Machine Learning Notes - Pytorch
- Pytorch Introduction ¶
- Import Pythorch ¶
- Pytorch Tensor ¶
- Pytorch Autograd ¶
- Pytorch DataLoader ¶
- Pytorch Transforms ¶
- Pytorch Loss Functions ¶
- Torch Activation Functions ¶
- Torch Optimizer ¶
Pytorch Introduction ¶
PyTorch is an open source machine learning framework. You can find more information about PyTorch by following one of the oficial tutorials or by reading the documentation .
Import Pythorch ¶
torch.cude.is_available()
# Import pytorch and check its version
import torch
import numpy as np
print(torch.__version__)
print(f'Is cuda available? {torch.cuda.is_available()}')
#
Pytorch Tensor ¶
Tensor Initialization ¶
torch.tensor(),
torch.from_numpy(),
torch.zerors_like(),
torch.ones_like(),
torch.rand(),
torch.ones(),
torch.zeros(),
torch.eye(),
torch.full(),
# Tensor Initialization
# Directly from data
data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)
print(f"Direct Tensor: \n {x_np} \n")
# From a NumPy array
np_array = np.array(data)
x_np = torch.from_numpy(np_array)
print(f"Numpy Tensor: \n {x_np} \n")
# From another tensor:
x_zeros = torch.zeros_like(x_data)
print(f"Zeros Tensor: \n {x_zeros} \n")
x_ones = torch.ones_like(x_data)
print(f"Ones Tensor: \n {x_ones} \n")
x_rand = torch.rand_like(x_data, dtype=torch.float)
print(f"Random Tensor: \n {x_rand} \n")
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)
full_tensor = torch.full(shape, 2)
print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}\n")
print(f"Full twos Tensor: \n {full_tensor}\n")
#
Tensor Attributes ¶
tensor.dim(),
tensor.shape,
tensor.dtype,
tensor.device,
# Tensor Atrributes
tensor = torch.rand(3,4)
print(f"Dimension of tensor: {tensor.dim()}\n")
print(f"Shape of tensor: {tensor.shape}\n")
print(f"Datatype of tensor: {tensor.dtype}\n")
print(f"Device tensor is stored on: {tensor.device}\n")
#
Tensor Indexing ¶
tensor[start:stop:step]
# Tensor slicing
x = torch.tensor([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print('Original tensor:')
print(x)
print('shape: ', x.shape)
# Get row 1, and all columns.
print('\nSingle row:')
print(x[1, :].shape, x[1, :])
print('shape: ', x[1, :].shape, x[1].shape)
print('\nSingle column:')
print(x[:, 1])
print('shape: ', x[:, 1].shape)
# Get the first two rows and the last three columns
print('\nFirst two rows, last two columns:')
print(x[:2, -3:])
print('shape: ', x[:2, -3:].shape)
# Get every other row, and columns at index 1 and 2
print('\nEvery other row, middle columns:')
print(x[::2, 1:3])
print('shape: ', x[::2, 1:3].shape)
#
More generally, given index arrays
idx0
and
idx1
with
N
elements each,
a[idx0, idx1]
is equivalent to:
torch.tensor([
a[idx0[0], idx1[0]],
a[idx0[1], idx1[1]],
...,
a[idx0[N - 1], idx1[N - 1]]
])
(A similar pattern extends to tensors with more than two dimensions)
# Integer Index
x = torch.tensor([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print('Original tensor:')
print(x)
print('shape: ', x.shape)
idx = torch.tensor([3, 2, 1, 0]) # Index arrays can be int64 torch tensors
print('\nReordered columns:')
print(x[:, idx])
a = torch.tensor([0, 1, 0]) # Index arrays can be int64 torch tensors
b = torch.tensor([3, 2, 1]) # Index arrays can be int64 torch tensors
print('\nReordered rows/columns:')
print(x[a, b])
#
# Boolen indexing
x = torch.tensor([[1,2], [3, 4], [5, 6]])
print('Original tensor:')
print(x)
mask = (x > 3)
print('\nMask tensor:')
print(mask)
# We can use the mask to construct a rank-1 tensor containing the elements of a
# that are selected by the mask
print('\nSelecting elements with the mask:')
print(x[mask])
# We can also use boolean masks to modify tensors; for example this sets all
# elements <= 3 to zero:
x[x <= 3] = 0
print('\nAfter modifying with a mask:')
print(x)
#
Tensor Operations ¶
tensor.cat(), torch.sum(), torch.mean(), torch.max(), torch.min(), torch.dot(),
torch.mm(), torch.mv(), torch.addmm(), torch,addmv(),torch.bmm(), torch,baddmm(),
torch.matmul(), torch.torch.broadcast_tensors()
Tensor Stacks ¶
# Tensor Operations
# Standard numpy-like indexing and slicing
tensor = torch.ones(4, 4)
tensor[:,1] = 0
tensor[2,2] = 2
tensor[3,3] = 3
print(f'{tensor}\n')
# When use `torch.cat` and specify `dim=x`, then the dimension `x` will increase.
t_h = torch.cat([tensor, tensor], dim=1)
print(f'horizontal cat:\n {t_h}\n')
t_v = torch.cat([tensor, tensor], dim=0)
print(f'vetical cat:\n {t_v}\n')
#
Elementwise Operations ¶
# Elementwise operations
x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float32)
y = torch.tensor([[5, 6, 7, 8]], dtype=torch.float32)
# Elementwise sum; all give the same result
print('Elementwise sum:')
print(x + y)
print(torch.add(x, y))
print(x.add(y))
# Elementwise difference
print('\nElementwise difference:')
print(x - y)
print(torch.sub(x, y))
print(x.sub(y))
# Elementwise product
print('\nElementwise product:')
print(x * y)
print(torch.mul(x, y))
print(x.mul(y))
# Elementwise division
print('\nElementwise division')
print(x / y)
print(torch.div(x, y))
print(x.div(y))
# Elementwise power
print('\nElementwise power')
print(x ** y)
print(torch.pow(x, y))
print(x.pow(y))
#
Reduction Operations ¶
Reduction operations
reduce
the rank of tensors: the dimension over which you perform the reduction will be removed from the shape of the output. If you pass
keepdim=True
to a reduction operation, the specified dimension will not be removed; the output tensor will instead have a shape of 1 in that dimension.
# Reduction
# When use reduction and specify `dim=x`, then the dimension `x` will be removed.
x = torch.tensor([[1, 2, 3],
[4, 5, 6]], dtype=torch.float32)
print('Original tensor:')
print(x)
print('\nSum over entire tensor:')
print(torch.sum(x))
# We can sum over each row:
print('\nSum of each row:')
print(torch.sum(x, dim=0).shape)
# Sum over each column:
print('\nSum of each column:')
print(torch.sum(x, dim=1))
#
Matrix Operations ¶
-
@
is used for multiplication, same astorch.mm()
,torch.mv()
,torch.bmm()
,torch.bmv()
-
torch.matmul()
- If both tensors are 1-dimensional, the dot product (scalar) is returned.
- If both arguments are 2-dimensional, the matrix-matrix product is returned.
- If the first argument is 2-dimensional and the second argument is 1-dimensional, the matrix-vector product is returned.
- If the first argument is 1-dimensional and the second argument is 2-dimensional, a 1 is prepended to its dimension for the purpose of the matrix multiply. After the matrix multiply, the prepended dimension is removed.
- Support broadcasting
# Matrix Operations
import torch
# torch.dot
x0 = torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
print(f'\n x0 shape: {x0.shape}')
# torch.mm
x1 = torch.mm(torch.randn(2, 3), torch.randn(3, 3))
print(f'\n x1 shape: {x1.shape}')
# torch.mv
x2 = torch.mv(torch.randn(2, 3), torch.randn(3))
print(f'\n x2 shape: {x2.shape}')
# torch.admm
M = torch.randn(2, 3)
m1 = torch.randn(2, 3)
m2 = torch.randn(3, 3)
# m1 @ m2 + M
x3 = torch.addmm(M, m1, m2)
print(f'\n x3 shape: {x3.shape}')
# torch.bmm
bm1 = torch.randn(10, 3, 4)
bm2 = torch.randn(10, 4, 5)
x4 = torch.bmm(bm1, bm2)
print(f'\n x4 shape: {x4.shape}')
# torch.matmul()
# vector x vector
print(f'\n vector x vector {torch.matmul(torch.randn(3), torch.randn(3)).size()}')
# matrix x vector
print(f' matrix x vector {torch.matmul(torch.randn(3, 4), torch.randn(4)).size()}')
# batched matrix x broadcasted vector
print(f' batched matrix x broadcasted vector {torch.matmul(torch.randn(10, 3, 4), torch.randn(4)).size()}')
# batched matrix x batched matrix
print(f' batched matrix x broadcasted vector {torch.matmul(torch.randn(10, 3, 4), torch.randn(10, 4, 5)).size()}')
# batched matrix x broadcasted matrix
print(f' batched matrix x broadcasted vector {torch.matmul(torch.randn(10, 3, 4), torch.randn(4, 5)).size()}')
#
Tensor Broadcasting ¶
- Each tensor has at least one dimension.
- When iterating over the dimension sizes, starting at the trailing dimension, the dimension sizes must either be equal, one of them is 1, or one of them does not exist.
# Tensor Broadcasting
x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
v = torch.tensor([1, 0, 1])
print('Here is x (before broadcasting):')
print(x)
print('x.shape: ', x.shape)
print('\nHere is v (before broadcasting):')
print(v)
print('v.shape: ', v.shape)
xx, vv = torch.broadcast_tensors(x, v)
print('\nHere is xx (after) broadcasting):')
print(xx)
print('xx.shape: ', x.shape)
print('\nHere is vv (after broadcasting):')
print(vv)
print('vv.shape: ', vv.shape)
#
Tensor Data Type ¶
tensor.to(),
tensor.new_zeros(),
tensor.float(),
tensor.double(),
# Tensor Data Type
# Let torch choose the datatype
x0 = torch.tensor([1, 2]) # List of integers
x1 = torch.tensor([1., 2.]) # List of floats
x2 = torch.tensor([1., 2]) # Mixed list
print('dtype when torch chooses for us:')
print('List of integers:', x0.dtype)
print('List of floats:', x1.dtype)
print('Mixed list:', x2.dtype)
# Force a particular datatype
y0 = torch.tensor([1, 2], dtype=torch.float32) # 32-bit float
y1 = torch.tensor([1, 2], dtype=torch.int32) # 32-bit (signed) integer
y2 = torch.tensor([1, 2], dtype=torch.int64) # 64-bit (signed) integer
print('\ndtype when we force a datatype:')
print('32-bit float: ', y0.dtype)
print('32-bit integer: ', y1.dtype)
print('64-bit integer: ', y2.dtype)
# Other creation ops also take a dtype argument
z0 = torch.ones(1, 2) # Let torch choose for us
z1 = torch.ones(1, 2, dtype=torch.int16) # 16-bit (signed) integer
z2 = torch.ones(1, 2, dtype=torch.uint8) # 8-bit (unsigned) integer
print('\ntorch.ones with different dtypes')
print('default dtype:', z0.dtype)
print('16-bit integer:', z1.dtype)
print('8-bit unsigned integer:', z2.dtype)
x0 = torch.eye(3, dtype=torch.int64)
x1 = x0.float() # Cast to 32-bit float
x2 = x0.double() # Cast to 64-bit float
x3 = x0.to(torch.float32) # Alternate way to cast to 32-bit float
x4 = x0.to(torch.float64) # Alternate way to cast to 64-bit float
print('\nx0:', x0.dtype)
print('x1:', x1.dtype)
print('x2:', x2.dtype)
print('x3:', x3.dtype)
print('x4:', x4.dtype)
x0 = torch.eye(3, dtype=torch.float64) # Shape (3, 3), dtype torch.float64
x1 = torch.zeros_like(x0) # Shape (3, 3), dtype torch.float64
x2 = x0.new_zeros(4, 5) # Shape (4, 5), dtype torch.float64
x3 = torch.ones(6, 7).to(x0) # Shape (6, 7), dtype torch.float64)
print('\nx0 shape is %r, dtype is %r' % (x0.shape, x0.dtype))
print('x1 shape is %r, dtype is %r' % (x1.shape, x1.dtype))
print('x2 shape is %r, dtype is %r' % (x2.shape, x2.dtype))
print('x3 shape is %r, dtype is %r' % (x3.shape, x3.dtype))
#
Tensor Reshape ¶
tensor.view(), tensor.reshape(), tensor.transpose(), tensor.permute(), tensor.contiguous()
The
view()
function takes elements in row-major order, so you cannot transpose matrices with
view()
.
# tensor.view()
x0 = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
print('Original tensor:')
print(x0)
print('shape:', x0.shape)
# Flatten x0 into a rank 1 vector of shape (8,)
# Takes elements in row-major order
x1 = x0.view(-1)
print('\nFlattened tensor:')
print(x1)
print('shape:', x1.shape)
# Rank 2 tensor
x2 = x0.view(-1, 1)
print('\nRank 2 tensor:')
print('shape:', x2.shape)
# View x1 as shape (2, 2, 2)
x3 = x0.view(-1, 2, 2)
print('\nRank 3 tensor:')
print('shape:', x3.shape)
#
# Tensor transpose and permute
# Create a tensor of shape (2, 3, 4)
x0 = torch.tensor([
[[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]],
[[13, 14, 15, 16],
[17, 18, 19, 20],
[21, 22, 23, 24]]])
print('Original tensor:')
print(x0)
print('shape:', x0.shape)
# Swap axes 1 and 2; shape is (2, 4, 3)
x1 = x0.transpose(1, 2)
print('\nSwap axes 1 and 2:')
print(x1)
print(x1.shape)
# Permute axes; the argument (1, 2, 0) means:
# - Make the old dimension 1 appear at dimension 0;
# - Make the old dimension 2 appear at dimension 1;
# - Make the old dimension 0 appear at dimension 2
# This results in a tensor of shape (3, 4, 2)
x2 = x0.permute(1, 2, 0)
print('\nPermute axes')
print(x2)
print('shape:', x2.shape)
#
Tensor Test ¶
torch.all(), torch.is_close(), torch.all_close()
Tensor Range ¶
torch.arange(), torch.linspace()
# Tensor Range
# torch.arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False)
# torch.linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) → Tensor
#
Tensor GPU ¶
# Tensor on GPU
# Construct a tensor on the CPU
x0 = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
print('x0 device:', x0.device)
if torch.cuda.is_available():
# Move it to the GPU using .to()
x1 = x0.to('cuda')
print('x1 device:', x1.device)
# Move it to the GPU using .cuda()
x2 = x0.cuda()
print('x2 device:', x2.device)
# Move it back to the CPU using .to()
x3 = x1.to('cpu')
print('x3 device:', x3.device)
# Move it back to the CPU using .cpu()
x4 = x2.cpu()
print('x4 device:', x4.device)
# We can construct tensors directly on the GPU as well
y = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float64, device='cuda')
print('y device / dtype:', y.device, y.dtype)
# Calling x.to(y) where y is a tensor will return a copy of x with the same
# device and dtype as y
x5 = x0.to(y)
print('x5 device / dtype:', x5.device, x5.dtype)
else:
print('Cuda is not available.' )
#
Pytorch Autograd ¶
tensor.requires_grad_(), tensor.detach()
Every Tensor has a flag:
requires_grad
that allows for fine grained exclusion of subgraphs from gradient computation and can increase efficiency.
# Pytorch autograd
import torch
x = torch.randn(5, 5) # requires_grad=False by default
y = torch.randn(5, 5) # requires_grad=False by default
z = torch.randn((5, 5), requires_grad=True)
a = x + y
print(f'a requires_grad: {a.requires_grad}')
b = (a + 2*z).sum()
print(f'b requires_grad: {b.requires_grad}')
b.backward()
print(f'z grad: \n {z.grad}')
#
Pytorch DataLoader ¶
Pytorch Transforms ¶
Pytorch Loss Functions ¶
torch.nn.MSELoss(), torch.nn.CrossEntropyLoss(),torch.nn.MultiLabelMarginLoss(),
# torch.nn.MSELoss(size_average=None, reduce=None, reduction='mean')
import torch
loss = torch.nn.MSELoss()
x = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)
output = loss(x, target)
output.backward()
#
CrossEntropyLoss() ¶
- Input: (N, C), N is batch size, C is number of classes
- Target: (N,), $0 \leq target[i] \leq C-1 $
- Outtput: (N,)
# torch.nn.CrossEntropyLoss(weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
import torch
loss = torch.nn.CrossEntropyLoss(reduction='mean')
x = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(x, target)
output.backward()
# Image pixel level classifiction for 5 classes
width = 10
height = 10
loss = torch.nn.CrossEntropyLoss(reduction='mean')
x = torch.randn(3, 5, width, height, requires_grad=True)
target = torch.empty(3, width, height, dtype=torch.long).random_(5)
output = loss(x, target)
output.backward()
#
MultiLabelMarginLoss() ¶
Creates a criterion that optimizes a multi-class multi-classification hinge loss. This means that for a sample x, it could have multiple correct labels.
- Input: (C,) or (N, C), N is batch size, C is number of classes
- Target: (C) or (N, C), label targets after first -1 are ignored
- Output: Scalar. If reduction is 'none', then (N,)
# torch.nn.MultiLabelMarginLoss(size_average=None, reduce=None, reduction='mean')
import torch
loss = torch.nn.MultiLabelMarginLoss()
x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
# Single class hinge loss, so label == 3
y = torch.LongTensor([[3, -1, -1, -1]])
output = loss(x, y)
print(f'single class loss: {output.item():.4f}')
expected = torch.tensor([0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))])
assert(torch.isclose(output, expected))
# Multi-class hinge loss, so label == 3 and label == 1
y = torch.LongTensor([[3, 0, -1, -1]])
output = loss(x, y)
print(f'multi-class loss: {output.item():.4f}')
expected = torch.tensor([0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))])
assert(torch.isclose(output, expected))
#
Torch Activation Functions ¶
Torch Optimizer ¶
torch.optim.SGD(),torch.optim.RMSprop(), torch.optim.Adam()
torch.optim
is a package implementing various optimization algorithms. Most commonly used methods are already supported, and the interface is general enough, so that more sophisticated ones can be also easily integrated in the future.
To use
torch.optim
you have to construct an optimizer object, that will hold the current state and will update the parameters based on the computed gradients.
Note: If you need to move a model to GPU via .cuda(), please do so before constructing optimizers for it. Parameters of a model after .cuda() will be different objects with those before the call.
Construct Optimizers ¶
# Construct an optimizer
import torch
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam([var1, var2], lr=0.0001)
#
Set Parameter Options ¶
This means that
model.bas
e’s parameters will use the default learning rate of
1e-2
,
model.classifier
’s parameters will use a learning rate of
1e-3
, and a momentum of
0.9
will be used for all parameters.
# Set Parameter Options
import torch
class Model(nn.Module):
def __init__(self, config={}):
super(Model, self).__init__()
num_classes=config.setdefault('num_classes', 10)
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = Model()
optimizer = torch.optim.SGD([
{'params': model.layer1.parameters(), 'momentum': 0.8},
{'params': model.layer2.parameters(), 'lr': 0.01}
], lr=0.001, momentum=0.9)
print(optimizer)
for i in range(len(optimizer.param_groups)):
print(f'group_id: {i}, lr: {optimizer.param_groups[i]["lr"]}, momentum: {optimizer.param_groups[i]["momentum"]}')
#
Model Parameter ¶
import torch
import torch.nn as nn
class Model(nn.Module):
def __init__(self, config={}):
super(Model, self).__init__()
num_classes=config.setdefault('num_classes', 10)
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
model = Model()
print('Named parameters...')
for name, param in model.named_parameters():
print(name)
print(param.size(), '\n')
#print('Parameters...')
#for param in model.parameters():
# print(param.size(), '\n')
#
Taking an Optimization Step ¶
# Taking an Optimization Step
for input, target in dataset:
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
#