For these exercises, you will need to install torch and torchvision. Open the terminal and type python -m pip install torch torchvision.
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from time import time
Exercise 8.1¶
Tuning the hyperparameters of deep learning models and their optimization routines can be a very time-consuming task. It is essentially a task that requires a lot of experience and patience. In this exercise, we will try to understand the effect of the learning rate parameter in stochastic gradient descent (SGD).
Revisit the fully connected neural network we trained on the FashionMNIST dataset in the lecture notes. Modify the code so that you can easily perform multiple training runs with different learning rates. Then produce a plot that visualizes the training time until each epoch versus the classification accuracy for different learning rates.
# Solution
# Download training data
training_data = datasets.FashionMNIST(root="deep_store", train=True,
download=True, transform=ToTensor())
# Download test data
test_data = datasets.FashionMNIST(root="deep_store", train=False,
download=True, transform=ToTensor())
classes = [ "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
"Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot" ]
# Create data loaders
batch_size = 128
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break
# Get the best available device
if torch.cuda.is_available():
device = torch.device("cuda")
device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
print(f"Using {device} device")
# Define the model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
# Train and test functions from the lecture notes
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train() # switch to train mode
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()
#if batch % 250 == 0:
# loss, current = loss.item(), (batch + 1) * len(X)
# print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval() # switch to evaluation mode
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Accuracy on test set: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
return correct # return the accuracy
# We'll always train for the same total number of epochs
epochs = 5
# Loss function stays the same for all runs
loss_fn = nn.CrossEntropyLoss()
# These are the learning rates we'll try
learning_rates = [1e-3, 1e-2, 1e-1, 5e-1 ]
for lr in learning_rates:
print(f"\nTraining with learning rate {lr}")
# Reinstantiate the model anew for each training run
torch.manual_seed(seed=42)
model = NeuralNetwork().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# Train and time
st = time()
time_epoch = []
acc_epoch = []
# Get the accuracy of the untrained model. This doesn't count as an epoch.
acc_epoch.append(test(test_dataloader, model, loss_fn))
time_epoch.append(time()-st)
for t in range(epochs):
print(f"Epoch {t+1}...")
train(train_dataloader, model, loss_fn, optimizer)
# Get test accuracy and record time
acc_epoch.append(test(test_dataloader, model, loss_fn))
time_epoch.append(time()-st)
# Note that timings include computing test predictions!
plt.plot(time_epoch, acc_epoch, marker='o', label=f"lr = {lr}")
plt.title("Training Time vs Accuracy")
plt.xlabel("Time (seconds)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.legend()
plt.show()
Shape of X [N, C, H, W]: torch.Size([128, 1, 28, 28]) Shape of y: torch.Size([128]) torch.int64 Using cpu device Training with learning rate 0.001 Accuracy on test set: 16.9%, Avg loss: 2.300619 Epoch 1... Accuracy on test set: 38.3%, Avg loss: 2.240401 Epoch 2... Accuracy on test set: 51.1%, Avg loss: 2.166955 Epoch 3... Accuracy on test set: 54.2%, Avg loss: 2.063158 Epoch 4... Accuracy on test set: 56.2%, Avg loss: 1.915649 Epoch 5... Accuracy on test set: 57.1%, Avg loss: 1.729528 Training with learning rate 0.01 Accuracy on test set: 16.9%, Avg loss: 2.300619 Epoch 1... Accuracy on test set: 63.6%, Avg loss: 1.092849 Epoch 2... Accuracy on test set: 70.6%, Avg loss: 0.789251 Epoch 3... Accuracy on test set: 75.5%, Avg loss: 0.686245 Epoch 4... Accuracy on test set: 78.0%, Avg loss: 0.624007 Epoch 5... Accuracy on test set: 79.8%, Avg loss: 0.582494 Training with learning rate 0.1 Accuracy on test set: 16.9%, Avg loss: 2.300619 Epoch 1... Accuracy on test set: 81.6%, Avg loss: 0.528303 Epoch 2... Accuracy on test set: 83.5%, Avg loss: 0.460536 Epoch 3... Accuracy on test set: 84.7%, Avg loss: 0.430501 Epoch 4... Accuracy on test set: 85.4%, Avg loss: 0.408183 Epoch 5... Accuracy on test set: 86.1%, Avg loss: 0.391376 Training with learning rate 0.5 Accuracy on test set: 16.9%, Avg loss: 2.300619 Epoch 1... Accuracy on test set: 83.0%, Avg loss: 0.453234 Epoch 2... Accuracy on test set: 83.7%, Avg loss: 0.443161 Epoch 3... Accuracy on test set: 84.7%, Avg loss: 0.418452 Epoch 4... Accuracy on test set: 85.4%, Avg loss: 0.398655 Epoch 5... Accuracy on test set: 86.1%, Avg loss: 0.389794
Exercise 8.2¶
Again starting with the fully connected neural network we trained on the FashionMNIST dataset in the lecture notes, experiment with different model architectures by varying the number of layers and neurons per layer. Produce a plot that visualizes the training time until each epoch versus the classification accuracy for a few different configurations.
Exercise 8.3¶
Modify the LeNet CNN example from the lecture notes to work with coloured images and train and test it on the CIFAR10 dataset. Compared to FashionMNIST, images are now coloured (3x32x32 instead of 1x28x28), and so the first convolutional layer needs to act on three channels. This then also affects the following layer dimensions. In addition, replace the Tanh activation functions by nowadays more commonly used ReLU activations.
This PyTorch tutorial might be useful.
# Solution
# Download training data
training_data = datasets.CIFAR10(root="deep_store", train=True,
download=True, transform=ToTensor())
# Download test data
test_data = datasets.CIFAR10(root="deep_store", train=False,
download=True, transform=ToTensor())
classes = [ 'plane', 'car', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck' ]
# Show the first 16 training images
fig, axes = plt.subplots(4,4)
fig.tight_layout()
for i in range(4):
for j in range(4):
k = j + 4*i
x, y = training_data[k][0], training_data[k][1]
img = x.data
# data is 3x32x32 but imshow expects 32x32x3
axes[i,j].imshow(np.transpose(img, (1, 2, 0)))
axes[i,j].axis(False)
axes[i,j].set_title(classes[y])
plt.show()
# Create data loaders
batch_size = 64
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break
# Get the best available device
if torch.cuda.is_available():
device = torch.device("cuda")
device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
print(f"Using {device} device")
# Define model
class LeNet5_mod(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=2)
self.act = nn.ReLU()
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
self.flat = nn.Flatten()
self.fc1 = nn.Linear(576, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# input 3x32x32, output 6x32x32
x = self.act(self.conv1(x))
# input 6x32x32, output 6x16x16
x = self.pool(x)
# input 6x16x16, output 16x12x12
x = self.act(self.conv2(x))
# input 16x12x12, output 16x6x6
x = self.pool(x)
# input 16x6x6, output 576
x = self.flat(x)
# input 576, output 120
x = self.act(self.fc1(x))
# input 120, output 84
x = self.act(self.fc2(x))
# input 84, output 10
x = self.fc3(x)
return x
torch.manual_seed(seed=42)
model = LeNet5_mod().to(device)
print(model, "\n")
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters: {total_params}')
# Train and test functions from the lecture notes
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train() # switch to train mode
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 250 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval() # switch to evaluation mode
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Accuracy on test set: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Train the model
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
epochs = 3
st = time()
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model, loss_fn)
print("Epochs completed in", time()-st, "seconds.")
# Make some predictions on the test set
model.eval()
fig, axes = plt.subplots(4,4)
fig.tight_layout()
for i in range(4):
for j in range(4):
k = j + 4*i
x, y = test_data[k][0], test_data[k][1]
img = x.data
# data is 3x32x32 but imshow expects 32x32x3
axes[i,j].imshow(np.transpose(img, (1, 2, 0)))
axes[i,j].axis(False)
with torch.no_grad():
# x is 3x32x32 but model expects a batch 1x3x32x32
x = x.unsqueeze(0)
x = x.to(device)
pred = model(x)
predicted, actual = classes[pred[0].argmax(0)], classes[y]
if predicted == actual:
predicted += '*'
axes[i,j].set_title(predicted)
plt.show()
Shape of X [N, C, H, W]: torch.Size([64, 3, 32, 32]) Shape of y: torch.Size([64]) torch.int64 Using cpu device LeNet5_mod( (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)) (act): ReLU() (pool): AvgPool2d(kernel_size=2, stride=2, padding=0) (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1)) (flat): Flatten(start_dim=1, end_dim=-1) (fc1): Linear(in_features=576, out_features=120, bias=True) (fc2): Linear(in_features=120, out_features=84, bias=True) (fc3): Linear(in_features=84, out_features=10, bias=True) ) Total number of parameters: 83126 Epoch 1 ------------------------------- loss: 2.319287 [ 64/50000] loss: 1.838265 [16064/50000] loss: 1.728843 [32064/50000] loss: 1.879978 [48064/50000] Accuracy on test set: 42.5%, Avg loss: 1.601184 Epoch 2 ------------------------------- loss: 1.594288 [ 64/50000] loss: 1.475687 [16064/50000] loss: 1.551362 [32064/50000] loss: 1.666932 [48064/50000] Accuracy on test set: 50.0%, Avg loss: 1.377274 Epoch 3 ------------------------------- loss: 1.318630 [ 64/50000] loss: 1.408956 [16064/50000] loss: 1.483812 [32064/50000] loss: 1.632436 [48064/50000] Accuracy on test set: 52.7%, Avg loss: 1.308082 Epochs completed in 37.01607370376587 seconds.