For these exercises, you will need to install torch and torchvision. Open the terminal and type python -m pip install torch torchvision.

In [2]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from time import time

Exercise 8.1¶

Tuning the hyperparameters of deep learning models and their optimization routines can be a very time-consuming task. It is essentially a task that requires a lot of experience and patience. In this exercise, we will try to understand the effect of the learning rate parameter in stochastic gradient descent (SGD).

Revisit the fully connected neural network we trained on the FashionMNIST dataset in the lecture notes. Modify the code so that you can easily perform multiple training runs with different learning rates. Then produce a plot that visualizes the training time until each epoch versus the classification accuracy for different learning rates.

In [8]:
# Solution

# Download training data
training_data = datasets.FashionMNIST(root="deep_store", train=True, 
                            download=True, transform=ToTensor())

# Download test data 
test_data = datasets.FashionMNIST(root="deep_store", train=False,
                            download=True, transform=ToTensor())

classes = [ "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
            "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot" ]

# Create data loaders
batch_size = 128
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

# Get the best available device 
if torch.cuda.is_available():
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using {device} device")

# Define the model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# Train and test functions from the lecture notes
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train() # switch to train mode
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        #if batch % 250 == 0:
        #    loss, current = loss.item(), (batch + 1) * len(X)
        #    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() # switch to evaluation mode
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Accuracy on test set: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct # return the accuracy

# We'll always train for the same total number of epochs
epochs = 5

# Loss function stays the same for all runs
loss_fn = nn.CrossEntropyLoss()

# These are the learning rates we'll try
learning_rates = [1e-3, 1e-2, 1e-1, 5e-1 ]

for lr in learning_rates:

    print(f"\nTraining with learning rate {lr}")

    # Reinstantiate the model anew for each training run
    torch.manual_seed(seed=42)
    model = NeuralNetwork().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    # Train and time
    st = time()
    time_epoch = []
    acc_epoch = []

    # Get the accuracy of the untrained model. This doesn't count as an epoch.
    acc_epoch.append(test(test_dataloader, model, loss_fn))
    time_epoch.append(time()-st) 

    for t in range(epochs):
        print(f"Epoch {t+1}...")
        train(train_dataloader, model, loss_fn, optimizer)
        # Get test accuracy and record time
        acc_epoch.append(test(test_dataloader, model, loss_fn))
        time_epoch.append(time()-st) 
        # Note that timings include computing test predictions!

    plt.plot(time_epoch, acc_epoch, marker='o', label=f"lr = {lr}")

plt.title("Training Time vs Accuracy")
plt.xlabel("Time (seconds)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.legend()
plt.show()
Shape of X [N, C, H, W]: torch.Size([128, 1, 28, 28])
Shape of y: torch.Size([128]) torch.int64
Using cpu device

Training with learning rate 0.001
Accuracy on test set: 16.9%, Avg loss: 2.300619 

Epoch 1...
Accuracy on test set: 38.3%, Avg loss: 2.240401 

Epoch 2...
Accuracy on test set: 51.1%, Avg loss: 2.166955 

Epoch 3...
Accuracy on test set: 54.2%, Avg loss: 2.063158 

Epoch 4...
Accuracy on test set: 56.2%, Avg loss: 1.915649 

Epoch 5...
Accuracy on test set: 57.1%, Avg loss: 1.729528 


Training with learning rate 0.01
Accuracy on test set: 16.9%, Avg loss: 2.300619 

Epoch 1...
Accuracy on test set: 63.6%, Avg loss: 1.092849 

Epoch 2...
Accuracy on test set: 70.6%, Avg loss: 0.789251 

Epoch 3...
Accuracy on test set: 75.5%, Avg loss: 0.686245 

Epoch 4...
Accuracy on test set: 78.0%, Avg loss: 0.624007 

Epoch 5...
Accuracy on test set: 79.8%, Avg loss: 0.582494 


Training with learning rate 0.1
Accuracy on test set: 16.9%, Avg loss: 2.300619 

Epoch 1...
Accuracy on test set: 81.6%, Avg loss: 0.528303 

Epoch 2...
Accuracy on test set: 83.5%, Avg loss: 0.460536 

Epoch 3...
Accuracy on test set: 84.7%, Avg loss: 0.430501 

Epoch 4...
Accuracy on test set: 85.4%, Avg loss: 0.408183 

Epoch 5...
Accuracy on test set: 86.1%, Avg loss: 0.391376 


Training with learning rate 0.5
Accuracy on test set: 16.9%, Avg loss: 2.300619 

Epoch 1...
Accuracy on test set: 83.0%, Avg loss: 0.453234 

Epoch 2...
Accuracy on test set: 83.7%, Avg loss: 0.443161 

Epoch 3...
Accuracy on test set: 84.7%, Avg loss: 0.418452 

Epoch 4...
Accuracy on test set: 85.4%, Avg loss: 0.398655 

Epoch 5...
Accuracy on test set: 86.1%, Avg loss: 0.389794 

No description has been provided for this image

Exercise 8.2¶

Again starting with the fully connected neural network we trained on the FashionMNIST dataset in the lecture notes, experiment with different model architectures by varying the number of layers and neurons per layer. Produce a plot that visualizes the training time until each epoch versus the classification accuracy for a few different configurations.

Exercise 8.3¶

Modify the LeNet CNN example from the lecture notes to work with coloured images and train and test it on the CIFAR10 dataset. Compared to FashionMNIST, images are now coloured (3x32x32 instead of 1x28x28), and so the first convolutional layer needs to act on three channels. This then also affects the following layer dimensions. In addition, replace the Tanh activation functions by nowadays more commonly used ReLU activations.

This PyTorch tutorial might be useful.

In [9]:
# Solution

# Download training data
training_data = datasets.CIFAR10(root="deep_store", train=True,
                                 download=True, transform=ToTensor())

# Download test data
test_data = datasets.CIFAR10(root="deep_store", train=False,
                             download=True, transform=ToTensor())

classes = [ 'plane', 'car', 'bird', 'cat', 'deer', 
            'dog', 'frog', 'horse', 'ship', 'truck' ]

# Show the first 16 training images
fig, axes = plt.subplots(4,4)
fig.tight_layout()
for i in range(4):
    for j in range(4):
        k = j + 4*i
        x, y = training_data[k][0], training_data[k][1]
        img = x.data
        # data is 3x32x32 but imshow expects 32x32x3
        axes[i,j].imshow(np.transpose(img, (1, 2, 0)))
        axes[i,j].axis(False)
        axes[i,j].set_title(classes[y])
plt.show()

# Create data loaders
batch_size = 64
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

# Get the best available device 
if torch.cuda.is_available():
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using {device} device")

# Define model
class LeNet5_mod(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=2)
        self.act = nn.ReLU()
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.flat = nn.Flatten()
        self.fc1 = nn.Linear(576, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # input 3x32x32, output 6x32x32
        x = self.act(self.conv1(x))
        # input 6x32x32, output 6x16x16
        x = self.pool(x)
        # input 6x16x16, output 16x12x12
        x = self.act(self.conv2(x))
        # input 16x12x12, output 16x6x6
        x = self.pool(x)
        # input 16x6x6, output 576
        x = self.flat(x)
        # input 576, output 120
        x = self.act(self.fc1(x))
        # input 120, output 84
        x = self.act(self.fc2(x))
        # input 84, output 10
        x = self.fc3(x)
        return x

torch.manual_seed(seed=42)
model = LeNet5_mod().to(device)
print(model, "\n")

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters: {total_params}')

# Train and test functions from the lecture notes
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train() # switch to train mode
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 250 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() # switch to evaluation mode
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Accuracy on test set: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


# Train the model
loss_fn = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters()) 

epochs = 3
st = time()
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Epochs completed in", time()-st, "seconds.")

# Make some predictions on the test set
model.eval()
fig, axes = plt.subplots(4,4)
fig.tight_layout()
for i in range(4):
    for j in range(4):
        k = j + 4*i
        x, y = test_data[k][0], test_data[k][1]

        img = x.data
        # data is 3x32x32 but imshow expects 32x32x3
        axes[i,j].imshow(np.transpose(img, (1, 2, 0)))
        axes[i,j].axis(False)

        with torch.no_grad():
            # x is 3x32x32 but model expects a batch 1x3x32x32
            x = x.unsqueeze(0)
            x = x.to(device)
            pred = model(x)
            
        predicted, actual = classes[pred[0].argmax(0)], classes[y]
        if predicted == actual:
            predicted += '*'
        axes[i,j].set_title(predicted)
plt.show()
No description has been provided for this image
Shape of X [N, C, H, W]: torch.Size([64, 3, 32, 32])
Shape of y: torch.Size([64]) torch.int64
Using cpu device
LeNet5_mod(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (act): ReLU()
  (pool): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
) 

Total number of parameters: 83126
Epoch 1
-------------------------------
loss: 2.319287  [   64/50000]
loss: 1.838265  [16064/50000]
loss: 1.728843  [32064/50000]
loss: 1.879978  [48064/50000]
Accuracy on test set: 42.5%, Avg loss: 1.601184 

Epoch 2
-------------------------------
loss: 1.594288  [   64/50000]
loss: 1.475687  [16064/50000]
loss: 1.551362  [32064/50000]
loss: 1.666932  [48064/50000]
Accuracy on test set: 50.0%, Avg loss: 1.377274 

Epoch 3
-------------------------------
loss: 1.318630  [   64/50000]
loss: 1.408956  [16064/50000]
loss: 1.483812  [32064/50000]
loss: 1.632436  [48064/50000]
Accuracy on test set: 52.7%, Avg loss: 1.308082 

Epochs completed in 37.01607370376587 seconds.
No description has been provided for this image