# Run these once per machine (comment out after first successful run):
# install.packages(c("torch","torchvision","luz","yardstick"))
# torch::install_torch() # downloads libtorch/lantern for your OS
The aim of this lab is to acquire basic familiarity with deep learning.
By the end of this lab session, students should be able to
Plot greyscale images in R
Arrange several plots on a grid
Train artificial neural networks in R/Torch
Set training controls and hyperparameters for neural networks - optimizer, regularizer, loss function, activation functions, the number of layers, layer dimensions.
Please run the R chunks one by one, look at the output and make sure that you understand how it is produced. There will be questions that either require a short answer - then you type your answer right in this document - or modifying R codes - then you modify the R codes here. In either case, you can discuss your work with the lab instructor.
We will load the Fashion-MNIST dataset from the library
torchvision. The original raw dataset can be found
here:
It consists of 60000 training and 10000 grayscale images, each labelled by one of the following classes:
0 T-shirt/top 1 Trouser 2 Pullover 3 Dress 4 Coat 5 Sandal 6 Shirt 7 Sneaker 8 Bag 9 Ankle boot
library(tidyverse)
library(patchwork) # For multiple plots on one page
library(janitor) # Some useful helper functions
library(caret)
library(matrixStats)
### The following three libraries enable deep learning:
library(torch)
library(torchvision)
library(luz)
set.seed(199)
class_names <- c(
"T-shirt/top","Trouser","Pullover","Dress","Coat",
"Sandal","Shirt","Sneaker","Bag","Ankle boot"
)
# torchvision's fashion_mnist dataset
transform <- function(x) {
# x arrives as a base R 28x28 matrix with values in 0..255
x <- torch_tensor(x, dtype = torch_float()) # convert to torch tensor
x <- x$unsqueeze(1)$div(255) # make it 1x28x28 and scale to [0,1]
x
}
# re-create datasets so they use the new transform
train_ds <- fashion_mnist_dataset(
root = "./data", train = TRUE, download = TRUE, transform = transform
)
test_ds <- fashion_mnist_dataset(
root = "./data", train = FALSE, download = TRUE, transform = transform
)
Here is a sample of how the first image represented in the raw data:
s1 <- train_ds[1]
cat("The dimensions of the tensor are", s1$x$size(), "\n")
## The dimensions of the tensor are 1 28 28
cat("Here is a sample of the tensor:\n")
## Here is a sample of the tensor:
s1$x[1, 10:15, 10:15]
## torch_tensor
## 0.0000 0.0000 0.0000 0.0000 0.7176 0.8824
## 0.0000 0.0000 0.0000 0.0000 0.7569 0.8941
## 0.0039 0.0118 0.0000 0.0471 0.8588 0.8627
## 0.0000 0.0235 0.0000 0.3882 0.9569 0.8706
## 0.0157 0.0000 0.0000 0.2157 0.9255 0.8941
## 0.0000 0.0000 0.0000 0.9294 0.8863 0.8510
## [ CPUFloatType{6,6} ]
cat("The numeric label is", s1$y, "\n")
## The numeric label is 10
cat("The class is", train_ds$classes[s1$y])
## The class is Ankle boot
The dataset is perfectly balanced:
tabyl(train_ds$targets)
Below is the actual image of the first training example:
plot_fashion_mnist_image <- function(ds, i, title = NULL) {
img <- as_array(ds[i]$x$squeeze()) # 28x28 numeric [0,1]
df <- as.data.frame(img)
colnames(df) <- seq_len(ncol(df))
df$y <- seq_len(nrow(df))
df <- tidyr::pivot_longer(df, -y, names_to = "x", values_to = "value")
df$x <- as.integer(df$x)
p <- ggplot(df, aes(x = x, y = y, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "black", na.value = NA) +
scale_y_reverse() +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
legend.position = "none") +
coord_equal()
if (!is.null(title)) p <- p + ggtitle(title)
p
}
plot_fashion_mnist_image(train_ds, 1)
And here is how we can plot several images on the same plot:
1:25 %>%
map(~plot_fashion_mnist_image(train_ds, .)) %>%
wrap_plots(nrow = 5, ncol = 5)
Change the function for plotting images so that it will print the image title with the class label. Plot the first 16 images with the class labels.
Hint: train_ds$classes are class names and
train_ds$targets are numeric labels.
# Write your code here
1:16 %>%
map(~plot_fashion_mnist_image(
train_ds, .,
title = train_ds$classes[train_ds$targets[.]])) %>%
wrap_plots(nrow = 4, ncol = 4)
Here, we will train a random forest to get some idea of how a
familiar model can predict the labels. To do it, we need to convert the
training and test datasets from torch tensor
(multidimensional array) to familiar tibble or
data.frame.
Note that the original tensor has dimensions \(1\times N\times 28\times 28\):
train_ds[1:5]$x %>% dim()
## [1] 1 5 28 28
We can just get rid of the first one with squeeze()
method:
train_ds[1:5]$x$squeeze(1) %>% dim()
## [1] 5 28 28
And we can collapse the \(28\times
28\) table into one long vector of independent variables with the
flatten() method:
train_ds[1:5]$x$squeeze(1)$flatten(start_dim = 2) %>% dim()
## [1] 5 784
Note also that the value of the response variable is contained in the
y record of the tensor:
train_ds[1:5]$y
## [1] 10 1 1 4 1
and classes can be found by subsetting:
train_ds$classes[train_ds[1:5]$y]
## [1] "Ankle boot" "T-shirt/top" "T-shirt/top" "Dress" "T-shirt/top"
Now we will write a helper function to convert such a tensor to a data frame
torch_to_tibble <- function(torch_data) {
n <- length(torch_data)
y <- torch_data$classes[torch_data[1:n]$y]
torch_data[1:n]$x$squeeze(1)$flatten(start_dim = 2) %>%
as_array() %>%
as_tibble() %>%
mutate(y = as_factor(y))
}
train_df <- train_ds %>% torch_to_tibble()
test_df <- test_ds %>% torch_to_tibble()
Now we will train a random forest. This may take quite a while, so
feel free to change the grid to, say, just mtry = 28 and
min.node.size = 50.
rf_ctrl <- trainControl(
method = "oob",
verboseIter = TRUE
)
rf_grid <- expand.grid(
mtry = c(16, 28, 48),
splitrule = "gini",
min.node.size = c(5, 20, 50)
)
mod_rf <- train(
y ~ .,
data = train_df,
method = "ranger",
trControl = rf_ctrl,
tuneGrid = rf_grid,
num.trees = 100,
importance = "impurity",
metric = "Accuracy"
)
## + : mtry=16, splitrule=gini, min.node.size= 5
## - : mtry=16, splitrule=gini, min.node.size= 5
## + : mtry=28, splitrule=gini, min.node.size= 5
## - : mtry=28, splitrule=gini, min.node.size= 5
## + : mtry=48, splitrule=gini, min.node.size= 5
## - : mtry=48, splitrule=gini, min.node.size= 5
## + : mtry=16, splitrule=gini, min.node.size=20
## - : mtry=16, splitrule=gini, min.node.size=20
## + : mtry=28, splitrule=gini, min.node.size=20
## - : mtry=28, splitrule=gini, min.node.size=20
## + : mtry=48, splitrule=gini, min.node.size=20
## - : mtry=48, splitrule=gini, min.node.size=20
## + : mtry=16, splitrule=gini, min.node.size=50
## - : mtry=16, splitrule=gini, min.node.size=50
## + : mtry=28, splitrule=gini, min.node.size=50
## - : mtry=28, splitrule=gini, min.node.size=50
## + : mtry=48, splitrule=gini, min.node.size=50
## - : mtry=48, splitrule=gini, min.node.size=50
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 48, splitrule = gini, min.node.size = 5 on full training set
mod_rf
## Random Forest
##
## 60000 samples
## 784 predictor
## 10 classes: 'Ankle boot', 'T-shirt/top', 'Dress', 'Pullover', 'Sneaker', 'Sandal', 'Trouser', 'Shirt', 'Coat', 'Bag'
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry min.node.size Accuracy Kappa
## 16 5 0.8780333 0.8644815
## 16 20 0.8751333 0.8612593
## 16 50 0.8709667 0.8566296
## 28 5 0.8797333 0.8663704
## 28 20 0.8764333 0.8627037
## 28 50 0.8739833 0.8599815
## 48 5 0.8807167 0.8674630
## 48 20 0.8784333 0.8649259
## 48 50 0.8766000 0.8628889
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 48, splitrule = gini
## and min.node.size = 5.
Which classes do you think are the hardest for the model to predict? Which ones will it confuse the most? Think about it and then print the test confusion matrix.
Answer the model should have difficulty distinguishing items that look similar. It will probably have hard time to distinguish a shirt, a t-shirt, and a pullover. Maybe, ankle boots vs sandals with high heels.
Looking at the confusion matrix, we see that, indeed, “shirt” is the hardest label to predict, but distinguishing sandals from high heel boots doesn’t seem to be hard - probably, the dataset does not have many shoes with high heels.
mod_rf %>%
predict(test_df, type = "raw") %>%
confusionMatrix(test_df$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ankle boot Pullover Trouser Shirt Coat Sandal Sneaker Dress Bag
## Ankle boot 949 0 0 0 0 8 35 1 0
## Pullover 0 801 3 120 89 0 0 11 5
## Trouser 0 0 959 1 1 0 0 2 2
## Shirt 1 55 8 608 57 0 0 26 5
## Coat 0 118 3 82 812 0 0 32 4
## Sandal 7 0 0 1 1 964 11 0 2
## Sneaker 41 0 0 0 0 25 954 0 4
## Dress 0 11 23 28 38 1 0 906 2
## Bag 2 3 1 17 2 2 0 2 976
## T-shirt/top 0 12 3 143 0 0 0 20 0
## Reference
## Prediction T-shirt/top
## Ankle boot 0
## Pullover 13
## Trouser 0
## Shirt 85
## Coat 3
## Sandal 1
## Sneaker 0
## Dress 26
## Bag 14
## T-shirt/top 858
##
## Overall Statistics
##
## Accuracy : 0.8787
## 95% CI : (0.8721, 0.885)
## No Information Rate : 0.1
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8652
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Ankle boot Class: Pullover Class: Trouser
## Sensitivity 0.9490 0.8010 0.9590
## Specificity 0.9951 0.9732 0.9993
## Pos Pred Value 0.9557 0.7687 0.9938
## Neg Pred Value 0.9943 0.9778 0.9955
## Prevalence 0.1000 0.1000 0.1000
## Detection Rate 0.0949 0.0801 0.0959
## Detection Prevalence 0.0993 0.1042 0.0965
## Balanced Accuracy 0.9721 0.8871 0.9792
## Class: Shirt Class: Coat Class: Sandal Class: Sneaker
## Sensitivity 0.6080 0.8120 0.9640 0.9540
## Specificity 0.9737 0.9731 0.9974 0.9922
## Pos Pred Value 0.7195 0.7704 0.9767 0.9316
## Neg Pred Value 0.9572 0.9790 0.9960 0.9949
## Prevalence 0.1000 0.1000 0.1000 0.1000
## Detection Rate 0.0608 0.0812 0.0964 0.0954
## Detection Prevalence 0.0845 0.1054 0.0987 0.1024
## Balanced Accuracy 0.7908 0.8926 0.9807 0.9731
## Class: Dress Class: Bag Class: T-shirt/top
## Sensitivity 0.9060 0.9760 0.8580
## Specificity 0.9857 0.9952 0.9802
## Pos Pred Value 0.8754 0.9578 0.8282
## Neg Pred Value 0.9895 0.9973 0.9842
## Prevalence 0.1000 0.1000 0.1000
## Detection Rate 0.0906 0.0976 0.0858
## Detection Prevalence 0.1035 0.1019 0.1036
## Balanced Accuracy 0.9458 0.9856 0.9191
To train a neural network, we prepare the data as
mini-batches.
Gradient descent does not update the model after every single example
but after each batch, which makes training much faster and more
stable.
We also shuffle the training data at the start of each epoch so that the
model does not learn spurious patterns from the fixed order of the
data.
batch_size <- 128
train_dl <- dataloader(train_ds, batch_size = batch_size, shuffle = TRUE)
test_dl <- dataloader(test_ds, batch_size = batch_size)
We will now build a very simple network with one hidden layer. Note that the first layer just transforms a \(28\times 28\) matrix representing an image to a \(1\times 784\) vector, i.e., it does the same transformation as we did manually for training a random forest.
The activation function for the hidden layer is relu.
Note that we do not add a softmax at the end. This is because the
cross-entropy loss function in torch expects raw outputs
(“logits”) and applies softmax internally.
mlp <- nn_module(
initialize = function() {
self$flatten <- nn_flatten()
self$fc1 <- nn_linear(28*28, 128)
self$fc2 <- nn_linear(128, 10)
},
forward = function(x) {
x %>%
self$flatten() %>%
self$fc1() %>% nnf_relu() %>%
self$fc2()
}
)
How many parameters does our model have? First, find the answer based
on your knowledge of lecture material and then verify it by running the
command mlp().
Answer The input has \(28\times 28=784\) units, i.e., layer dimensions are \[ n_1=784,\quad n_2=128,\quad n_3=10 \] Thus the total number of parameters is \[ (784+1)\times 128 + (128+1)\times 10=101770 \]
And this is correct:
mlp()
## An `nn_module` containing 101,770 parameters.
##
## ── Modules ─────────────────────────────────────────────────────────────────────
## • flatten: <nn_flatten> #0 parameters
## • fc1: <nn_linear> #100,480 parameters
## • fc2: <nn_linear> #1,290 parameters
luzThe next step is to define settings that control the training and run gradient descent. This is done in one chain of pipes. Note that
setup() defines the loss function, the metric, and
optimizer (a method to boost convergence of gradient descent). We will
use the Adam optimizer, but you are encouraged to try others.
set_hparams() defines hyperparameters, such as layer
dimensions. We don’t need this, so we leave it blank.
set_opt_hparams() defines optimizer hyperparameters,
such as learning rate and weight decay (for regularized models). We will
set the learning rate to \(0.001\)s,
which the default for the Adam optimizer, but you are encouraged to
experiment.
fit() trains the model, i.e., runs gradient descent.
Here, we do it for 5 epochs (i.e., 5 runs through the whole training
dataset). We use the test data for validation, which is generally not
recommended - here we do it for simplicity.
torch_manual_seed(199)
mlp_fit <- mlp %>%
setup(
loss = nn_cross_entropy_loss(),
metrics = list(luz_metric_accuracy()),
optimizer = optim_adam
) %>%
set_hparams() %>%
set_opt_hparams(lr = 1e-3) %>%
fit(
train_dl,
epochs = 5,
valid_data = test_dl,
verbose = TRUE
)
mlp_fit
## A `luz_module_fitted`
## ── Time ────────────────────────────────────────────────────────────────────────
## • Total time: 42.1s
## • Avg time per training epoch: 7.4s
##
## ── Results ─────────────────────────────────────────────────────────────────────
## Metrics observed in the last epoch.
##
## ℹ Training:
## loss: 0.3286
## acc: 0.8817
##
## ── Model ───────────────────────────────────────────────────────────────────────
## An `nn_module` containing 101,770 parameters.
##
## ── Modules ─────────────────────────────────────────────────────────────────────
## • flatten: <nn_flatten> #0 parameters
## • fc1: <nn_linear> #100,480 parameters
## • fc2: <nn_linear> #1,290 parameters
The function predict has a version for
torch / luz models, but its output is a matrix
that comes out of the model, i.e., it’s just a bunch of numbers.
test_preds <- predict(mlp_fit, test_ds)
cat("The output of predict looks like this:\n")
## The output of predict looks like this:
test_preds[1:5, 1:5]
## torch_tensor
## -7.1110 -10.8535 -7.2900 -7.4961 -8.3498
## -2.0691 -11.6464 7.8082 -6.7344 2.3714
## -0.6569 10.6232 -4.7768 -1.6812 -2.3333
## -2.9600 9.8252 -4.8348 0.9131 -3.2399
## 2.0330 -5.2376 1.0788 -0.8532 -0.2802
## [ MPSFloatType{5,5} ]
cat("Its dimensions are", dim(test_preds), "\n")
## Its dimensions are 10000 10
In order to convert it to estimated probabilities of the classes, we
need to apply softmax. Here are estimated probabilities of
the first five test examples rounded to 4 decimal places for convenience
(note that we need to convert to array before rounding
because round() doesn’t work with torch
objects):
test_probs <- nnf_softmax(test_preds, dim = 2)
test_probs[1:5 , ] %>% as_array() %>% round(4)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0683 0.0000 0.0642 7e-04 0.8668
## [2,] 0.0001 0.0000 0.9878 0.0000 0.0043 0.0000 0.0078 0.0000 0e+00 0.0000
## [3,] 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0e+00 0.0000
## [4,] 0.0000 0.9999 0.0000 0.0001 0.0000 0.0000 0.0000 0.0000 0e+00 0.0000
## [5,] 0.0818 0.0001 0.0315 0.0046 0.0081 0.0000 0.8699 0.0000 4e-03 0.0000
To predict labels, we need to find the position of the maximum number in each row:
pred_test_labels <- test_probs %>% as_array() %>% max.col()
head(pred_test_labels)
## [1] 10 3 2 2 7 2
And to get predicted classes, we use numeric labels to index the pre-defined vector of class names:
pred_test_classes <- as_factor(class_names)[pred_test_labels]
head(pred_test_classes)
## [1] Ankle boot Pullover Trouser Trouser Shirt Trouser
## 10 Levels: T-shirt/top Trouser Pullover Dress Coat Sandal Shirt Sneaker ... Ankle boot
Finally, here is the confusion matrix:
confusionMatrix(test_df$y, pred_test_classes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction T-shirt/top Trouser Pullover Dress Coat Sandal Shirt Sneaker Bag
## T-shirt/top 719 4 7 25 4 0 230 1 10
## Trouser 1 976 0 16 3 0 2 0 2
## Pullover 13 5 734 11 126 0 110 0 1
## Dress 22 18 6 860 39 0 50 0 5
## Coat 0 1 73 26 811 0 84 0 5
## Sandal 0 0 0 1 0 950 0 30 2
## Shirt 66 3 70 28 65 0 755 0 13
## Sneaker 0 0 0 0 0 25 0 943 0
## Bag 4 1 2 3 7 5 10 5 963
## Ankle boot 0 0 0 1 0 5 1 36 0
## Reference
## Prediction Ankle boot
## T-shirt/top 0
## Trouser 0
## Pullover 0
## Dress 0
## Coat 0
## Sandal 17
## Shirt 0
## Sneaker 32
## Bag 0
## Ankle boot 957
##
## Overall Statistics
##
## Accuracy : 0.8668
## 95% CI : (0.86, 0.8734)
## No Information Rate : 0.1242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.852
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: T-shirt/top Class: Trouser Class: Pullover
## Sensitivity 0.8715 0.9683 0.8229
## Specificity 0.9694 0.9973 0.9708
## Pos Pred Value 0.7190 0.9760 0.7340
## Neg Pred Value 0.9882 0.9964 0.9824
## Prevalence 0.0825 0.1008 0.0892
## Detection Rate 0.0719 0.0976 0.0734
## Detection Prevalence 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9204 0.9828 0.8968
## Class: Dress Class: Coat Class: Sandal Class: Shirt
## Sensitivity 0.8857 0.7687 0.9645 0.6079
## Specificity 0.9845 0.9789 0.9945 0.9720
## Pos Pred Value 0.8600 0.8110 0.9500 0.7550
## Neg Pred Value 0.9877 0.9729 0.9961 0.9459
## Prevalence 0.0971 0.1055 0.0985 0.1242
## Detection Rate 0.0860 0.0811 0.0950 0.0755
## Detection Prevalence 0.1000 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9351 0.8738 0.9795 0.7900
## Class: Sneaker Class: Bag Class: Ankle boot
## Sensitivity 0.9291 0.9620 0.9513
## Specificity 0.9937 0.9959 0.9952
## Pos Pred Value 0.9430 0.9630 0.9570
## Neg Pred Value 0.9920 0.9958 0.9946
## Prevalence 0.1015 0.1001 0.1006
## Detection Rate 0.0943 0.0963 0.0957
## Detection Prevalence 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9614 0.9790 0.9733
Now let’s put everything together into a few convenience functions:
multiclass_prediction <- function(torch_model,
torch_data = test_ds,
predefined_classes = torch_data$classes) {
# Here we extract the vector of predicted labels
# out of the model and the dataset
pred_probs <- torch_model %>%
predict(torch_data) %>%
nnf_softmax(dim = 2) %>%
as_array()
pred_test_labels <- pred_probs %>% max.col()
as_factor(predefined_classes)[pred_test_labels]
}
torch_acc <- function(torch_model,
torch_data = test_ds,
predefined_classes = torch_data$classes) {
# Here we compute the overall accuracy.
# Begin by running the function defined above.
preds <- multiclass_prediction(torch_model, torch_data)
actual_y <- as_factor(predefined_classes)[torch_data$targets]
mean(preds == actual_y)
}
torch_conf_mat <- function(torch_model,
torch_data = test_ds,
predefined_classes = torch_data$classes) {
# Here we compute the whole confusion matrix.
# Again, begin by running the function defined above.
preds <- multiclass_prediction(torch_model, torch_data)
actual_y <- as_factor(predefined_classes)[torch_data$targets]
confusionMatrix(actual_y, preds)
}
torch_conf_mat(mlp_fit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction T-shirt/top Trouser Pullover Dress Coat Sandal Shirt Sneaker Bag
## T-shirt/top 719 4 7 25 4 0 230 1 10
## Trouser 1 976 0 16 3 0 2 0 2
## Pullover 13 5 734 11 126 0 110 0 1
## Dress 22 18 6 860 39 0 50 0 5
## Coat 0 1 73 26 811 0 84 0 5
## Sandal 0 0 0 1 0 950 0 30 2
## Shirt 66 3 70 28 65 0 755 0 13
## Sneaker 0 0 0 0 0 25 0 943 0
## Bag 4 1 2 3 7 5 10 5 963
## Ankle boot 0 0 0 1 0 5 1 36 0
## Reference
## Prediction Ankle boot
## T-shirt/top 0
## Trouser 0
## Pullover 0
## Dress 0
## Coat 0
## Sandal 17
## Shirt 0
## Sneaker 32
## Bag 0
## Ankle boot 957
##
## Overall Statistics
##
## Accuracy : 0.8668
## 95% CI : (0.86, 0.8734)
## No Information Rate : 0.1242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.852
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: T-shirt/top Class: Trouser Class: Pullover
## Sensitivity 0.8715 0.9683 0.8229
## Specificity 0.9694 0.9973 0.9708
## Pos Pred Value 0.7190 0.9760 0.7340
## Neg Pred Value 0.9882 0.9964 0.9824
## Prevalence 0.0825 0.1008 0.0892
## Detection Rate 0.0719 0.0976 0.0734
## Detection Prevalence 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9204 0.9828 0.8968
## Class: Dress Class: Coat Class: Sandal Class: Shirt
## Sensitivity 0.8857 0.7687 0.9645 0.6079
## Specificity 0.9845 0.9789 0.9945 0.9720
## Pos Pred Value 0.8600 0.8110 0.9500 0.7550
## Neg Pred Value 0.9877 0.9729 0.9961 0.9459
## Prevalence 0.0971 0.1055 0.0985 0.1242
## Detection Rate 0.0860 0.0811 0.0950 0.0755
## Detection Prevalence 0.1000 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9351 0.8738 0.9795 0.7900
## Class: Sneaker Class: Bag Class: Ankle boot
## Sensitivity 0.9291 0.9620 0.9513
## Specificity 0.9937 0.9959 0.9952
## Pos Pred Value 0.9430 0.9630 0.9570
## Neg Pred Value 0.9920 0.9958 0.9946
## Prevalence 0.1015 0.1001 0.1006
## Detection Rate 0.0943 0.0963 0.0957
## Detection Prevalence 0.1000 0.1000 0.1000
## Balanced Accuracy 0.9614 0.9790 0.9733
Find the image for which the model is least confident in its prediction. Was it correct for predicting that image’s class?
Hint You will need the function rowMaxs() from
the library matrixStats.
Answer To do it, we need to to get estimated probabilities of predicted classes, i.e., the maximum number in every row of the matrix of estimated probabilities of all classes, and then we find the minimum out of all those maxima.
pred_test_probs <- test_probs %>% as_array() %>% rowMaxs()
least_conf_case <- which.min(pred_test_probs)
least_conf_pred <- as.character(pred_test_classes[least_conf_case])
cat("The model is least confident on test example", least_conf_case,"\n")
## The model is least confident on test example 9860
cat("The confidence is", pred_test_probs[least_conf_case], "\n")
## The confidence is 0.1719416
cat("The model predicted", least_conf_pred,"\n")
## The model predicted Bag
cat("The actual label is",
test_ds$classes[test_ds$targets[least_conf_case]],
"\n")
## The actual label is Shirt
cat("The prediction is",
ifelse(least_conf_pred == test_df$y[least_conf_case],
"correct.\n", "incorrect.\n"))
## The prediction is incorrect.
plot_fashion_mnist_image(test_ds, least_conf_case)
Here, we will train a bigger model. To reduce overfitting, we will regularize it with \(l_2\)-penalty applied to all trainable parameters and a dropout layer.
mlp_reg <- nn_module(
initialize = function() {
self$flatten <- nn_flatten()
self$fc1 <- nn_linear(28*28, 500)
self$fc2 <- nn_linear(500, 300)
self$fc3 <- nn_linear(300, 200)
self$fc4 <- nn_linear(200, 100)
self$out <- nn_linear(100, 10)
self$drop <- nn_dropout(p = 0.05)
},
forward = function(x) {
x <- self$flatten(x)
x <- nnf_relu(self$fc1(x))
x <- nnf_relu(self$fc2(x))
x <- self$drop(x)
x <- nnf_relu(self$fc3(x))
x <- nnf_relu(self$fc4(x))
self$out(x)
}
)
mlp_reg_fit <- mlp_reg %>%
setup(
loss = nn_cross_entropy_loss(),
metrics = list(luz_metric_accuracy()),
optimizer = optim_adam
) %>%
set_opt_hparams(lr = 0.001, weight_decay = 0.0005) %>%
fit(train_dl, epochs = 10, valid_data = test_dl, verbose = TRUE)
cat("Test accuracy of regularized model =",
torch_acc(mlp_reg_fit))
## Test accuracy of regularized model = 0.8733
R saves values of loss and metrics after every epoch throughout training and we can plot the history of loss and accuracy changes.
Note: Accuracy is available since added it as a metric when calling
fit(). Otherwise only the loss curves can be plotted.
# extract per-epoch losses
train_loss <- map_dbl(mlp_reg_fit$records$metrics$train, ~ .x$loss)
valid_loss <- map_dbl(mlp_reg_fit$records$metrics$valid, ~ .x$loss)
# extract per-epoch accuracy
train_acc <- map_dbl(mlp_reg_fit$records$metrics$train, ~ .x$acc)
valid_acc <- map_dbl(mlp_reg_fit$records$metrics$valid, ~ .x$acc)
log_df <- tibble(
epoch = seq_along(train_loss),
train_loss = train_loss,
valid_loss = valid_loss,
train_acc = train_acc,
valid_acc = valid_acc
)
plot_loss <- log_df %>%
pivot_longer(cols = ends_with("loss"), names_to = "Metric") %>%
ggplot(aes(x = epoch, y = value, color = Metric)) +
geom_line() + geom_point() + theme_minimal() + ggtitle("Loss")
plot_acc <- log_df %>%
pivot_longer(cols = ends_with("acc"), names_to = "Metric") %>%
ggplot(aes(x = epoch, y = value, color = Metric)) +
geom_line() + geom_point() + theme_minimal() + ggtitle("Accuracy")
plot_loss / plot_acc
Note that the operator / for plots comes from the R
library patchwork. It provides a convenient way to arrange
a few plots on the same canvas.
Convolutional neural networks dominated image recognition until very recently (now it is common to use general-purpose large language models). Convolutional networks are not included into midterm test 2 - this material is here just for your information.
Below we define one of the first convolutional neural networks called LeNet-5, developed by Yann LeCun in 1998. Note that activation in hidden layers is \(\tanh\) — relu was not in use yet.
lenet <- nn_module(
initialize = function() {
self$conv1 <- nn_conv2d(in_channels = 1, out_channels = 20, kernel_size = 5)
self$conv2 <- nn_conv2d(in_channels = 20, out_channels = 50, kernel_size = 5)
self$drop <- nn_dropout(p = 0.3)
# After two conv+pool layers on 28x28 with valid padding: feature maps are 4x4
self$fc1 <- nn_linear(50*4*4, 500)
self$out <- nn_linear(500, 10)
},
forward = function(x) {
x <- torch_tanh(self$conv1(x)) %>% nnf_max_pool2d(kernel_size = 2, stride = 2)
x <- torch_tanh(self$conv2(x)) %>% nnf_max_pool2d(kernel_size = 2, stride = 2)
x <- self$drop(x)
x <- x$flatten(start_dim = 2)
x <- torch_tanh(self$fc1(x))
x <- self$drop(x)
self$out(x)
}
)
lenet_fit <- lenet %>%
setup(
loss = nn_cross_entropy_loss(),
optimizer = optim_adam
) %>%
set_opt_hparams(lr = 1e-3) %>%
fit(train_dl, epochs = 10, valid_data = test_dl, verbose = TRUE)
lenet_fit
## A `luz_module_fitted`
## ── Time ────────────────────────────────────────────────────────────────────────
## • Total time: 2m 27.4s
## • Avg time per training epoch: 13.5s
##
## ── Results ─────────────────────────────────────────────────────────────────────
## Metrics observed in the last epoch.
##
## ℹ Training:
## loss: 0.2367
##
## ── Model ───────────────────────────────────────────────────────────────────────
## An `nn_module` containing 431,080 parameters.
##
## ── Modules ─────────────────────────────────────────────────────────────────────
## • conv1: <nn_conv2d> #520 parameters
## • conv2: <nn_conv2d> #25,050 parameters
## • drop: <nn_dropout> #0 parameters
## • fc1: <nn_linear> #400,500 parameters
## • out: <nn_linear> #5,010 parameters
Remark: Why people often use assignments instead of a single pipe chain:
Debugging / inspection. With assignments
(x <- ...), you can stop after any line, print
x$size(), and check shapes step by step. With one long pipe
chain, this is harder.
Readability. Deep models often get long, with branches, skips, concatenations. Breaking into lines makes it clearer.
Consistency with PyTorch style. In
Python/PyTorch, we don’t have pipes.
Now let us evaluate CNN:
torch_acc(lenet_fit)
## [1] 0.902
Replace tanh with relu and Adam with
RMSprop. Does it help?
lenet_relu <- nn_module(
initialize = function() {
self$conv1 <- nn_conv2d(1, 20, 5)
self$conv2 <- nn_conv2d(20, 50, 5)
self$drop <- nn_dropout(p = 0.3)
self$fc1 <- nn_linear(50*4*4, 500)
self$out <- nn_linear(500, 10)
},
forward = function(x) {
x <- nnf_relu(self$conv1(x)) %>% nnf_max_pool2d(2,2)
x <- nnf_relu(self$conv2(x)) %>% nnf_max_pool2d(2,2)
x <- self$drop(x)
x <- x$flatten(start_dim = 2)
x <- nnf_relu(self$fc1(x))
x <- self$drop(x)
self$out(x)
}
)
lenet_relu_fit <- lenet_relu %>%
setup(
loss = nn_cross_entropy_loss(),
optimizer = optim_rmsprop
) %>%
set_opt_hparams(lr = 1e-3) %>%
fit(train_dl, epochs = 10, valid_data = test_dl, verbose = TRUE)
torch_acc(lenet_relu_fit)
## [1] 0.9007
Please fill the short survey after Lab 7: - https://forms.gle/gATMesEyRBmhrk4s5