In the Model Fitting and Recommendation Systems section, you will learn how i applying the machine learning algorithms.
After completing this section, you will be able to:
This section has three parts:
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
[37m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[37m[32mv[37m [34mggplot2[37m 3.2.1 [32mv[37m [34mpurrr [37m 0.3.2
[32mv[37m [34mtibble [37m 2.1.3 [32mv[37m [34mdplyr [37m 0.8.3
[32mv[37m [34mtidyr [37m 0.8.3 [32mv[37m [34mstringr[37m 1.4.0
[32mv[37m [34mreadr [37m 1.3.1 [32mv[37m [34mforcats[37m 0.4.0[39m
[37m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
** Key points**
Code
library(dslabs)
mnist <- read_mnist()
names(mnist)
[1] "train" "test"
dim(mnist$train$images)
[1] 60000 784
class(mnist$train$labels)
[1] "integer"
table(mnist$train$labels)
0 1 2 3 4 5 6 7 8 9
5923 6742 5958 6131 5842 5421 5918 6265 5851 5949
# sample 10k rows from training set, 1k rows from test set
set.seed(123)
index <- sample(nrow(mnist$train$images), 10000)
x <- mnist$train$images[index,]
y <- factor(mnist$train$labels[index])
index <- sample(nrow(mnist$test$images), 1000)
x_test <- mnist$test$images[index,]
y_test <- factor(mnist$test$labels[index])
Key points
Code
library(matrixStats)
Attaching package: 㤼㸱matrixStats㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
count
sds <- colSds(x)
qplot(sds, bins = 256, color = I('black'))
library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: 㤼㸱caret㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
lift
nzv <- nearZeroVar(x)
image(matrix(1:784 %in% nzv, 28, 28))
col_index <- setdiff(1:ncol(x), nzv) # remove the colums with near-zero-variance
length(col_index)
[1] 249
Key points
Code
# !!! NB: Fatal Error R crach on call train(,..., "Rborist")
colnames(x) <- 1:ncol(mnist$train$images)
colnames(x_test) <- colnames(mnist$train$images)
### MNIST: test algo KNN
# take several minutes to run cross-validation for knn (dist between each point must be calculated)
#control <- trainControl(method = "cv", number = 10, p = .9)
#train_knn <- train(x[,col_index], y,
# method = "knn",
# tuneGrid = data.frame(k = c(1,3,5,7)),
# trControl = control)
#ggplot(train_knn)
# general good idea: test out code on a smaller subset to get an idea of timing
n <- 1000 # nb of rows subset to use
b <- 2 # nb of cross-valdation subset to use
index <- sample(nrow(x), n)
control <- trainControl(method = "cv", number = b, p = .9)
train_knn <- train(x[index ,col_index], y[index],
method = "knn",
tuneGrid = data.frame(k = c(3,5,7)),
trControl = control)
fit_knn <- knn3(x[ ,col_index], y, k = 5)
y_hat_knn <- predict(fit_knn,
x_test[, col_index],
type="class")
cm <- confusionMatrix(y_hat_knn, factor(y_test))
cm$overall["Accuracy"]
Accuracy
0.948
cm$byClass[,1:2]
Sensitivity Specificity
Class: 0 0.9907407 0.9966368
Class: 1 0.9910714 0.9909910
Class: 2 0.9375000 0.9966814
Class: 3 0.9166667 0.9943946
Class: 4 0.9882353 0.9945355
Class: 5 0.9148936 0.9900662
Class: 6 0.9439252 0.9910414
Class: 7 0.9595960 0.9944506
Class: 8 0.9072165 0.9977852
Class: 9 0.9255319 0.9955850
### MNIST: test algo Random Forest
# RF takes even larger time than kNN: 100 of trees to calculate
# use Rborist implementation of RF: less features, but faster
library(Rborist)
Rborist 0.1-17
Type RboristNews() to see new features/changes/bug fixes.
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 㤼㸱randomForest㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
margin
# reduce cross-validation to 5 for faster calculations: number = 5
control <- trainControl(method="cv", number = 5, p = 0.8)
# tuning parameters to optimize
grid <- expand.grid(minNode = c(1,5), # minimal node size
predFixed = c(10, 15, 25, 35, 50)) # nb of randomly selected predictors
grid2 <- data.frame(mtry = c(10, 15))
# NB !!! Below commented code crashes R: 'Rborist' BUG ?
train_rf <- train(x[, col_index],
y,
method = 'rf', # !!! BUG: ethod = 'Rborist',
ntree = 10, # nTree = 50, reduce nb of trees that are fit for faster calculations
trControl = control,
tuneGrid = grid2,
nodesize = 1,
nSamp = 5000) # take random subset of observations when constructing each tree
ggplot(train_rf)
train_rf$bestTune
# NB !!! Below commented code crashes R: 'Rborist' BUG ?
# fit_rf <- Rborist(x[, col_index],
# y,
# nTree = 1000,
# nodesize = 1, # Since R crashes above on train_rf : setting it manually minNode = train_rf$bestTune$minNode,
# predFixed = 10) # Since R crashes above on train_rf : setting it manually predFixed = train_rf$bestTune$predFixed)
fit_rf <- randomForest(x[, col_index],
y,
ntree = 1000, # ntree = 1000,
mtry = 10,
nodesize = 2, # nodesize = 1,
samplesize = 5000,
importance = TRUE)
y_hat_rf <- factor(levels(y)[predict(fit_rf, x_test[, col_index])]) #factor(levels(y)[predict(fit_rf, x_test[, col_index])$yPred])
cm <- confusionMatrix(y_hat_rf, y_test)
cm$overall["Accuracy"]
Accuracy
0.956
rafalib::mypar(3,4)
for(i in 1:12){
image(matrix(x_test[i,], 28, 28)[, 28:1],
main = paste("Our prediction:", y_hat_rf[i]),
xaxt="n", yaxt="n")
}
Key points
library(randomForest)
x <- mnist$train$images[index,]
y <- factor(mnist$train$labels[index])
rf <- randomForest(x, y, ntree = 50)
imp <- importance(rf)
# display the top variable importance
as.data.frame(imp) %>%
arrange(desc(MeanDecreaseGini)) %>%
top_n(10)
Selecting by MeanDecreaseGini
image(matrix(imp, 28, 28))
p_max <- predict(fit_knn, x_test[,col_index])
p_max <- apply(p_max, 1, max)
ind <- which(y_hat_knn != y_test)
ind <- ind[order(p_max[ind], decreasing = TRUE)]
rafalib::mypar(3,4)
for(i in ind[1:12]){
image(matrix(x_test[i,], 28, 28)[, 28:1],
main = paste0("Pr(",y_hat_knn[i],")=",
round(p_max[i], 2),
" but is a ", y_test[i]),
xaxt="n", yaxt="n")
}
p_max <- predict(fit_rf, x_test[,col_index], type='prob') # predict(fit_rf, x_test[,col_index])$census
p_max <- p_max / rowSums(p_max)
p_max <- apply(p_max, 1, max)
ind <- which(y_hat_rf != y_test)
ind <- ind[order(p_max[ind], decreasing = TRUE)]
rafalib::mypar(3,4)
for(i in ind[1:12]){
image(matrix(x_test[i,], 28, 28)[, 28:1],
main = paste0("Pr(",y_hat_rf[i],")=",round(p_max[i], 2),
" but is a ",y_test[i]),
xaxt="n", yaxt="n")
}
Key points * Ensembles combine multiple machine learning algorithms into one model to improve predictions. Important: here we combined two methods and got an improvement. In practice, one might ensemble dozens or even hundreds of different methods and get substantial improvement. Code
## proba of each digit by method = Random Forest
p_rf <- predict(fit_rf, x_test[,col_index], type='prob') # predict(fit_rf, x_test[,col_index])$census
p_rf <- p_rf / rowSums(p_rf)
## proba of each digit by method = Random Forest
p_knn <- predict(fit_knn, x_test[,col_index])
## proba combined of the two methods
p <- (p_rf + p_knn)/2
y_pred <- factor(apply(p, 1, which.max)-1)
confusionMatrix(y_pred, y_test)
Confusion Matrix and Statistics
Reference
Prediction 0 1 2 3 4 5 6 7 8 9
0 107 0 0 0 0 0 1 0 1 1
1 0 111 1 0 0 1 1 1 0 0
2 0 1 90 0 0 0 0 0 1 0
3 0 0 0 100 0 4 1 0 1 0
4 0 0 0 0 84 0 0 0 1 4
5 0 0 0 6 0 87 2 0 0 1
6 0 0 1 1 0 2 102 0 2 0
7 0 0 4 0 0 0 0 95 0 0
8 0 0 0 1 0 0 0 0 91 1
9 1 0 0 0 1 0 0 3 0 87
Overall Statistics
Accuracy : 0.954
95% CI : (0.9391, 0.9661)
No Information Rate : 0.112
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9489
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
Sensitivity 0.9907 0.9911 0.9375 0.9259 0.9882 0.9255 0.9533 0.9596 0.9381 0.9255
Specificity 0.9966 0.9955 0.9978 0.9933 0.9945 0.9901 0.9933 0.9956 0.9978 0.9945
Pos Pred Value 0.9727 0.9652 0.9783 0.9434 0.9438 0.9062 0.9444 0.9596 0.9785 0.9457
Neg Pred Value 0.9989 0.9989 0.9934 0.9911 0.9989 0.9923 0.9944 0.9956 0.9934 0.9923
Prevalence 0.1080 0.1120 0.0960 0.1080 0.0850 0.0940 0.1070 0.0990 0.0970 0.0940
Detection Rate 0.1070 0.1110 0.0900 0.1000 0.0840 0.0870 0.1020 0.0950 0.0910 0.0870
Detection Prevalence 0.1100 0.1150 0.0920 0.1060 0.0890 0.0960 0.1080 0.0990 0.0930 0.0920
Balanced Accuracy 0.9937 0.9933 0.9676 0.9596 0.9914 0.9578 0.9733 0.9776 0.9680 0.9600
Use the training set to build a model with several of the models available from the caret package. We will test out 10 of the most common machine learning models in this exercise:
models <- c("glm", "lda", "naive_bayes", "svmLinear", "knn", "gamLoess", "multinom", "qda", "rf", "adaboost")
Apply all of these models using train with all the default parameters. You may need to install some packages. Keep in mind that you will probably get some warnings. Also, it will probably take a while to train all of the models - be patient! Run the following code to train the various models:
library(caret)
library(dslabs)
set.seed(1, sample.kind = "Rounding") # in R 3.6 or later
non-uniform 'Rounding' sampler used
data("mnist_27")
fits <- lapply(models, function(model){
print(model)
train(y ~ ., method = model, data = mnist_27$train)
})
[1] "glm"
[1] "lda"
[1] "naive_bayes"
[1] "svmLinear"
[1] "knn"
[1] "gamLoess"
non-list contrasts argument ignoredeval 0.46667upperlimit 0.41586extrapolation not allowed with blendingeval 0.4375upperlimit 0.41586extrapolation not allowed with blendingeval 0.089286lowerlimit 0.10703extrapolation not allowed with blendingeval 0.094737lowerlimit 0.10703extrapolation not allowed with blendingnon-list contrasts argument ignorednon-list contrasts argument ignorednon-list contrasts argument ignoredeval 0.089286lowerlimit 0.092518extrapolation not allowed with blendingeval 0.57895upperlimit 0.54068extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingeval 0.089286lowerlimit 0.092518extrapolation not allowed with blendingeval 0.57895upperlimit 0.54068extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingeval 0.089286lowerlimit 0.092316extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.089286lowerlimit 0.092316extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.53846upperlimit 0.53555extrapolation not allowed with blendingeval 0.57895upperlimit 0.53555extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingeval 0.089286lowerlimit 0.10703extrapolation not allowed with blendingeval 0.094737lowerlimit 0.10703extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.57895upperlimit 0.54071extrapolation not allowed with blendingeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingnon-list contrasts argument ignorednon-list contrasts argument ignoredeval 0.089286lowerlimit 0.092316extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.40628extrapolation not allowed with blendingeval 0.41379upperlimit 0.40628extrapolation not allowed with blendingeval 0.4375upperlimit 0.40628extrapolation not allowed with blendingeval 0.089286lowerlimit 0.092518extrapolation not allowed with blendingeval 0.57895upperlimit 0.54068extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.089286lowerlimit 0.10703extrapolation not allowed with blendingeval 0.094737lowerlimit 0.10703extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.402extrapolation not allowed with blendingeval 0.40323upperlimit 0.402extrapolation not allowed with blendingeval 0.41379upperlimit 0.402extrapolation not allowed with blendingeval 0.40426upperlimit 0.402extrapolation not allowed with blendingeval 0.4375upperlimit 0.402extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.41586extrapolation not allowed with blendingeval 0.4375upperlimit 0.41586extrapolation not allowed with blendingnon-list contrasts argument ignorednon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingeval 0.57895upperlimit 0.54071extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.41586extrapolation not allowed with blendingeval 0.4375upperlimit 0.41586extrapolation not allowed with blendingeval 0.089286lowerlimit 0.092316extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.089286lowerlimit 0.092518extrapolation not allowed with blendingeval 0.57895upperlimit 0.54068extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.089286lowerlimit 0.10877extrapolation not allowed with blendingeval 0.094737lowerlimit 0.10877extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.089286lowerlimit 0.092316extrapolation not allowed with blendingnon-list contrasts argument ignoredeval 0.46667upperlimit 0.43969extrapolation not allowed with blendingeval 0.53846upperlimit 0.50797extrapolation not allowed with blendingeval 0.51111upperlimit 0.50797extrapolation not allowed with blendingeval 0.57895upperlimit 0.50797extrapolation not allowed with blendingeval 0.53333upperlimit 0.50797extrapolation not allowed with blendingnon-list contrasts argument ignored
[1] "multinom"
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 384.794809
final value 384.794775
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 421.251454
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 384.848555
final value 384.848522
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 358.466023
final value 358.466014
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 400.257332
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 358.528966
final value 358.528958
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 345.361326
final value 345.361319
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 389.162400
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 345.427631
final value 345.427624
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 370.819967
iter 10 value 370.819967
iter 10 value 370.819967
final value 370.819967
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 411.520894
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 370.881269
iter 10 value 370.881269
iter 10 value 370.881269
final value 370.881269
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 338.339240
final value 337.642174
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 389.552735
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 337.725860
final value 337.725851
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 362.651997
iter 10 value 362.651996
iter 10 value 362.651996
final value 362.651996
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 404.947235
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 362.716896
iter 10 value 362.716895
iter 10 value 362.716894
final value 362.716894
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 353.360649
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 396.615883
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 353.427369
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 331.505876
final value 331.505837
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 382.233327
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 331.587049
final value 331.587010
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 364.158073
iter 10 value 364.158073
iter 10 value 364.158073
final value 364.158073
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 400.438283
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 364.210111
iter 10 value 364.210111
iter 10 value 364.210111
final value 364.210111
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 343.760429
final value 343.760410
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 387.083157
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 343.826126
final value 343.826108
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 377.277862
iter 10 value 377.277862
iter 10 value 377.277861
final value 377.277861
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 413.479657
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 377.330740
iter 10 value 377.330739
iter 10 value 377.330738
final value 377.330738
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 363.527477
final value 363.527449
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 405.904614
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 363.591426
final value 363.591399
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 346.706756
iter 10 value 346.706754
iter 10 value 346.706754
final value 346.706754
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 393.064300
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 346.778579
iter 10 value 346.778577
iter 10 value 346.778577
final value 346.778577
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 350.308158
final value 350.308124
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 394.686750
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 350.376208
final value 350.376174
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 365.423988
final value 365.423967
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 407.046095
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 365.486830
final value 365.486809
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 375.942875
final value 375.942868
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 412.738783
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 375.996860
final value 375.996853
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 369.004020
final value 369.003531
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 407.374841
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 369.060934
final value 369.060455
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 360.551961
iter 10 value 360.551959
iter 10 value 360.551959
final value 360.551959
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 400.866217
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 360.611945
iter 10 value 360.611943
iter 10 value 360.611943
final value 360.611943
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 370.467778
final value 370.414135
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 406.680836
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 370.519928
final value 370.466715
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 355.236387
final value 355.236347
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 401.370189
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 355.308279
final value 355.308240
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 364.714111
final value 364.714051
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 407.312950
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 364.779508
final value 364.779448
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 347.812292
final value 347.812150
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 389.764148
iter 10 value 389.764145
iter 10 value 389.764145
final value 389.764145
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 347.875247
final value 347.875105
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 319.870357
final value 319.870338
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 372.994080
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 319.955663
final value 319.955644
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 312.576095
final value 312.576064
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 367.284329
iter 10 value 367.284329
iter 10 value 367.284329
final value 367.284329
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 312.666550
final value 312.666520
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 363.313712
iter 10 value 363.313712
iter 10 value 363.313712
final value 363.313712
converged
# weights: 4 (3 variable)
initial value 554.517744
final value 403.175943
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 363.373575
iter 10 value 363.373575
iter 10 value 363.373575
final value 363.373575
converged
# weights: 4 (3 variable)
initial value 554.517744
iter 10 value 358.900453
iter 10 value 358.900452
iter 10 value 358.900452
final value 358.900452
converged
[1] "qda"
[1] "rf"
note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
[1] "adaboost"
names(fits) <- models
Now that you have all the trained models in a list, use sapply or map to create a matrix of predictions for the test set. You should end up with a matrix with length(mnist_27\(test\)y) rows and length(models) columns. What are the dimensions of the matrix of predictions? Number of rows, columns:
models_y_hat <- sapply(fits, function(fit_model){
predict(fit_model, mnist_27$test)
})
dim(models_y_hat)
[1] 200 10
Now compute accuracy for each model on the test set. Report the mean accuracy across all models.
#str(mnist_27$test$y)
#str(models_y_hat[,1])
model_accuracies <- colMeans(models_y_hat == mnist_27$test$y)
model_accuracies
glm lda naive_bayes svmLinear knn gamLoess multinom qda rf
0.75 0.75 0.80 0.76 0.84 0.84 0.75 0.82 0.78
adaboost
0.78
mean(model_accuracies)
[1] 0.79
Next, build an ensemble prediction by majority vote and compute the accuracy of the ensemble. What is the accuracy of the ensemble?
df<-as.data.frame(table(models_y_hat[1,]), stringsAsFactors =TRUE)
df$Var1
[1] 2
Levels: 2
y_hat_maj <- sapply(seq(1,nrow(models_y_hat)), function(index_line){
df <- as.data.frame(table(models_y_hat[index_line,]))
df[which.max(df$Freq),]$Var1
})
mean(y_hat_maj == mnist_27$test$y)
[1] 0.81
In Q3, we computed the accuracy of each method on the test set and noticed that the individual accuracies varied. Q: How many of the individual methods do better than the ensemble?
Q: Which individual methods perform better than the ensemble?
model_indices <- model_accuracies > mean(models_y_hat == mnist_27$test$y)
sum(model_indices)
[1] 4
models[model_indices]
[1] "naive_bayes" "knn" "gamLoess" "qda"
It is tempting to remove the methods that do not perform well and re-do the ensemble. The problem with this approach is that we are using the test data to make a decision. However, we could use the minimum accuracy estimates obtained from cross validation with the training data for each model. Obtain these estimates and save them in an object. Report the mean of these training set accuracy estimates. What is the mean of these training set accuracy estimates?
# mean(fits[[1]]$results$Accuracy) +
# mean(fits[[2]]$results$Accuracy) +
# mean(fits[[3]]$results$Accuracy) +
# mean(fits[[4]]$results$Accuracy) +
# mean(fits[[5]]$results$Accuracy) +
# mean(fits[[6]]$results$Accuracy) +
# mean(fits[[7]]$results$Accuracy) +
# mean(fits[[8]]$results$Accuracy) +
# mean(fits[[9]]$results$Accuracy) +
# mean(fits[[10]]$results$Accuracy)
acc_hat <- sapply(fits, function(fit) min(fit$results$Accuracy))
mean(acc_hat)
[1] 0.81
Now let’s only consider the methods with an estimated accuracy of greater than or equal to 0.8 when constructing the ensemble. What is the accuracy of the ensemble now?
!!! NB: It should be mentionned that the Ensemble is decided by majority vote of models
ind <- acc_hat >= 0.8
votes <- rowMeans(models_y_hat[,ind] == "7")
y_hat <- ifelse(votes>=0.5, 7, 2)
mean(y_hat == mnist_27$test$y)
[1] 0.82
We want to explore the tissue_gene_expression predictors by plotting them.
data("tissue_gene_expression")
dim(tissue_gene_expression$x)
[1] 189 500
We want to get an idea of which observations are close to each other, but, as you can see from the dimensions, the predictors are 500-dimensional, making plotting difficult. Plot the first two principal components with color representing tissue type. Which tissue is in a cluster by itself?
pc <- prcomp(tissue_gene_expression$x)
data.frame(pc_1 = pc$x[,1], pc_2 = pc$x[,2],
tissue = tissue_gene_expression$y) %>%
ggplot(aes(pc_1, pc_2, color = tissue)) +
geom_point()
The predictors for each observation are measured using the same device and experimental procedure. This introduces biases that can affect all the predictors from one observation. For each observation, compute the average across all predictors, and then plot this against the first PC with color representing tissue. Report the correlation. What is the correlation?
bias_sample <- rowMeans(tissue_gene_expression$x)
data.frame(pc_1 = pc$x[,1], bias_sample,
tissue = tissue_gene_expression$y) %>%
ggplot(aes(pc_1, bias_sample, color = tissue)) +
geom_point()
cor(pc$x[,1], bias_sample)
[1] 0.6
We see an association with the first PC and the observation averages. Redo the PCA but only after removing the center. Part of the code is provided for you. Which line of code should be used to replace #BLANK below?
#BLANK
x <- with(tissue_gene_expression, sweep(x, 1, rowMeans(x)))
# END BLANK
pc2 <- prcomp(x)
data.frame(pc_1 = pc2$x[,1], pc_2 = pc2$x[,2],
tissue = tissue_gene_expression$y) %>%
ggplot(aes(pc_1, pc_2, color = tissue)) +
geom_point()
For the first 10 PCs, make a boxplot showing the values for each tissue. For the 7th PC, which two tissues have the greatest median difference?
for(i in 1:10){
boxplot(pc$x[,i] ~ tissue_gene_expression$y, main = paste("PC", i))
}
Plot the percent variance explained by PC number. Hint: use the summary function. How many PCs are required to reach a cumulative percent variance explained greater than 50%?
summary(pc2)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12
Standard deviation 8.876 5.694 4.899 3.7670 3.4397 3.1473 2.8139 2.1831 1.9745 1.8637 1.6449 1.52451
Proportion of Variance 0.336 0.138 0.102 0.0605 0.0505 0.0423 0.0338 0.0203 0.0166 0.0148 0.0115 0.00992
Cumulative Proportion 0.336 0.474 0.577 0.6374 0.6879 0.7301 0.7639 0.7843 0.8009 0.8157 0.8273 0.83717
PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23
Standard deviation 1.46921 1.27738 1.20250 1.18670 1.12428 1.08062 1.03637 0.97922 0.97284 0.93817 0.90089
Proportion of Variance 0.00921 0.00696 0.00617 0.00601 0.00539 0.00498 0.00458 0.00409 0.00404 0.00376 0.00346
Cumulative Proportion 0.84638 0.85334 0.85951 0.86552 0.87091 0.87589 0.88047 0.88456 0.88860 0.89236 0.89582
PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33 PC34
Standard deviation 0.88067 0.87348 0.82956 0.81636 0.8104 0.78204 0.76460 0.7494 0.73597 0.70857 0.70278
Proportion of Variance 0.00331 0.00326 0.00294 0.00284 0.0028 0.00261 0.00249 0.0024 0.00231 0.00214 0.00211
Cumulative Proportion 0.89913 0.90238 0.90532 0.90816 0.9110 0.91357 0.91607 0.9185 0.92078 0.92292 0.92502
PC35 PC36 PC37 PC38 PC39 PC40 PC41 PC42 PC43 PC44 PC45
Standard deviation 0.67950 0.67018 0.66361 0.65228 0.64571 0.63961 0.6321 0.61613 0.59700 0.59148 0.57580
Proportion of Variance 0.00197 0.00192 0.00188 0.00182 0.00178 0.00175 0.0017 0.00162 0.00152 0.00149 0.00141
Cumulative Proportion 0.92699 0.92891 0.93079 0.93261 0.93438 0.93613 0.9378 0.93945 0.94097 0.94247 0.94388
PC46 PC47 PC48 PC49 PC50 PC51 PC52 PC53 PC54 PC55 PC56
Standard deviation 0.5723 0.56984 0.56191 0.5517 0.54567 0.54000 0.52188 0.51885 0.51268 0.49678 0.49465
Proportion of Variance 0.0014 0.00139 0.00135 0.0013 0.00127 0.00124 0.00116 0.00115 0.00112 0.00105 0.00104
Cumulative Proportion 0.9453 0.94666 0.94801 0.9493 0.95058 0.95182 0.95299 0.95414 0.95526 0.95631 0.95735
PC57 PC58 PC59 PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67
Standard deviation 0.48808 0.48547 0.47818 0.47557 0.46984 0.46737 0.45302 0.44880 0.44599 0.43880 0.4324
Proportion of Variance 0.00102 0.00101 0.00098 0.00096 0.00094 0.00093 0.00088 0.00086 0.00085 0.00082 0.0008
Cumulative Proportion 0.95837 0.95938 0.96035 0.96132 0.96226 0.96319 0.96407 0.96492 0.96577 0.96659 0.9674
PC68 PC69 PC70 PC71 PC72 PC73 PC74 PC75 PC76 PC77 PC78
Standard deviation 0.42861 0.42529 0.41761 0.41513 0.41408 0.4058 0.40208 0.39515 0.39012 0.38527 0.38438
Proportion of Variance 0.00078 0.00077 0.00074 0.00074 0.00073 0.0007 0.00069 0.00067 0.00065 0.00063 0.00063
Cumulative Proportion 0.96818 0.96895 0.96969 0.97043 0.97116 0.9719 0.97255 0.97322 0.97387 0.97450 0.97513
PC79 PC80 PC81 PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89
Standard deviation 0.37967 0.3746 0.37200 0.36666 0.36073 0.35835 0.35752 0.35113 0.35070 0.34438 0.3410
Proportion of Variance 0.00061 0.0006 0.00059 0.00057 0.00056 0.00055 0.00055 0.00053 0.00052 0.00051 0.0005
Cumulative Proportion 0.97574 0.9763 0.97693 0.97751 0.97806 0.97861 0.97916 0.97968 0.98021 0.98071 0.9812
PC90 PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99 PC100
Standard deviation 0.33846 0.32943 0.32878 0.32675 0.32064 0.31787 0.31474 0.31327 0.30849 0.3074 0.3051
Proportion of Variance 0.00049 0.00046 0.00046 0.00046 0.00044 0.00043 0.00042 0.00042 0.00041 0.0004 0.0004
Cumulative Proportion 0.98170 0.98216 0.98262 0.98308 0.98352 0.98395 0.98437 0.98479 0.98519 0.9856 0.9860
PC101 PC102 PC103 PC104 PC105 PC106 PC107 PC108 PC109 PC110 PC111
Standard deviation 0.30164 0.29439 0.29024 0.28776 0.28449 0.28286 0.27938 0.27392 0.26971 0.26870 0.2659
Proportion of Variance 0.00039 0.00037 0.00036 0.00035 0.00035 0.00034 0.00033 0.00032 0.00031 0.00031 0.0003
Cumulative Proportion 0.98638 0.98675 0.98711 0.98747 0.98781 0.98815 0.98849 0.98881 0.98912 0.98942 0.9897
PC112 PC113 PC114 PC115 PC116 PC117 PC118 PC119 PC120 PC121 PC122
Standard deviation 0.2632 0.26096 0.25970 0.25561 0.25440 0.25247 0.25132 0.24512 0.24181 0.23789 0.23574
Proportion of Variance 0.0003 0.00029 0.00029 0.00028 0.00028 0.00027 0.00027 0.00026 0.00025 0.00024 0.00024
Cumulative Proportion 0.9900 0.99031 0.99060 0.99088 0.99115 0.99143 0.99170 0.99195 0.99220 0.99244 0.99268
PC123 PC124 PC125 PC126 PC127 PC128 PC129 PC130 PC131 PC132 PC133
Standard deviation 0.23265 0.22979 0.22868 0.22644 0.22295 0.2181 0.2143 0.21297 0.21146 0.20931 0.20461
Proportion of Variance 0.00023 0.00023 0.00022 0.00022 0.00021 0.0002 0.0002 0.00019 0.00019 0.00019 0.00018
Cumulative Proportion 0.99291 0.99314 0.99336 0.99358 0.99379 0.9940 0.9942 0.99438 0.99457 0.99476 0.99494
PC134 PC135 PC136 PC137 PC138 PC139 PC140 PC141 PC142 PC143 PC144
Standard deviation 0.20313 0.20106 0.20043 0.19927 0.19474 0.19113 0.18946 0.18713 0.18498 0.18368 0.18073
Proportion of Variance 0.00018 0.00017 0.00017 0.00017 0.00016 0.00016 0.00015 0.00015 0.00015 0.00014 0.00014
Cumulative Proportion 0.99511 0.99529 0.99546 0.99563 0.99579 0.99595 0.99610 0.99625 0.99639 0.99654 0.99668
PC145 PC146 PC147 PC148 PC149 PC150 PC151 PC152 PC153 PC154 PC155
Standard deviation 0.17793 0.17732 0.17342 0.17220 0.17099 0.16690 0.16543 0.16358 0.16158 0.16069 0.15928
Proportion of Variance 0.00014 0.00013 0.00013 0.00013 0.00012 0.00012 0.00012 0.00011 0.00011 0.00011 0.00011
Cumulative Proportion 0.99681 0.99695 0.99707 0.99720 0.99733 0.99744 0.99756 0.99768 0.99779 0.99790 0.99801
PC156 PC157 PC158 PC159 PC160 PC161 PC162 PC163 PC164 PC165 PC166
Standard deviation 0.1567 0.1530 0.1518 0.1493 0.14809 0.14694 0.14512 0.14264 0.13972 0.13829 0.13599
Proportion of Variance 0.0001 0.0001 0.0001 0.0001 0.00009 0.00009 0.00009 0.00009 0.00008 0.00008 0.00008
Cumulative Proportion 0.9981 0.9982 0.9983 0.9984 0.99850 0.99859 0.99868 0.99877 0.99885 0.99893 0.99901
PC167 PC168 PC169 PC170 PC171 PC172 PC173 PC174 PC175 PC176 PC177
Standard deviation 0.13316 0.12935 0.12809 0.12604 0.12141 0.12042 0.11858 0.11711 0.11401 0.11328 0.10979
Proportion of Variance 0.00008 0.00007 0.00007 0.00007 0.00006 0.00006 0.00006 0.00006 0.00006 0.00005 0.00005
Cumulative Proportion 0.99909 0.99916 0.99923 0.99929 0.99936 0.99942 0.99948 0.99954 0.99959 0.99965 0.99970
PC178 PC179 PC180 PC181 PC182 PC183 PC184 PC185 PC186 PC187
Standard deviation 0.10939 0.10731 0.10291 1e-01 0.09661 0.09455 0.08960 3.94e-15 7.33e-16 7.33e-16
Proportion of Variance 0.00005 0.00005 0.00005 4e-05 0.00004 0.00004 0.00003 0.00e+00 0.00e+00 0.00e+00
Cumulative Proportion 0.99975 0.99980 0.99985 1e+00 0.99993 0.99997 1.00000 1.00e+00 1.00e+00 1.00e+00
PC188 PC189
Standard deviation 7.33e-16 7.32e-16
Proportion of Variance 0.00e+00 0.00e+00
Cumulative Proportion 1.00e+00 1.00e+00
# sol: plots the cumulative PCA var
# plot(summary(pc2)$importance[3,])
# my sol: calculative individual PCA vars, plot them
pca_var <- sapply(pc2$sdev[1:10], function(pc_i){
pc_i^2 / (t(pc2$sdev) %*% pc2$sdev)
})
plot(pca_var)
# calculate the cumulative sum of PCA vars
cumsum(pca_var)
[1] 0.34 0.47 0.58 0.64 0.69 0.73 0.76 0.78 0.80 0.82
Netflix Challenge links For more information about the “Netflix Challenge,” you can check out these sites: * https://bits.blogs.nytimes.com/2009/09/21/netflix-awards-1-million-prize-and-starts-a-new-contest/ * http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/ * https://www.netflixprize.com/assets/GrandPrize2009_BPC_BellKor.pdf Key points * Recommendation systems are more complicated machine learning challenges because each outcome has a different set of predictors. For example, different users rate a different number of movies and rate different movies. * To compare different models or to see how well we’re doing compared to a baseline, we will use root mean squared error (RMSE) as our loss function. We can interpret RMSE similar to standard deviation. If \(N\) is the number of user-movie combinations, \(y_{u,i}\) is the rating for movie \(i\) by user \(u\), and \(\hat{y}_{u,i}\) is our prediction, then RMSE is defined as follows: \[\sqrt{\frac{1}{N}\sum_{u,i}(\hat{y}_{u,i} - y_{u,i})^2}\] Code
library(dslabs)
library(tidyverse)
data("movielens")
head(movielens)
movielens %>%
summarize(n_users = n_distinct(userId),
n_movies = n_distinct(movieId))
keep <- movielens %>%
dplyr::count(movieId) %>%
top_n(5) %>%
pull(movieId)
Selecting by n
tab <- movielens %>%
filter(userId %in% c(13:20)) %>%
filter(movieId %in% keep) %>%
select(userId, title, rating) %>%
spread(title, rating)
tab %>% knitr::kable()
userId | Forrest Gump | Pulp Fiction | Shawshank Redemption, The | Silence of the Lambs, The | Star Wars: Episode IV - A New Hope |
---|---|---|---|---|---|
13 | 5.0 | 3.5 | 4.5 | NA | NA |
15 | 1.0 | 5.0 | 2.0 | 5.0 | 5.0 |
16 | NA | NA | 4.0 | NA | NA |
17 | 2.5 | 5.0 | 5.0 | 4.5 | 3.5 |
18 | NA | NA | NA | NA | 3.0 |
19 | 5.0 | 5.0 | 4.0 | 3.0 | 4.0 |
20 | 2.0 | 0.5 | 4.5 | 0.5 | 1.5 |
users <- sample(unique(movielens$userId), 100)
rafalib::mypar()
movielens %>% filter(userId %in% users) %>%
select(userId, movieId, rating) %>%
mutate(rating = 1) %>%
spread(movieId, rating) %>% select(sample(ncol(.), 100)) %>%
as.matrix() %>% t(.) %>%
image(1:100, 1:100,. , xlab="Movies", ylab="Users")
abline(h=0:100+0.5, v=0:100+0.5, col = "grey")
movielens %>%
dplyr::count(movieId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 30, color = "black") +
scale_x_log10() +
ggtitle("Movies")
movielens %>%
dplyr::count(userId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 30, color = "black") +
scale_x_log10() +
ggtitle("Users")
library(caret)
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]
test_set <- test_set %>%
semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId")
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
** Key points * We start with a model that assumes the same rating for all movies and all users, with all the differences explained by random variation: If \(\mu\) represents the true rating for all movies and users and \(\epsilon\) represents independent errors sampled from the same distribution centered at zero, then: \[Y_{u,i} = \mu + \epsilon_{\mu,i}\] * In this case, the least squares estimate of \(\mu\)** — the estimate that minimizes the root mean squared error — is the average rating of all movies across all users. * We can improve our model by adding a term, \(b_i\) that represents the average rating for movie \(i\): \[Y_{u,i} = \mu + b_i + \epsilon_{\mu,i}\] \(b_i\) is the average of Y_{u,i} minus the overall mean for each movie \(i\). * We can further improve our model by adding \(b_u\), the user-specific effect: \[Y_{u,i} = \mu + b_i + b_u + \epsilon_{\mu,i}\] * Note that because there are thousands of \(b\)’s, the lm function will be very slow or cause R to crash, so we don’t recommend using linear regression to calculate these effects. Code
mu_hat <- mean(train_set$rating)
mu_hat
[1] 3.542793
naive_rmse <- RMSE(test_set$rating, mu_hat)
naive_rmse
[1] 1.04822
predictions <- rep(2.5, nrow(test_set))
RMSE(test_set$rating, predictions)
[1] 1.489453
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)
`data_frame()` is deprecated, use `tibble()`.
[90mThis warning is displayed once per session.[39m
# fit <- lm(rating ~ as.factor(userId), data = movielens)
mu <- mean(train_set$rating)
movie_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = mean(rating - mu))
movie_avgs %>% qplot(b_i, geom ="histogram", bins = 10, data = ., color = I("black"))
predicted_ratings <- mu + test_set %>%
left_join(movie_avgs, by='movieId') %>%
.$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie Effect Model",
RMSE = model_1_rmse ))
rmse_results %>% knitr::kable()
method | RMSE |
---|---|
Just the average | 1.0482202 |
Movie Effect Model | 0.9862839 |
train_set %>%
group_by(userId) %>%
summarize(b_u = mean(rating)) %>%
filter(n()>=100) %>%
ggplot(aes(b_u)) +
geom_histogram(bins = 30, color = "black")
# lm(rating ~ as.factor(movieId) + as.factor(userId))
user_avgs <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
left_join(user_avgs, by='userId') %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie + User Effects Model",
RMSE = model_2_rmse ))
rmse_results %>% knitr::kable()
method | RMSE |
---|---|
Just the average | 1.0482202 |
Movie Effect Model | 0.9862839 |
Movie + User Effects Model | 0.8848688 |
library(dslabs)
library(tidyverse)
data("movielens")
Compute the number of ratings for each movie and then plot it against the year the movie came out. Use the square root transformation on the counts. What year has the highest median number of ratings?
movielens %>%
group_by(movieId) %>%
summarize(n = n_distinct(userId), year = as.character(first(year))) %>%
qplot(year, n, data = ., geom = "boxplot") +
coord_trans(y = "sqrt") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
We see that, on average, movies that came out after 1993 get more ratings. We also see that with newer movies, starting in 1993, the number of ratings decreases with year: the more recent a movie is, the less time users have had to rate it. Among movies that came out in 1993 or later, select the top 25 movies with the highest average number of ratings per year (n/year), and caculate the average rating of each of them. To calculate number of ratings per year, use 2018 as the end year. What is the average rating for the movie The Shawshank Redemption?
What is the average number of ratings per year for the movie Forrest Gump?
res2 <- movielens %>%
filter(year >= 1993) %>%
group_by(movieId) %>%
summarize(avg_rating = mean(rating), n = n(), title=title[1], years=2018 - first(year)) %>%
mutate(n_year = n / years) %>%
top_n(25, n_year) %>%
arrange(desc(n_year))
From the table constructed in Q2, we can see that the most frequently rated movies tend to have above average ratings. This is not surprising: more people watch popular movies. To confirm this, stratify the post-1993 movies by ratings per year and compute their average ratings. To calculate number of ratings per year, use 2018 as the end year. Make a plot of average rating versus ratings per year and show an estimate of the trend. What type of trend do you observe?
movielens %>%
filter(year >= 1993) %>%
group_by(movieId) %>%
summarize(n = n(), years = 2018 - first(year),
title = title[1],
rating = mean(rating)) %>%
mutate(rate = n/years) %>%
ggplot(aes(rate, rating)) +
geom_point() +
geom_smooth()
… #### Question 5 The movielens dataset also includes a time stamp. This variable represents the time and data in which the rating was provided. The units are seconds since January 1, 1970. Create a new column date with the date. Q: Which code correctly creates this new column?
A: The as_datetime function in the lubridate package is particularly useful here
library(lubridate)
Attaching package: 㤼㸱lubridate㤼㸲
The following object is masked from 㤼㸱package:base㤼㸲:
date
movielens <- mutate(movielens, date = as_datetime(timestamp))
Compute the average rating for each week and plot this average against day.
Hint: use the round_date function before you group_by. Q: What type of trend do you observe?
A: There is some evidence of a time effect on average rating.
movielens %>% mutate(date = round_date(date, unit = "week")) %>%
group_by(date) %>%
summarize(rating = mean(rating)) %>%
ggplot(aes(date, rating)) +
geom_point() +
geom_smooth()
… #### Question 8 The movielens data also has a genres column. This column includes every genre that applies to the movie. Some movies fall under several genres. Define a category as whatever combination appears in this column. Keep only categories with more than 1,000 ratings. Then compute the average and standard error for each category. Plot these as error bar plots. Which genre has the lowest average rating?
movielens %>% group_by(genres) %>%
summarize(n = n(), avg = mean(rating), se = sd(rating)/sqrt(n())) %>%
filter(n >= 1000) %>%
mutate(genres = reorder(genres, avg)) %>%
ggplot(aes(x = genres, y = avg, ymin = avg - 2*se, ymax = avg + 2*se)) +
geom_point() +
geom_errorbar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
… ## 6.3 Regularization ### 6.3.1 Regularization Notes To improve our results, we will use regularization. Regularization constrains the total variability of the effect sizes by penalizing large estimates that come from small sample sizes.( To estimate the \(b\)’s, we will now minimize this equation, which contains a penalty term: \[\frac{1}{N}\sum_{u,i}(y_{u,i} - \mu - b_i)^2 + \lambda \sum_{i} b_i^2\] The first term is the mean squared error and the second is a penalty term that gets larger when many \(b\)’s are large. The values of \(b\) that minimize this equation are given by: \[ \hat{b}_i(\lambda) = \frac{1}{\lambda+n_i}\sum_{u=1}^{n_i} (Y_{u,i} - \hat{\mu})\] where \(n_i\) is a number of ratings \(b\) for movie \(i\). * The larger \(\lambda\) is, the more we shrink. \(\lambda\) is a tuning parameter, so we can use cross-validation to choose it. We should be using full cross-validation on just the training set, without using the test set until the final assessment. * We can also use regularization to estimate the user effect. We will now minimize this equation: \[\frac{1}{N}\sum_{u,i}(y_{u,i} - \mu - b_i - b_u)^2 + \lambda (\sum_{i} b_i^2 + \sum_{u} b_u^2)\] Code
library(dslabs)
library(tidyverse)
library(caret)
data("movielens")
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]
test_set <- test_set %>%
semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId")
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
mu_hat <- mean(train_set$rating)
naive_rmse <- RMSE(test_set$rating, mu_hat)
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)
mu <- mean(train_set$rating)
movie_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>%
left_join(movie_avgs, by='movieId') %>%
.$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie Effect Model",
RMSE = model_1_rmse ))
user_avgs <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
left_join(user_avgs, by='userId') %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie + User Effects Model",
RMSE = model_2_rmse ))
test_set %>%
left_join(movie_avgs, by='movieId') %>%
mutate(residual = rating - (mu + b_i)) %>%
arrange(desc(abs(residual))) %>%
select(title, residual) %>% slice(1:10) %>% knitr::kable()
title | residual |
---|---|
Day of the Beast, The (Día de la Bestia, El) | 4.500000 |
Horror Express | -4.000000 |
No Holds Barred | 4.000000 |
Dear Zachary: A Letter to a Son About His Father | -4.000000 |
Faust | -4.000000 |
Hear My Song | -4.000000 |
Confessions of a Shopaholic | -4.000000 |
Twilight Saga: Breaking Dawn - Part 1, The | -4.000000 |
Taxi Driver | -3.806931 |
Taxi Driver | -3.806931 |
movie_titles <- movielens %>%
select(movieId, title) %>%
distinct()
movie_avgs %>% left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i) %>%
slice(1:10) %>%
knitr::kable()
title | b_i |
---|---|
Lamerica | 1.457207 |
Love & Human Remains | 1.457207 |
Enfer, L’ | 1.457207 |
Picture Bride (Bijo photo) | 1.457207 |
Red Firecracker, Green Firecracker (Pao Da Shuang Deng) | 1.457207 |
Faces | 1.457207 |
Maya Lin: A Strong Clear Vision | 1.457207 |
Heavy | 1.457207 |
Gate of Heavenly Peace, The | 1.457207 |
Death in the Garden (Mort en ce jardin, La) | 1.457207 |
movie_avgs %>% left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i) %>%
slice(1:10) %>%
knitr::kable()
title | b_i |
---|---|
Santa with Muscles | -3.042793 |
BAP*S | -3.042793 |
3 Ninjas: High Noon On Mega Mountain | -3.042793 |
Barney’s Great Adventure | -3.042793 |
Merry War, A | -3.042793 |
Day of the Beast, The (Día de la Bestia, El) | -3.042793 |
Children of the Corn III | -3.042793 |
Whiteboyz | -3.042793 |
Catfish in Black Bean Sauce | -3.042793 |
Watcher, The | -3.042793 |
train_set %>% dplyr::count(movieId) %>%
left_join(movie_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
Joining, by = "movieId"
title | b_i | n |
---|---|---|
Lamerica | 1.457207 | 1 |
Love & Human Remains | 1.457207 | 3 |
Enfer, L’ | 1.457207 | 1 |
Picture Bride (Bijo photo) | 1.457207 | 1 |
Red Firecracker, Green Firecracker (Pao Da Shuang Deng) | 1.457207 | 3 |
Faces | 1.457207 | 1 |
Maya Lin: A Strong Clear Vision | 1.457207 | 2 |
Heavy | 1.457207 | 1 |
Gate of Heavenly Peace, The | 1.457207 | 1 |
Death in the Garden (Mort en ce jardin, La) | 1.457207 | 1 |
train_set %>% dplyr::count(movieId) %>%
left_join(movie_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
Joining, by = "movieId"
title | b_i | n |
---|---|---|
Santa with Muscles | -3.042793 | 1 |
BAP*S | -3.042793 | 1 |
3 Ninjas: High Noon On Mega Mountain | -3.042793 | 1 |
Barney’s Great Adventure | -3.042793 | 1 |
Merry War, A | -3.042793 | 1 |
Day of the Beast, The (Día de la Bestia, El) | -3.042793 | 1 |
Children of the Corn III | -3.042793 | 1 |
Whiteboyz | -3.042793 | 1 |
Catfish in Black Bean Sauce | -3.042793 | 1 |
Watcher, The | -3.042793 | 1 |
lambda <- 3
mu <- mean(train_set$rating)
movie_reg_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+lambda), n_i = n())
data_frame(original = movie_avgs$b_i,
regularlized = movie_reg_avgs$b_i,
n = movie_reg_avgs$n_i) %>%
ggplot(aes(original, regularlized, size=sqrt(n))) +
geom_point(shape=1, alpha=0.5)
train_set %>%
dplyr::count(movieId) %>%
left_join(movie_reg_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
Joining, by = "movieId"
title | b_i | n |
---|---|---|
All About Eve | 0.9271514 | 26 |
Shawshank Redemption, The | 0.9206986 | 240 |
Godfather, The | 0.8971328 | 153 |
Godfather: Part II, The | 0.8710751 | 100 |
Maltese Falcon, The | 0.8597749 | 47 |
Best Years of Our Lives, The | 0.8592343 | 11 |
On the Waterfront | 0.8467603 | 23 |
Face in the Crowd, A | 0.8326899 | 4 |
African Queen, The | 0.8322939 | 36 |
All Quiet on the Western Front | 0.8235200 | 11 |
train_set %>%
dplyr::count(movieId) %>%
left_join(movie_reg_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
Joining, by = "movieId"
title | b_i | n |
---|---|---|
Battlefield Earth | -2.064653 | 14 |
Joe’s Apartment | -1.779955 | 7 |
Speed 2: Cruise Control | -1.689385 | 20 |
Super Mario Bros. | -1.597269 | 13 |
Police Academy 6: City Under Siege | -1.571379 | 10 |
After Earth | -1.524453 | 4 |
Disaster Movie | -1.521396 | 3 |
Little Nicky | -1.511374 | 17 |
Cats & Dogs | -1.472973 | 6 |
Blade: Trinity | -1.462194 | 11 |
predicted_ratings <- test_set %>%
left_join(movie_reg_avgs, by='movieId') %>%
mutate(pred = mu + b_i) %>%
.$pred
model_3_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Regularized Movie Effect Model",
RMSE = model_3_rmse ))
rmse_results %>% knitr::kable()
method | RMSE |
---|---|
Just the average | 1.0482202 |
Movie Effect Model | 0.9862839 |
Movie + User Effects Model | 0.8848688 |
Regularized Movie Effect Model | 0.9649457 |
lambdas <- seq(0, 10, 0.25)
mu <- mean(train_set$rating)
just_the_sum <- train_set %>%
group_by(movieId) %>%
summarize(s = sum(rating - mu), n_i = n())
rmses <- sapply(lambdas, function(l){
predicted_ratings <- test_set %>%
left_join(just_the_sum, by='movieId') %>%
mutate(b_i = s/(n_i+l)) %>%
mutate(pred = mu + b_i) %>%
.$pred
return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)
lambdas[which.min(rmses)]
[1] 3
lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
mu <- mean(train_set$rating)
b_i <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+l))
b_u <- train_set %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
predicted_ratings <-
test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)
lambda <- lambdas[which.min(rmses)]
lambda
[1] 3.75
rmse_results <- bind_rows(rmse_results,
data_frame(method="Regularized Movie + User Effect Model",
RMSE = min(rmses)))
rmse_results %>% knitr::kable()
method | RMSE |
---|---|
Just the average | 1.0482202 |
Movie Effect Model | 0.9862839 |
Movie + User Effects Model | 0.8848688 |
Regularized Movie Effect Model | 0.9649457 |
Regularized Movie + User Effect Model | 0.8806419 |
The exercises in Q1-Q8 work with a simulated dataset for 1000 schools. This pre-exercise setup walks you through the code needed to simulate the dataset. An education expert is advocating for smaller schools. The expert bases this recommendation on the fact that among the best performing schools, many are small schools. Let’s simulate a dataset for 1000 schools. First, let’s simulate the number of students in each school, using the following code:
set.seed(1986, sample.kind="Rounding")
non-uniform 'Rounding' sampler used
n <- round(2^rnorm(1000, 8, 1))
Now let’s assign a true quality for each school that is completely independent from size. This is the parameter we want to estimate in our analysis. The true quality can be assigned using the following code:
set.seed(1, sample.kind="Rounding")
non-uniform 'Rounding' sampler used
mu <- round(80 + 2*rt(1000, 5))
range(mu)
[1] 67 94
schools <- data.frame(id = paste("PS",1:1000),
size = n,
quality = mu,
rank = rank(-mu))
We can see the top 10 schools using this code:
schools %>%
top_n(10, quality) %>%
arrange(desc(quality))
Now let’s have the students in the school take a test. There is random variability in test taking, so we will simulate the test scores as normally distributed with the average determined by the school quality with a standard deviation of 30 percentage points. This code will simulate the test scores:
set.seed(1, sample.kind="Rounding")
non-uniform 'Rounding' sampler used
mu <- round(80 + 2*rt(1000, 5))
scores <- sapply(1:nrow(schools), function(i){
scores <- rnorm(schools$size[i], schools$quality[i], 30)
scores
})
schools <- schools %>% mutate(score = sapply(scores, mean))
head(schools)
What are the top schools based on the average score? Show just the ID, size, and the average score. Report the ID of the top school and average score of the 10th school. What is the ID of the top school? Note that the school IDs are given in the form “PS x” - where x is a number. Report the number only. Q: What is the average score of the 10th school?
A:
schools_top10 <- schools %>%
top_n(10, score) %>%
arrange(desc(score))
schools_top10
Compare the median school size to the median school size of the top 10 schools based on the score. Q: What is the median school size overall?
Q: What is the median school size of the of the top 10 schools based on the score?
schools %>%
summarize(median(size))
schools_top10 %>%
summarize(median(size))
According to this analysis, it appears that small schools produce better test scores than large schools. Four out of the top 10 schools have 100 or fewer students. But how can this be? We constructed the simulation so that quality and size were independent. Repeat the exercise for the worst 10 schools. Q: What is the median school size of the bottom 10 schools based on the score?
schools_bottom10 <- schools %>%
top_n(10, -score) %>%
arrange(score)
schools_bottom10
schools_bottom10 %>%
summarize(median(size))
From this analysis, we see that the worst schools are also small. Plot the average score versus school size to see what’s going on. Highlight the top 10 schools based on the true quality. Q: What do you observe?
A: The standard error of the score has larger variability when the school is smaller, which is why both the best and the worst schools are more likely to be small.
schools %>% ggplot(aes(size, score)) +
geom_point(alpha = 0.5) +
geom_point(data = filter(schools, rank<=10), col = 2)
Let’s use regularization to pick the best schools. Remember regularization shrinks deviations from the average towards 0. To apply regularization here, we first need to define the overall average for all schools, using the following code:
overall <- mean(sapply(scores, mean))
Then, we need to define, for each school, how it deviates from that average. Write code that estimates the score above the average for each school but dividing by \(n+\alpha\) instead of \(n\), with \(n\) the school size and \(\alpha\) a regularization parameter. Try \(\alpha = 25\). Q: What is the ID of the top school with regularization?
Q: What is the regularized score of the 10th school?
alpha <- 25
schools5 <- schools %>%
mutate(score_dev = overall + (score - overall) * size / (size + alpha)) %>%
arrange(desc(score_dev))
# mutate(quality_new = score_dev-80)
schools5 %>%
top_n(10, score_dev)
Notice that this improves things a bit. The number of small schools that are not highly ranked is now lower. Is there a better \(\alpha\)?
Using values of \(\alpha\) from 10 to 250, find the \(alpha\) that minimizes the RMSE. Q: What value of \(\alpha\) gives the minimum RMSE? A:
alphas <- seq(10,250)
rmses <- sapply(alphas, function(alpha){
schools %>%
mutate(score_dev = overall + (score - overall) * size / (size + alpha)) %>%
summarize(rmse = sqrt(1/1000 * sum((score_dev-quality)^2))) %>%
pull(rmse)
})
# solution: other way for same result
# rmses <- sapply(alphas, function(alpha){
# score_reg <- sapply(scores, function(x) overall+sum(x-overall)/(length(x)+alpha))
# sqrt(mean((score_reg - schools$quality)^2))
# })
alphas[which.min(rmses)]
[1] 135
plot(alphas, rmses)
Rank the schools based on the average obtained with the best \(\alpha\). Note that no small school is incorrectly included. Q: What is the ID of the top school now?
Q: What is the regularized average score of the 10th school now?
alpha_best <- 135
schools7 <- schools %>%
mutate(score_dev = overall + (score - overall) * size / (size + alpha_best)) %>%
arrange(desc(score_dev)) %>%
top_n(10, score_dev)
schools7
A common mistake made when using regularization is shrinking values towards 0 that are not centered around 0. For example, if we don’t subtract the overall average before shrinking, we actually obtain a very similar result. Confirm this by re-running the code from the exercise in Q6 but without removing the overall mean. What value of \(\alpha\) gives the minimum RMSE here?
alphas <- seq(10,250)
rmses <- sapply(alphas, function(alpha){
score_reg <- sapply(scores, function(x) sum(x)/(length(x)+alpha))
sqrt(mean((score_reg - schools$quality)^2))
})
alphas[which.min(rmses)]
[1] 10
plot(alphas, rmses)
Key points * Our earlier models fail to account for an important source of variation related to the fact that groups of movies and groups of users have similar rating patterns. We can observe these patterns by studying the residuals and converting our data into a matrix where each user gets a row and each movie gets a column: \[ r_{u,i} = y_{u,i} - \hat{b}_i - \hat{b}_u\] where \(y_{u,i}\) is the entry in row \(u\) and column \(i\). * We can factorize the matrix of residuals \(r\) into a vector \(p\) and vector \(q\), \(r_{u,i} = p_u q_i\), allowing us to explain more of the variance using a model like this: \[Y_{u,i} = \mu + b_i + b_u + p_u q_i + \epsilon_{i,j}\] * Because our example is more complicated, we can use two factors to explain the structure and two sets of coefficients to describe users: \[Y_{u,i} = \mu + b_i + b_u + p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + \epsilon_{i,j}\] * To estimate factors using our data instead of constructing them ourselves, we can use principal component analysis (PCA) or singular value decomposition (SVD). Code
train_small <- movielens %>%
group_by(movieId) %>%
filter(n() >= 50 | movieId == 3252) %>% ungroup() %>% #3252 is Scent of a Woman used in example
group_by(userId) %>%
filter(n() >= 50) %>% ungroup()
y <- train_small %>%
select(userId, movieId, rating) %>%
spread(movieId, rating) %>%
as.matrix()
rownames(y)<- y[,1]
y <- y[,-1]
colnames(y) <- with(movie_titles, title[match(colnames(y), movieId)])
y <- sweep(y, 1, rowMeans(y, na.rm=TRUE))
y <- sweep(y, 2, colMeans(y, na.rm=TRUE))
m_1 <- "Godfather, The"
m_2 <- "Godfather: Part II, The"
qplot(y[ ,m_1], y[,m_2], xlab = m_1, ylab = m_2)
m_1 <- "Godfather, The"
m_3 <- "Goodfellas"
qplot(y[ ,m_1], y[,m_3], xlab = m_1, ylab = m_3)
m_4 <- "You've Got Mail"
m_5 <- "Sleepless in Seattle"
qplot(y[ ,m_4], y[,m_5], xlab = m_4, ylab = m_5)
cor(y[, c(m_1, m_2, m_3, m_4, m_5)], use="pairwise.complete") %>%
knitr::kable()
Godfather, The | Godfather: Part II, The | Goodfellas | You’ve Got Mail | Sleepless in Seattle | |
---|---|---|---|---|---|
Godfather, The | 1.0000000 | 0.8320756 | 0.4541425 | -0.4535093 | -0.3540335 |
Godfather: Part II, The | 0.8320756 | 1.0000000 | 0.5400558 | -0.3377691 | -0.3259897 |
Goodfellas | 0.4541425 | 0.5400558 | 1.0000000 | -0.4894054 | -0.3672836 |
You’ve Got Mail | -0.4535093 | -0.3377691 | -0.4894054 | 1.0000000 | 0.5423584 |
Sleepless in Seattle | -0.3540335 | -0.3259897 | -0.3672836 | 0.5423584 | 1.0000000 |
set.seed(1)
options(digits = 2)
Q <- matrix(c(1 , 1, 1, -1, -1), ncol=1)
rownames(Q) <- c(m_1, m_2, m_3, m_4, m_5)
P <- matrix(rep(c(2,0,-2), c(3,5,4)), ncol=1)
rownames(P) <- 1:nrow(P)
X <- jitter(P%*%t(Q))
X %>% knitr::kable(align = "c")
Godfather, The | Godfather: Part II, The | Goodfellas | You’ve Got Mail | Sleepless in Seattle |
---|---|---|---|---|
1.81 | 2.15 | 1.81 | -1.76 | -1.81 |
1.90 | 1.91 | 1.91 | -2.31 | -1.85 |
2.06 | 2.22 | 1.61 | -1.82 | -2.02 |
0.33 | 0.00 | -0.09 | -0.07 | 0.29 |
-0.24 | 0.17 | 0.30 | 0.26 | -0.05 |
0.32 | 0.39 | -0.13 | 0.12 | -0.20 |
0.36 | -0.10 | -0.01 | 0.23 | -0.34 |
0.13 | 0.22 | 0.08 | 0.04 | -0.32 |
-1.90 | -1.65 | -2.01 | 2.02 | 1.85 |
-2.35 | -2.23 | -2.25 | 2.23 | 2.01 |
-2.24 | -1.88 | -1.74 | 1.62 | 2.13 |
-2.26 | -2.30 | -1.87 | 1.98 | 1.93 |
cor(X)
Godfather, The Godfather: Part II, The Goodfellas You've Got Mail Sleepless in Seattle
Godfather, The 1.00 0.99 0.98 -0.98 -0.99
Godfather: Part II, The 0.99 1.00 0.99 -0.98 -0.99
Goodfellas 0.98 0.99 1.00 -0.99 -0.99
You've Got Mail -0.98 -0.98 -0.99 1.00 0.98
Sleepless in Seattle -0.99 -0.99 -0.99 0.98 1.00
t(Q) %>% knitr::kable(aling="c")
Godfather, The | Godfather: Part II, The | Goodfellas | You’ve Got Mail | Sleepless in Seattle |
---|---|---|---|---|
1 | 1 | 1 | -1 | -1 |
P
[,1]
1 2
2 2
3 2
4 0
5 0
6 0
7 0
8 0
9 -2
10 -2
11 -2
12 -2
set.seed(1)
options(digits = 2)
m_6 <- "Scent of a Woman"
Q <- cbind(c(1 , 1, 1, -1, -1, -1),
c(1 , 1, -1, -1, -1, 1))
rownames(Q) <- c(m_1, m_2, m_3, m_4, m_5, m_6)
P <- cbind(rep(c(2,0,-2), c(3,5,4)),
c(-1,1,1,0,0,1,1,1,0,-1,-1,-1))/2
rownames(P) <- 1:nrow(X)
X <- jitter(P%*%t(Q), factor=1)
X %>% knitr::kable(align = "c")
Godfather, The | Godfather: Part II, The | Goodfellas | You’ve Got Mail | Sleepless in Seattle | Scent of a Woman |
---|---|---|---|---|---|
0.45 | 0.54 | 1.45 | -0.44 | -0.45 | -1.42 |
1.47 | 1.48 | 0.48 | -1.58 | -1.46 | -0.54 |
1.51 | 1.55 | 0.40 | -1.46 | -1.50 | -0.51 |
0.08 | 0.00 | -0.02 | -0.02 | 0.07 | -0.03 |
-0.06 | 0.04 | 0.07 | 0.06 | -0.01 | 0.03 |
0.58 | 0.60 | -0.53 | -0.47 | -0.55 | 0.45 |
0.59 | 0.48 | -0.50 | -0.44 | -0.59 | 0.50 |
0.53 | 0.56 | -0.48 | -0.49 | -0.58 | 0.55 |
-0.97 | -0.91 | -1.00 | 1.01 | 0.96 | 0.92 |
-1.59 | -1.56 | -0.56 | 1.56 | 1.50 | 0.58 |
-1.56 | -1.47 | -0.43 | 1.40 | 1.53 | 0.47 |
-1.56 | -1.57 | -0.47 | 1.50 | 1.48 | 0.57 |
cor(X)
Godfather, The Godfather: Part II, The Goodfellas You've Got Mail Sleepless in Seattle
Godfather, The 1.00 1.00 0.53 -1.00 -1.00
Godfather: Part II, The 1.00 1.00 0.55 -1.00 -1.00
Goodfellas 0.53 0.55 1.00 -0.55 -0.53
You've Got Mail -1.00 -1.00 -0.55 1.00 1.00
Sleepless in Seattle -1.00 -1.00 -0.53 1.00 1.00
Scent of a Woman -0.57 -0.59 -0.99 0.60 0.57
Scent of a Woman
Godfather, The -0.57
Godfather: Part II, The -0.59
Goodfellas -0.99
You've Got Mail 0.60
Sleepless in Seattle 0.57
Scent of a Woman 1.00
t(Q) %>% knitr::kable(aling="c")
Godfather, The | Godfather: Part II, The | Goodfellas | You’ve Got Mail | Sleepless in Seattle | Scent of a Woman |
---|---|---|---|---|---|
1 | 1 | 1 | -1 | -1 | -1 |
1 | 1 | -1 | -1 | -1 | 1 |
P
[,1] [,2]
1 1 -0.5
2 1 0.5
3 1 0.5
4 0 0.0
5 0 0.0
6 0 0.5
7 0 0.5
8 0 0.5
9 -1 0.0
10 -1 -0.5
11 -1 -0.5
12 -1 -0.5
six_movies <- c(m_1, m_2, m_3, m_4, m_5, m_6)
tmp <- y[,six_movies]
cor(tmp, use="pairwise.complete")
Godfather, The Godfather: Part II, The Goodfellas You've Got Mail Sleepless in Seattle
Godfather, The 1.00 0.83 0.45 -0.45 -0.35
Godfather: Part II, The 0.83 1.00 0.54 -0.34 -0.33
Goodfellas 0.45 0.54 1.00 -0.49 -0.37
You've Got Mail -0.45 -0.34 -0.49 1.00 0.54
Sleepless in Seattle -0.35 -0.33 -0.37 0.54 1.00
Scent of a Woman 0.07 0.14 -0.17 -0.20 -0.18
Scent of a Woman
Godfather, The 0.07
Godfather: Part II, The 0.14
Goodfellas -0.17
You've Got Mail -0.20
Sleepless in Seattle -0.18
Scent of a Woman 1.00
Key points * You can think of singular value decomposition (SVD) as an algorithm that finds the vectors \(p\) and \(q\) that permit us to write the matrix of residuals \(r\) with \(m\) rows and \(n\) columns in the following way: \[r_{u,i} = p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + p_{u,m} q_{m,i}\] with the variability of these terms decreasing and the \(p\)’s uncorrelated to each other. * SVD also computes the variabilities so that we can know how much of the matrix’s total variability is explained as we add new terms. * The vectors \(q\) are called the principal components and the vectors \(p\) are the user effects. By using principal components analysis (PCA), matrix factorization can capture structure in the data determined by user opinions about movies. Code
y[is.na(y)] <- 0
y <- sweep(y, 1, rowMeans(y))
pca <- prcomp(y)
dim(pca$rotation)
[1] 454 292
dim(pca$x)
[1] 292 292
plot(pca$sdev)
var_explained <- cumsum(pca$sdev^2/sum(pca$sdev^2))
plot(var_explained)
library(ggrepel)
pcs <- data.frame(pca$rotation, name = colnames(y))
pcs %>% ggplot(aes(PC1, PC2)) + geom_point() +
geom_text_repel(aes(PC1, PC2, label=name),
data = filter(pcs,
PC1 < -0.1 | PC1 > 0.1 | PC2 < -0.075 | PC2 > 0.1))
pcs %>% select(name, PC1) %>% arrange(PC1) %>% slice(1:10)
pcs %>% select(name, PC1) %>% arrange(desc(PC1)) %>% slice(1:10)
pcs %>% select(name, PC2) %>% arrange(PC2) %>% slice(1:10)
pcs %>% select(name, PC2) %>% arrange(desc(PC2)) %>% slice(1:10)
In this exercise set, we will be covering a topic useful for understanding matrix factorization: the singular value decomposition (SVD). SVD is a mathematical result that is widely used in machine learning, both in practice and to understand the mathematical properties of some algorithms. This is a rather advanced topic and to complete this exercise set you will have to be familiar with linear algebra concepts such as matrix multiplication, orthogonal matrices, and diagonal matrices. The SVD tells us that we can decompose an \(N*p\) matrix \(Y\) with \(p < N\) as \[Y = UDV^T \] with \(U\) and \(V\) orthogonal of dimensions \(N*p\) and \(p*p\) respectively and \(D\) a \(p*p\) diagonal matrix with the values of the diagonal decreasing: \[d_{1,1} \ge d_{2,2} \ge ... \ge d_{p,p}\] In this exercise, we will see one of the ways that this decomposition can be useful. To do this, we will construct a dataset that represents grade scores for 100 students in 24 different subjects. The overall average has been removed so this data represents the percentage point each student received above or below the average test score. So a 0 represents an average grade (C), a 25 is a high grade (A+), and a -25 represents a low grade (F). You can simulate the data like this:
set.seed(1987, sample.kind="Rounding")
non-uniform 'Rounding' sampler used
n <- 100
k <- 8
Sigma <- 64 * matrix(c(1, .75, .5, .75, 1, .5, .5, .5, 1), 3, 3)
m <- MASS::mvrnorm(n, rep(0, 3), Sigma)
m <- m[order(rowMeans(m), decreasing = TRUE),]
y <- m %x% matrix(rep(1, k), nrow = 1) + matrix(rnorm(matrix(n*k*3)), n, k*3)
colnames(y) <- c(paste(rep("Math",k), 1:k, sep="_"),
paste(rep("Science",k), 1:k, sep="_"),
paste(rep("Arts",k), 1:k, sep="_"))
Our goal is to describe the student performances as succinctly as possible. For example, we want to know if these test results are all just a random independent numbers. Are all students just about as good? Does being good in one subject imply you will be good in another? How does the SVD help with all this? We will go step by step to show that with just three relatively small pairs of vectors we can explain much of the variability in this 100×24 dataset. #### Question 1 You can visualize the 24 test scores for the 100 students by plotting an image: Q: How would you describe the data based on this figure?
A: The students that test well are at the top of the image and there seem to be three groupings by subject
my_image <- function(x, zlim = range(x), ...){
colors = rev(RColorBrewer::brewer.pal(9, "RdBu"))
cols <- 1:ncol(x)
rows <- 1:nrow(x)
image(cols, rows, t(x[rev(rows),,drop=FALSE]), xaxt = "n", yaxt = "n",
xlab="", ylab="", col = colors, zlim = zlim, ...)
abline(h=rows + 0.5, v = cols + 0.5)
axis(side = 1, cols, colnames(x), las = 2)
}
my_image(y)
You can examine the correlation between the test scores directly like this. Q: Which of the following best describes what you see?
A: There is correlation among all tests, but higher if the tests are in science and math and even higher within each subject.
my_image(cor(y), zlim = c(-1,1))
range(cor(y))
[1] 0.49 1.00
axis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)
Remember that orthogonality means that \(U^T U\) and \(V^T V\) are equal to the identity matrix. This implies that we can also rewrite the decomposition as \[YV = UD\ or\ U^TY = DV^T\] We can think of \(YV\) and \(U^TY\) as two transformations of \(Y\) that preserve the total variability of \(Y\) since \(U\) and \(V\) are orthogonal. Use the function svd to compute the SVD of \(y\). This function will return \(U\), \(V\) and the diagonal entries of \(D\).
s <- svd(y)
names(s)
[1] "d" "u" "v"
You can check that the SVD works by typing:
y_svd <- s$u %*% diag(s$d) %*% t(s$v)
max(abs(y - y_svd))
[1] 5.3e-14
Compute the sum of squares of the columns of \(Y\) and store them in \(ss_y\). Then compute the sum of squares of columns of the transformed \(YV\) and store them in \(ss_yv\). Confirm that \(sum(ss_y) = sum(ss_yv)\). Q: What is the value of sum(ss_y) (and also the value of sum(ss_yv))?
A:
ss_y <- apply(y^2, 2, sum)
ss_yv <- apply((y%*%s$v)^2, 2, sum)
sum(ss_y)
[1] 175435
sum(ss_yv)
[1] 175435
We see that the total sum of squares is preserved. This is because \(V\) is orthogonal. Now to start understanding how \(YV\) is useful, plot \(ss_y\) against the column number and then do the same for \(ss_yv\). Q: What do you observe?
A: ss_yv is decreasing and close to 0 for the 4th column and beyond
plot(seq(1,ncol(y)), ss_y)
plot(seq(1,ncol(y)), ss_yv)
Now notice that we didn’t have to compute ss_yv because we already have the answer. How? Remember that \(YV = UD\) and because \(U\) is orthogonal, we know that the sum of squares of the columns of \(UD\) are the diagonal entries of \(D\) squared. Confirm this by plotting the square root of ss_yv versus the diagonal entries of \(D\).
plot(s$d, sqrt(ss_yv))
So from the above we know that the sum of squares of the columns of \(Y\) (the total sum of squares) adds up to the sum of s$d^2 and that the transformation \(YV\) gives us columns with sums of squares equal to s$d^2. Now compute the percent of the total variability that is explained by just the first three columns of YV. What proportion of the total variability is explained by the first three columns of \(YV\)?
sum(s$d[1:3]^2) / sum(s$d^2)
[1] 0.99
Use the sweep function to compute \(UD\) without constructing diag(s$d) or using matrix multiplication.
identical(s$u %*% diag(s$d), sweep(s$u, 2, s$d, FUN = "*"))
[1] TRUE
We know that \(U_1d_{1,1}\), the first column of \(UD\), has the most variability of all the columns of \(UD\). Earlier we looked at an image of \(Y\) using my_image(y), in which we saw that the student to student variability is quite large and that students that are good in one subject tend to be good in all. This implies that the average (across all subjects) for each student should explain a lot of the variability.
Compute the average score for each student, plot it against \(U_1d_{1,1}\) and describe what you find. Q: What do you observe? A:
rowMeans(y)
[1] 17.535 15.641 13.862 13.525 13.614 11.751 10.686 10.789 10.814 10.057 9.509 9.352 9.480
[14] 8.911 8.569 9.140 8.708 8.696 7.641 7.347 7.213 7.378 7.424 6.449 6.565 5.960
[27] 5.395 4.825 5.194 5.569 5.637 5.351 5.089 4.788 4.368 4.144 4.048 3.524 3.193
[40] 3.702 3.628 3.113 3.016 2.702 2.933 2.390 2.001 1.712 1.784 1.239 1.514 1.446
[53] 1.082 0.946 1.593 1.150 1.030 1.056 1.232 0.439 0.040 0.384 0.052 -0.125 -1.653
[66] -1.548 -1.412 -1.573 -1.437 -1.816 -2.442 -2.289 -2.050 -2.763 -2.694 -2.686 -3.035 -3.061
[79] -3.403 -3.394 -3.439 -3.702 -3.979 -4.106 -4.397 -4.947 -5.609 -6.203 -6.494 -6.761 -6.691
[92] -7.089 -7.066 -9.238 -11.930 -12.206 -12.632 -18.628 -19.490 -19.808
UD <- sweep(s$u, 2, s$d, FUN = "*")
plot(UD[,1], rowMeans(y))
We note that the signs in SVD are arbitrary because: \[UDV^T = (-U)D(-V)^T\] With this in mind we see that the first column of \(UD\) is almost identical to the average score for each student except for the sign. This implies that multiplying \(Y\) by the first column of \(V\) must be performing a similar operation to taking the average. Make an image plot of \(V\) and describe the first column relative to others and how this relates to taking an average. Q: How does the first column relate to the others, and how does this relate to taking an average? A: The first column is very close to being a constant, which implies that the first column of YV is the sum of the rows of Y multiplied by some constant, and is thus proportional to an average.
my_image(s$v)
These exercises will work with the tissue_gene_expression dataset, which is part of the dslabs package. #### Question 1 Load the tissue_gene_expression dataset. Remove the row means and compute the distance between each observation. Store the result in d. Q: Which of the following lines of code correctly does this computation?
A:
library(dslabs)
library(tidyverse)
data("tissue_gene_expression")
#x <- tissue_gene_expression$x
#y <- tissue_gene_expression$y
d <- dist(tissue_gene_expression$x - rowMeans(tissue_gene_expression$x))
Make a hierarchical clustering plot and add the tissue types as labels. You will observe multiple branches. Q: Which tissue type is in the branch farthest to the left?
h <- hclust(d)
plot(h)
Run a k-means clustering on the data with \(K=7\). Make a table comparing the identified clusters to the actual tissue types. Run the algorithm several times to see how the answer changes. Q: What do you observe for the clustering of the liver tissue?
A: Liver is split into two clusters (one large and one small) about 60% of the time. The other 40% of the time it is either in a single cluster or in three clusters at roughly equal frequency.
cl <- kmeans(tissue_gene_expression$x, centers = 7)
table(cl$cluster, tissue_gene_expression$y)
cerebellum colon endometrium hippocampus kidney liver placenta
1 0 34 1 0 0 0 6
2 0 0 0 0 0 6 0
3 0 0 14 0 36 0 0
4 36 0 0 29 0 0 0
5 0 0 0 0 0 14 0
6 0 0 0 0 0 6 0
7 2 0 0 2 3 0 0
Select the 50 most variable genes. Make sure the observations show up in the columns, that the predictor are centered, and add a color bar to show the different tissue types.
Hint: use the ColSideColors argument to assign colors. Also, use \(col = RColorBrewer::brewer.pal(11, "RdBu")\) for a better use of colors. Part of the code is provided for you here: Q: Which line of code should replace #BLANK in the code above?
A: cf. below
library(RColorBrewer)
sds <- matrixStats::colSds(tissue_gene_expression$x)
ind <- order(sds, decreasing = TRUE)[1:50]
colors <- brewer.pal(7, "Dark2")[as.numeric(tissue_gene_expression$y)]
#BLANK
heatmap(t(tissue_gene_expression$x[,ind]), col = brewer.pal(11, "RdBu"), scale = "row", ColSideColors = colors)