knitr::opts_chunk$set(echo = TRUE)
#To improve our results, we will use regularization. Regularization constrains the total variability of the #effect sizes by penalizing large estimates that come from small sample sizes.
#To estimate the b ’s, we will now minimize this equation, which contains a penalty term:
#1N∑u,i(yu,i−μ−bi)2+λ∑ib2i
#The first term is the mean squared error and the second is a penalty term that gets larger when many b ’s #are large.
#The values of b that minimize this equation are given by:
#b^i(λ)=1λ+ni∑u=1ni(Yu,i−μ^),
#where ni is a number of ratings b for movie i .
#The larger λ is, the more we shrink. λ is a tuning parameter, so we can use cross-validation to choose #it. We should be using full cross-validation on just the training set, without using the test set until the #final assessment.
#We can also use regularization to estimate the user effect. We will now minimize this equation:
#1N∑u,i(yu,i−μ−bi−bu)2+λ(∑ib2i+∑ub2u)
#Code
#Please refer to the textbook for an updated version of the code that may contain some corrections.
library(dslabs)
## Warning: package 'dslabs' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
data("movielens")
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]
test_set <- test_set %>%
semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId")
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
mu_hat <- mean(train_set$rating)
naive_rmse <- RMSE(test_set$rating, mu_hat)
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
mu <- mean(train_set$rating)
movie_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>%
left_join(movie_avgs, by='movieId') %>%
.$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie Effect Model",
RMSE = model_1_rmse ))
user_avgs <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
left_join(user_avgs, by='userId') %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie + User Effects Model",
RMSE = model_2_rmse ))
test_set %>%
left_join(movie_avgs, by='movieId') %>%
mutate(residual = rating - (mu + b_i)) %>%
arrange(desc(abs(residual))) %>%
select(title, residual) %>% slice(1:10) %>% knitr::kable()
| Kingdom, The (Riget) |
-4.000000 |
| Heaven Knows, Mr. Allison |
-4.000000 |
| American Pimp |
-4.000000 |
| Chinatown |
-3.830357 |
| American Beauty |
-3.739011 |
| Apocalypse Now |
-3.729412 |
| Taxi Driver |
-3.714286 |
| Wallace & Gromit: A Close Shave |
-3.696078 |
| Down in the Delta |
-3.666667 |
| Stalag 17 |
-3.647059 |
movie_titles <- movielens %>%
select(movieId, title) %>%
distinct()
movie_avgs %>% left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i) %>%
slice(1:10) %>%
knitr::kable()
| When Night Is Falling |
1.456695 |
| Lamerica |
1.456695 |
| Mute Witness |
1.456695 |
| Picture Bride (Bijo photo) |
1.456695 |
| Red Firecracker, Green Firecracker (Pao Da Shuang Deng) |
1.456695 |
| Paris, France |
1.456695 |
| Faces |
1.456695 |
| Maya Lin: A Strong Clear Vision |
1.456695 |
| Heavy |
1.456695 |
| Gate of Heavenly Peace, The |
1.456695 |
movie_avgs %>% left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i) %>%
slice(1:10) %>%
knitr::kable()
| Children of the Corn IV: The Gathering |
-3.043305 |
| Barney’s Great Adventure |
-3.043305 |
| Merry War, A |
-3.043305 |
| Whiteboyz |
-3.043305 |
| Catfish in Black Bean Sauce |
-3.043305 |
| Killer Shrews, The |
-3.043305 |
| Horrors of Spider Island (Ein Toter Hing im Netz) |
-3.043305 |
| Monkeybone |
-3.043305 |
| Arthur 2: On the Rocks |
-3.043305 |
| Red Heat |
-3.043305 |
train_set %>% dplyr::count(movieId) %>%
left_join(movie_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
## Joining, by = "movieId"
| When Night Is Falling |
1.456695 |
1 |
| Lamerica |
1.456695 |
1 |
| Mute Witness |
1.456695 |
1 |
| Picture Bride (Bijo photo) |
1.456695 |
1 |
| Red Firecracker, Green Firecracker (Pao Da Shuang Deng) |
1.456695 |
3 |
| Paris, France |
1.456695 |
1 |
| Faces |
1.456695 |
1 |
| Maya Lin: A Strong Clear Vision |
1.456695 |
2 |
| Heavy |
1.456695 |
1 |
| Gate of Heavenly Peace, The |
1.456695 |
1 |
train_set %>% dplyr::count(movieId) %>%
left_join(movie_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
## Joining, by = "movieId"
| Children of the Corn IV: The Gathering |
-3.043305 |
1 |
| Barney’s Great Adventure |
-3.043305 |
1 |
| Merry War, A |
-3.043305 |
1 |
| Whiteboyz |
-3.043305 |
1 |
| Catfish in Black Bean Sauce |
-3.043305 |
1 |
| Killer Shrews, The |
-3.043305 |
1 |
| Horrors of Spider Island (Ein Toter Hing im Netz) |
-3.043305 |
1 |
| Monkeybone |
-3.043305 |
1 |
| Arthur 2: On the Rocks |
-3.043305 |
1 |
| Red Heat |
-3.043305 |
1 |
lambda <- 3
mu <- mean(train_set$rating)
movie_reg_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+lambda), n_i = n())
data_frame(original = movie_avgs$b_i,
regularlized = movie_reg_avgs$b_i,
n = movie_reg_avgs$n_i) %>%
ggplot(aes(original, regularlized, size=sqrt(n))) +
geom_point(shape=1, alpha=0.5)

train_set %>%
dplyr::count(movieId) %>%
left_join(movie_reg_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(desc(b_i)) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
## Joining, by = "movieId"
| Paris Is Burning |
1.0196864 |
7 |
| Shawshank Redemption, The |
0.9789475 |
250 |
| Godfather, The |
0.9371013 |
169 |
| African Queen, The |
0.9248324 |
40 |
| Band of Brothers |
0.8881906 |
17 |
| Paperman |
0.8600188 |
6 |
| On the Waterfront |
0.8404850 |
26 |
| All About Eve |
0.8075258 |
33 |
| Usual Suspects, The |
0.8062874 |
169 |
| Ikiru |
0.8044632 |
6 |
train_set %>%
dplyr::count(movieId) %>%
left_join(movie_reg_avgs) %>%
left_join(movie_titles, by="movieId") %>%
arrange(b_i) %>%
select(title, b_i, n) %>%
slice(1:10) %>%
knitr::kable()
## Joining, by = "movieId"
| Battlefield Earth |
-1.926883 |
11 |
| Joe’s Apartment |
-1.695537 |
6 |
| Super Mario Bros. |
-1.647199 |
15 |
| Speed 2: Cruise Control |
-1.581135 |
20 |
| Dungeons & Dragons |
-1.576949 |
8 |
| Batman & Robin |
-1.539974 |
36 |
| Police Academy 6: City Under Siege |
-1.534026 |
11 |
| Cats & Dogs |
-1.524746 |
4 |
| Disaster Movie |
-1.521653 |
3 |
| Mighty Morphin Power Rangers: The Movie |
-1.494850 |
10 |
predicted_ratings <- test_set %>%
left_join(movie_reg_avgs, by='movieId') %>%
mutate(pred = mu + b_i) %>%
.$pred
model_3_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Regularized Movie Effect Model",
RMSE = model_3_rmse ))
rmse_results %>% knitr::kable()
| Just the average |
1.0528047 |
| Movie Effect Model |
0.9885931 |
| Movie + User Effects Model |
0.8812843 |
| Regularized Movie Effect Model |
0.9699530 |
lambdas <- seq(0, 10, 0.25)
mu <- mean(train_set$rating)
just_the_sum <- train_set %>%
group_by(movieId) %>%
summarize(s = sum(rating - mu), n_i = n())
rmses <- sapply(lambdas, function(l){
predicted_ratings <- test_set %>%
left_join(just_the_sum, by='movieId') %>%
mutate(b_i = s/(n_i+l)) %>%
mutate(pred = mu + b_i) %>%
.$pred
return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)

lambdas[which.min(rmses)]
## [1] 3
lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
mu <- mean(train_set$rating)
b_i <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+l))
b_u <- train_set %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
predicted_ratings <-
test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)

lambda <- lambdas[which.min(rmses)]
lambda
## [1] 3.25
rmse_results <- bind_rows(rmse_results,
data_frame(method="Regularized Movie + User Effect Model",
RMSE = min(rmses)))
rmse_results %>% knitr::kable()
| Just the average |
1.0528047 |
| Movie Effect Model |
0.9885931 |
| Movie + User Effects Model |
0.8812843 |
| Regularized Movie Effect Model |
0.9699530 |
| Regularized Movie + User Effect Model |
0.8814297 |
# Bookmark this page
#The exercises in Q1-Q8 work with a simulated dataset for 1000 schools. This pre-exercise setup walks you #through the code needed to simulate the dataset.
#If you have not done so already since the Titanic Exercises, please restart R or reset the number of digits #that are printed with options(digits=7).
options(digits=7)
#An education expert is advocating for smaller schools. The expert bases this recommendation on the fact that #among the best performing schools, many are small schools. Let's simulate a dataset for 1000 schools. First, #let's simulate the number of students in each school, using the following code:
library(tidyverse)
#set.seed(1986) #for R 3.5 or earlier
set.seed(1986, sample.kind="Rounding")
## Warning in set.seed(1986, sample.kind = "Rounding"): non-uniform 'Rounding'
## sampler used
n <- round(2^rnorm(1000, 8, 1))
#Now let's assign a true quality for each school that is completely independent from size. This is the parameter we want to estimate in our analysis. The true quality can be assigned using the following code:
#set.seed(1) #for R 3.5 or earlier
set.seed(1, sample.kind="Rounding")
## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
mu <- round(80 + 2*rt(1000, 5))
range(mu)
## [1] 67 94
schools <- data.frame(id = paste("PS",1:1000),
size = n,
quality = mu,
rank = rank(-mu))
#We can see the top 10 schools using this code:
schools %>% top_n(10, quality) %>% arrange(desc(quality))
## id size quality rank
## 1 PS 191 1036 94 1.0
## 2 PS 567 121 93 2.0
## 3 PS 95 235 91 3.0
## 4 PS 430 61 90 4.0
## 5 PS 343 78 89 5.0
## 6 PS 981 293 88 6.0
## 7 PS 558 196 87 7.0
## 8 PS 79 105 86 13.5
## 9 PS 113 653 86 13.5
## 10 PS 163 300 86 13.5
## 11 PS 266 2369 86 13.5
## 12 PS 400 550 86 13.5
## 13 PS 451 217 86 13.5
## 14 PS 477 341 86 13.5
## 15 PS 484 967 86 13.5
## 16 PS 561 723 86 13.5
## 17 PS 563 828 86 13.5
## 18 PS 865 586 86 13.5
## 19 PS 963 208 86 13.5
#Now let's have the students in the school take a test. There is random variability in test taking, so we will simulate the test scores as normally distributed with the average determined by the school quality with a standard deviation of 30 percentage points. This code will simulate the test scores:
#set.seed(1) #for R 3.5 or earlier
set.seed(1, sample.kind="Rounding")
## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
mu <- round(80 + 2*rt(1000, 5))
scores <- sapply(1:nrow(schools), function(i){
scores <- rnorm(schools$size[i], schools$quality[i], 30)
scores
})
schools <- schools %>% mutate(score = sapply(scores, mean))
#Q1
#0.0/1.0 point (graded)
#What are the top schools based on the average score? Show just the ID, size, and the average score.
#
#Report the ID of the top school and average score of the 10th school.
#What is the ID of the top school?
#Note that the school IDs are given in the form "PS x" - where x is a number. Report the number only.
school_Size <- schools %>% top_n(10, score) %>% arrange(desc(score)) %>% select(id, size, score)
school_Size
## id size score
## 1 PS 567 121 95.84170
## 2 PS 191 1036 93.54249
## 3 PS 330 162 90.99615
## 4 PS 701 83 90.50055
## 5 PS 591 213 89.74194
## 6 PS 205 172 89.28585
## 7 PS 574 199 89.19625
## 8 PS 963 208 89.00446
## 9 PS 430 61 88.72107
## 10 PS 756 245 87.95731
median(school_Size$size)
## [1] 185.5
median(schools$size)
## [1] 261
school_Size_Bott <- schools %>% top_n(-10, score) %>% arrange(score) %>% select(id, size, score)
school_Size_Bott
## id size score
## 1 PS 440 543 65.84809
## 2 PS 409 1078 65.87475
## 3 PS 884 299 67.07405
## 4 PS 734 77 67.20603
## 5 PS 336 217 69.46928
## 6 PS 806 154 70.58705
## 7 PS 623 125 70.78077
## 8 PS 542 276 70.83295
## 9 PS 424 221 70.83953
## 10 PS 155 97 70.89372
median(school_Size_Bott$size)
## [1] 219
plot(school_Size$score,school_Size$quality)

schools %>% ggplot(aes(size, score)) +
geom_point(alpha = 0.5) +
geom_point(data = filter(schools, rank<=10), col = 2)

overall <- mean(sapply(scores, mean))
alpha <- 135
score_reg <- sapply(scores, function(x) overall + sum(x-overall)/(length(x)+alpha))
schools %>% mutate(score_reg = score_reg) %>%
top_n(10, score_reg) %>% arrange(desc(score_reg))
## id size quality rank score score_reg
## 1 PS 191 1036 94 1.0 93.54249 91.98183
## 2 PS 567 121 93 2.0 95.84170 87.49043
## 3 PS 561 723 86 13.5 87.39858 86.23529
## 4 PS 330 162 84 53.5 90.99615 86.00028
## 5 PS 591 213 83 104.5 89.74194 85.96477
## 6 PS 400 550 86 13.5 87.38269 85.92873
## 7 PS 865 586 86 13.5 87.17508 85.83260
## 8 PS 266 2369 86 13.5 85.98176 85.65954
## 9 PS 563 828 86 13.5 86.45072 85.54714
## 10 PS 574 199 84 53.5 89.19625 85.48132
alphas <- seq(10,250)
rmse <- sapply(alphas, function(alpha){
score_reg <- sapply(scores, function(x) overall+sum(x-overall)/(length(x)+alpha))
sqrt(mean((score_reg - schools$quality)^2))
})
plot(alphas, rmse)

alphas[which.min(rmse)]
## [1] 135
##################################
alphas <- seq(10,250)
rmses <- sapply(alphas, function(alpha){
score_reg <- sapply(scores, function(x) sum(x)/(length(x)+alpha))
sqrt(mean((score_reg - schools$quality)^2))
})
alphas[which.min(rmses)]
## [1] 10
###########################
alphas <- seq(10,250)
rmse <- sapply(alphas, function(alpha){
score_reg <- sapply(scores, function(x) sum(x)/(length(x)+alpha))
sqrt(mean((score_reg - schools$quality)^2))
})
plot(alphas, rmse)

alphas[which.min(rmse)]
## [1] 10