Data Science Machine Learning Edx Harvard Ph125.8x

knitr::opts_chunk$set(echo = TRUE)

#To improve our results, we will use regularization. Regularization constrains the total variability of the #effect sizes by penalizing large estimates that come from small sample sizes.
#To estimate the  b ’s, we will now minimize this equation, which contains a penalty term:
#1N∑u,i(yu,i−μ−bi)2+λ∑ib2i 
#The first term is the mean squared error and the second is a penalty term that gets larger when many  b ’s #are large.

#The values of  b  that minimize this equation are given by:
#b^i(λ)=1λ+ni∑u=1ni(Yu,i−μ^), 
#where  ni  is a number of ratings  b  for movie  i .

#The larger  λ  is, the more we shrink.  λ  is a tuning parameter, so we can use cross-validation to choose #it. We should be using full cross-validation on just the training set, without using the test set until the #final assessment.
#We can also use regularization to estimate the user effect. We will now minimize this equation:
#1N∑u,i(yu,i−μ−bi−bu)2+λ(∑ib2i+∑ub2u) 
#Code
#Please refer to the textbook for an updated version of the code that may contain some corrections.
library(dslabs)

## Warning: package 'dslabs' was built under R version 3.6.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.3

## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## Warning: package 'tibble' was built under R version 3.6.3

## Warning: package 'tidyr' was built under R version 3.6.3

## Warning: package 'purrr' was built under R version 3.6.3

## Warning: package 'dplyr' was built under R version 3.6.3

## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.6.3

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

data("movielens")
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
                                  p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]
test_set <- test_set %>% 
     semi_join(train_set, by = "movieId") %>%
     semi_join(train_set, by = "userId")
RMSE <- function(true_ratings, predicted_ratings){
     sqrt(mean((true_ratings - predicted_ratings)^2))
}
mu_hat <- mean(train_set$rating)
naive_rmse <- RMSE(test_set$rating, mu_hat)
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

mu <- mean(train_set$rating) 
movie_avgs <- train_set %>% 
     group_by(movieId) %>% 
     summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>% 
     left_join(movie_avgs, by='movieId') %>%
     .$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie Effect Model",
                                     RMSE = model_1_rmse ))
user_avgs <- test_set %>% 
     left_join(movie_avgs, by='movieId') %>%
     group_by(userId) %>%
     summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>% 
     left_join(movie_avgs, by='movieId') %>%
     left_join(user_avgs, by='userId') %>%
     mutate(pred = mu + b_i + b_u) %>%
     .$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie + User Effects Model",  
                                     RMSE = model_2_rmse ))

test_set %>% 
     left_join(movie_avgs, by='movieId') %>%
     mutate(residual = rating - (mu + b_i)) %>%
     arrange(desc(abs(residual))) %>% 
     select(title,  residual) %>% slice(1:10) %>% knitr::kable()

title	residual
Kingdom, The (Riget)	-4.000000
Heaven Knows, Mr. Allison	-4.000000
American Pimp	-4.000000
Chinatown	-3.830357
American Beauty	-3.739011
Apocalypse Now	-3.729412
Taxi Driver	-3.714286
Wallace & Gromit: A Close Shave	-3.696078
Down in the Delta	-3.666667
Stalag 17	-3.647059

movie_titles <- movielens %>% 
     select(movieId, title) %>%
     distinct()
movie_avgs %>% left_join(movie_titles, by="movieId") %>%
     arrange(desc(b_i)) %>% 
     select(title, b_i) %>% 
     slice(1:10) %>%  
     knitr::kable()

title	b_i
When Night Is Falling	1.456695
Lamerica	1.456695
Mute Witness	1.456695
Picture Bride (Bijo photo)	1.456695
Red Firecracker, Green Firecracker (Pao Da Shuang Deng)	1.456695
Paris, France	1.456695
Faces	1.456695
Maya Lin: A Strong Clear Vision	1.456695
Heavy	1.456695
Gate of Heavenly Peace, The	1.456695

movie_avgs %>% left_join(movie_titles, by="movieId") %>%
     arrange(b_i) %>% 
     select(title, b_i) %>% 
     slice(1:10) %>%  
     knitr::kable()

title	b_i
Children of the Corn IV: The Gathering	-3.043305
Barney’s Great Adventure	-3.043305
Merry War, A	-3.043305
Whiteboyz	-3.043305
Catfish in Black Bean Sauce	-3.043305
Killer Shrews, The	-3.043305
Horrors of Spider Island (Ein Toter Hing im Netz)	-3.043305
Monkeybone	-3.043305
Arthur 2: On the Rocks	-3.043305
Red Heat	-3.043305

train_set %>% dplyr::count(movieId) %>% 
     left_join(movie_avgs) %>%
     left_join(movie_titles, by="movieId") %>%
     arrange(desc(b_i)) %>% 
     select(title, b_i, n) %>% 
     slice(1:10) %>% 
     knitr::kable()

## Joining, by = "movieId"

title	b_i	n
When Night Is Falling	1.456695	1
Lamerica	1.456695	1
Mute Witness	1.456695	1
Picture Bride (Bijo photo)	1.456695	1
Red Firecracker, Green Firecracker (Pao Da Shuang Deng)	1.456695	3
Paris, France	1.456695	1
Faces	1.456695	1
Maya Lin: A Strong Clear Vision	1.456695	2
Heavy	1.456695	1
Gate of Heavenly Peace, The	1.456695	1

train_set %>% dplyr::count(movieId) %>% 
     left_join(movie_avgs) %>%
     left_join(movie_titles, by="movieId") %>%
     arrange(b_i) %>% 
     select(title, b_i, n) %>% 
     slice(1:10) %>% 
     knitr::kable()

## Joining, by = "movieId"

title	b_i	n
Children of the Corn IV: The Gathering	-3.043305	1
Barney’s Great Adventure	-3.043305	1
Merry War, A	-3.043305	1
Whiteboyz	-3.043305	1
Catfish in Black Bean Sauce	-3.043305	1
Killer Shrews, The	-3.043305	1
Horrors of Spider Island (Ein Toter Hing im Netz)	-3.043305	1
Monkeybone	-3.043305	1
Arthur 2: On the Rocks	-3.043305	1
Red Heat	-3.043305	1

lambda <- 3
mu <- mean(train_set$rating)
movie_reg_avgs <- train_set %>% 
     group_by(movieId) %>% 
     summarize(b_i = sum(rating - mu)/(n()+lambda), n_i = n()) 

data_frame(original = movie_avgs$b_i, 
           regularlized = movie_reg_avgs$b_i, 
           n = movie_reg_avgs$n_i) %>%
     ggplot(aes(original, regularlized, size=sqrt(n))) + 
     geom_point(shape=1, alpha=0.5)

train_set %>%
     dplyr::count(movieId) %>% 
     left_join(movie_reg_avgs) %>%
     left_join(movie_titles, by="movieId") %>%
     arrange(desc(b_i)) %>% 
     select(title, b_i, n) %>% 
     slice(1:10) %>% 
     knitr::kable()

## Joining, by = "movieId"

title	b_i	n
Paris Is Burning	1.0196864	7
Shawshank Redemption, The	0.9789475	250
Godfather, The	0.9371013	169
African Queen, The	0.9248324	40
Band of Brothers	0.8881906	17
Paperman	0.8600188	6
On the Waterfront	0.8404850	26
All About Eve	0.8075258	33
Usual Suspects, The	0.8062874	169
Ikiru	0.8044632	6

train_set %>%
     dplyr::count(movieId) %>% 
     left_join(movie_reg_avgs) %>%
     left_join(movie_titles, by="movieId") %>%
     arrange(b_i) %>% 
     select(title, b_i, n) %>% 
     slice(1:10) %>% 
     knitr::kable()

## Joining, by = "movieId"

title	b_i	n
Battlefield Earth	-1.926883	11
Joe’s Apartment	-1.695537	6
Super Mario Bros.	-1.647199	15
Speed 2: Cruise Control	-1.581135	20
Dungeons & Dragons	-1.576949	8
Batman & Robin	-1.539974	36
Police Academy 6: City Under Siege	-1.534026	11
Cats & Dogs	-1.524746	4
Disaster Movie	-1.521653	3
Mighty Morphin Power Rangers: The Movie	-1.494850	10

predicted_ratings <- test_set %>% 
     left_join(movie_reg_avgs, by='movieId') %>%
     mutate(pred = mu + b_i) %>%
     .$pred

model_3_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Regularized Movie Effect Model",  
                                     RMSE = model_3_rmse ))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0528047
Movie Effect Model	0.9885931
Movie + User Effects Model	0.8812843
Regularized Movie Effect Model	0.9699530

lambdas <- seq(0, 10, 0.25)
mu <- mean(train_set$rating)
just_the_sum <- train_set %>% 
     group_by(movieId) %>% 
     summarize(s = sum(rating - mu), n_i = n())
rmses <- sapply(lambdas, function(l){
     predicted_ratings <- test_set %>% 
          left_join(just_the_sum, by='movieId') %>% 
          mutate(b_i = s/(n_i+l)) %>%
          mutate(pred = mu + b_i) %>%
          .$pred
     return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)

lambdas[which.min(rmses)]

## [1] 3

lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
     mu <- mean(train_set$rating)
     b_i <- train_set %>%
          group_by(movieId) %>%
          summarize(b_i = sum(rating - mu)/(n()+l))
     b_u <- train_set %>% 
          left_join(b_i, by="movieId") %>%
          group_by(userId) %>%
          summarize(b_u = sum(rating - b_i - mu)/(n()+l))
     predicted_ratings <- 
          test_set %>% 
          left_join(b_i, by = "movieId") %>%
          left_join(b_u, by = "userId") %>%
          mutate(pred = mu + b_i + b_u) %>%
          .$pred
     return(RMSE(predicted_ratings, test_set$rating))
})

qplot(lambdas, rmses)

lambda <- lambdas[which.min(rmses)]
lambda

## [1] 3.25

rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Regularized Movie + User Effect Model",  
                                     RMSE = min(rmses)))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0528047
Movie Effect Model	0.9885931
Movie + User Effects Model	0.8812843
Regularized Movie Effect Model	0.9699530
Regularized Movie + User Effect Model	0.8814297

# Bookmark this page
#The exercises in Q1-Q8 work with a simulated dataset for 1000 schools. This pre-exercise setup walks you #through the code needed to simulate the dataset.

#If you have not done so already since the Titanic Exercises, please restart R or reset the number of digits #that are printed with options(digits=7).
options(digits=7)

#An education expert is advocating for smaller schools. The expert bases this recommendation on the fact that #among the best performing schools, many are small schools. Let's simulate a dataset for 1000 schools. First, #let's simulate the number of students in each school, using the following code:
library(tidyverse)
#set.seed(1986) #for R 3.5 or earlier
set.seed(1986, sample.kind="Rounding")

## Warning in set.seed(1986, sample.kind = "Rounding"): non-uniform 'Rounding'
## sampler used

n <- round(2^rnorm(1000, 8, 1))
#Now let's assign a true quality for each school that is completely independent from size. This is the parameter we want to estimate in our analysis. The true quality can be assigned using the following code:

#set.seed(1) #for R 3.5 or earlier
set.seed(1, sample.kind="Rounding")

## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

mu <- round(80 + 2*rt(1000, 5))
range(mu)

## [1] 67 94

schools <- data.frame(id = paste("PS",1:1000),
                      size = n,
                      quality = mu,
                      rank = rank(-mu))
#We can see the top 10 schools using this code: 

schools %>% top_n(10, quality) %>% arrange(desc(quality))

##        id size quality rank
## 1  PS 191 1036      94  1.0
## 2  PS 567  121      93  2.0
## 3   PS 95  235      91  3.0
## 4  PS 430   61      90  4.0
## 5  PS 343   78      89  5.0
## 6  PS 981  293      88  6.0
## 7  PS 558  196      87  7.0
## 8   PS 79  105      86 13.5
## 9  PS 113  653      86 13.5
## 10 PS 163  300      86 13.5
## 11 PS 266 2369      86 13.5
## 12 PS 400  550      86 13.5
## 13 PS 451  217      86 13.5
## 14 PS 477  341      86 13.5
## 15 PS 484  967      86 13.5
## 16 PS 561  723      86 13.5
## 17 PS 563  828      86 13.5
## 18 PS 865  586      86 13.5
## 19 PS 963  208      86 13.5

#Now let's have the students in the school take a test. There is random variability in test taking, so we will simulate the test scores as normally distributed with the average determined by the school quality with a standard deviation of 30 percentage points. This code will simulate the test scores:

#set.seed(1) #for R 3.5 or earlier
set.seed(1, sample.kind="Rounding")

## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

mu <- round(80 + 2*rt(1000, 5))

scores <- sapply(1:nrow(schools), function(i){
       scores <- rnorm(schools$size[i], schools$quality[i], 30)
       scores
})
schools <- schools %>% mutate(score = sapply(scores, mean))
#Q1
#0.0/1.0 point (graded)
#What are the top schools based on the average score? Show just the ID, size, and the average score.
#
#Report the ID of the top school and average score of the 10th school.

#What is the ID of the top school?
#Note that the school IDs are given in the form "PS x" - where x is a number. Report the number only.
school_Size <- schools %>% top_n(10, score) %>% arrange(desc(score)) %>% select(id, size, score)
school_Size

##        id size    score
## 1  PS 567  121 95.84170
## 2  PS 191 1036 93.54249
## 3  PS 330  162 90.99615
## 4  PS 701   83 90.50055
## 5  PS 591  213 89.74194
## 6  PS 205  172 89.28585
## 7  PS 574  199 89.19625
## 8  PS 963  208 89.00446
## 9  PS 430   61 88.72107
## 10 PS 756  245 87.95731

median(school_Size$size)

## [1] 185.5

median(schools$size)

## [1] 261

school_Size_Bott <- schools %>% top_n(-10, score) %>% arrange(score) %>% select(id, size, score)
school_Size_Bott

##        id size    score
## 1  PS 440  543 65.84809
## 2  PS 409 1078 65.87475
## 3  PS 884  299 67.07405
## 4  PS 734   77 67.20603
## 5  PS 336  217 69.46928
## 6  PS 806  154 70.58705
## 7  PS 623  125 70.78077
## 8  PS 542  276 70.83295
## 9  PS 424  221 70.83953
## 10 PS 155   97 70.89372

median(school_Size_Bott$size)

## [1] 219

plot(school_Size$score,school_Size$quality)

schools %>% ggplot(aes(size, score)) +
    geom_point(alpha = 0.5) +
    geom_point(data = filter(schools, rank<=10), col = 2)

overall <- mean(sapply(scores, mean))
alpha <- 135
score_reg <- sapply(scores, function(x)  overall + sum(x-overall)/(length(x)+alpha))
schools %>% mutate(score_reg = score_reg) %>%
    top_n(10, score_reg) %>% arrange(desc(score_reg))

##        id size quality  rank    score score_reg
## 1  PS 191 1036      94   1.0 93.54249  91.98183
## 2  PS 567  121      93   2.0 95.84170  87.49043
## 3  PS 561  723      86  13.5 87.39858  86.23529
## 4  PS 330  162      84  53.5 90.99615  86.00028
## 5  PS 591  213      83 104.5 89.74194  85.96477
## 6  PS 400  550      86  13.5 87.38269  85.92873
## 7  PS 865  586      86  13.5 87.17508  85.83260
## 8  PS 266 2369      86  13.5 85.98176  85.65954
## 9  PS 563  828      86  13.5 86.45072  85.54714
## 10 PS 574  199      84  53.5 89.19625  85.48132

alphas <- seq(10,250)
rmse <- sapply(alphas, function(alpha){
    score_reg <- sapply(scores, function(x) overall+sum(x-overall)/(length(x)+alpha))
    sqrt(mean((score_reg - schools$quality)^2))
})
plot(alphas, rmse)

alphas[which.min(rmse)]

## [1] 135

##################################
alphas <- seq(10,250)
rmses <- sapply(alphas, function(alpha){
    score_reg <- sapply(scores, function(x) sum(x)/(length(x)+alpha))
    sqrt(mean((score_reg - schools$quality)^2))
})
alphas[which.min(rmses)]

## [1] 10

###########################
alphas <- seq(10,250)
rmse <- sapply(alphas, function(alpha){
    score_reg <- sapply(scores, function(x) sum(x)/(length(x)+alpha))
    sqrt(mean((score_reg - schools$quality)^2))
})
plot(alphas, rmse)

alphas[which.min(rmse)]

## [1] 10

Data Science Machine Learning Edx Harvard Ph125.8x

Saqib Kareem

April 30, 2020