#Install the pacman package
if(!require(pacman)) install.packages("pacman", repos = "http://cran.us.r-project.org")
#Load the required libraries
#If a package below is missing, p_load will automatically download it from CRAN
pacman::p_load(tidyverse, ggplot2, ggthemes, data.table, lubridate, caret,
knitr, scales, treemapify)
#All Data for the MovieLens Dataset Will be obtained from the following sources:
# https://grouplens.org/datasets/movielens/10m/
# http://files.grouplens.org/datasets/movielens/ml-10m.zip
#Data Preparation
#Download File
dl <-tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
#Create the Data Frame "ratings" using fread from library(data.table)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <-str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <-c("movieId", "title", "genres")
movies <-as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
movielens <-left_join(ratings, movies, by = "movieId")
#Validation set will be 10% of MovieLens data
#Set Seed to 1
set.seed(1, sample.kind="Rounding")
test_index <-createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)
edx <-movielens[-test_index,]
temp <-movielens[test_index,]
#Make sure userId and movieId in validation set are also in edx set
validation <- temp %>%
semi_join(edx, by = "movieId") %>%
semi_join(edx, by = "userId")
#Add rows removed from validation set back into edx set
removed <-anti_join(temp, validation)
edx <-rbind(edx, removed)
rm(dl, ratings, movies, test_index, temp, movielens, removed)
To begin, we must first partition training and test sets to be used in our later models. Similarly, we will only partition 10% of the MovieLens Data for the test set.
set.seed(1, sample.kind = "Rounding")
test_index <-createDataPartition(y = edx$rating, times = 1, p = 0.1, list = F)
edx_train <-edx[-test_index,]
edx_temp <-edx[test_index,]
#Again, Confirm userId and movieId are in both the train and test sets
edx_test <-edx_temp %>%
semi_join(edx_train, by = "movieId") %>%
semi_join(edx_train, by = "userId")
#Add the Rows removed from the edx_test back into edx_train
removed <-anti_join(edx_temp, edx_test)
edx_train <-rbind(edx_train, removed)
rm(edx_temp, test_index, removed)
The only changes made to the validation set will be a transformed timestamp column as well as the addition of a column titled “MovieAge” to ensure no fundamental changes are made to the original datasets.
Confirm the data is tidy:
edx %>% as_tibble()
Note that each row has only one observation
glimpse(edx)
## Rows: 9,000,055
## Columns: 6
## $ userId <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ movieId <dbl> 122, 185, 292, 316, 329, 355, 356, 362, 364, 370, 377, 42...
## $ rating <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ timestamp <int> 838985046, 838983525, 838983421, 838983392, 838983392, 83...
## $ title <chr> "Boomerang (1992)", "Net, The (1995)", "Outbreak (1995)",...
## $ genres <chr> "Comedy|Romance", "Action|Crime|Thriller", "Action|Drama|...
We learn that there are 9000055 Rows & 6 Columns: userId, movieId, rating, timestamp, title & genres.
edx %>% summarize(unique_users = length(unique(userId)),
unique_movies = length(unique(movieId)),
unique_genres = length(unique(genres)))
There are 69878 unique userIds, 10,677 unique movieIds, and 797 unique combinations of genres.
Let’s visually interpret the distribution of the number of ratings by users:
It is clear that most users rate a fewer number of films and that the distribution is right skewed.
The distribution appears to be nearly symmetric which is logical as more popular films tend to be rated more frequently than less popular films. The fact that there are a number of films with fewer ratings implies potential biases may affect our recommendation system.
First we determine that there are 10 different ratings users can award the films:
length(unique(edx$rating))
## [1] 10
These Ratings are: 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5 & 5.0.
unique_ratings <-unique(edx$rating)
sort(unique_ratings)
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
#View a Tibble of the Ratings Distribution
edx %>% group_by(rating) %>% summarize(ratings_sum = n()) %>%
arrange(desc(ratings_sum))
A rating of “4” is the most popular rating while 0.5 is awarded the least frequently.
The histogram confirms most viewers tended to award a 3 or above.
rp <-edx %>% filter(edx$rating >=3)
nrow(rp)/length(edx$rating)
## [1] 0.8242332
Over 82% of ratings were at or above a 3
This feature will be transformed into column “RatingYear” to make the data more accessible. It will be named “RatingYear” to reflect the fact that timestamp refers to the year a movie was rated in. The release year of a film will later be extracted from the title column.
edx <- edx %>% mutate(timestamp = as.POSIXct(timestamp, origin = "1970-01-01",
tz = "EST"))
edx$timestamp <- format(edx$timestamp, "%Y")
names(edx)[names(edx) == "timestamp"] <- "RatingYear"
head(edx)
Do the same for the validation set.
validation <- validation %>% mutate(timestamp = as.POSIXct(timestamp, origin = "1970-01-01",
tz = "EST"))
validation$timestamp <- format(validation$timestamp, "%Y")
names(validation)[names(validation) == "timestamp"] <- "RatingYear"
head(validation)
This process is also applied to the edx_train & edx_test sets for later modeling purposes.
edx_train <- edx_train %>% mutate(timestamp = as.POSIXct(timestamp, origin = "1970-01-01",
tz = "EST"))
edx_train$timestamp <- format(edx_train$timestamp, "%Y")
names(edx_train)[names(edx_train) == "timestamp"] <- "RatingYear"
head(edx_train)
edx_test <-edx_test %>% mutate(timestamp = as.POSIXct(timestamp, origin = "1970-01-01",
tz = "EST"))
edx_test$timestamp <- format(edx_test$timestamp, "%Y")
names(edx_test)[names(edx_test) == "timestamp"] <- "RatingYear"
head(edx_test)
A quick check of the range tells us all ratings have taken place between 1995 & 2009.
range(edx$RatingYear)
## [1] "1995" "2009"
#Coerce RatingYear from character to numeric to plot the histogram
edx$RatingYear <-as.numeric(edx$RatingYear)
str(edx)
## Classes 'data.table' and 'data.frame': 9000055 obs. of 6 variables:
## $ userId : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : num 122 185 292 316 329 355 356 362 364 370 ...
## $ rating : num 5 5 5 5 5 5 5 5 5 5 ...
## $ RatingYear: num 1996 1996 1996 1996 1996 ...
## $ title : chr "Boomerang (1992)" "Net, The (1995)" "Outbreak (1995)" "Stargate (1994)" ...
## $ genres : chr "Comedy|Romance" "Action|Crime|Thriller" "Action|Drama|Sci-Fi|Thriller" "Action|Adventure|Sci-Fi" ...
## - attr(*, ".internal.selfref")=<externalptr>
We observe that 1996, 2000 and 2005 are the years in which most films were rated.
A tibble of the top 50 movies with the highest number of ratings reveals the vast majority were hits at the box office like “Batman” & “Forrest Gump.”
edx %>% group_by(RatingYear, title) %>%
summarize(Ratings_Sum = n(), Average_Rating = mean(rating)) %>%
mutate(Average_Rating = sprintf("%0.2f", Average_Rating)) %>%
arrange(-Ratings_Sum) %>% print(n = 50)
## # A tibble: 75,964 x 4
## # Groups: RatingYear [15]
## RatingYear title Ratings_Sum Average_Rating
## <dbl> <chr> <int> <chr>
## 1 1996 Batman (1989) 12016 3.26
## 2 1996 Dances with Wolves (1990) 11524 3.79
## 3 1996 Apollo 13 (1995) 11393 3.99
## 4 1996 Pulp Fiction (1994) 10925 4.01
## 5 1996 Fugitive, The (1993) 10901 4.12
## 6 1996 True Lies (1994) 10838 3.57
## 7 1996 Forrest Gump (1994) 9986 4.12
## 8 1996 Batman Forever (1995) 9907 3.13
## 9 1996 Aladdin (1992) 9856 3.67
## 10 1996 Jurassic Park (1993) 9771 3.84
## 11 1996 Ace Ventura: Pet Detective (1994) 9724 2.96
## 12 1996 Clear and Present Danger (1994) 9484 3.71
## 13 1996 Die Hard: With a Vengeance (1995) 9467 3.48
## 14 1996 Silence of the Lambs, The (1991) 9341 4.29
## 15 1996 Beauty and the Beast (1991) 8895 3.68
## 16 1996 Stargate (1994) 8845 3.33
## 17 1996 Shawshank Redemption, The (1994) 8728 4.48
## 18 1996 Outbreak (1995) 8386 3.56
## 19 1996 Star Trek: Generations (1994) 8284 3.42
## 20 1996 Cliffhanger (1993) 8172 3.21
## 21 1996 Braveheart (1995) 8106 4.26
## 22 1996 Firm, The (1993) 8097 3.54
## 23 1996 Crimson Tide (1995) 8039 3.82
## 24 1996 Terminator 2: Judgment Day (1991) 7994 3.96
## 25 1996 Speed (1994) 7949 3.79
## 26 1996 Dumb & Dumber (1994) 7938 2.83
## 27 1996 Net, The (1995) 7902 3.34
## 28 1996 Lion King, The (1994) 7692 3.81
## 29 1996 While You Were Sleeping (1995) 7674 3.61
## 30 1996 Waterworld (1995) 7601 3.07
## 31 1996 Interview with the Vampire: The Vampir~ 7544 3.41
## 32 1996 GoldenEye (1995) 7421 3.44
## 33 1996 Mrs. Doubtfire (1993) 7391 3.62
## 34 1996 Seven (a.k.a. Se7en) (1995) 7022 3.96
## 35 1996 Pretty Woman (1990) 6998 3.47
## 36 1996 Mask, The (1994) 6945 3.34
## 37 1996 Ghost (1990) 6840 3.61
## 38 1996 Natural Born Killers (1994) 6497 3.15
## 39 1996 Quiz Show (1994) 6417 3.65
## 40 1996 Babe (1995) 6363 3.87
## 41 1996 Sleepless in Seattle (1993) 6334 3.70
## 42 1996 Addams Family Values (1993) 6072 3.06
## 43 1996 Schindler's List (1993) 5894 4.52
## 44 1996 Four Weddings and a Funeral (1994) 5871 3.70
## 45 1996 12 Monkeys (Twelve Monkeys) (1995) 5861 3.90
## 46 1996 Get Shorty (1995) 5817 3.67
## 47 1996 Usual Suspects, The (1995) 5669 4.30
## 48 1996 Home Alone (1990) 5430 3.15
## 49 1996 Disclosure (1994) 5373 3.37
## 50 1996 Clueless (1995) 5360 3.44
## # ... with 75,914 more rows
Additionally, the average rating for every film was above a 3 except for “Ace Ventura: Pet Detective” (2.94) & “Dumb & Dumber” (2.83).
As we determined before that there were 797 distinct combinations of genres, we will now separate them into individual genres for each row in a new data frame for further exploratory analysis: note that now there are only 19 distinct genres.
edx_genres <-edx %>% separate_rows(genres, sep = "\\|")
edx_genres %>%
group_by(genres) %>% summarize(Ratings_Sum = n(), Average_Rating = mean(rating)) %>%
arrange(-Ratings_Sum)
Out of the 19 listed genres, the Top 5 are: Drama, Comedy, Action, & Thriller with IMAX at the bottom. IMAX being at the bottom could potentially obfuscate modeling as it is more so a venue than a genre.
The treemap visually confirms the ranking in the above tibble.
edx_genres %>%
group_by(genres) %>% summarize(Ratings_Sum = n(), Average_Rating = mean(rating)) %>%
arrange(-Average_Rating)
The Top 5 by Mean Rating are: Film-Noir, War, IMAX, Mystery, & Drama. Film Noir has a far lower ratings sum which could skew our data.
#First we must coerce genres from characters to factors
edx$genres <-as.factor(edx$genres)
edx_genres$genres <-as.factor(edx_genres$genres)
class(edx_genres$genres)
## [1] "factor"
The visualization confirms most genres were given a rating between a 3 and 4 with Film Noir at the top and Horror at the bottom.
The title feature includes the release year of a film so in order to facilitate further data analysis, we extract the year by separating it from the title column and creating a new column “yearReleased”.
yearreleaseda <-as.numeric(str_sub(edx$title, start = -5, end = -2))
edx <- edx %>% mutate(yearReleased = yearreleaseda)
head(edx)
#Do the same for the validation set
yearreleasedb <-as.numeric(str_sub(validation$title, start = -5, end = -2))
validation <- validation %>% mutate(yearReleased = yearreleasedb)
head(validation)
#This is also applied to edx_train & edx_test for later modeling purposes
yearreleasedc <-as.numeric(str_sub(edx_train$title, start = -5, end = -2))
edx_train <- edx_train %>% mutate(yearReleased = yearreleasedc)
head(edx_train)
yearreleasedd <-as.numeric(str_sub(edx_test$title, start = -5, end = -2))
edx_test <- edx_test %>% mutate(yearReleased = yearreleasedd)
head(edx_test)
edx <-edx %>% mutate(MovieAge = 2020 - yearReleased)
validation <-validation %>% mutate(MovieAge = 2020 - yearReleased)
edx_train <-edx_train %>% mutate(MovieAge = 2020 - yearReleased)
edx_test <-edx_test %>% mutate(MovieAge = 2020 - yearReleased)
Data Analysis will be conducted on MovieAge to determine the significance of potential Age Effects.
The Mean MovieAge is 29.78 years while the Median is 26.
summary(edx$MovieAge)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 22.00 26.00 29.78 33.00 105.00
It is apparent that there is a positive correlation between the two features as evidenced by the generally positive slope of the line of best fit. This could be the result of users awarding higher ratings to films that tend to be older and are often regarded as “classics.” As a result, we can infer including Movie Age Effects could meaningfully augment our modeling.
Based on our exploratory data analysis, User, Movie, & Movie Age Effects seem to influence the overall ratings most heavily. As such, they will be incorporated as either individual or integrated features of our machine learning models.
The individual and cumulative results of the models will be published after each model.
The formula for RMSE can be defined as follows with \(\overline{y}_{u,i}\) = the prediction of movie i by user u, and \(y_{u,i}\) = the rating of movie i, by user u. N is then defined as the number of user/movie combinations and the sum of these different combinations.
\[\sqrt{\frac{1}{N}\sum_{u,i}(\hat{y}_{u,i}-y_{u,i})^2}\]
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2))
}
The Benchmarking Model calculates the Naive RMSE based on the mean of the edx_train dataset. The formula can be defined as follows with \(y_{u,i}\) = predicted rating, \(\mu\) = the average rating, \(\epsilon_{u,i}\) = independent errors centered at 0.
\[y_{u,i} = \mu + \epsilon_{u,i}\]
edx_train_mu <-mean(edx_train$rating)
NRMSE_M1 <- RMSE(edx_test$rating, edx_train_mu)
#Table the Results
results_table <-tibble(Model_Type = "NRMSE", RMSE = NRMSE_M1) %>%
mutate(RMSE = sprintf("%0.4f", RMSE))
results_table
The Naive RMSE is found to be about 1.06.
Creating other similar models by plugging in the median or any other random number will always produce a less desirable RMSE. The formula can be defined as follows with \(y_{u,i}\) = predicted rating, M = the median of the observed data, \(\epsilon_{u,i}\) = independent errors centered at 0.
\[y_{u,i} = M + \epsilon_{u,i}\]
edx_train_median <-median(edx_train$rating)
MM_M2 <-RMSE(edx_test$rating, edx_train_median)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model"),
RMSE = c(NRMSE_M1, MM_M2)) %>%
mutate(RMSE = sprintf("%0.4f", RMSE))
results_table
The Median Model is shown to have an RMSE of about 1.16.
This model introduces “Movie Effects” or biases and takes into account the fact that films tend to have different rating distributions. The formula can be defined as follows with \(y_{u,i}\) = predicted rating, \(\mu\) = the average rating, bi = the movie effects, and \(\epsilon_{u,i}\) = independent errors centered at 0.
\[y_{u,i} = \mu + bi + \epsilon_{u,i}\]
bi <- edx_train %>% group_by(movieId) %>%
summarize(b_i = mean(rating - edx_train_mu))
Then check the prediction against the test set to determine the RMSE and table the results
prediction_bi <-edx_train_mu + edx_test %>%
left_join(bi, by = "movieId") %>% .$b_i
MEM_M3 <-RMSE(edx_test$rating, prediction_bi)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3)) %>%
mutate(RMSE = sprintf("%0.4f", RMSE))
results_table
The Movie Effects Model is shown to significantly improve the RMSE to roughly .943.
This model introduces “User Effects” or biases that reflect the fact that individual users tend to rate films according to their own standards (which vary widely in distribution). The formula can be defined as follows with \(y_{u,i}\) = predicted rating, \(\mu\) = the average rating, bi = the movie effects, bu = the user effects and \(\epsilon_{u,i}\) = independent errors centered at 0.
\[y_{u,i} = \mu + bi + bu + \epsilon_{u,i}\]
bu <-edx_train %>% left_join(bi, by = "movieId") %>% group_by(userId) %>%
summarize(b_u = mean(rating - edx_train_mu - b_i))
The data appears to be nearly normally distributed.
Then check the prediction against the test set to determine the RMSE and table the results.
prediction_bu <-edx_test %>% left_join(bi, by = "movieId") %>%
left_join(bu, by = "userId") %>%
mutate(predictions = edx_train_mu + b_i + b_u) %>% .$predictions
UEM_M4 <-RMSE(edx_test$rating, prediction_bu)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects", "Movie & User Effects"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3, UEM_M4)) %>%
mutate(RMSE = sprintf("%0.4f", RMSE))
results_table
The Movie & User Effects Model is shown to further improve the RMSE to .8647.
This model introduces “Movie Age Effects” or biases that incorporate variation among the ratings distribution awarded for films of different ages. The formula can be defined as follows with \(y_{u,i}\) = predicted rating, \(\mu\) = the average rating, bi = the movie effects, bu = the user effects, ba = the movie age effects and \(\epsilon_{u,i}\) = independent errors centered at 0.
\[y_{u,i} = \mu + bi + ba + \epsilon_{u,i}\]
ba <- edx_train %>%
left_join(bi, by="movieId") %>% left_join(bu, by ="userId") %>%
group_by(MovieAge) %>% summarize(b_a = mean(rating - b_i - b_u - edx_train_mu))
The range of the distribution is much wider with denser clusters between 0 & 0.5 and outliers around .20.
Check the prediction against the test set to determine the RMSE and table the results.
predictions_ma <- edx_test %>%
left_join(bi, by = "movieId") %>% left_join(bu, by = "userId") %>%
left_join(ba, by = "MovieAge") %>% mutate(predictions = edx_train_mu + b_i + b_u + b_a) %>%
.$predictions
UMMAE_M5 <-RMSE(edx_test$rating, predictions_ma)
#Table the results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"User, Movie & Movie Age Effects"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3, UEM_M4, UMMAE_M5)) %>%
mutate(RMSE = sprintf("%0.4f", RMSE))
results_table
The User, Movie & Movie Age Effects Model shows a marginal improvement with an RMSE of .8643.
This model builds a function using sapply to begin regularization: adding a tuning parameter lambda \(\lambda\) to minimize the RMSE. This function penalizes outliers from bi and bu such as users and movies with very few ratings to optimize the recommendation system.
lambdasR <-seq(0, 10, 1)
RMSES <- sapply(lambdasR, function(l){
edx_train_mu <- mean(edx_train$rating)
b_i <- edx_train %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_train_mu)/(n() + l))
b_u <- edx_train %>%
left_join(b_i, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_train_mu)/(n() +l))
predicted_ratings <- edx_test %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = edx_train_mu + b_i + b_u) %>% .$pred
return(RMSE(predicted_ratings, edx_test$rating))
})
#Determine which lambda minimizes the RMSE
lambda <- lambdasR[which.min(RMSES)]
lambda
## [1] 5
A lambda of 5 is shown to be the best Tune
Then check the prediction against the test set to determine the RMSE and table the results
b_i <- edx_train %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_train_mu)/(n()+lambda))
b_u <-edx_train %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_train_mu)/(n()+lambda))
reg_prediction <- edx_test %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(predictions = edx_train_mu + b_i + b_u) %>% .$predictions
UMEM_REG_M6 <-RMSE(edx_test$rating, reg_prediction)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"Movie, User & Movie Age Effects",
"Movie & User Effects w/Regularization"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3, UEM_M4,
UMMAE_M5, UMEM_REG_M6)) %>%
mutate(RMSE = sprintf("%0.6f", RMSE))
results_table
The Regularized Movie and User Effects model further reduces the RMSE to .8641.
This model adds movie effects while once again using sapply to begin regularization: adding a tuning parameter lambda \(\lambda\) to minimize the RMSE.
lambdasM <-seq(0, 10, 1)
RMSES2 <-sapply(lambdasM, function(l){
edx_train_mu <-mean(edx_train$rating)
b_i <-edx_train %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_train_mu)/(n() + l))
b_u <-edx_train %>%
left_join(b_i, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_train_mu)/(n() +l))
b_a <-edx_train %>%
left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId") %>%
group_by(MovieAge) %>%
summarize(b_a = sum(rating - b_i - b_u - edx_train_mu)/(n()+l))
predicted_ratings <-edx_test %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
left_join(b_a, by = "MovieAge") %>%
mutate(predictions = edx_train_mu + b_i + b_u + b_a) %>% .$predictions
return(RMSE(predicted_ratings, edx_test$rating))
})
lambda2 <- lambdasM[which.min(RMSES2)]
lambda2
## [1] 5
A lambda of 5 is yet again shown to be the best Tune for this model.
Then check the prediction against the test set to determine the RMSE and table the results.
b_i <- edx_train %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_train_mu)/(n()+lambda2))
b_u <-edx_train %>%
left_join(b_i, by = "movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_train_mu)/(n()+lambda2))
b_a <-edx_train %>%
left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId") %>%
group_by(MovieAge) %>%
summarize(b_a = sum(rating - b_i - b_u - edx_train_mu)/(n()+lambda2))
reg_prediction2 <- edx_test %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
left_join(b_a, by = "MovieAge") %>%
mutate(pred = edx_train_mu + b_i + b_u + b_a) %>%
pull(pred)
UMMAE_REG_M7 <-RMSE(edx_test$rating, reg_prediction2)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"User, Movie & Movie Age Effects",
"Movie & User Effects w/Regularization",
"User, Movie & Movie Age Effects w/Regularization"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3, UEM_M4,
UMMAE_M5, UMEM_REG_M6, UMMAE_REG_M7)) %>%
mutate(RMSE = sprintf("%0.5f", RMSE))
results_table
The Final Model significantly improves upon previous models with an RMSE of .86384.
Now we will move on to using the edx & validation sets to confirm our Final Model achieves an RMSE less than .8649.
Uses Naive RMSE based on the mean of the edx dataset with validation.
edx_mu <-mean(edx$rating)
FRMSE_M1 <-RMSE(validation$rating, edx_mu)
#Table the Results
results_table <-tibble(Model_Type = ("NRMSE"),
Final_RMSE_Validation = (NRMSE_M1)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f", Final_RMSE_Validation))
results_table
The Benchmarking Mean Model with validation demonstrates an RMSE of about 1.06.
Based on the median of the edx dataset with validation.
edx_med <-median(edx$rating)
FRMSE_M2 <-RMSE(validation$rating, edx_med)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f", Final_RMSE_Validation))
results_table
The Median Model demonstrates an RMSE of about 1.16.
bi <- edx %>% group_by(movieId) %>%
summarize(b_i = mean(rating - edx_mu))
#Prediction
prediction_bi <-edx_mu + validation %>%
left_join(bi, by = "movieId") %>% .$b_i
FRMSE_M3 <-RMSE(validation$rating, prediction_bi)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2, FRMSE_M3)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f", Final_RMSE_Validation))
results_table
The Movie Effects Model improves upon the Benchmarking Model with an RMSE of 0.9439.
bu <-edx %>% left_join(bi, by = "movieId") %>% group_by(userId) %>%
summarize(b_u = mean(rating - edx_mu - b_i))
#Prediction
prediction_bu <-validation %>% left_join(bi, by = "movieId") %>%
left_join(bu, by = "userId") %>%
mutate(predictions = edx_mu + b_i + b_u) %>% .$predictions
FRMSE_M4 <-RMSE(validation$rating, prediction_bu)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2, FRMSE_M3,
FRMSE_M4)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f", Final_RMSE_Validation))
results_table
This Model combines User, Movie & Movie Age Effects.
ba <- edx %>%
left_join(bi, by = "movieId") %>% left_join(bu, by = "userId") %>%
group_by(MovieAge) %>% summarize(b_a = mean(rating - b_i - b_u - edx_mu))
#Prediction
predictions_ma <- validation %>%
left_join(bi, by = "movieId") %>% left_join(bu, by = "userId") %>%
left_join(ba, by = "MovieAge") %>% mutate(predictions = edx_mu + b_i + b_u + b_a) %>%
.$predictions
FRMSE_M5 <-RMSE(validation$rating, predictions_ma)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"Movie, User, & Movie Age Effects"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2, FRMSE_M3,
FRMSE_M4, FRMSE_M5)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f", Final_RMSE_Validation))
results_table
As demonstrated below, the Movie Age Effects Model further improves upon the User & Movie Effects Model with an RMSE of .865.
A lambda of 5 was shown to be the best Tune for the train and test sets of this Regularized Model and therefore will be used on the validation set.
lambda
## [1] 5
#Movie & User Effects Model with Regularization using the validation set
b_i <-edx %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_mu)/(n()+lambda))
b_u <-edx %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_mu)/(n()+lambda))
reg_prediction <-validation %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(predictions = edx_mu + b_i + b_u) %>% .$predictions
FRMSE_M6 <-RMSE(validation$rating, reg_prediction)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"Movie, User, & Movie Age Effects",
"Movie & User Effects w/Regularization"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2, FRMSE_M3,
FRMSE_M4, FRMSE_M5,
FRMSE_M6)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f",
Final_RMSE_Validation))
results_table
This Regularized Model achieves an RMSE of .86482, but let’s see if we can do better with our Final Validation Model.
A lambda of 5 was shown to be the best Tune for the train and test sets of this Regularized Model as well and therefore will be used on the validation set.
lambda2
## [1] 5
b_i <- edx %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - edx_mu)/(n()+lambda2))
b_u <-edx %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - edx_mu)/(n()+lambda2))
b_a <-edx %>%
left_join(b_i, by="movieId") %>% left_join(b_u, by= "userId") %>%
group_by(MovieAge) %>%
summarize(b_a = sum(rating - b_i - b_u - edx_mu)/(n()+lambda2))
reg_prediction2 <-validation %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
left_join(b_a, by = "MovieAge") %>%
mutate(predictions = edx_mu + b_i + b_u + b_a) %>% .$predictions
FRMSE_M7 <-RMSE(validation$rating, reg_prediction2)
#Table the Results
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"Movie, User, & Movie Age Effects",
"Movie & User Effects w/Regularization",
"Movie, User & Movie Age Effects w/Regularization"),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2, FRMSE_M3,
FRMSE_M4, FRMSE_M5,
FRMSE_M6, FRMSE_M7)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f",
Final_RMSE_Validation))
results_table
results_table <-tibble(Model_Type = c("NRMSE", "Median_Model", "Movie Effects",
"Movie & User Effects",
"Movie, User & Movie Age Effects",
"Movie & User Effects w/Regularization",
"User, Movie & Movie Age Effects w/Regularization"),
RMSE = c(NRMSE_M1, MM_M2, MEM_M3, UEM_M4,
UMMAE_M5, UMEM_REG_M6, UMMAE_REG_M7),
Final_RMSE_Validation = c(FRMSE_M1, FRMSE_M2,
FRMSE_M3, FRMSE_M4,
FRMSE_M5, FRMSE_M6,
FRMSE_M7)) %>%
mutate(Final_RMSE_Validation = sprintf("%0.5f",
Final_RMSE_Validation)) %>%
mutate(RMSE = sprintf("%0.5f", RMSE))
results_table
results_table %>% knitr::kable()
| Model_Type | RMSE | Final_RMSE_Validation |
|---|---|---|
| NRMSE | 1.06005 | 1.06120 |
| Median_Model | 1.16676 | 1.16802 |
| Movie Effects | 0.94296 | 0.94391 |
| Movie & User Effects | 0.86468 | 0.86535 |
| Movie, User & Movie Age Effects | 0.86433 | 0.86500 |
| Movie & User Effects w/Regularization | 0.86414 | 0.86482 |
| User, Movie & Movie Age Effects w/Regularization | 0.86384 | 0.86452 |
The lowest RMSE using the validation set is the Final Validation Model featuring Regularized User, Movie & Movie Age Effects. It significantly improves upon the Benchmarking Model’s RMSE of 1.06.
Irizarry, Rafael A., “Introduction to Data Science: Data Analysis and Prediction Algorithms in R” https://rafalab.github.io/dsbook/
Feuerverger, A., He, Y., & Khatri, S. (2012). Statistical significance of the Netflix challenge. Statistical Science, 27(2), 202-231. https://www.jstor.org/stable/41714795?seq=1