library(dplyr)
library(tidyr)
library(kableExtra)
library(scales)
This simple system recommends how a specific user will rate a movie given the user’s past rating and how the overall movie has been rated
Load the the ratings and movie title datasets
ratings_import <- read.csv("https://raw.githubusercontent.com/dhairavc/DATA612-RecommenderSystems/master/Project1/ratings.csv")[,1:3]
movie_titles <- read.csv("https://raw.githubusercontent.com/dhairavc/DATA612-RecommenderSystems/master/Project1/movies.csv")[,1:2]
For educational purposes create a smaller dense dataset of 5 randomly selected movies with the most reviews and 12 random users with a max 2 movies not rated per user
# Create a subset of random 5 movies which have grather than 150 reviews
m_filter <- ratings_import %>% select(movieId) %>% group_by(movieId) %>% summarise("RatingCount" = n()) %>% arrange(desc(RatingCount)) %>% filter(RatingCount > 150)
m_filter <- m_filter[sample(nrow(m_filter), 5),]
# Create subset of 12 random users which have rated atleast 3 of the 5 movies selected
u_filter <- ratings_import %>% filter(movieId %in% m_filter$movieId) %>% select(userId) %>% group_by(userId) %>% summarise("Rated" = n()) %>% arrange(desc(Rated)) %>% filter(Rated %in% c(3,4,5))
rating_sample <- ratings_import %>% filter(movieId %in% m_filter$movieId) %>% filter(userId %in% sample(u_filter$userId,12))
# Join movie titles and remove movie Ids
rating_sample <- left_join(rating_sample, movie_titles, by = "movieId")
rating_sample <- rating_sample %>% select(-movieId)
ratings <- spread(rating_sample, title, rating)
row.names(ratings) <- c(ratings$userId)
ratings <- ratings[,2:6]
ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Men in Black (a.k.a. MIB) (1997) | Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | Saving Private Ryan (1998) | Schindler’s List (1993) | Star Wars: Episode V - The Empire Strikes Back (1980) | |
|---|---|---|---|---|---|
| 17 | NA | 4.5 | 4.5 | 4.5 | 5.0 |
| 18 | 3.5 | 4.0 | 4.0 | 4.5 | 4.0 |
| 45 | 3.0 | 4.0 | 5.0 | NA | 5.0 |
| 195 | NA | NA | 3.0 | 4.0 | 4.0 |
| 199 | 4.0 | 4.0 | 4.0 | 5.0 | 3.5 |
| 266 | 3.0 | 5.0 | 4.0 | NA | 5.0 |
| 274 | 3.0 | 4.0 | 4.0 | 4.0 | 4.5 |
| 282 | 4.0 | NA | 4.5 | 5.0 | NA |
| 354 | 3.5 | NA | 3.5 | 4.0 | 4.0 |
| 494 | NA | 5.0 | 5.0 | NA | 5.0 |
| 570 | 4.0 | 4.0 | 4.0 | 4.5 | 4.0 |
| 600 | NA | 4.0 | NA | 3.0 | 4.0 |
Split the existing dataset into training and testing datasets
n <- nrow(ratings)
p_train <- .6
rowsintrain <- floor(n*p_train)
randomize_order <- ratings[sample(n),]
train_ratings <- randomize_order[1:rowsintrain,]
test_ratings <- randomize_order[(rowsintrain+1):n,]
#training dataset
train_ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Men in Black (a.k.a. MIB) (1997) | Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | Saving Private Ryan (1998) | Schindler’s List (1993) | Star Wars: Episode V - The Empire Strikes Back (1980) | |
|---|---|---|---|---|---|
| 18 | 3.5 | 4.0 | 4.0 | 4.5 | 4.0 |
| 17 | NA | 4.5 | 4.5 | 4.5 | 5.0 |
| 274 | 3.0 | 4.0 | 4.0 | 4.0 | 4.5 |
| 354 | 3.5 | NA | 3.5 | 4.0 | 4.0 |
| 600 | NA | 4.0 | NA | 3.0 | 4.0 |
| 195 | NA | NA | 3.0 | 4.0 | 4.0 |
| 570 | 4.0 | 4.0 | 4.0 | 4.5 | 4.0 |
#testing dataset
test_ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Men in Black (a.k.a. MIB) (1997) | Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | Saving Private Ryan (1998) | Schindler’s List (1993) | Star Wars: Episode V - The Empire Strikes Back (1980) | |
|---|---|---|---|---|---|
| 282 | 4 | NA | 4.5 | 5 | NA |
| 199 | 4 | 4 | 4.0 | 5 | 3.5 |
| 266 | 3 | 5 | 4.0 | NA | 5.0 |
| 45 | 3 | 4 | 5.0 | NA | 5.0 |
| 494 | NA | 5 | 5.0 | NA | 5.0 |
Compute the raw average mean
raw_average <- mean(as.matrix(train_ratings), na.rm = TRUE)
raw_average
## [1] 3.982759
Create function to calculate the Root Mean Squared Error (RMSE)
RMSE_calc <- function(df, q) {
sqrt(mean(as.matrix(((df-q)^2)),na.rm = TRUE))
}
Calculate the RMSE RMSE for the raw average for both testing and training datasets
train_RMSE <- RMSE_calc(train_ratings, raw_average)
test_RMSE <- RMSE_calc(test_ratings, raw_average)
train_RMSE
## [1] 0.4639181
test_RMSE
## [1] 0.786298
Calculate user and movie bias and calculate the baseline predictions for the training dataset
user_bias <- rowMeans(train_ratings, na.rm = TRUE) - raw_average
movie_bias <- colMeans(train_ratings, na.rm = TRUE) - raw_average
train_ratings_pred <- data.frame(matrix(rep(raw_average, nrow(train_ratings)*ncol(train_ratings)), nrow = nrow(train_ratings)))
colnames(train_ratings_pred) <- colnames(train_ratings)
rownames(train_ratings_pred) <- rownames(train_ratings)
train_ratings_pred <-round(sweep(train_ratings_pred,MARGIN = 2, STATS = movie_bias, FUN = "+")+user_bias,1)
#clip [.5, 5]
train_ratings_pred[which(train_ratings_pred < .5, arr.ind = TRUE)] <- .5
train_ratings_pred[which(train_ratings_pred > 5, arr.ind = TRUE)] <- 5
train_ratings_pred %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Men in Black (a.k.a. MIB) (1997) | Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | Saving Private Ryan (1998) | Schindler’s List (1993) | Star Wars: Episode V - The Empire Strikes Back (1980) | |
|---|---|---|---|---|---|
| 18 | 3.5 | 4.1 | 3.9 | 4.1 | 4.2 |
| 17 | 4.1 | 4.7 | 4.5 | 4.7 | 4.9 |
| 274 | 3.4 | 4.0 | 3.8 | 4.0 | 4.1 |
| 354 | 3.3 | 3.9 | 3.6 | 3.8 | 4.0 |
| 600 | 3.2 | 3.8 | 3.5 | 3.8 | 3.9 |
| 195 | 3.2 | 3.8 | 3.5 | 3.8 | 3.9 |
| 570 | 3.6 | 4.2 | 4.0 | 4.2 | 4.3 |
Calculate user bias and calculate the baseline predictions for the training dataset using the movie bias from the training dataset
user_bias_test <- rowMeans(test_ratings, na.rm = TRUE)-raw_average
test_ratings_pred <- data.frame(matrix(rep(raw_average, nrow(test_ratings)*ncol(test_ratings)), nrow = nrow(test_ratings)))
colnames(test_ratings_pred) <- colnames(test_ratings)
rownames(test_ratings_pred) <- rownames(test_ratings)
test_ratings_pred <-round(sweep(test_ratings_pred,MARGIN = 2, STATS = movie_bias, FUN = "+")+user_bias_test,1)
#clip [.5, 5]
test_ratings_pred[which(test_ratings_pred < .5, arr.ind = TRUE)] <- .5
test_ratings_pred[which(test_ratings_pred > 5, arr.ind = TRUE)] <- 5
test_ratings_pred %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Men in Black (a.k.a. MIB) (1997) | Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | Saving Private Ryan (1998) | Schindler’s List (1993) | Star Wars: Episode V - The Empire Strikes Back (1980) | |
|---|---|---|---|---|---|
| 282 | 4.0 | 4.6 | 4.4 | 4.6 | 4.7 |
| 199 | 3.6 | 4.2 | 4.0 | 4.2 | 4.3 |
| 266 | 3.8 | 4.4 | 4.1 | 4.3 | 4.5 |
| 45 | 3.8 | 4.4 | 4.1 | 4.3 | 4.5 |
| 494 | 4.5 | 5.0 | 4.9 | 5.0 | 5.0 |
Calculate the RMSE for both predicted datasets
train_RMSE_baseline <- RMSE_calc(train_ratings, train_ratings_pred)
test_RMSE_baseline <- RMSE_calc(test_ratings, test_ratings_pred)
train_RMSE_baseline
## [1] 0.2710134
test_RMSE_baseline
## [1] 0.5015765
Summarize findings and calculate improvement over RMSE base just on the raw average
raw_avg <- c(raw_average, raw_average)
RMSE <- c(train_RMSE, test_RMSE)
RMSE_b <- c(train_RMSE_baseline, test_RMSE_baseline)
imp <- percent(round(1-(RMSE_b/RMSE),5))
sum_df <- data.frame(raw_avg, RMSE, RMSE_b, imp)
colnames(sum_df) <- c("Raw Average", "RMSE", "RMSE Baseline", "Improvement")
rownames(sum_df) <- c("Train Ratings", "Test Ratings")
sum_df %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Raw Average | RMSE | RMSE Baseline | Improvement | |
|---|---|---|---|---|
| Train Ratings | 3.982759 | 0.4639181 | 0.2710134 | 41.58% |
| Test Ratings | 3.982759 | 0.7862980 | 0.5015765 | 36.21% |