Recommender System for Movies.

This is a simple recommender system for movies. GroupLens Research has collected and made available rating data sets from the MovieLens web site (http://movielens.org). The data sets were collected over various periods of time. The selected dataset has ~100K movie ratings (1-5) from ~1000 users on ~1700 movies. Training and testing data sets are prepared and available already with an 80/20 split proportion

Data collection, sampling and preparation
# Loading datasets, Package Installation

train_base <- read.table('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/ua_train.txt')
test_base <- read.table('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/ua_test.txt')
item_base <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/item.txt', sep = '|')
user_base <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/user.txt', sep = '|')

#install.packages("tidyverse")
library(dplyr)
library(tidyr)
# Complementing, Filtering & Sampling datasets

#Train data set

item_tmp <- item_base[,1:3]
item <- sample_n(item_tmp, 100, replace = FALSE) 
colnames(item) <- c("MovID", "MovTitle", "ReleaseDT")
user_tmp <- user_base[,1:3]
user <- sample_n(user_tmp, 100, replace = FALSE) 
colnames(user) <- c("UserID", "Age", "Gender")
train_tmp <- train_base
colnames(train_tmp) <- c("UserID", "ItemID", "Rating", "TS")
train_tmp <- inner_join(train_tmp, user, by = "UserID")
train_tmp <- inner_join(train_tmp, item, by = c("ItemID" = "MovID"))
train_tmp <- train_tmp[,c(1:3)]

train <- sample_frac(train_tmp, 0.1, replace = FALSE) 
train
##     UserID ItemID Rating
## 142    276   1244      3
## 114    276     76      4
## 5       10    432      4
## 174    303    117      3
## 176    303    181      5
## 393    592    885      2
## 465    815    136      5
## 447    776    444      2
## 453    807    181      5
## 54     108    255      2
## 149    290    180      1
## 419    646    328      3
## 186    303    421      4
## 541    934    190      4
## 208    307    736      3
## 61     150    319      4
## 327    498    181      2
## 296    478     65      4
## 58     146    340      4
## 219    346    117      4
## 369    574    312      4
## 88     185    845      4
## 31      44    432      5
## 1       10    156      4
## 338    524    180      4
## 57     146    328      3
## 73     174     63      4
## 230    350    181      4
## 113    276     65      4
## 177    303    215      5
## 536    919    937      4
## 314    493    693      4
## 86     175    496      5
## 446    776    432      1
## 522    907   1051      5
## 87     185    181      4
## 178    303    219      5
## 294    454    705      3
## 65     160    544      4
## 481    823    215      4
## 267    381   1018      4
## 13      21    219      5
## 431    722    845      5
## 274    388    845      4
## 299    478    340      5
## 228    346   1222      4
## 128    276    496      4
## 91     217    117      4
## 254    380    181      3
## 207    307    505      3
## 71     171    340      3
## 313    493    647      4
## 505    868    181      5
## 117    276    180      5
#Test data set

test_tmp <- test_base
colnames(test_tmp) <- c("UserID", "ItemID", "Rating", "TS")

user_train <- inner_join(train, user, by = "UserID")
user_train <- data.frame(user_train[,1])
colnames(user_train) <- c("UserID")

item_train <- inner_join(train, item, by = c("ItemID" = "MovID"))
item_train <- data.frame(item_train[,2])
colnames(item_train) <- c("ItemID")

test_tmp <- semi_join(test_tmp, user_train, by = "UserID")
test_tmp <- semi_join(test_tmp, item_train, by = "ItemID")
test_tmp <- test_tmp[,c(1:3)]

test <- test_tmp # no sampling due to limited records found in test set (sample_frac(test_tmp, 0.5, replace = FALSE))
test
##    UserID ItemID Rating
## 1     108    181      3
## 2     150    181      5
## 3     160    117      4
## 4     174    937      5
## 5     350    340      4
## 6     381    647      4
## 7     388    117      5
## 8     454    181      3
## 9     646    319      3
## 10    776    496      3
## 11    807    117      4
## 12    907    340      2
Calculate Raw Average and corresponding RMSE
#User-Item Matrix (Train)
train_matrix <- spread(train[,1:3], ItemID, Rating)
train_matrix <- train_matrix[,-1] #discard UserID

#User-Item Matrix (Test)
test_matrix <- spread(test[,1:3], ItemID, Rating)

# Raw Average
mov_raw_avg <- round(mean(as.matrix(train_matrix), na.rm = TRUE),1)
mov_raw_avg
## [1] 3.8
# validating raw average
#sum(as.matrix(train_matrix), na.rm = TRUE) / sum(!is.na(train_matrix))

# RMSE - Test
test_ratings <- test[,3]
rsq_diff <- c()
for (i in test_ratings){
  rsq_diff[length(rsq_diff)+1] <- (i-mov_raw_avg)^2
}

rmse_raw_test <- sqrt(mean(rsq_diff))
rmse_raw_test
## [1] 0.9255629
# RMSE - Train
train_ratings <- train[,3]
rsq_diff <- c()
for (i in train_ratings){
  rsq_diff[length(rsq_diff)+1] <- (i-mov_raw_avg)^2
}

rmse_raw_train <- sqrt(mean(rsq_diff))
rmse_raw_train
## [1] 1.00148
Calculate Baseline Predictor and corresponding RMSE
# TRAIN

# Calculate User and Movie Biases
train_matrix <- spread(train[,1:3], ItemID, Rating)
rownames(train_matrix) <- train_matrix[,1] #make UserIDs as rownames
train_matrix <- train_matrix[,-1] #discard UserID
user_mean <- round(rowMeans(as.matrix(train_matrix), na.rm = TRUE),1)
user_bias <- user_mean-mov_raw_avg
movie_mean <- round(colMeans(as.matrix(train_matrix), na.rm = TRUE),1)
movie_bias <- movie_mean-mov_raw_avg

# Calculating Baseline Predictor Matrix

k <- length(movie_bias)
n <- length(user_bias)
bsln_pred <- matrix(, nrow = n, ncol = k)
for(j in 1:k){
   for(i in 1:n){
        bsln_pred[i,j] = mov_raw_avg + user_bias[i] + movie_bias[j]
        if (bsln_pred[i,j] < 1){
          bsln_pred[i,j] <- 1
        }
        if (bsln_pred[i,j] > 5){
          bsln_pred[i,j] <- 5
        }
        
   }
}

colnames(bsln_pred) <- names(movie_bias)
rownames(bsln_pred) <- names(user_bias)

# RMSE - Train

rsq_diff <- (train_matrix - bsln_pred)^2

rmse_bsln_pred_train <- sqrt(mean(as.matrix(rsq_diff), na.rm = TRUE))
rmse_bsln_pred_train
## [1] 0.5053418
## TEST

test_matrix <- spread(test[,1:3], ItemID, Rating)
rownames(test_matrix) <- test_matrix[,1] #make UserIDs as rownames
test_matrix <- test_matrix[,-1] #discard UserID
# Subsetting baseline predictor matrix for only the Users & Items present in the Test Matrix
user_bias_test <- user_bias[as.character(sort(unique(test$UserID)))]
movie_bias_test <- movie_bias[as.character(sort(unique(test$ItemID)))]

# Calculating Baseline Predictor Matrix

k <- length(movie_bias_test)
n <- length(user_bias_test)
bsln_pred_test <- matrix(, nrow = n, ncol = k)
for(j in 1:k){
   for(i in 1:n){
        bsln_pred_test[i,j] = mov_raw_avg + user_bias_test[i] + movie_bias_test[j]
        if (bsln_pred_test[i,j] < 1){
          bsln_pred[i,j] <- 1
        }
        if (bsln_pred_test[i,j] > 5){
          bsln_pred_test[i,j] <- 5
        }
        
   }
}

colnames(bsln_pred_test) <- names(movie_bias_test)
rownames(bsln_pred_test) <- names(user_bias_test)

# RMSE - Test

rsq_diff_test <- (test_matrix - bsln_pred_test)^2

rmse_bsln_pred_test <- sqrt(mean(as.matrix(rsq_diff_test), na.rm = TRUE))
rmse_bsln_pred_test
## [1] 1.070436
Conclusion
Comparing RMSEs for the 2 solution approaches
# Raw Averages for Train and Test sets:

rmse_raw_train
## [1] 1.00148
rmse_raw_test
## [1] 0.9255629
# Baseline Predictors for Train and Test sets:

rmse_bsln_pred_train
## [1] 0.5053418
rmse_bsln_pred_test
## [1] 1.070436
# Improvement Percentages:

train_improv <- (1- (rmse_bsln_pred_train/rmse_raw_train))*100
train_improv
## [1] 49.54052
test_improv <- (1- (rmse_bsln_pred_test/rmse_raw_test))*100
test_improv
## [1] -15.65244

On average, a baseline predictor approach provides a considerable improvement compared to a raw average approach, it represents a more informed process to assign/predict a specific rating for a User/Item pair. The above statement was proven completely right for the Train set (~50%+ improvement); for the Test set however, there was a negative improvement primarily due to the sparcity of the test set (and the randomness of my sampling on top to make it simpler to validate by hand) which did not allow for more precise biases calculations (User & Item). I prefered to use a more realistic data set (Movilens) rather to creating a synthethic one precisely because I wanted to get a feel of how the processing and actual RMSE calculations were going to be with real data.