Recommender System for Movies - Matrix Factorization Methods

GroupLens Research has collected and made available rating data sets from the MovieLens web site (http://movielens.org). The data sets were collected over various periods of time. The selected dataset has ~100K movie ratings (1-5) from ~600 users on ~9000 movies.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

Data collection
# Loading datasets, Package Installation

movies <- read.csv('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/movies.csv')
ratings <- read.csv('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/ratings.csv')


#install.packages("tidyverse")
library(dplyr)
library(tidyr)
Recommender based on SVD approximation with column-mean imputation
# Data Exploration

summary(ratings)
##      userId         movieId           rating        timestamp        
##  Min.   :  1.0   Min.   :     1   Min.   :0.500   Min.   :8.281e+08  
##  1st Qu.:177.0   1st Qu.:  1199   1st Qu.:3.000   1st Qu.:1.019e+09  
##  Median :325.0   Median :  2991   Median :3.500   Median :1.186e+09  
##  Mean   :326.1   Mean   : 19435   Mean   :3.502   Mean   :1.206e+09  
##  3rd Qu.:477.0   3rd Qu.:  8122   3rd Qu.:4.000   3rd Qu.:1.436e+09  
##  Max.   :610.0   Max.   :193609   Max.   :5.000   Max.   :1.538e+09
head(ratings)
##   userId movieId rating timestamp
## 1      1       1      4 964982703
## 2      1       3      4 964981247
## 3      1       6      4 964982224
## 4      1      47      5 964983815
## 5      1      50      5 964982931
## 6      1      70      3 964982400
summary(movies)
##     movieId                                          title     
##  Min.   :     1   Confessions of a Dangerous Mind (2002):   2  
##  1st Qu.:  3248   Emma (1996)                           :   2  
##  Median :  7300   Eros (2004)                           :   2  
##  Mean   : 42200   Saturn 3 (1980)                       :   2  
##  3rd Qu.: 76232   War of the Worlds (2005)              :   2  
##  Max.   :193609   '71 (2014)                            :   1  
##                   (Other)                               :9731  
##             genres    
##  Drama         :1053  
##  Comedy        : 946  
##  Comedy|Drama  : 435  
##  Comedy|Romance: 363  
##  Drama|Romance : 349  
##  Documentary   : 339  
##  (Other)       :6257
head(movies)
##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller
# Build a user matrix with movies as columns

rating_mat <- spread(ratings[,1:3], movieId, rating)
rating_mat <- as.matrix(rating_mat[,-1]) #remove userIds

library(recommenderlab)

#Convert into a recommenderlab sparse matrix
rating_mat <- as(rating_mat, "realRatingMatrix") 

#Exploring parameters of recommendation models
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix"          "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix"         "POPULAR_realRatingMatrix"     
## [5] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
## [7] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [9] "UBCF_realRatingMatrix"
lapply(recommender_models, "[[", "description")
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
## 
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
## 
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
## 
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
## 
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
## 
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
## 
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
## 
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend."
## 
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
#SVD Parameters
recommender_models$SVD_realRatingMatrix$parameters
## $k
## [1] 10
## 
## $maxiter
## [1] 100
## 
## $normalize
## [1] "center"
#Determine similarity between users (first 4)
similarity_users <- similarity(rating_mat[1:4, ], method = "cosine", which = "users")
as.matrix(similarity_users)
##           1  2         3         4
## 1 0.0000000  1 0.7919033 0.9328096
## 2 1.0000000  0        NA 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000
## 4 0.9328096  1 1.0000000 0.0000000
image(as.matrix(similarity_users), main = "User similarity")

#Determine similarity between items (movies) (first 4)
similarity_items <- similarity(rating_mat[, 1:4], method = "cosine", which ="items")
as.matrix(similarity_items)
##           1         2         3         4
## 1 0.0000000 0.9644641 0.9715415 0.9838699
## 2 0.9644641 0.0000000 0.9389013 0.9609877
## 3 0.9715415 0.9389013 0.0000000 1.0000000
## 4 0.9838699 0.9609877 1.0000000 0.0000000
image(as.matrix(similarity_items), main = "Movies similarity")

#Explore ratings distribution
vector_ratings <- as.vector(rating_mat@data)
table(vector_ratings)
## vector_ratings
##       0     0.5       1     1.5       2     2.5       3     3.5       4 
## 5830804    1370    2811    1791    7551    5550   20047   13136   26818 
##     4.5       5 
##    8551   13211
#Explore movie performance
views_per_movie <- colCounts(rating_mat) # count views for each movie
table_views <- data.frame(movie = names(views_per_movie),views = views_per_movie)
table_views <- table_views[order(table_views$views, decreasing = TRUE), ] # sort by number of views
table_views$title <- NA

for (i in 1:nrow(table_views)){
  table_views[i,3] <- as.character(subset(movies, movies$movieId == table_views[i,1])$title)
}
head(table_views)
##      movie views                                     title
## 356    356   329                       Forrest Gump (1994)
## 318    318   317          Shawshank Redemption, The (1994)
## 296    296   307                       Pulp Fiction (1994)
## 593    593   279          Silence of the Lambs, The (1991)
## 2571  2571   278                        Matrix, The (1999)
## 260    260   251 Star Wars: Episode IV - A New Hope (1977)
#Consider only movies with total of views higher than 50 views
average_ratings <- colMeans(rating_mat)
average_ratings_relevant <- average_ratings[views_per_movie > 50] 
# Out of 9700+ movies, only 436 have more thab 50 views
length(average_ratings_relevant)
## [1] 436
#Focus on a more relevant set of ratings with the following constraints: Minimum of 50 users per rates movie and 50 views per movie.

ratings_relevant <- rating_mat[rowCounts(rating_mat) > 50,            colCounts(rating_mat) > 50]

ratings_relevant
## 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.
vector_ratings_relevant <- as.vector(ratings_relevant@data)
table(vector_ratings_relevant)
## vector_ratings_relevant
##      0    0.5      1    1.5      2    2.5      3    3.5      4    4.5 
## 128594    322    694    367   1833   1479   6279   4605  10552   3742 
##      5 
##   6341
#Normalize data

#Defining Train and Test data sets

train_filter <- sample(x = c(TRUE, FALSE), size = nrow(ratings_relevant),replace = TRUE, prob = c(0.8, 0.2))

train_ratings <- as(ratings_relevant[train_filter, ], "realRatingMatrix") 
test_ratings <- as(ratings_relevant[!train_filter, ], "realRatingMatrix")

#Normalize data
train_ratings <- normalize(train_ratings)
test_ratings <- normalize(test_ratings)

#Create Recommender Model. Based on SVD approximation with column-mean imputation

recommender_model <- Recommender(train_ratings, method = "SVD", param=list(k=10,maxiter=100,normalize="center"))


# Top 10 recommendations for users (1-10)
recom <- predict(recommender_model, newdata=test_ratings, n=10, type="topNList") 
recom_list <- as(recom, "list")

recom_result <- list()
for (i in c(1:10)){
 recom_result[[i]] <- movies[as.integer(recom_list[[i]]),2]
}
library(knitr)
recom_result_df <- as.data.frame(recom_result)
colnames(recom_result_df) <- seq(1,10,1)
kable(recom_result_df)
1 2 3 4 5 6 7 8 9 10
Foxfire (1996) NeverEnding Story III, The (1994) S.F.W. (1994) GLOW: The Story of the Gorgeous Ladies of Wrestling (2012) Great White Hype, The (1996) Instinct (1999) Dead Man Walking (1995) Detour (1945) NA Excess Baggage (1997)
Star Wars: Episode VI - Return of the Jedi (1983) Rules of Engagement (2000) Universal Soldier (1992) Picture Perfect (1997) NeverEnding Story III, The (1994) Escort, The (Scorta, La) (1993) 42 Up (1998) Air Force One (1997) S.F.W. (1994) Orgazmo (1997)
Children of the Corn (1984) 20,000 Leagues Under the Sea (1954) Twelve Monkeys (a.k.a. 12 Monkeys) (1995) Escort, The (Scorta, La) (1993) Snatch (2000) 20,000 Leagues Under the Sea (1954) Mulan (1998) Teenage Mutant Ninja Turtles (1990) NA 20,000 Leagues Under the Sea (1954)
He Said, She Said (1991) Sgt. Bilko (1996) Dead Man Walking (1995) Mrs. Dalloway (1997) Jupiter’s Wife (1994) Phantoms (1998) Star Trek: Generations (1994) Mrs. Dalloway (1997) NA Liar Liar (1997)
Joe Versus the Volcano (1990) Full Metal Jacket (1987) 20,000 Leagues Under the Sea (1954) Good German, The (2006) Wonderland (1999) Bread and Chocolate (Pane e cioccolata) (1973) Police Academy 4: Citizens on Patrol (1987) Usual Suspects, The (1995) Beverly Hillbillies, The (1993) Sgt. Bilko (1996)
First Strike (Police Story 4: First Strike) (Ging chaat goo si 4: Ji gaan daan yam mo) (1996) Dead Man Walking (1995) Rules of Engagement (2000) NA Tarantula (1955) NA Masters of the Universe (1987) Wonderland (1999) Surviving the Game (1994) Marvin’s Room (1996)
Nightmare on Elm Street, A (1984) Orgazmo (1997) Instinct (1999) Defiance (2008) Great Expectations (1998) I Like It Like That (1994) Monster in a Box (1992) 20,000 Leagues Under the Sea (1954) NA Cop Land (1997)
Seabiscuit (2003) Flying Tigers (1942) Negotiator, The (1998) Samouraï, Le (Godson, The) (1967) Flesh & Blood (1985) Police Academy 4: Citizens on Patrol (1987) NA Fog, The (1980) Brice Man, The (Brice de Nice) (2005) Striptease (1996)
Cat People (1982) Not Without My Daughter (1991) Full Metal Jacket (1987) Toy Story (1995) Starman (1984) Dracula (Bram Stoker’s Dracula) (1992) Aliens (1986) Sgt. Bilko (1996) NA Great White Hype, The (1996)
Around the World in 80 Days (1956) Kid in King Arthur’s Court, A (1995) Deconstructing Harry (1997) Woman of Paris, A (1923) Vertical Limit (2000) Wallace & Gromit: The Best of Aardman Animation (1996) In Love and War (1996) 42 Up (1998) Flesh & Blood (1985) Alien: Resurrection (1997)
#Ratings assigned to the movies
recomr <- predict(recommender_model, newdata=test_ratings,  type="ratingMatrix") 
recomr_mat <- as(recomr, "matrix")
recomr_mat[1:5,1:5] #First 5 users and first 5 movies
##               1          2        3         6         7
## [1,]  3.9529464  0.1630435 1.163043 0.1630435 0.1630435
## [2,] -1.0121951  3.4961473 3.465581 3.5417891 3.4409979
## [3,]  0.1527778  4.3228326 4.336484 4.3586883 4.3351892
## [4,]  4.2198792 -0.9250000 3.735146 3.8431901 3.8897291
## [5,]  3.5326733  3.6586487 3.632866 0.3666667 3.6426460
#Evaluating Model with Cross-validation

eval_sch <- evaluationScheme(ratings_relevant, method="cross-validation", k=4, given=10, goodRating=3)
recommender_model <- Recommender(getData(eval_sch,"train"), method = "SVD", param=list(k=10,maxiter=100,normalize="center"))
recom <- predict(recommender_model, newdata=getData(eval_sch,"known"), n=10, type="topNList") 

#Performance index of the whole model
eval_accuracy <- calcPredictionAccuracy(x = recom, data = getData(eval_sch, "unknown"), given=10, goodRating=3, byUser = FALSE)
head(eval_accuracy)
##           TP           FP           FN           TN    precision 
##   3.11458333   6.88541667  71.11458333 344.88541667   0.31145833 
##       recall 
##   0.05435035
#Evaluate recommender model depending on the number of items (movies) recommended for every user (multiples of 5 up to 20)
results <- evaluate(x = eval_sch,method = "SVD",n = seq(0,20,5))
## SVD run fold/sample [model time/prediction time]
##   1  [0.03sec/0.06sec] 
##   2  [0.03sec/0.06sec] 
##   3  [0.03sec/0.1sec] 
##   4  [0sec/0.06sec]
head(getConfusionMatrix(results)[[1]])
##          TP        FP       FN       TN precision     recall        TPR
## 0  0.000000  0.000000 74.22917 351.7708       NaN 0.00000000 0.00000000
## 5  1.812500  3.187500 72.41667 348.5833 0.3625000 0.03783824 0.03783824
## 10 3.114583  6.885417 71.11458 344.8854 0.3114583 0.05435035 0.05435035
## 15 4.604167 10.395833 69.62500 341.3750 0.3069444 0.07446513 0.07446513
## 20 5.927083 14.072917 68.30208 337.6979 0.2963542 0.09441751 0.09441751
##            FPR
## 0  0.000000000
## 5  0.008835174
## 10 0.018980418
## 15 0.028729052
## 20 0.038950878