DATA643_Project3: Matrix Factorization

Yun Mai
June 25, 2017

In this project, matrix factorization will be used build modles for collaborative filtering to predict a user's missing rating for a given movie. Singular Value Decomposition (SVD), SVD Approximation(SVDF), and Alternative Least Squares (ALS) will be used to do matrix factorization or laten semantic indexing. SVD Approximation will decompose the original ranking matrix and keep only first r most significance entities.

The data will be using is the Movielense 100k data set. This data set is compromised of 100,000 records containing user ratings of movies on a scale of 1-5 collected during a seven month period from 9/19/1997-4/22/1998. The data contains 943 users.

install.packages("recommenderlab")
install.packages("recosystem")
install.packages("Metrics")
install.packages("svd")

library(devtools)
install.packages("quanteda") 
install_github(repo = "SVDApproximation", username = "tarashnot")

Load packages

#convert to data to spase matrix
suppressWarnings(suppressMessages(library(Matrix)))
#matrix factorization 
suppressWarnings(suppressMessages(library(recommenderlab)))
suppressWarnings(suppressMessages(library(SVDApproximation)))
suppressWarnings(suppressMessages(library(irlba)))
suppressWarnings(suppressMessages(library(recosystem)))
suppressWarnings(suppressMessages(library(SlopeOne)))
#data cleaning and reconstruction
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(data.table)))

#linear algebra  here used for rmse calculation
suppressWarnings(suppressMessages(library(Metrics)))

#
suppressWarnings(suppressMessages(library(ggplot2)))

1. Data

data(MovieLense)

** 1.1 View the raw data**

kable(head(MovieLense,n=5))
user_id item_id rating
1 101 Dalmatians (1996) 2
1 12 Angry Men (1957) 5
1 20,000 Leagues Under the Sea (1954) 3
1 2001: A Space Odyssey (1968) 4
1 Abyss, The (1989) 3
# convert ratings data to realRatingMatrix for implement of recommenderlab package

ml <- as(MovieLense,"realRatingMatrix")
head(ml)
## 1 x 1664 rating matrix of class 'realRatingMatrix' with 271 ratings.
#Normalize by subtracting the row mean from all ratings in the row
ml_n <- normalize(ml)

image(ml, main = "Raw Movielense Data")

image(ml_n, main = "Normalized Movielens Data")

1.2 Statistics of ratings data

# use visualize_ratings function from SVDApproximation to visualize statistics for all ratings: item count of different ratings,item histogram of users' average ratings, item histogram of items' average ratings, item histogram of number of rated items by user, item histogram of number of scores items have

#distribution of ratings
rating_frq <- as.data.frame(table(MovieLense$rating))

#distribution of rating mean of users
user_summary <-  as.data.frame(cbind( 'mean'=rowMeans(ml),'number'=rowCounts(ml)))
user_summary <-as.data.frame(sapply(user_summary, function(x) as.numeric(as.character(x))))

#distribution of rating mean of items
item_summary <- as.data.frame(cbind('mean'=colMeans(ml), 'number'=colCounts(ml)))
item_summary <-as.data.frame(sapply(item_summary, function(x) as.numeric(as.character(x))))

ggplot(rating_frq,aes(Var1,Freq)) +   
  geom_bar(aes(fill = Var1), position = "dodge", stat="identity",fill="palegreen")+ labs(x = "Score")

par(mfrow=c(2,2))
ggplot(user_summary,aes(mean)) +
  geom_histogram(binwidth = 0.05,col='white',fill="plum") + labs(x = "User Average Score")+geom_vline(xintercept = mean(user_summary$mean),col='grey',size=2)

ggplot(item_summary,aes(mean)) +
  geom_histogram(binwidth = 0.05,col='white',fill="sandybrown") + labs(x = "Item Average Score")+geom_vline(xintercept = mean(item_summary$mean),col='grey',size=2)

ggplot(user_summary,aes(number)) +
  geom_histogram(binwidth = 0.8,fill="plum") + labs(x = "Number of Rated Items")

ggplot(item_summary,aes(number)) +
  geom_histogram(binwidth = 0.8,fill="sandybrown") + labs(x = "Number of Scores Item has")

From the figures we know: 1.Rating distribution is nealy normal and sightly right skewed. Rating four has the highest frequency. 2.Distribution of user average score is normal. 3.Distribution of item average score is normal. 4.Distribution of number of rated items is strongly right skewed. 5.Distribution of number of item's scores is strongly right skewed.

2.Building and testing the SVD, Funk SVD, and ALS recommendation algorithms

######################### SVD SVDF ALS ############################

#divide the data into 90% training 10% test
div <- evaluationScheme(ml[1:943], method="split", train = 0.9, given = 15, goodRating = 5)
div
## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=5.000000
## Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
#Create the recommender based on SVD and SVDF using the training data
r.svd <- Recommender(getData(div, "train"), "SVD")
r.svdf <- Recommender(getData(div, "train"), "SVDF")
r.als <- Recommender(getData(div, "train"), "ALS")

#Compute predicted ratings for test data that is known using the UBCF algorithm
p.svd <- predict(r.svd, getData(div, "known"), type = "ratings")
p.svdf <- predict(r.svdf, getData(div, "known"), type = "ratings")
p.als <- predict(r.als, getData(div, "known"), type = "ratings")
getRatingMatrix(p.svd)[1:6,1:6]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   3.996941     3.997359              3.999921
## 110                  3.397708     3.400934              3.395493
## 112                  3.797367     3.798788              3.795003
## 116                  2.662335     2.665715              2.661648
## 127                  4.469041     4.467892              4.426330
## 131                  3.801799     3.800601              3.792000
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.005228   3.999858                    4.000702
## 110            3.396908   3.399105                    3.401712
## 112            3.806029   3.797275                    3.802845
## 116            2.673332   2.666178                    2.671881
## 127            4.441403   4.463365                    4.464441
## 131            3.797982   3.797902                    3.803770
getRatingMatrix(p.svdf)[1:6,1:6]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   3.737997     4.011937              3.777990
## 110                  3.127695     3.443084              3.082330
## 112                  3.338245     3.850501              3.365544
## 116                  2.263905     2.796515              2.264469
## 127                  4.517391     4.495813              4.455854
## 131                  3.863479     3.805763              3.898272
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.360445   3.824796                    3.831574
## 110            3.796420   3.270283                    3.235066
## 112            4.367055   3.510299                    3.599670
## 116            3.295692   2.540556                    2.626836
## 127            4.428249   4.451759                    4.456323
## 131            3.931560   3.784952                    3.648976
getRatingMatrix(p.als)[1:6,1:6]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   2.121811     2.665332              2.756370
## 110                  2.250776     2.182810              2.777425
## 112                  2.761319     1.672027              3.212948
## 116                  1.689270     2.348751              2.128914
## 127                  2.942794     2.396498              3.953979
## 131                  2.040607     2.347141              2.459893
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.529131   3.394102                    3.170678
## 110            3.713515   2.790870                    2.818951
## 112            4.038139   3.472868                    2.599914
## 116            3.346893   2.158193                    2.398878
## 127            3.601322   2.381760                    3.016424
## 131            4.208693   3.658074                    3.079852
#Calculate the error between training prediction and unknown test data
error <- rbind(
  SVD = calcPredictionAccuracy(p.svd, getData(div, "unknown")),
  SVDF = calcPredictionAccuracy(p.svdf, getData(div, "unknown")),
  ALS = calcPredictionAccuracy(p.als, getData(div, "unknown")))
kable(as.data.frame(error))
RMSE MSE MAE
SVD 1.0610806 1.1258921 0.8422431
SVDF 0.9939672 0.9879708 0.7757971
ALS 1.0267436 1.0542024 0.8009489
ml.cross <- evaluationScheme(ml[1:800], method = "cross", k = 4, given = 3, goodRating = 5)

ml.cross.results.svd <- evaluate(ml.cross, method = "SVD", type = "topNList", n = c(1, 3, 5, 10, 15, 20))
## SVD run fold/sample [model time/prediction time]
##   1  [0.16sec/0.58sec] 
##   2  [0.29sec/0.69sec] 
##   3  [0.14sec/0.71sec] 
##   4  [0.13sec/0.69sec]
ml.cross.results.als <- evaluate(ml.cross, method = "ALS", type = "topNList", n = c(1, 3, 5, 10, 15, 20))
## ALS run fold/sample [model time/prediction time]
##   1  [0sec/42.34sec] 
##   2  [0sec/41.38sec] 
##   3  [0sec/42.17sec] 
##   4  [0sec/42.26sec]
plot(ml.cross.results.svd, annotate = TRUE, main = "ROC Curve for SVD Recommender Method")

plot(ml.cross.results.svd, "prec/rec", annotate = TRUE, main = "Precision-Recall for SVD Recommender Method")

plot(ml.cross.results.als, annotate = TRUE, main = "ROC Curve for ALS Recommender Method")

plot(ml.cross.results.als, "prec/rec", annotate = TRUE, main = "Precision-Recall for ALS Recommender Method")

3. Matrix Factorization by Singular Value Decomposition

Next, recommenderlab package and hand write code will be used to build the recommendation model. Effect of different number of singular values (k or r) on accuracy of prediction will be evaluated.

3.1 Replace the movie name with movie ID

# add numeric movie_ID to the data
m <- as.data.frame(as(ml_n, "matrix"))
a <- as.numeric(as.character(rownames(m)))
m <- cbind('user_id'=a, m)
m_2 <- gather(m,item_id,rating,2:1665)
movie <- data.frame('movieID'=1, 'movie'=m_2[,2])
movie <- as.data.frame(table(movie[,2]))
movie$movieID <- seq(1:nrow(movie))
movie <- movie[,c(3,1)]
colnames(movie)<- c('movieID','title')
rating <- merge(m_2, movie, by.x = "item_id",by.y = "title")
rating <- rating[,c(2,4,3)]
# in rating, movie names were replaced with movie ID

#convert to data.table
as.data.table(rating)
##          user_id movieID rating
##       1:       1       1     NA
##       2:      10       1     NA
##       3:     100       1     NA
##       4:     101       1     NA
##       5:     102       1     NA
##      ---                       
## 1569148:      95    1664     NA
## 1569149:      96    1664     NA
## 1569150:      97    1664     NA
## 1569151:      98    1664     NA
## 1569152:      99    1664     NA
# visualize_ratings(ratings_table = rating) visualize_ratings function from SVDApproximation package did not work for rating but worked well for the package build-in data,ratings. So I will view the data by using ggplot2 as shown in following.

3.2 Build Recommendation Model Base On SVD Method

# view the available registried mathods in recommenderlab
recommenderRegistry$get_entries()
## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
##   normalize lambda n_factors n_iterations min_item_nr seed
## 1      NULL    0.1        10           10           1 NULL
## 
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $ALS_implicit_binaryRatingMatrix
## Recommender method: ALS_implicit for binaryRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $AR_binaryRatingMatrix
## Recommender method: AR for binaryRatingMatrix
## Description: Recommender based on association rules.
## Reference: NA
## Parameters:
##   support confidence maxlen sort_measure sort_decreasing apriori_control
## 1     0.1        0.8      3 "confidence"            TRUE          list()
##   verbose
## 1   FALSE
## 
## $IBCF_binaryRatingMatrix
## Recommender method: IBCF for binaryRatingMatrix
## Description: Recommender based on item-based collaborative filtering (binary rating data).
## Reference: NA
## Parameters:
##    k    method normalize_sim_matrix alpha
## 1 30 "Jaccard"                FALSE   0.5
## 
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
##    k   method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine"  "center"                FALSE   0.5      FALSE
## 
## $POPULAR_binaryRatingMatrix
## Recommender method: POPULAR for binaryRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters: None
## 
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
##   normalize    aggregationRatings aggregationPopularity
## 1  "center" new("standardGeneric" new("standardGeneric"
## 
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
## 
## $RANDOM_binaryRatingMatrix
## Recommender method: RANDOM for binaryRatingMatrix
## Description: Produce random recommendations (binary ratings).
## Reference: NA
## Parameters: None
## 
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
##   randomize minRating
## 1         1        NA
## 
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
##    k maxiter normalize
## 1 10     100  "center"
## 
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend.
## Reference: NA
## Parameters:
##    k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015  0.001         50        200           1e-06  "center"   FALSE
## 
## $UBCF_binaryRatingMatrix
## Recommender method: UBCF for binaryRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##      method nn weighted sample
## 1 "jaccard" 25     TRUE  FALSE
## 
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##     method nn sample normalize
## 1 "cosine" 25  FALSE  "center"
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1852856  99.0    5684620 303.6  5249944 280.4
## Vcells 19424431 148.2   56417825 430.5 55307755 422.0

From the list, we can see SVD (Recommender based on SVD approximation with column-mean imputation) and SVDF (Recommender based on Funk SVD with gradient descend) are available from recommenderlab package.

3.2.1 Prepare Data

# Convert the data to wide dataframe
rating_c <- as.data.frame(rating)
rating_w <- spread(rating_c,movieID,rating)
rm(rating_c)

# get half of the data because of small RAM 
set.seed(124)
sample1 <- sample.int(n = nrow(rating_w), size = floor(.5*nrow(rating_w)), replace = F)
sample2 <- sample.int(n = ncol(rating_w), size = floor(.5*ncol(rating_w)), replace = F)
rating_2 <- rating_w[sample1,sample2]#
rating_3 <- cbind('user_id'=as.numeric(rownames(rating_2)),rating_2)
rating_2 <- rating_3 # raing_2 is the wide table or user-item matrix in a dataframe format

#convert data back to long table
rating_1 <- gather(rating_3,'movieID','rating',2:833)
rating_1$movieID <- as.numeric(rating_1$movieID) # raing_1 is the long table in a dataframe format
## Warning: NAs introduced by coercion
rm(rating_3)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1859660  99.4    5684620 303.6  5249944 280.4
## Vcells 22562579 172.2   56417825 430.5 55702308 425.0
# split the data into train set and test set
# set Seed to make sure the reproducibility of the sampling results
set.seed(125)
sample3 <- sample.int(n = nrow(rating_2), size = floor(.8*nrow(rating_2)), replace = F)
train_2 <- rating_2[sample3,] # train_1 is the wide table in a dataframe format
test_2 <- rating_2[-sample3,] # test_1 is the wide table in a dataframe format
#convert data back to long table
train_1 <- gather(train_2,'movieID','rating',2:833) # train_2 is the long table in a dataframe format
train_1$movieID<-as.numeric(train_1$movieID)
## Warning: NAs introduced by coercion
test_1 <- gather(test_2,'movieID','rating',2:833) # test_2 is the long table in a dataframe format
test_1$movieID<-as.numeric(test_1$movieID)
## Warning: NAs introduced by coercion
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1861449  99.5    5684620 303.6  5249944 280.4
## Vcells 24132726 184.2   56417825 430.5 55702308 425.0
# find out which user and movie have been allocated into train and test set respectively
user_all <- as.list(unique(rating_2$user_id))
movie_all <- as.list(unique(rating_1$movieID))
user_train <- user_all[sample3]
movie_train <- movie_all[sample3]
user_test <- user_all[-sample3]
movie_test <- movie_all[-sample3]
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862830  99.5    5684620 303.6  5249944 280.4
## Vcells 24136687 184.2   56417825 430.5 55702308 425.0
# convert the data to realRatingMatrix format
rating_4 <- as.matrix(rating_2[,-1])
rating_m <- as(rating_4,"realRatingMatrix") # rating_m is the realRatingMatrix format for all data
movie_m <- getRatingMatrix(rating_m)
rm(rating_4)

train_3 <- as.matrix(train_2[,-1])
train_m <- as(train_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(train_3)

test_3 <- as.matrix(test_2[,-1])
test_m <- as(test_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(test_3)

evaluationScheme(rating_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 471 x 832 rating matrix of class 'realRatingMatrix' with 25944 ratings.
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863020  99.5    5684620 303.6  5249944 280.4
## Vcells 24219360 184.8   56417825 430.5 55702308 425.0
evaluationScheme(train_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 376 x 832 rating matrix of class 'realRatingMatrix' with 21010 ratings.
evaluationScheme(test_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 95 x 832 rating matrix of class 'realRatingMatrix' with 4934 ratings.
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863025  99.5    5684620 303.6  5249944 280.4
## Vcells 24219408 184.8   56417825 430.5 55702308 425.0

3.2.2 Get SVD

r <- Recommender(rating_m, method = "SVD")
names(getModel(r))
## [1] "description" "svd"         "columnMeans" "k"           "maxiter"    
## [6] "normalize"   "verbose"
imt <- getModel(r)$svd
names(imt)
## [1] "d"     "u"     "v"     "iter"  "mprod"
print(paste("the number of latent features (k) is ",getModel(r)$k))
## [1] "the number of latent features (k) is  10"
r_train <- Recommender(train_m, method = "SVD")
imt_train <- getModel(r_train)$svd
print(paste("the number of latent features (k) for train set is ",getModel(r_train)$k))
## [1] "the number of latent features (k) for train set is  10"
r_test <- Recommender(test_m, method = "SVD")
imt_test <- getModel(r_test)$svd
print(paste("the number of latent features (k) for test setis ",getModel(r_test)$k))
## [1] "the number of latent features (k) for test setis  10"
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862993  99.5    5684620 303.6  5249944 280.4
## Vcells 25515356 194.7   56417825 430.5 55702308 425.0

3.2.3 Making Predictions From the Decomposed Matrices

Obtain SVD matrices from SVD model.By multiply U, ??, and VT back to get the rank k=10 approximation of original rating matrix.

u_sigma <- imt$u %*% diag(imt$d)
predict_rating_2 <- u_sigma %*% t(imt$v) # predict_rating_2 is the user-item version for predtion rating for all data

u_sigma_train <- imt_train$u %*% diag(imt_train$d)
predict_train_2 <- u_sigma_train %*% t(imt_train$v) # predict_train_2 is the user-item version for predtion rating for train set

predict_test_2 <- predict(r_train, test_m, verbose = TRUE,type="ratingMatrix") # predict_test_2 is the user-item version for predtion rating for test set
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862930  99.5    5684620 303.6  5249944 280.4
## Vcells 26348014 201.1   56417825 430.5 55702308 425.0
#u_sigma_test <- imt_test$u %*% diag(imt_test$d)
#predict_test <- u_sigma_test %*% t(imt_test$v)
# set the memory the process needs
memory.limit(size=40000)
## [1] 40000
# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating_2)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict_all <- gather(predict_rating,movieID, rating,2:833)
predict_all$movieID <- as.numeric(predict_all$movieID)
rm(predict_rating)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863240  99.6    5684620 303.6  5249944 280.4
## Vcells 27525629 210.1   56417825 430.5 55702308 425.0
kable(head(predict_all),caption = "All Prediction")
user_id movieID rating
79 1466 -6.336172
386 1466 -6.355482
485 1466 -6.349115
374 1466 -6.295749
210 1466 -6.322168
275 1466 -6.356418
predict_train <- as.data.frame(predict_train_2)
predict_train <- cbind('user_id'= unlist(user_train),predict_train)
colnames(predict_train)<- c('user_id',unlist(movie_all))
predict_train_1 <- gather(predict_train,movieID, rating,2:833)
rm(predict_train)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863440  99.6    5684620 303.6  5249944 280.4
## Vcells 28464362 217.2   56417825 430.5 55702308 425.0
kable(head(predict_train_1),caption = "Prediction For Train set")
user_id movieID rating
898 1466 -6.252160
221 1466 -6.196706
259 1466 -6.277859
236 1466 -6.236991
495 1466 -6.216564
738 1466 -6.365704
predict_test <- as.data.frame(as.matrix(getRatingMatrix(predict_test_2)))
predict_test <- cbind('user_id'= unlist(user_test),predict_test)
colnames(predict_test)<- c('user_id',unlist(movie_all))
predict_test_1 <- gather(predict_test,movieID, rating,2:833)
rm(predict_test)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863513  99.6    5684620 303.6  5249944 280.4
## Vcells 28701637 219.0   56417825 430.5 55702308 425.0
kable(head(predict_test_1),caption = "Prediction For Test set")
user_id movieID rating
210 1466 3.179707
262 1466 3.080304
721 1466 8.026959
799 1466 79.715527
381 1466 5.202305
690 1466 12.232795

3.2.4 Making Recommendations

Based on the laten index, we know the predicted ratings to all movies for every user. The movie with the high predicted ratings that a user hasn't rated will be returned.

Predition for all data

# find the moive user have not rated yet
predict_all[,c('movieID','rating')] <- lapply(predict_all[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict_all, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6))
user_id movieID rating
888 1396 8.495888
800 84 5.731576
800 553 5.478352
879 553 4.945058
879 84 4.756719
765 553 4.736794
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_all, old_rated, by = c("user_id","movieID"))

rmse_1 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating is",round(rmse_1,2)))
## [1] "RMSE of prediction for all rating is 11.06"

Predition for train set

# find the moive user have not rated yet
predict_train_1[,c('movieID','rating')] <- lapply(predict_train_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- train_1[!(is.na(train_1$rating)),]
not_rated <- anti_join(predict_train_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for train set")
user_id movieID rating
857 581 10.060336
328 1286 4.325148
888 1396 3.453218
879 553 2.908616
765 553 2.064947
858 1268 2.001262
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_train_1, old_rated, by = c("user_id","movieID"))

rmse_2 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for train set is",round(rmse_2,2)))
## [1] "RMSE of prediction for train set is 10.92"

Predition for test set

# find the moive user have not rated yet
predict_test_1[,c('movieID','rating')] <- lapply(predict_test_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- test_1[!(is.na(test_1$rating)),]
not_rated <- anti_join(predict_test_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for test set")
user_id movieID rating
895 94 120.21248
895 1323 119.61836
895 91 119.07299
895 1228 118.96159
895 941 118.91728
672 412 99.43448
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_test_1, old_rated, by = c("user_id","movieID"))

rmse_3 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for test set is",round(rmse_3,2)))
## [1] "RMSE of prediction for test set is 16.3"

3.3. Build Recommendation Model Base On Funk SVD Method

3.3.1 Prepare Data

# Convert the data to wide dataframe
rating_c <- as.data.frame(rating)
rating_w <- spread(rating_c,movieID,rating)
rm(rating_c)

# get half of the data because of small RAM 
set.seed(124)
sample1 <- sample.int(n = nrow(rating_w), size = floor(.5*nrow(rating_w)), replace = F)
sample2 <- sample.int(n = ncol(rating_w), size = floor(.5*ncol(rating_w)), replace = F)
rating_2 <- rating_w[sample1,sample2]#
rating_3 <- cbind('user_id'=as.numeric(rownames(rating_2)),rating_2)
rating_2 <- rating_3 # raing_2 is the wide table or user-item matrix in a dataframe format

#convert data back to long table
rating_1 <- gather(rating_3,'movieID','rating',2:833)
rating_1$movieID <- as.numeric(rating_1$movieID) # raing_1 is the long table in a dataframe format
## Warning: NAs introduced by coercion
rm(rating_3)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867309  99.8    5684620 303.6  5249944 280.4
## Vcells 28965032 221.0   56417825 430.5 56071063 427.8
# split the data into train set and test set
# set Seed to make sure the reproducibility of the sampling results
set.seed(125)
sample3 <- sample.int(n = nrow(rating_2), size = floor(.8*nrow(rating_2)), replace = F)
train_2 <- rating_2[sample3,] # train_1 is the wide table in a dataframe format
test_2 <- rating_2[-sample3,] # test_1 is the wide table in a dataframe format
#convert data back to long table
train_1 <- gather(train_2,'movieID','rating',2:833) # train_2 is the long table in a dataframe format
train_1$movieID<-as.numeric(train_1$movieID)
## Warning: NAs introduced by coercion
test_1 <- gather(test_2,'movieID','rating',2:833) # test_2 is the long table in a dataframe format
test_1$movieID<-as.numeric(test_1$movieID)
## Warning: NAs introduced by coercion
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867391  99.8    5684620 303.6  5249944 280.4
## Vcells 28964276 221.0   56417825 430.5 56071063 427.8
# find out which user and movie have been allocated into train and test set respectively
user_all <- as.list(unique(rating_2$user_id))
movie_all <- as.list(unique(rating_1$movieID))
user_train <- user_all[sample3]
movie_train <- movie_all[sample3]
user_test <- user_all[-sample3]
movie_test <- movie_all[-sample3]
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867441  99.8    5684620 303.6  5249944 280.4
## Vcells 28964325 221.0   56417825 430.5 56071063 427.8
# convert the data to realRatingMatrix format
rating_4 <- as.matrix(rating_2[,-1])
rating_m <- as(rating_4,"realRatingMatrix") # rating_m is the realRatingMatrix format for all data
movie_m <- getRatingMatrix(rating_m)
rm(rating_4)

train_3 <- as.matrix(train_2[,-1])
train_m <- as(train_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(train_3)

test_3 <- as.matrix(test_2[,-1])
test_m <- as(test_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(test_3)

evaluationScheme(rating_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 471 x 832 rating matrix of class 'realRatingMatrix' with 25944 ratings.
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867565  99.8    5684620 303.6  5249944 280.4
## Vcells 28965410 221.0   56417825 430.5 56071063 427.8
evaluationScheme(train_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 376 x 832 rating matrix of class 'realRatingMatrix' with 21010 ratings.
evaluationScheme(test_m, method="split", train=0.8, k=1, given=3)
## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 95 x 832 rating matrix of class 'realRatingMatrix' with 4934 ratings.
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867568  99.8    5684620 303.6  5249944 280.4
## Vcells 28965458 221.0   56417825 430.5 56071063 427.8

3.3.2 Get SVD

r <- Recommender(rating_m, method = "SVDF")
names(getModel(r))
##  [1] "description"     "svd"             "k"              
##  [4] "gamma"           "lambda"          "min_epochs"     
##  [7] "max_epochs"      "min_improvement" "normalize"      
## [10] "verbose"
imt <- getModel(r)$svd
names(imt)
## [1] "U"          "V"          "parameters"
print(paste("the number of latent features (k) is ",getModel(r)$k))
## [1] "the number of latent features (k) is  10"
r_train <- Recommender(train_m, method = "SVDF")
imt_train <- getModel(r_train)$svd
print(paste("the number of latent features (k) for train set is ",getModel(r_train)$k))
## [1] "the number of latent features (k) for train set is  10"
#r_test <- Recommender(test_m, method = "SVDF")
#imt_test <- getModel(r_test)$svd
#print(paste("the number of latent features (k) for test setis ",getModel(r_test)$k))
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866654  99.7    5684620 303.6  5249944 280.4
## Vcells 27899158 212.9   56417825 430.5 56417825 430.5

3.3.3 Making Predictions From the Decomposed Matrices

Obtain SVD matrices from SVD model.By multiply U, ??, and VT back to get the rank k=10 approximation of original rating matrix.

predict_rating_2 <- imt$U %*% t(imt$V) # predict_rating_2 is the user-item version for predtion rating for all data

predict_train_2 <- imt_train$U %*% t(imt_train$V) # predict_train_2 is the user-item version for predtion rating for train set

predict_test_2 <- predict(r_train, test_m, verbose = TRUE,type="ratingMatrix") # predict_test_2 is the user-item version for predtion rating for test set
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866534  99.7    5684620 303.6  5684620 303.6
## Vcells 27787596 212.1   56417825 430.5 56417825 430.5
# set the memory the process needs
memory.limit(size=40000)
## [1] 40000
# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating_2)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict_all <- gather(predict_rating,movieID, rating,2:833)
predict_all$movieID <- as.numeric(predict_all$movieID)
rm(predict_rating)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866857  99.8    5684620 303.6  5684620 303.6
## Vcells 27789849 212.1   56417825 430.5 56417825 430.5
kable(head(predict_all),caption = "All Prediction")
user_id movieID rating
79 1466 NaN
386 1466 NaN
485 1466 NaN
374 1466 NaN
210 1466 NaN
275 1466 NaN
predict_train <- as.data.frame(predict_train_2)
predict_train <- cbind('user_id'= unlist(user_train),predict_train)
colnames(predict_train)<- c('user_id',unlist(movie_all))
predict_train_1 <- gather(predict_train,movieID, rating,2:833)
rm(predict_train)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866908  99.8    5684620 303.6  5684620 303.6
## Vcells 27789933 212.1   56417825 430.5 56417825 430.5
kable(head(predict_train_1),caption = "Prediction For Test set")
user_id movieID rating
898 1466 NaN
221 1466 NaN
259 1466 NaN
236 1466 NaN
495 1466 NaN
738 1466 NaN
predict_test <- as.data.frame(as.matrix(getRatingMatrix(predict_test_2)))
predict_test <- cbind('user_id'= unlist(user_test),predict_test)
colnames(predict_test)<- c('user_id',unlist(movie_all))
predict_test_1 <- gather(predict_test,movieID, rating,2:833)
rm(predict_test)
gc() #release memory
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866963  99.8    5684620 303.6  5684620 303.6
## Vcells 27790027 212.1   56417825 430.5 56417825 430.5
kable(head(predict_test_1),caption = "Prediction For Test set")
user_id movieID rating
210 1466 0
262 1466 0
721 1466 0
799 1466 0
381 1466 0
690 1466 0

3.3.4 Making Recommendations

Based on the laten index, we know the predicted ratings to all movies for every user. The movie with the high predicted ratings that a user hasn't rated will be returned.

Predition for all data

# find the moive user have not rated yet
predict_all[,c('movieID','rating')] <- lapply(predict_all[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict_all, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6))
user_id movieID rating
747 1110 NaN
843 691 NaN
835 1402 NaN
92 1101 NaN
110 1381 NaN
261 818 NaN
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_all, old_rated, by = c("user_id","movieID"))

rmse_1 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating is",round(rmse_1,2)))
## [1] "RMSE of prediction for all rating is NaN"

Predition for train set

# find the moive user have not rated yet
predict_train_1[,c('movieID','rating')] <- lapply(predict_train_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- train_1[!(is.na(train_1$rating)),]
not_rated <- anti_join(predict_train_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for train set")
user_id movieID rating
673 894 NaN
292 677 NaN
841 399 NaN
791 1606 NaN
795 1084 NaN
596 337 NaN
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_train_1, old_rated, by = c("user_id","movieID"))

rmse_2 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for train set is",round(rmse_2,2)))
## [1] "RMSE of prediction for train set is NaN"

Predition for test set

# find the moive user have not rated yet
predict_test_1[,c('movieID','rating')] <- lapply(predict_test_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- test_1[!(is.na(test_1$rating)),]
not_rated <- anti_join(predict_test_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for test set")
user_id movieID rating
257 126 0
262 961 0
272 184 0
767 1157 0
771 1403 0
690 7 0
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_test_1, old_rated, by = c("user_id","movieID"))

rmse_3 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for test set is",round(rmse_3,2)))
## [1] "RMSE of prediction for test set is 16.3"

In terms of RMSE, SVD based matrix decomposition performed better when the model was built on the normalized ratings instead of buidt on the original rating(data not shown).

Funk SVD is not appropreiate as it generate none predction.

4.Find the optimized number of latent features (r or k) by minimizing the Root Mean Square Error (using irlba package to try different r/k)**

#The irlba function only computes the number of singular values corresponding to the maximum of the desired singular vectors, max(nu, nv).
# max(nu, nv) must be strictly less than min(nrow(dtm), ncol(dtm)), so first see the 
max_nunv <-min(nrow(movie_m), ncol(movie_m))
print(paste("Number of left singular vectors to estimate, nu, and number of right, nv, should be less than",max_nunv,"."))
## [1] "Number of left singular vectors to estimate, nu, and number of right, nv, should be less than 471 ."
# try several r to see which one give the best prediction
SVD_5=irlba(getRatingMatrix(rating_m),nu=5,nv=5)
SVD_10=irlba(getRatingMatrix(rating_m),nu=10,nv=10)
SVD_20=irlba(getRatingMatrix(rating_m),nu=20,nv=20)
SVD_50=irlba(getRatingMatrix(rating_m),nu=50,nv=50)
SVD_100=irlba(getRatingMatrix(rating_m),nu=100,nv=100)
plot(SVD_10$d, main="Largest singular values (r=10)")

r=10

u_sigma <- SVD_10$u %*% diag(SVD_10$d)
predict_rating <- u_sigma %*% t(SVD_10$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")
user_id movieID rating
374 1502 1.761509
21 1502 1.598378
374 34 1.573657
174 1319 1.403222
533 572 1.091957
389 1283 1.067422
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_4 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=10 is",round(rmse_4,2)))
## [1] "RMSE of prediction for all rating when r=10 is 0.86"

The RMSE derived from SVD_10 calculated by irlba pacakge is different from that calclated by recommenderlab. irlba use truncated SVD.

Several number of singular values(5, 20, 50, and 100) will be compared with r=10.

r=5

u_sigma <- SVD_5$u %*% diag(SVD_5$d)
predict_rating <- u_sigma %*% t(SVD_5$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")
user_id movieID rating
174 1319 1.733024
174 1283 1.513814
846 500 1.348276
833 391 1.250866
532 1319 1.229991
374 1502 1.187603
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_5 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=5 is",round(rmse_5,2)))
## [1] "RMSE of prediction for all rating when r=5 is 0.92"

r=20

u_sigma <- SVD_20$u %*% diag(SVD_20$d)
predict_rating <- u_sigma %*% t(SVD_20$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")
user_id movieID rating
561 306 1.300612
437 391 1.289824
21 1502 1.207223
561 615 1.198636
561 1286 1.193565
881 34 1.143125
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_6 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=20 is",round(rmse_6,2)))
## [1] "RMSE of prediction for all rating when r=20 is 0.78"

r=50

u_sigma <- SVD_50$u %*% diag(SVD_50$d)
predict_rating <- u_sigma %*% t(SVD_50$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")
user_id movieID rating
451 45 1.0361451
617 435 1.0183612
474 416 1.0006906
664 1458 1.0006452
21 45 0.9610552
533 572 0.9530407
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_7 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=50 is",round(rmse_7,2)))
## [1] "RMSE of prediction for all rating when r=50 is 0.58"

r=100

u_sigma <- SVD_100$u %*% diag(SVD_100$d)
predict_rating <- u_sigma %*% t(SVD_100$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")
user_id movieID rating
102 401 0.7731909
63 885 0.7143455
934 615 0.6763654
448 1022 0.6555326
618 1393 0.6495313
167 696 0.6441336
# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_8 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=100 is",round(rmse_8,2)))
## [1] "RMSE of prediction for all rating when r=100 is 0.37"
kable(RMSE <- data.frame('r'=c(5,10,20,50,100),'RMSE'=c(rmse_5,rmse_4,rmse_6,rmse_7,rmse_8)))
r RMSE
5 0.9169962
10 0.8628613
20 0.7763696
50 0.5848391
100 0.3744922
ggplot(RMSE,aes(r,RMSE))+
  geom_line(col='slateblue2',size=1)+geom_point(col='slateblue2',size=2)+labs(x="Number of Singular Values",y="RMSE")

Conlcuion: The RMSE decreased as the number of singular values increase, suggesting that SVD approximation get closer to real rating as we increased the latent features.

Acknowledge

Thank you Kelly Shaffer's discussion for the part 1(Building and testing the SVD, Funk SVD, and ALS recommendation algorithms)

Reference:

1.Bregt Verreet, The alternating least squares algorithm in recommenderlab. https://beckernick.github.io/matrix-factorization-recommender/

2.Nick Becker, Matrix Factorization for Movie Recommendations in Python http://www.infofarm.be/articles/alternating-least-squares-algorithm-recommenderlab