DATA643_Project3: Matrix Factorization

Yun Mai
June 25, 2017

In this project, matrix factorization will be used build modles for collaborative filtering to predict a user's missing rating for a given movie. Singular Value Decomposition (SVD), SVD Approximation(SVDF), and Alternative Least Squares (ALS) will be used to do matrix factorization or laten semantic indexing. SVD Approximation will decompose the original ranking matrix and keep only first r most significance entities.

The data will be using is the Movielense 100k data set. This data set is compromised of 100,000 records containing user ratings of movies on a scale of 1-5 collected during a seven month period from 9/19/1997-4/22/1998. The data contains 943 users.

install.packages("recommenderlab")
install.packages("recosystem")
install.packages("Metrics")
install.packages("svd")

library(devtools)
install.packages("quanteda") 
install_github(repo = "SVDApproximation", username = "tarashnot")

Load packages

#convert to data to spase matrix
suppressWarnings(suppressMessages(library(Matrix)))
#matrix factorization 
suppressWarnings(suppressMessages(library(recommenderlab)))
suppressWarnings(suppressMessages(library(SVDApproximation)))
suppressWarnings(suppressMessages(library(irlba)))
suppressWarnings(suppressMessages(library(recosystem)))
suppressWarnings(suppressMessages(library(SlopeOne)))
#data cleaning and reconstruction
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(data.table)))

#linear algebra  here used for rmse calculation
suppressWarnings(suppressMessages(library(Metrics)))

#
suppressWarnings(suppressMessages(library(ggplot2)))

1. Data

data(MovieLense)

** 1.1 View the raw data**

kable(head(MovieLense,n=5))

user_id	item_id	rating
1	101 Dalmatians (1996)	2
1	12 Angry Men (1957)	5
1	20,000 Leagues Under the Sea (1954)	3
1	2001: A Space Odyssey (1968)	4
1	Abyss, The (1989)	3

# convert ratings data to realRatingMatrix for implement of recommenderlab package

ml <- as(MovieLense,"realRatingMatrix")
head(ml)

## 1 x 1664 rating matrix of class 'realRatingMatrix' with 271 ratings.

#Normalize by subtracting the row mean from all ratings in the row
ml_n <- normalize(ml)

image(ml, main = "Raw Movielense Data")

image(ml_n, main = "Normalized Movielens Data")

1.2 Statistics of ratings data

# use visualize_ratings function from SVDApproximation to visualize statistics for all ratings: item count of different ratings,item histogram of users' average ratings, item histogram of items' average ratings, item histogram of number of rated items by user, item histogram of number of scores items have

#distribution of ratings
rating_frq <- as.data.frame(table(MovieLense$rating))

#distribution of rating mean of users
user_summary <-  as.data.frame(cbind( 'mean'=rowMeans(ml),'number'=rowCounts(ml)))
user_summary <-as.data.frame(sapply(user_summary, function(x) as.numeric(as.character(x))))

#distribution of rating mean of items
item_summary <- as.data.frame(cbind('mean'=colMeans(ml), 'number'=colCounts(ml)))
item_summary <-as.data.frame(sapply(item_summary, function(x) as.numeric(as.character(x))))

ggplot(rating_frq,aes(Var1,Freq)) +   
  geom_bar(aes(fill = Var1), position = "dodge", stat="identity",fill="palegreen")+ labs(x = "Score")

par(mfrow=c(2,2))
ggplot(user_summary,aes(mean)) +
  geom_histogram(binwidth = 0.05,col='white',fill="plum") + labs(x = "User Average Score")+geom_vline(xintercept = mean(user_summary$mean),col='grey',size=2)

ggplot(item_summary,aes(mean)) +
  geom_histogram(binwidth = 0.05,col='white',fill="sandybrown") + labs(x = "Item Average Score")+geom_vline(xintercept = mean(item_summary$mean),col='grey',size=2)

ggplot(user_summary,aes(number)) +
  geom_histogram(binwidth = 0.8,fill="plum") + labs(x = "Number of Rated Items")

ggplot(item_summary,aes(number)) +
  geom_histogram(binwidth = 0.8,fill="sandybrown") + labs(x = "Number of Scores Item has")

From the figures we know: 1.Rating distribution is nealy normal and sightly right skewed. Rating four has the highest frequency. 2.Distribution of user average score is normal. 3.Distribution of item average score is normal. 4.Distribution of number of rated items is strongly right skewed. 5.Distribution of number of item's scores is strongly right skewed.

2.Building and testing the SVD, Funk SVD, and ALS recommendation algorithms

######################### SVD SVDF ALS ############################

#divide the data into 90% training 10% test
div <- evaluationScheme(ml[1:943], method="split", train = 0.9, given = 15, goodRating = 5)
div

## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=5.000000
## Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.

#Create the recommender based on SVD and SVDF using the training data
r.svd <- Recommender(getData(div, "train"), "SVD")
r.svdf <- Recommender(getData(div, "train"), "SVDF")
r.als <- Recommender(getData(div, "train"), "ALS")

#Compute predicted ratings for test data that is known using the UBCF algorithm
p.svd <- predict(r.svd, getData(div, "known"), type = "ratings")
p.svdf <- predict(r.svdf, getData(div, "known"), type = "ratings")
p.als <- predict(r.als, getData(div, "known"), type = "ratings")
getRatingMatrix(p.svd)[1:6,1:6]

## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   3.996941     3.997359              3.999921
## 110                  3.397708     3.400934              3.395493
## 112                  3.797367     3.798788              3.795003
## 116                  2.662335     2.665715              2.661648
## 127                  4.469041     4.467892              4.426330
## 131                  3.801799     3.800601              3.792000
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.005228   3.999858                    4.000702
## 110            3.396908   3.399105                    3.401712
## 112            3.806029   3.797275                    3.802845
## 116            2.673332   2.666178                    2.671881
## 127            4.441403   4.463365                    4.464441
## 131            3.797982   3.797902                    3.803770

getRatingMatrix(p.svdf)[1:6,1:6]

## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   3.737997     4.011937              3.777990
## 110                  3.127695     3.443084              3.082330
## 112                  3.338245     3.850501              3.365544
## 116                  2.263905     2.796515              2.264469
## 127                  4.517391     4.495813              4.455854
## 131                  3.863479     3.805763              3.898272
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.360445   3.824796                    3.831574
## 110            3.796420   3.270283                    3.235066
## 112            4.367055   3.510299                    3.599670
## 116            3.295692   2.540556                    2.626836
## 127            4.428249   4.451759                    4.456323
## 131            3.931560   3.784952                    3.648976

getRatingMatrix(p.als)[1:6,1:6]

## 6 x 6 sparse Matrix of class "dgCMatrix"
##     'Til There Was You (1997) 1-900 (1994) 101 Dalmatians (1996)
## 10                   2.121811     2.665332              2.756370
## 110                  2.250776     2.182810              2.777425
## 112                  2.761319     1.672027              3.212948
## 116                  1.689270     2.348751              2.128914
## 127                  2.942794     2.396498              3.953979
## 131                  2.040607     2.347141              2.459893
##     12 Angry Men (1957) 187 (1997) 2 Days in the Valley (1996)
## 10             4.529131   3.394102                    3.170678
## 110            3.713515   2.790870                    2.818951
## 112            4.038139   3.472868                    2.599914
## 116            3.346893   2.158193                    2.398878
## 127            3.601322   2.381760                    3.016424
## 131            4.208693   3.658074                    3.079852

#Calculate the error between training prediction and unknown test data
error <- rbind(
  SVD = calcPredictionAccuracy(p.svd, getData(div, "unknown")),
  SVDF = calcPredictionAccuracy(p.svdf, getData(div, "unknown")),
  ALS = calcPredictionAccuracy(p.als, getData(div, "unknown")))
kable(as.data.frame(error))

	RMSE	MSE	MAE
SVD	1.0610806	1.1258921	0.8422431
SVDF	0.9939672	0.9879708	0.7757971
ALS	1.0267436	1.0542024	0.8009489

ml.cross <- evaluationScheme(ml[1:800], method = "cross", k = 4, given = 3, goodRating = 5)

ml.cross.results.svd <- evaluate(ml.cross, method = "SVD", type = "topNList", n = c(1, 3, 5, 10, 15, 20))

## SVD run fold/sample [model time/prediction time]
##   1  [0.16sec/0.58sec] 
##   2  [0.29sec/0.69sec] 
##   3  [0.14sec/0.71sec] 
##   4  [0.13sec/0.69sec]

ml.cross.results.als <- evaluate(ml.cross, method = "ALS", type = "topNList", n = c(1, 3, 5, 10, 15, 20))

## ALS run fold/sample [model time/prediction time]
##   1  [0sec/42.34sec] 
##   2  [0sec/41.38sec] 
##   3  [0sec/42.17sec] 
##   4  [0sec/42.26sec]

plot(ml.cross.results.svd, annotate = TRUE, main = "ROC Curve for SVD Recommender Method")

plot(ml.cross.results.svd, "prec/rec", annotate = TRUE, main = "Precision-Recall for SVD Recommender Method")

plot(ml.cross.results.als, annotate = TRUE, main = "ROC Curve for ALS Recommender Method")

plot(ml.cross.results.als, "prec/rec", annotate = TRUE, main = "Precision-Recall for ALS Recommender Method")

3. Matrix Factorization by Singular Value Decomposition

Next, recommenderlab package and hand write code will be used to build the recommendation model. Effect of different number of singular values (k or r) on accuracy of prediction will be evaluated.

3.1 Replace the movie name with movie ID

# add numeric movie_ID to the data
m <- as.data.frame(as(ml_n, "matrix"))
a <- as.numeric(as.character(rownames(m)))
m <- cbind('user_id'=a, m)
m_2 <- gather(m,item_id,rating,2:1665)
movie <- data.frame('movieID'=1, 'movie'=m_2[,2])
movie <- as.data.frame(table(movie[,2]))
movie$movieID <- seq(1:nrow(movie))
movie <- movie[,c(3,1)]
colnames(movie)<- c('movieID','title')
rating <- merge(m_2, movie, by.x = "item_id",by.y = "title")
rating <- rating[,c(2,4,3)]
# in rating, movie names were replaced with movie ID

#convert to data.table
as.data.table(rating)

##          user_id movieID rating
##       1:       1       1     NA
##       2:      10       1     NA
##       3:     100       1     NA
##       4:     101       1     NA
##       5:     102       1     NA
##      ---                       
## 1569148:      95    1664     NA
## 1569149:      96    1664     NA
## 1569150:      97    1664     NA
## 1569151:      98    1664     NA
## 1569152:      99    1664     NA

# visualize_ratings(ratings_table = rating) visualize_ratings function from SVDApproximation package did not work for rating but worked well for the package build-in data,ratings. So I will view the data by using ggplot2 as shown in following.

3.2 Build Recommendation Model Base On SVD Method

# view the available registried mathods in recommenderlab
recommenderRegistry$get_entries()

## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
##   normalize lambda n_factors n_iterations min_item_nr seed
## 1      NULL    0.1        10           10           1 NULL
## 
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $ALS_implicit_binaryRatingMatrix
## Recommender method: ALS_implicit for binaryRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $AR_binaryRatingMatrix
## Recommender method: AR for binaryRatingMatrix
## Description: Recommender based on association rules.
## Reference: NA
## Parameters:
##   support confidence maxlen sort_measure sort_decreasing apriori_control
## 1     0.1        0.8      3 "confidence"            TRUE          list()
##   verbose
## 1   FALSE
## 
## $IBCF_binaryRatingMatrix
## Recommender method: IBCF for binaryRatingMatrix
## Description: Recommender based on item-based collaborative filtering (binary rating data).
## Reference: NA
## Parameters:
##    k    method normalize_sim_matrix alpha
## 1 30 "Jaccard"                FALSE   0.5
## 
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
##    k   method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine"  "center"                FALSE   0.5      FALSE
## 
## $POPULAR_binaryRatingMatrix
## Recommender method: POPULAR for binaryRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters: None
## 
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
##   normalize    aggregationRatings aggregationPopularity
## 1  "center" new("standardGeneric" new("standardGeneric"
## 
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
## 
## $RANDOM_binaryRatingMatrix
## Recommender method: RANDOM for binaryRatingMatrix
## Description: Produce random recommendations (binary ratings).
## Reference: NA
## Parameters: None
## 
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
##   randomize minRating
## 1         1        NA
## 
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
##    k maxiter normalize
## 1 10     100  "center"
## 
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend.
## Reference: NA
## Parameters:
##    k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015  0.001         50        200           1e-06  "center"   FALSE
## 
## $UBCF_binaryRatingMatrix
## Recommender method: UBCF for binaryRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##      method nn weighted sample
## 1 "jaccard" 25     TRUE  FALSE
## 
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##     method nn sample normalize
## 1 "cosine" 25  FALSE  "center"

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1852856  99.0    5684620 303.6  5249944 280.4
## Vcells 19424431 148.2   56417825 430.5 55307755 422.0

From the list, we can see SVD (Recommender based on SVD approximation with column-mean imputation) and SVDF (Recommender based on Funk SVD with gradient descend) are available from recommenderlab package.

3.2.1 Prepare Data

# Convert the data to wide dataframe
rating_c <- as.data.frame(rating)
rating_w <- spread(rating_c,movieID,rating)
rm(rating_c)

# get half of the data because of small RAM 
set.seed(124)
sample1 <- sample.int(n = nrow(rating_w), size = floor(.5*nrow(rating_w)), replace = F)
sample2 <- sample.int(n = ncol(rating_w), size = floor(.5*ncol(rating_w)), replace = F)
rating_2 <- rating_w[sample1,sample2]#
rating_3 <- cbind('user_id'=as.numeric(rownames(rating_2)),rating_2)
rating_2 <- rating_3 # raing_2 is the wide table or user-item matrix in a dataframe format

#convert data back to long table
rating_1 <- gather(rating_3,'movieID','rating',2:833)
rating_1$movieID <- as.numeric(rating_1$movieID) # raing_1 is the long table in a dataframe format

## Warning: NAs introduced by coercion

rm(rating_3)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1859660  99.4    5684620 303.6  5249944 280.4
## Vcells 22562579 172.2   56417825 430.5 55702308 425.0

# split the data into train set and test set
# set Seed to make sure the reproducibility of the sampling results
set.seed(125)
sample3 <- sample.int(n = nrow(rating_2), size = floor(.8*nrow(rating_2)), replace = F)
train_2 <- rating_2[sample3,] # train_1 is the wide table in a dataframe format
test_2 <- rating_2[-sample3,] # test_1 is the wide table in a dataframe format
#convert data back to long table
train_1 <- gather(train_2,'movieID','rating',2:833) # train_2 is the long table in a dataframe format
train_1$movieID<-as.numeric(train_1$movieID)

## Warning: NAs introduced by coercion

test_1 <- gather(test_2,'movieID','rating',2:833) # test_2 is the long table in a dataframe format
test_1$movieID<-as.numeric(test_1$movieID)

## Warning: NAs introduced by coercion

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1861449  99.5    5684620 303.6  5249944 280.4
## Vcells 24132726 184.2   56417825 430.5 55702308 425.0

# find out which user and movie have been allocated into train and test set respectively
user_all <- as.list(unique(rating_2$user_id))
movie_all <- as.list(unique(rating_1$movieID))
user_train <- user_all[sample3]
movie_train <- movie_all[sample3]
user_test <- user_all[-sample3]
movie_test <- movie_all[-sample3]
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862830  99.5    5684620 303.6  5249944 280.4
## Vcells 24136687 184.2   56417825 430.5 55702308 425.0

# convert the data to realRatingMatrix format
rating_4 <- as.matrix(rating_2[,-1])
rating_m <- as(rating_4,"realRatingMatrix") # rating_m is the realRatingMatrix format for all data
movie_m <- getRatingMatrix(rating_m)
rm(rating_4)

train_3 <- as.matrix(train_2[,-1])
train_m <- as(train_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(train_3)

test_3 <- as.matrix(test_2[,-1])
test_m <- as(test_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(test_3)

evaluationScheme(rating_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 471 x 832 rating matrix of class 'realRatingMatrix' with 25944 ratings.

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863020  99.5    5684620 303.6  5249944 280.4
## Vcells 24219360 184.8   56417825 430.5 55702308 425.0

evaluationScheme(train_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 376 x 832 rating matrix of class 'realRatingMatrix' with 21010 ratings.

evaluationScheme(test_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 95 x 832 rating matrix of class 'realRatingMatrix' with 4934 ratings.

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863025  99.5    5684620 303.6  5249944 280.4
## Vcells 24219408 184.8   56417825 430.5 55702308 425.0

3.2.2 Get SVD

r <- Recommender(rating_m, method = "SVD")
names(getModel(r))

## [1] "description" "svd"         "columnMeans" "k"           "maxiter"    
## [6] "normalize"   "verbose"

imt <- getModel(r)$svd
names(imt)

## [1] "d"     "u"     "v"     "iter"  "mprod"

print(paste("the number of latent features (k) is ",getModel(r)$k))

## [1] "the number of latent features (k) is  10"

r_train <- Recommender(train_m, method = "SVD")
imt_train <- getModel(r_train)$svd
print(paste("the number of latent features (k) for train set is ",getModel(r_train)$k))

## [1] "the number of latent features (k) for train set is  10"

r_test <- Recommender(test_m, method = "SVD")
imt_test <- getModel(r_test)$svd
print(paste("the number of latent features (k) for test setis ",getModel(r_test)$k))

## [1] "the number of latent features (k) for test setis  10"

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862993  99.5    5684620 303.6  5249944 280.4
## Vcells 25515356 194.7   56417825 430.5 55702308 425.0

3.2.3 Making Predictions From the Decomposed Matrices

Obtain SVD matrices from SVD model.By multiply U, ??, and V^T back to get the rank k=10 approximation of original rating matrix.

u_sigma <- imt$u %*% diag(imt$d)
predict_rating_2 <- u_sigma %*% t(imt$v) # predict_rating_2 is the user-item version for predtion rating for all data

u_sigma_train <- imt_train$u %*% diag(imt_train$d)
predict_train_2 <- u_sigma_train %*% t(imt_train$v) # predict_train_2 is the user-item version for predtion rating for train set

predict_test_2 <- predict(r_train, test_m, verbose = TRUE,type="ratingMatrix") # predict_test_2 is the user-item version for predtion rating for test set
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1862930  99.5    5684620 303.6  5249944 280.4
## Vcells 26348014 201.1   56417825 430.5 55702308 425.0

#u_sigma_test <- imt_test$u %*% diag(imt_test$d)
#predict_test <- u_sigma_test %*% t(imt_test$v)

# set the memory the process needs
memory.limit(size=40000)

## [1] 40000

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating_2)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict_all <- gather(predict_rating,movieID, rating,2:833)
predict_all$movieID <- as.numeric(predict_all$movieID)
rm(predict_rating)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863240  99.6    5684620 303.6  5249944 280.4
## Vcells 27525629 210.1   56417825 430.5 55702308 425.0

kable(head(predict_all),caption = "All Prediction")

user_id	movieID	rating
79	1466	-6.336172
386	1466	-6.355482
485	1466	-6.349115
374	1466	-6.295749
210	1466	-6.322168
275	1466	-6.356418

predict_train <- as.data.frame(predict_train_2)
predict_train <- cbind('user_id'= unlist(user_train),predict_train)
colnames(predict_train)<- c('user_id',unlist(movie_all))
predict_train_1 <- gather(predict_train,movieID, rating,2:833)
rm(predict_train)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863440  99.6    5684620 303.6  5249944 280.4
## Vcells 28464362 217.2   56417825 430.5 55702308 425.0

kable(head(predict_train_1),caption = "Prediction For Train set")

user_id	movieID	rating
898	1466	-6.252160
221	1466	-6.196706
259	1466	-6.277859
236	1466	-6.236991
495	1466	-6.216564
738	1466	-6.365704

predict_test <- as.data.frame(as.matrix(getRatingMatrix(predict_test_2)))
predict_test <- cbind('user_id'= unlist(user_test),predict_test)
colnames(predict_test)<- c('user_id',unlist(movie_all))
predict_test_1 <- gather(predict_test,movieID, rating,2:833)
rm(predict_test)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1863513  99.6    5684620 303.6  5249944 280.4
## Vcells 28701637 219.0   56417825 430.5 55702308 425.0

kable(head(predict_test_1),caption = "Prediction For Test set")

user_id	movieID	rating
210	1466	3.179707
262	1466	3.080304
721	1466	8.026959
799	1466	79.715527
381	1466	5.202305
690	1466	12.232795

3.2.4 Making Recommendations

Based on the laten index, we know the predicted ratings to all movies for every user. The movie with the high predicted ratings that a user hasn't rated will be returned.

Predition for all data

# find the moive user have not rated yet
predict_all[,c('movieID','rating')] <- lapply(predict_all[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict_all, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6))

user_id	movieID	rating
888	1396	8.495888
800	84	5.731576
800	553	5.478352
879	553	4.945058
879	84	4.756719
765	553	4.736794

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_all, old_rated, by = c("user_id","movieID"))

rmse_1 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating is",round(rmse_1,2)))

## [1] "RMSE of prediction for all rating is 11.06"

Predition for train set

# find the moive user have not rated yet
predict_train_1[,c('movieID','rating')] <- lapply(predict_train_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- train_1[!(is.na(train_1$rating)),]
not_rated <- anti_join(predict_train_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for train set")

user_id	movieID	rating
857	581	10.060336
328	1286	4.325148
888	1396	3.453218
879	553	2.908616
765	553	2.064947
858	1268	2.001262

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_train_1, old_rated, by = c("user_id","movieID"))

rmse_2 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for train set is",round(rmse_2,2)))

## [1] "RMSE of prediction for train set is 10.92"

Predition for test set

# find the moive user have not rated yet
predict_test_1[,c('movieID','rating')] <- lapply(predict_test_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- test_1[!(is.na(test_1$rating)),]
not_rated <- anti_join(predict_test_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for test set")

user_id	movieID	rating
895	94	120.21248
895	1323	119.61836
895	91	119.07299
895	1228	118.96159
895	941	118.91728
672	412	99.43448

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_test_1, old_rated, by = c("user_id","movieID"))

rmse_3 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for test set is",round(rmse_3,2)))

## [1] "RMSE of prediction for test set is 16.3"

3.3. Build Recommendation Model Base On Funk SVD Method

3.3.1 Prepare Data

# Convert the data to wide dataframe
rating_c <- as.data.frame(rating)
rating_w <- spread(rating_c,movieID,rating)
rm(rating_c)

# get half of the data because of small RAM 
set.seed(124)
sample1 <- sample.int(n = nrow(rating_w), size = floor(.5*nrow(rating_w)), replace = F)
sample2 <- sample.int(n = ncol(rating_w), size = floor(.5*ncol(rating_w)), replace = F)
rating_2 <- rating_w[sample1,sample2]#
rating_3 <- cbind('user_id'=as.numeric(rownames(rating_2)),rating_2)
rating_2 <- rating_3 # raing_2 is the wide table or user-item matrix in a dataframe format

#convert data back to long table
rating_1 <- gather(rating_3,'movieID','rating',2:833)
rating_1$movieID <- as.numeric(rating_1$movieID) # raing_1 is the long table in a dataframe format

## Warning: NAs introduced by coercion

rm(rating_3)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867309  99.8    5684620 303.6  5249944 280.4
## Vcells 28965032 221.0   56417825 430.5 56071063 427.8

# split the data into train set and test set
# set Seed to make sure the reproducibility of the sampling results
set.seed(125)
sample3 <- sample.int(n = nrow(rating_2), size = floor(.8*nrow(rating_2)), replace = F)
train_2 <- rating_2[sample3,] # train_1 is the wide table in a dataframe format
test_2 <- rating_2[-sample3,] # test_1 is the wide table in a dataframe format
#convert data back to long table
train_1 <- gather(train_2,'movieID','rating',2:833) # train_2 is the long table in a dataframe format
train_1$movieID<-as.numeric(train_1$movieID)

## Warning: NAs introduced by coercion

test_1 <- gather(test_2,'movieID','rating',2:833) # test_2 is the long table in a dataframe format
test_1$movieID<-as.numeric(test_1$movieID)

## Warning: NAs introduced by coercion

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867391  99.8    5684620 303.6  5249944 280.4
## Vcells 28964276 221.0   56417825 430.5 56071063 427.8

# find out which user and movie have been allocated into train and test set respectively
user_all <- as.list(unique(rating_2$user_id))
movie_all <- as.list(unique(rating_1$movieID))
user_train <- user_all[sample3]
movie_train <- movie_all[sample3]
user_test <- user_all[-sample3]
movie_test <- movie_all[-sample3]
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867441  99.8    5684620 303.6  5249944 280.4
## Vcells 28964325 221.0   56417825 430.5 56071063 427.8

# convert the data to realRatingMatrix format
rating_4 <- as.matrix(rating_2[,-1])
rating_m <- as(rating_4,"realRatingMatrix") # rating_m is the realRatingMatrix format for all data
movie_m <- getRatingMatrix(rating_m)
rm(rating_4)

train_3 <- as.matrix(train_2[,-1])
train_m <- as(train_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(train_3)

test_3 <- as.matrix(test_2[,-1])
test_m <- as(test_3,"realRatingMatrix") # train_m is the realRatingMatrix format for train set
rm(test_3)

evaluationScheme(rating_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 471 x 832 rating matrix of class 'realRatingMatrix' with 25944 ratings.

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867565  99.8    5684620 303.6  5249944 280.4
## Vcells 28965410 221.0   56417825 430.5 56071063 427.8

evaluationScheme(train_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 376 x 832 rating matrix of class 'realRatingMatrix' with 21010 ratings.

evaluationScheme(test_m, method="split", train=0.8, k=1, given=3)

## Evaluation scheme with 3 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 95 x 832 rating matrix of class 'realRatingMatrix' with 4934 ratings.

gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1867568  99.8    5684620 303.6  5249944 280.4
## Vcells 28965458 221.0   56417825 430.5 56071063 427.8

3.3.2 Get SVD

r <- Recommender(rating_m, method = "SVDF")
names(getModel(r))

##  [1] "description"     "svd"             "k"              
##  [4] "gamma"           "lambda"          "min_epochs"     
##  [7] "max_epochs"      "min_improvement" "normalize"      
## [10] "verbose"

imt <- getModel(r)$svd
names(imt)

## [1] "U"          "V"          "parameters"

print(paste("the number of latent features (k) is ",getModel(r)$k))

## [1] "the number of latent features (k) is  10"

r_train <- Recommender(train_m, method = "SVDF")
imt_train <- getModel(r_train)$svd
print(paste("the number of latent features (k) for train set is ",getModel(r_train)$k))

## [1] "the number of latent features (k) for train set is  10"

#r_test <- Recommender(test_m, method = "SVDF")
#imt_test <- getModel(r_test)$svd
#print(paste("the number of latent features (k) for test setis ",getModel(r_test)$k))
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866654  99.7    5684620 303.6  5249944 280.4
## Vcells 27899158 212.9   56417825 430.5 56417825 430.5

3.3.3 Making Predictions From the Decomposed Matrices

Obtain SVD matrices from SVD model.By multiply U, ??, and V^T back to get the rank k=10 approximation of original rating matrix.

predict_rating_2 <- imt$U %*% t(imt$V) # predict_rating_2 is the user-item version for predtion rating for all data

predict_train_2 <- imt_train$U %*% t(imt_train$V) # predict_train_2 is the user-item version for predtion rating for train set

predict_test_2 <- predict(r_train, test_m, verbose = TRUE,type="ratingMatrix") # predict_test_2 is the user-item version for predtion rating for test set
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866534  99.7    5684620 303.6  5684620 303.6
## Vcells 27787596 212.1   56417825 430.5 56417825 430.5

# set the memory the process needs
memory.limit(size=40000)

## [1] 40000

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating_2)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict_all <- gather(predict_rating,movieID, rating,2:833)
predict_all$movieID <- as.numeric(predict_all$movieID)
rm(predict_rating)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866857  99.8    5684620 303.6  5684620 303.6
## Vcells 27789849 212.1   56417825 430.5 56417825 430.5

kable(head(predict_all),caption = "All Prediction")

user_id	movieID	rating
79	1466	NaN
386	1466	NaN
485	1466	NaN
374	1466	NaN
210	1466	NaN
275	1466	NaN

predict_train <- as.data.frame(predict_train_2)
predict_train <- cbind('user_id'= unlist(user_train),predict_train)
colnames(predict_train)<- c('user_id',unlist(movie_all))
predict_train_1 <- gather(predict_train,movieID, rating,2:833)
rm(predict_train)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866908  99.8    5684620 303.6  5684620 303.6
## Vcells 27789933 212.1   56417825 430.5 56417825 430.5

kable(head(predict_train_1),caption = "Prediction For Test set")

user_id	movieID	rating
898	1466	NaN
221	1466	NaN
259	1466	NaN
236	1466	NaN
495	1466	NaN
738	1466	NaN

predict_test <- as.data.frame(as.matrix(getRatingMatrix(predict_test_2)))
predict_test <- cbind('user_id'= unlist(user_test),predict_test)
colnames(predict_test)<- c('user_id',unlist(movie_all))
predict_test_1 <- gather(predict_test,movieID, rating,2:833)
rm(predict_test)
gc() #release memory

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1866963  99.8    5684620 303.6  5684620 303.6
## Vcells 27790027 212.1   56417825 430.5 56417825 430.5

kable(head(predict_test_1),caption = "Prediction For Test set")

user_id	movieID	rating
210	1466	0
262	1466	0
721	1466	0
799	1466	0
381	1466	0
690	1466	0

3.3.4 Making Recommendations

Based on the laten index, we know the predicted ratings to all movies for every user. The movie with the high predicted ratings that a user hasn't rated will be returned.

Predition for all data

# find the moive user have not rated yet
predict_all[,c('movieID','rating')] <- lapply(predict_all[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict_all, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6))

user_id	movieID	rating
747	1110	NaN
843	691	NaN
835	1402	NaN
92	1101	NaN
110	1381	NaN
261	818	NaN

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_all, old_rated, by = c("user_id","movieID"))

rmse_1 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating is",round(rmse_1,2)))

## [1] "RMSE of prediction for all rating is NaN"

Predition for train set

# find the moive user have not rated yet
predict_train_1[,c('movieID','rating')] <- lapply(predict_train_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- train_1[!(is.na(train_1$rating)),]
not_rated <- anti_join(predict_train_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for train set")

user_id	movieID	rating
673	894	NaN
292	677	NaN
841	399	NaN
791	1606	NaN
795	1084	NaN
596	337	NaN

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_train_1, old_rated, by = c("user_id","movieID"))

rmse_2 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for train set is",round(rmse_2,2)))

## [1] "RMSE of prediction for train set is NaN"

Predition for test set

# find the moive user have not rated yet
predict_test_1[,c('movieID','rating')] <- lapply(predict_test_1[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- test_1[!(is.na(test_1$rating)),]
not_rated <- anti_join(predict_test_1, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for test set")

user_id	movieID	rating
257	126	0
262	961	0
272	184	0
767	1157	0
771	1403	0
690	7	0

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict_test_1, old_rated, by = c("user_id","movieID"))

rmse_3 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for test set is",round(rmse_3,2)))

## [1] "RMSE of prediction for test set is 16.3"

In terms of RMSE, SVD based matrix decomposition performed better when the model was built on the normalized ratings instead of buidt on the original rating(data not shown).

Funk SVD is not appropreiate as it generate none predction.

4.Find the optimized number of latent features (r or k) by minimizing the Root Mean Square Error (using irlba package to try different r/k)**

#The irlba function only computes the number of singular values corresponding to the maximum of the desired singular vectors, max(nu, nv).
# max(nu, nv) must be strictly less than min(nrow(dtm), ncol(dtm)), so first see the 
max_nunv <-min(nrow(movie_m), ncol(movie_m))
print(paste("Number of left singular vectors to estimate, nu, and number of right, nv, should be less than",max_nunv,"."))

## [1] "Number of left singular vectors to estimate, nu, and number of right, nv, should be less than 471 ."

# try several r to see which one give the best prediction
SVD_5=irlba(getRatingMatrix(rating_m),nu=5,nv=5)
SVD_10=irlba(getRatingMatrix(rating_m),nu=10,nv=10)
SVD_20=irlba(getRatingMatrix(rating_m),nu=20,nv=20)
SVD_50=irlba(getRatingMatrix(rating_m),nu=50,nv=50)
SVD_100=irlba(getRatingMatrix(rating_m),nu=100,nv=100)
plot(SVD_10$d, main="Largest singular values (r=10)")

r=10

u_sigma <- SVD_10$u %*% diag(SVD_10$d)
predict_rating <- u_sigma %*% t(SVD_10$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")

user_id	movieID	rating
374	1502	1.761509
21	1502	1.598378
374	34	1.573657
174	1319	1.403222
533	572	1.091957
389	1283	1.067422

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_4 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=10 is",round(rmse_4,2)))

## [1] "RMSE of prediction for all rating when r=10 is 0.86"

The RMSE derived from SVD_10 calculated by irlba pacakge is different from that calclated by recommenderlab. irlba use truncated SVD.

Several number of singular values(5, 20, 50, and 100) will be compared with r=10.

r=5

u_sigma <- SVD_5$u %*% diag(SVD_5$d)
predict_rating <- u_sigma %*% t(SVD_5$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")

user_id	movieID	rating
174	1319	1.733024
174	1283	1.513814
846	500	1.348276
833	391	1.250866
532	1319	1.229991
374	1502	1.187603

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_5 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=5 is",round(rmse_5,2)))

## [1] "RMSE of prediction for all rating when r=5 is 0.92"

r=20

u_sigma <- SVD_20$u %*% diag(SVD_20$d)
predict_rating <- u_sigma %*% t(SVD_20$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")

user_id	movieID	rating
561	306	1.300612
437	391	1.289824
21	1502	1.207223
561	615	1.198636
561	1286	1.193565
881	34	1.143125

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_6 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=20 is",round(rmse_6,2)))

## [1] "RMSE of prediction for all rating when r=20 is 0.78"

r=50

u_sigma <- SVD_50$u %*% diag(SVD_50$d)
predict_rating <- u_sigma %*% t(SVD_50$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")

user_id	movieID	rating
451	45	1.0361451
617	435	1.0183612
474	416	1.0006906
664	1458	1.0006452
21	45	0.9610552
533	572	0.9530407

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_7 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=50 is",round(rmse_7,2)))

## [1] "RMSE of prediction for all rating when r=50 is 0.58"

r=100

u_sigma <- SVD_100$u %*% diag(SVD_100$d)
predict_rating <- u_sigma %*% t(SVD_100$v)

# convert user-item matrix to long data frame
predict_rating <- as.data.frame(predict_rating)
predict_rating <- cbind('user_id'= unlist(user_all),predict_rating)
colnames(predict_rating)<- c('user_id',unlist(movie_all))
predict <- gather(predict_rating,movieID, rating,2:833)

# find the moive user have not rated yet
predict[,c('movieID','rating')] <- lapply(predict[,c('movieID','rating')], function(x) as.numeric(x))
old_rated <- rating_1[!(is.na(rating_1$rating)),]
not_rated <- anti_join(predict, old_rated, by = c("user_id","movieID"))

# find the movie with high predicted ratings to recommend to users
rec_movie <- not_rated %>% 
  group_by(user_id) %>%
  arrange(desc(rating)) %>%
  filter(row_number() <= 5L) 
kable(head(rec_movie, n=6),caption = "Recommendation for all rating")

user_id	movieID	rating
102	401	0.7731909
63	885	0.7143455
934	615	0.6763654
448	1022	0.6555326
618	1393	0.6495313
167	696	0.6441336

# use rmse function of Metrics to calculate RMSE for evaluation of accuracy of predictions
new_rating <- semi_join(predict, old_rated, by = c("user_id","movieID"))

rmse_8 <-rmse(old_rated$rating,new_rating$rating)
print(paste("RMSE of prediction for all rating when r=100 is",round(rmse_8,2)))

## [1] "RMSE of prediction for all rating when r=100 is 0.37"

kable(RMSE <- data.frame('r'=c(5,10,20,50,100),'RMSE'=c(rmse_5,rmse_4,rmse_6,rmse_7,rmse_8)))

r	RMSE
5	0.9169962
10	0.8628613
20	0.7763696
50	0.5848391
100	0.3744922

ggplot(RMSE,aes(r,RMSE))+
  geom_line(col='slateblue2',size=1)+geom_point(col='slateblue2',size=2)+labs(x="Number of Singular Values",y="RMSE")

Conlcuion: The RMSE decreased as the number of singular values increase, suggesting that SVD approximation get closer to real rating as we increased the latent features.

Acknowledge

Thank you Kelly Shaffer's discussion for the part 1(Building and testing the SVD, Funk SVD, and ALS recommendation algorithms)

Reference:

1.Bregt Verreet, The alternating least squares algorithm in recommenderlab. https://beckernick.github.io/matrix-factorization-recommender/

2.Nick Becker, Matrix Factorization for Movie Recommendations in Python http://www.infofarm.be/articles/alternating-least-squares-algorithm-recommenderlab