singular value decomposition (SVD)

SVD builds features that may or may not map neatly to items (such as movie genres or news topics). As in many areas of machine learning, the lack of explainability can be an issue).

SVD requires that there are no missing values. There are various ways to handle this, including (1) imputation of missing values, (2) mean-centering values around 0, or (3) using a more advance technique, such as stochastic gradient descent to simulate SVD in populating the factored matrices.

Calculating the SVD matrices can be computationally expensive, although calculating ratings once the factorization is completed is very fast. You may need to create a subset of your data for SVD calculations to be successfully performed, especially on a machine with a small RAM footprint.

Modeling using SVDApproximation library

library(data.table)
library(devtools)
if("SVDApproximation" %in% rownames(installed.packages()) == FALSE){
  install_github(repo = "SVDApproximation", username = "tarashnot")
}
library(SVDApproximation)
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'SVDApproximation'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'SVDApproximation'
dim(ratings)
## [1] 1000209       3
head(ratings)
##    user item rating
## 1:    1    1      5
## 2:    6    1      4
## 3:    8    1      4
## 4:    9    1      5
## 5:   10    1      5
## 6:   18    1      4
summary(ratings)
##       user           item          rating     
##  Min.   :   1   Min.   :   1   Min.   :1.000  
##  1st Qu.:1506   1st Qu.: 966   1st Qu.:3.000  
##  Median :3070   Median :1658   Median :4.000  
##  Mean   :3025   Mean   :1731   Mean   :3.582  
##  3rd Qu.:4476   3rd Qu.:2566   3rd Qu.:4.000  
##  Max.   :6040   Max.   :3706   Max.   :5.000
visualize_ratings(ratings_table = ratings)

set.seed(1)
mtx <- split_ratings(ratings_table = ratings,
                     proportion = c(0.7, 0.15, 0.15))

model <- svd_build(mtx)

model_tunes <- svd_tune(model, r = 2:50)
## Model is evaluated for r = 2 
## Model is evaluated for r = 3 
## Model is evaluated for r = 4 
## Model is evaluated for r = 5 
## Model is evaluated for r = 6 
## Model is evaluated for r = 7 
## Model is evaluated for r = 8 
## Model is evaluated for r = 9 
## Model is evaluated for r = 10 
## Model is evaluated for r = 11 
## Model is evaluated for r = 12 
## Model is evaluated for r = 13 
## Model is evaluated for r = 14 
## Model is evaluated for r = 15 
## Model is evaluated for r = 16 
## Model is evaluated for r = 17 
## Model is evaluated for r = 18 
## Model is evaluated for r = 19 
## Model is evaluated for r = 20 
## Model is evaluated for r = 21 
## Model is evaluated for r = 22 
## Model is evaluated for r = 23 
## Model is evaluated for r = 24 
## Model is evaluated for r = 25 
## Model is evaluated for r = 26 
## Model is evaluated for r = 27 
## Model is evaluated for r = 28 
## Model is evaluated for r = 29 
## Model is evaluated for r = 30 
## Model is evaluated for r = 31 
## Model is evaluated for r = 32 
## Model is evaluated for r = 33 
## Model is evaluated for r = 34 
## Model is evaluated for r = 35 
## Model is evaluated for r = 36 
## Model is evaluated for r = 37 
## Model is evaluated for r = 38 
## Model is evaluated for r = 39 
## Model is evaluated for r = 40 
## Model is evaluated for r = 41 
## Model is evaluated for r = 42 
## Model is evaluated for r = 43 
## Model is evaluated for r = 44 
## Model is evaluated for r = 45 
## Model is evaluated for r = 46 
## Model is evaluated for r = 47 
## Model is evaluated for r = 48 
## Model is evaluated for r = 49 
## Model is evaluated for r = 50
model_tunes$train_vs_valid

rmse_svd <- svd_rmse(model, r = model_tunes$r_best, rmse_type = c("test"))
rmse_svd
## [1] 0.9313392

Alternating Least Squares (ALS)

if("recommenderlab" %in% rownames(installed.packages()) == FALSE){
  install_github("mhahsler/recommenderlab")
}
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
library(ggplot2)

# binary data random sample from anonymous web click-stream data from microsoft.com
data("MovieLense")

class(MovieLense)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
(dim(MovieLense))
## [1]  943 1664
slotNames(MovieLense)
## [1] "data"      "normalize"
class(MovieLense@data)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
(dim(MovieLense@data))
## [1]  943 1664
vector_ratings <- as.vector(MovieLense@data)
(unique(vector_ratings))
## [1] 5 4 0 3 1 2
# remove missing values presented by 0
vector_ratings <- vector_ratings[vector_ratings != 0]

# visulization
qplot(factor(vector_ratings)) +
  ggtitle("Distribution of the ratings")

# popularity of the data
min_n_movies <- quantile(rowCounts(MovieLense), .99)
min_n_users <- quantile(colCounts(MovieLense), .99)
image(MovieLense[rowCounts(MovieLense) > min_n_movies, 
                 colCounts(MovieLense) > min_n_users], 
                 main = 'Heatmap of the top users and movies')

# data selection
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]
dim(ratings_movies)
## [1] 560 332
# normalization
ratings_movies_norm <- normalize(ratings_movies)
sum(rowMeans(ratings_movies_norm) > .00001)
## [1] 0
min_movies <- quantile(rowCounts(ratings_movies), .98)
min_users <- quantile(colCounts(ratings_movies), .98)
image(ratings_movies[rowCounts(ratings_movies) > min_movies, 
                 colCounts(ratings_movies) > min_users], 
                 main = 'Heatmap of the top users and movies')

image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies, 
                 colCounts(ratings_movies_norm) > min_users], 
                 main = 'Heatmap of the top users and movies')

# split data
which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies), replace = TRUE, prob = c(0.8, 0.2))
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]

rec <- Recommender(recc_data_train, method = "ALS", parameter = list(lambda=0.1, n_factors=10,
                                  n_iterations=10, seed = NULL, verbose = FALSE))
model_details <- getModel(rec)

# predict 6 items to each of the users in test data set
recom <- predict(rec, recc_data_test, n = 5)
recom
## Recommendations as 'topNList' with n = 5 for 108 users.
recc_matrix <- sapply(recom@items, function(x) {
  colnames(ratings_movies)[x]
})
recc_matrix[, 1:4]
##      7                            10                            
## [1,] "Wrong Trousers, The (1993)" "Schindler's List (1993)"     
## [2,] "Close Shave, A (1995)"      "Wrong Trousers, The (1993)"  
## [3,] "Good Will Hunting (1997)"   "Close Shave, A (1995)"       
## [4,] "As Good As It Gets (1997)"  "To Kill a Mockingbird (1962)"
## [5,] "L.A. Confidential (1997)"   "Good Will Hunting (1997)"    
##      12                                
## [1,] "Shawshank Redemption, The (1994)"
## [2,] "Titanic (1997)"                  
## [3,] "Good Will Hunting (1997)"        
## [4,] "Braveheart (1995)"               
## [5,] "Casablanca (1942)"               
##      22                                                                           
## [1,] "Wrong Trousers, The (1993)"                                                 
## [2,] "Close Shave, A (1995)"                                                      
## [3,] "Usual Suspects, The (1995)"                                                 
## [4,] "Pulp Fiction (1994)"                                                        
## [5,] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
# evaluate
eval_sets <- evaluationScheme(data = ratings_movies, method="split", train = .9, given = 5, k= NULL)
eval_reco <- Recommender(data = getData(eval_sets, 'train'), method = 'ALS', parameter = NULL)
eval_prediction <- predict(object = eval_reco, newdata = getData(eval_sets, "known"),  n = 10, type = 'ratings')

accuracy_table <- function(scheme, algorithm, parameter){
  als <- Recommender(getData(scheme, "train"), algorithm, parameter = parameter)
  p <- predict(als, getData(scheme, "known"), type="ratings")
  acc_list <- calcPredictionAccuracy(p, getData(scheme, "unknown"))
  total_list <- c(algorithm =algorithm, acc_list)
  total_list <- total_list[sapply(total_list, function(x) !is.null(x))]
  return(data.frame(as.list(total_list)))
}

table_ALS_1 <- accuracy_table(eval_sets, algorithm = "ALS",
                  parameter = list(lambda=0.1, n_factors=200,
                  n_iterations=10, seed = 1234, verbose = FALSE))

table_ALS_1$RMSE
## [1] 1.02449062340107
## Levels: 1.02449062340107
# evaluate with normalized the data
eval_sets2 <- evaluationScheme(data = ratings_movies_norm, method="split", train = .9, given = 5, k= NULL)
eval_reco2 <- Recommender(data = getData(eval_sets2, 'train'), method = 'ALS', parameter = NULL)
eval_prediction2 <- predict(object = eval_reco2, newdata = getData(eval_sets2, "known"), n = 10, type = 'ratings')

table_ALS_2 <- accuracy_table(eval_sets2, algorithm = "ALS",
                  parameter = list(lambda=0.1, n_factors=200,
                  n_iterations=10, seed = 1234, verbose = FALSE))

table_ALS_2$RMSE
## [1] 0.978723960614586
## Levels: 0.978723960614586

Summary of findings and recommendations

I learned 2 recommender system algorithms in this project SVD and ALS, visualized the data, and compared some results of the models.

The model using SVDApproximation library has RMSE .93; the model with ALS algorithm has 2 RMSE results for raw data and normalized data respectively: 1.02 and 0.98.

Two different datasets are used for SVD and ALS models, so the RMSEs of SVD and ALS will not be compared. I compared the results of 2 ALS RMSEs 1.02 and .98, it showed that modeling with normalized data produced a better result than using raw data, reducing RMSE by 4% ((1.02-.98)/0.98).

Reference:

http://www.infofarm.be/articles/alternating-least-squares-algorithm-recommenderlab

https://rpubs.com/tarashnot/recommender_comparison

https://www.r-bloggers.com/recosystem-recommender-system-using-parallel-matrix-factorization/

https://ashokharnal.wordpress.com/2014/12/18/using-recommenderlab-for-predicting-ratings-for-movielens-data/

http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.213&rep=rep1&type=pdf

https://rpubs.com/waltw/285262