SVD builds features that may or may not map neatly to items (such as movie genres or news topics). As in many areas of machine learning, the lack of explainability can be an issue).
SVD requires that there are no missing values. There are various ways to handle this, including (1) imputation of missing values, (2) mean-centering values around 0, or (3)
Calculating the SVD matrices can be computationally expensive, although calculating ratings once the factorization is completed is very fast. You may need to create a subset of your data for SVD calculations to be successfully performed, especially on a machine with a small RAM footprint.
library(data.table)
library(devtools)
if("SVDApproximation" %in% rownames(installed.packages()) == FALSE){
install_github(repo = "SVDApproximation", username = "tarashnot")
}
library(SVDApproximation)
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'SVDApproximation'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'SVDApproximation'
dim(ratings)
## [1] 1000209 3
head(ratings)
## user item rating
## 1: 1 1 5
## 2: 6 1 4
## 3: 8 1 4
## 4: 9 1 5
## 5: 10 1 5
## 6: 18 1 4
summary(ratings)
## user item rating
## Min. : 1 Min. : 1 Min. :1.000
## 1st Qu.:1506 1st Qu.: 966 1st Qu.:3.000
## Median :3070 Median :1658 Median :4.000
## Mean :3025 Mean :1731 Mean :3.582
## 3rd Qu.:4476 3rd Qu.:2566 3rd Qu.:4.000
## Max. :6040 Max. :3706 Max. :5.000
visualize_ratings(ratings_table = ratings)
set.seed(1)
mtx <- split_ratings(ratings_table = ratings,
proportion = c(0.7, 0.15, 0.15))
model <- svd_build(mtx)
model_tunes <- svd_tune(model, r = 2:50)
## Model is evaluated for r = 2
## Model is evaluated for r = 3
## Model is evaluated for r = 4
## Model is evaluated for r = 5
## Model is evaluated for r = 6
## Model is evaluated for r = 7
## Model is evaluated for r = 8
## Model is evaluated for r = 9
## Model is evaluated for r = 10
## Model is evaluated for r = 11
## Model is evaluated for r = 12
## Model is evaluated for r = 13
## Model is evaluated for r = 14
## Model is evaluated for r = 15
## Model is evaluated for r = 16
## Model is evaluated for r = 17
## Model is evaluated for r = 18
## Model is evaluated for r = 19
## Model is evaluated for r = 20
## Model is evaluated for r = 21
## Model is evaluated for r = 22
## Model is evaluated for r = 23
## Model is evaluated for r = 24
## Model is evaluated for r = 25
## Model is evaluated for r = 26
## Model is evaluated for r = 27
## Model is evaluated for r = 28
## Model is evaluated for r = 29
## Model is evaluated for r = 30
## Model is evaluated for r = 31
## Model is evaluated for r = 32
## Model is evaluated for r = 33
## Model is evaluated for r = 34
## Model is evaluated for r = 35
## Model is evaluated for r = 36
## Model is evaluated for r = 37
## Model is evaluated for r = 38
## Model is evaluated for r = 39
## Model is evaluated for r = 40
## Model is evaluated for r = 41
## Model is evaluated for r = 42
## Model is evaluated for r = 43
## Model is evaluated for r = 44
## Model is evaluated for r = 45
## Model is evaluated for r = 46
## Model is evaluated for r = 47
## Model is evaluated for r = 48
## Model is evaluated for r = 49
## Model is evaluated for r = 50
model_tunes$train_vs_valid
rmse_svd <- svd_rmse(model, r = model_tunes$r_best, rmse_type = c("test"))
rmse_svd
## [1] 0.9313392
if("recommenderlab" %in% rownames(installed.packages()) == FALSE){
install_github("mhahsler/recommenderlab")
}
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
library(ggplot2)
# binary data random sample from anonymous web click-stream data from microsoft.com
data("MovieLense")
class(MovieLense)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
(dim(MovieLense))
## [1] 943 1664
slotNames(MovieLense)
## [1] "data" "normalize"
class(MovieLense@data)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
(dim(MovieLense@data))
## [1] 943 1664
vector_ratings <- as.vector(MovieLense@data)
(unique(vector_ratings))
## [1] 5 4 0 3 1 2
# remove missing values presented by 0
vector_ratings <- vector_ratings[vector_ratings != 0]
# visulization
qplot(factor(vector_ratings)) +
ggtitle("Distribution of the ratings")
# popularity of the data
min_n_movies <- quantile(rowCounts(MovieLense), .99)
min_n_users <- quantile(colCounts(MovieLense), .99)
image(MovieLense[rowCounts(MovieLense) > min_n_movies,
colCounts(MovieLense) > min_n_users],
main = 'Heatmap of the top users and movies')
# data selection
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]
dim(ratings_movies)
## [1] 560 332
# normalization
ratings_movies_norm <- normalize(ratings_movies)
sum(rowMeans(ratings_movies_norm) > .00001)
## [1] 0
min_movies <- quantile(rowCounts(ratings_movies), .98)
min_users <- quantile(colCounts(ratings_movies), .98)
image(ratings_movies[rowCounts(ratings_movies) > min_movies,
colCounts(ratings_movies) > min_users],
main = 'Heatmap of the top users and movies')
image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies,
colCounts(ratings_movies_norm) > min_users],
main = 'Heatmap of the top users and movies')
# split data
which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies), replace = TRUE, prob = c(0.8, 0.2))
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]
rec <- Recommender(recc_data_train, method = "ALS", parameter = list(lambda=0.1, n_factors=10,
n_iterations=10, seed = NULL, verbose = FALSE))
model_details <- getModel(rec)
# predict 6 items to each of the users in test data set
recom <- predict(rec, recc_data_test, n = 5)
recom
## Recommendations as 'topNList' with n = 5 for 108 users.
recc_matrix <- sapply(recom@items, function(x) {
colnames(ratings_movies)[x]
})
recc_matrix[, 1:4]
## 7 10
## [1,] "Wrong Trousers, The (1993)" "Schindler's List (1993)"
## [2,] "Close Shave, A (1995)" "Wrong Trousers, The (1993)"
## [3,] "Good Will Hunting (1997)" "Close Shave, A (1995)"
## [4,] "As Good As It Gets (1997)" "To Kill a Mockingbird (1962)"
## [5,] "L.A. Confidential (1997)" "Good Will Hunting (1997)"
## 12
## [1,] "Shawshank Redemption, The (1994)"
## [2,] "Titanic (1997)"
## [3,] "Good Will Hunting (1997)"
## [4,] "Braveheart (1995)"
## [5,] "Casablanca (1942)"
## 22
## [1,] "Wrong Trousers, The (1993)"
## [2,] "Close Shave, A (1995)"
## [3,] "Usual Suspects, The (1995)"
## [4,] "Pulp Fiction (1994)"
## [5,] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
# evaluate
eval_sets <- evaluationScheme(data = ratings_movies, method="split", train = .9, given = 5, k= NULL)
eval_reco <- Recommender(data = getData(eval_sets, 'train'), method = 'ALS', parameter = NULL)
eval_prediction <- predict(object = eval_reco, newdata = getData(eval_sets, "known"), n = 10, type = 'ratings')
accuracy_table <- function(scheme, algorithm, parameter){
als <- Recommender(getData(scheme, "train"), algorithm, parameter = parameter)
p <- predict(als, getData(scheme, "known"), type="ratings")
acc_list <- calcPredictionAccuracy(p, getData(scheme, "unknown"))
total_list <- c(algorithm =algorithm, acc_list)
total_list <- total_list[sapply(total_list, function(x) !is.null(x))]
return(data.frame(as.list(total_list)))
}
table_ALS_1 <- accuracy_table(eval_sets, algorithm = "ALS",
parameter = list(lambda=0.1, n_factors=200,
n_iterations=10, seed = 1234, verbose = FALSE))
table_ALS_1$RMSE
## [1] 1.02449062340107
## Levels: 1.02449062340107
# evaluate with normalized the data
eval_sets2 <- evaluationScheme(data = ratings_movies_norm, method="split", train = .9, given = 5, k= NULL)
eval_reco2 <- Recommender(data = getData(eval_sets2, 'train'), method = 'ALS', parameter = NULL)
eval_prediction2 <- predict(object = eval_reco2, newdata = getData(eval_sets2, "known"), n = 10, type = 'ratings')
table_ALS_2 <- accuracy_table(eval_sets2, algorithm = "ALS",
parameter = list(lambda=0.1, n_factors=200,
n_iterations=10, seed = 1234, verbose = FALSE))
table_ALS_2$RMSE
## [1] 0.978723960614586
## Levels: 0.978723960614586
I learned 2 recommender system algorithms in this project SVD and ALS, visualized the data, and compared some results of the models.
The model using SVDApproximation library has RMSE .93; the model with ALS algorithm has 2 RMSE results for raw data and normalized data respectively: 1.02 and 0.98.
Two different datasets are used for SVD and ALS models, so the RMSEs of SVD and ALS will not be compared. I compared the results of 2 ALS RMSEs 1.02 and .98, it showed that modeling with normalized data produced a better result than using raw data, reducing RMSE by 4% ((1.02-.98)/0.98).
http://www.infofarm.be/articles/alternating-least-squares-algorithm-recommenderlab
https://rpubs.com/tarashnot/recommender_comparison
https://www.r-bloggers.com/recosystem-recommender-system-using-parallel-matrix-factorization/
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.213&rep=rep1&type=pdf