Matrix Factorization with Gradient Descend

if("recosystem" %in% rownames(installed.packages()) == FALSE){
  install.packages("recosystem")
}
library(recosystem)
library(SVDApproximation)
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'SVDApproximation'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'SVDApproximation'
set.seed(1)
in_train <- rep(TRUE, nrow(ratings))
in_train[sample(1:nrow(ratings), size = round(0.2 * length(unique(ratings$user)), 0) * 5)] <- FALSE

ratings_train <- ratings[(in_train)]
ratings_test <- ratings[(!in_train)]

write.table(ratings_train, file = "trainset.txt", sep = " ", row.names = FALSE, col.names = FALSE)
write.table(ratings_test, file = "testset.txt", sep = " ", row.names = FALSE, col.names = FALSE)

r = Reco()

# read in
train_data <- data_file('trainset.txt', index1 = TRUE)
test_data <- data_file('testset.txt', index1 = TRUE)

# tune model, select best tuning parameters
# opts <- r$tune(train_data,
#                 opts = list(dim = c(1:20),
#                             lrate = c(0.05),
#                             nthread = 4,
#                             costp_l1 = c(0, 0.1),
#                             costp_l2 = c(0.01, 0.1),
#                             costq_l1 = c(0, 0.1),
#                             costq_l2 = c(0.01, 0.1),
#                             niter = 200,
#                             nfold = 10,
#                             verbose = FALSE))


# save(opts, file = 'opts.RData')
attach('opts.RData')

# train model
r$train(train_data, opts = c(opts$min, nthread = 4, niter = 500, verbose = FALSE))

# predict
out_pred = out_file(tempfile())
r$predict(test_data, out_pred)
## prediction output generated at C:\Users\10121760\AppData\Local\Temp\Rtmp6z72J5\file13d544d775d41
scores_real <- read.table('testset.txt', header = FALSE, sep = " ")$V3
scores_pred <- scan(out_pred@dest)

rmse_mf <- sqrt(mean((scores_real-scores_pred) ^ 2))
rmse_mf
## [1] 0.8452224
# predict for the first 20 users on first 20 movies
user = 1:20
movie = 1:20
pred = expand.grid(user = user, movie = movie)
test_set = data_memory(pred$user, pred$movie, index1 = TRUE)
pred$rating = r$predict(test_set, out_memory())

library(ggplot2)
ggplot(pred, aes(x = movie, y = user, fill = rating)) +
    geom_raster() +
    scale_fill_gradient("Rating", low = "#d6e685", high = "#1e6823") +
    xlab("Movie ID") + ylab("User ID") +
    coord_fixed() +
    theme_bw(base_size = 22)

Summary and Finding:

  • Recosystem Was introduced Yu-Chin Juan, Wei-Sheng Chin, Yong Zhuang, Bo-Wen Yuan, Meng-Yuan Yang, and Chih-Jen Lin
  • Built on top of LIBMF
  • Parallel Matrix Factorization
  • Store model in hard disk to reduce memory use
  • Create data source with data_file()