Using sparklyr package in R to implement the Recommender System
if (!require('sparklyr')) install.packages('sparklyr')
## Loading required package: sparklyr
if (!require('dplyr')) install.packages('dplyr')
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require('SlopeOne')) install.packages('SlopeOne')
## Loading required package: SlopeOne
library(SVDApproximation)
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'SVDApproximation'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'SVDApproximation'
sparklyr::spark_install(version = "1.6.2")
java_path <- normalizePath('C:/Program Files/Java/jre1.8.0_131')
Sys.setenv(JAVA_HOME=java_path)
sc <- spark_connect(master = "local")
## * Using Spark: 1.6.2
data(ratings)
class(ratings)
## [1] "data.table" "data.frame"
dim(ratings)
## [1] 1000209 3
head(ratings)
## user item rating
## 1: 1 1 5
## 2: 6 1 4
## 3: 8 1 4
## 4: 9 1 5
## 5: 10 1 5
## 6: 18 1 4
summary(ratings)
## user item rating
## Min. : 1 Min. : 1 Min. :1.000
## 1st Qu.:1506 1st Qu.: 966 1st Qu.:3.000
## Median :3070 Median :1658 Median :4.000
## Mean :3025 Mean :1731 Mean :3.582
## 3rd Qu.:4476 3rd Qu.:2566 3rd Qu.:4.000
## Max. :6040 Max. :3706 Max. :5.000
# data selection to avoid run out of memory
ratings <- ratings[user < 50,]
# copy ratings into spark
ratings_tbl <- sdf_copy_to(sc, ratings, overwrite = TRUE)
# transform our data set, and then partition into 'training', 'test'
partitions <- ratings_tbl %>%
sdf_partition(training = 0.5, test = 0.5, seed = 1099)
model <- ml_als_factorization(partitions$training, rating.column = "rating",
user.column = "user",
item.column = "item",
iter.max = 5, regularization.parameter = 0.01,
implicit.preferences = TRUE, alpha = 1.0)
summary(model)
## Length Class Mode
## item.factors 11 data.frame list
## user.factors 11 data.frame list
## data 2 spark_jobj environment
## ml.options 6 ml_options list
## model.parameters 2 -none- list
## .call 9 -none- call
## .model 2 spark_jobj environment
predictions <- model$.model %>%
invoke("transform", spark_dataframe(partitions$test)) %>%
collect()
# due to data selection, there were NA value in prediction
predictions <- predictions[!is.na(predictions$prediction), ]
# RMSE
RMSE <- sqrt(mean(with(predictions, prediction-rating)^2))
RMSE
## [1] 3.578469
spark_disconnect(sc)