DATA 643 Project 5 | Implementing a Recommender System on Spark
library(sparklyr)
## Warning: package 'sparklyr' was built under R version 3.6.2
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.2
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
## Warning: package 'proxy' was built under R version 3.6.2
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::invoke() masks sparklyr::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Distributed Recommender System
spark_install(version = "2.1.0")
data(MovieLense)
movielense <- MovieLense
movies <- as(movielense,"data.frame")
head(movies)
## user item rating
## 1 1 Toy Story (1995) 5
## 453 1 GoldenEye (1995) 3
## 584 1 Four Rooms (1995) 4
## 674 1 Get Shorty (1995) 3
## 883 1 Copycat (1995) 3
## 969 1 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 5
movies1 <- movies %>%
mutate(user = as.numeric(user)) %>%
mutate(item = as.numeric(item))
head(movies1)
## user item rating
## 1 1 1525 5
## 2 1 618 3
## 3 1 555 4
## 4 1 594 3
## 5 1 344 3
## 6 1 1318 5
sc <- spark_connect(master = "local")
start_spark <- Sys.time()
# copy data to spark
rating_matrix <- sdf_copy_to(sc, movies1, "sdf_rating_matrix", overwrite = TRUE)
# split dataset in spark
partitioned <- rating_matrix %>%
sdf_random_split(training = 0.8, testing = 0.2)
#fit the model and make prediction
als_model <- ml_als(partitioned$training, max_iter = 5)
als_prd <- ml_transform(als_model, partitioned$testing) %>% collect()
end_spark <- Sys.time()
spark_disconnect(sc)
Recommender system from previous projects
start_prv <- Sys.time()
ratings_m <- MovieLense[rowCounts(MovieLense)>50, colCounts(MovieLense)>100]
scheme <- evaluationScheme(ratings_m, method = "split", train = 0.9, given = 15, goodRating = 3.5)
als_train <- Recommender(getData(scheme, "train"), "ALS")
als_pred <- predict(als_train, getData(scheme, "known"), type= "ratings")
end_prv <- Sys.time()
Evaluation
1. RMSE
rmse_ALS <- calcPredictionAccuracy(als_pred, getData(scheme, "unknown"))
rmse <- function(o, p) {
round((sqrt(mean((o - p)^2, na.rm = TRUE))), 2)
}
rmse_spark <- rmse(als_prd$rating, als_prd$prediction)
kable(cbind(rmse_ALS[[1]], rmse_spark), col.names = c("recommenderlab", "sparklyr"))%>%
kable_styling("striped", position = "right", font_size = 10)
recommenderlab
|
sparklyr
|
0.8991867
|
0.93
|
2. Processing time
kable(cbind(end_prv-start_prv, end_spark-start_spark), col.names = c("recommenderlab", "sparklyr"))%>%
kable_styling("striped", position = "right", font_size = 10)
recommenderlab
|
sparklyr
|
14.65515
|
18.32197
|