Data612 project 5

DATA 643 Project 5 | Implementing a Recommender System on Spark

library(sparklyr)

## Warning: package 'sparklyr' was built under R version 3.6.2

library(recommenderlab)

## Loading required package: Matrix

## Loading required package: arules

## Warning: package 'arules' was built under R version 3.6.2

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## Warning: package 'proxy' was built under R version 3.6.2

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::invoke() masks sparklyr::invoke()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

Distributed Recommender System

spark_install(version = "2.1.0")

data(MovieLense)

movielense <- MovieLense
movies <- as(movielense,"data.frame")
head(movies)

##     user                                                 item rating
## 1      1                                     Toy Story (1995)      5
## 453    1                                     GoldenEye (1995)      3
## 584    1                                    Four Rooms (1995)      4
## 674    1                                    Get Shorty (1995)      3
## 883    1                                       Copycat (1995)      3
## 969    1 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)      5

movies1 <- movies %>%
  mutate(user = as.numeric(user)) %>%
  mutate(item = as.numeric(item))
head(movies1)

##   user item rating
## 1    1 1525      5
## 2    1  618      3
## 3    1  555      4
## 4    1  594      3
## 5    1  344      3
## 6    1 1318      5

sc <- spark_connect(master = "local")

start_spark <- Sys.time()
# copy data to spark
rating_matrix <- sdf_copy_to(sc, movies1, "sdf_rating_matrix", overwrite = TRUE)

# split dataset in spark
partitioned <- rating_matrix %>% 
  sdf_random_split(training = 0.8, testing = 0.2)

#fit the model and make prediction
als_model <- ml_als(partitioned$training, max_iter = 5)
als_prd <- ml_transform(als_model, partitioned$testing) %>% collect()

end_spark <- Sys.time()
spark_disconnect(sc)

Recommender system from previous projects

start_prv <- Sys.time()
ratings_m <- MovieLense[rowCounts(MovieLense)>50, colCounts(MovieLense)>100]
scheme <- evaluationScheme(ratings_m, method = "split", train = 0.9, given = 15, goodRating = 3.5)
als_train <- Recommender(getData(scheme, "train"), "ALS")
als_pred <- predict(als_train, getData(scheme, "known"), type= "ratings")

end_prv <- Sys.time()

Evaluation

1. RMSE

rmse_ALS <- calcPredictionAccuracy(als_pred, getData(scheme, "unknown"))

rmse <- function(o, p) {
  round((sqrt(mean((o - p)^2, na.rm = TRUE))), 2)
}

rmse_spark <- rmse(als_prd$rating, als_prd$prediction)
kable(cbind(rmse_ALS[[1]], rmse_spark), col.names = c("recommenderlab", "sparklyr"))%>%
  kable_styling("striped", position = "right", font_size = 10)

recommenderlab	sparklyr
0.8991867	0.93

2. Processing time

kable(cbind(end_prv-start_prv, end_spark-start_spark), col.names = c("recommenderlab", "sparklyr"))%>%
  kable_styling("striped", position = "right", font_size = 10)

recommenderlab	sparklyr
14.65515	18.32197

Conclusion

For your given recommender system’s data, algorithm(s), and (envisioned) implementation, at what point would you see moving to a distributed platform such as Spark becoming necessary?

Building a recommender system through Spark is a great experience of learning something new to me. According to the accuracy evaluation by comparing rmse, They are almost same, but little improvement was observed when the Spark was in use. In terms of processing time, Using Spark took a little more time while to run the program with when loading packages. Also, the observed processing time was not improved when using Spark. I think, however, there should be advantages on efficiency in dealing with larger datasets.