library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")
library("recommenderlab")
library(tidytext)
library(psych)
library(knitr)
library(ggplot2)
require(ggthemes)
library(tictoc)
library(stringr)
suppressWarnings(suppressMessages(library(recommenderlab)))
Using the Movielens database this time
Per Chapter 3 of the book: “Building a Recommendation Systems”
data(MovieLense)
MovieLense
## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
ratings <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]
ratings
## 560 x 332 rating matrix of class 'realRatingMatrix' with 55298 ratings.
items_to_keep <- 15 # using the book's recommendation
rating_threshold <- 3# min rating threshold of ratings considered good (range: 1-5)
set.seed(123)
percentage_training <- 0.8
n_eval <- 1
eval_sets <- evaluationScheme(data = ratings, method = "split",
train = percentage_training, given = items_to_keep, goodRating =
rating_threshold, k = n_eval)
getData(eval_sets, "train")
## 448 x 332 rating matrix of class 'realRatingMatrix' with 44457 ratings.
getData(eval_sets, "known")
## 112 x 332 rating matrix of class 'realRatingMatrix' with 1680 ratings.
getData(eval_sets, "unknown")
## 112 x 332 rating matrix of class 'realRatingMatrix' with 9161 ratings.
# Set up data frame for timing of training and prediction
timing <- data.frame(Model = factor(), Training = double(), Predicting = double())
model_to_evaluate_ALS<- "ALS" #method name
model_parameters_ALS <- NULL
#start timing for training
tic()
eval_recommender_ALS <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_ALS, parameter = model_parameters_ALS)
t <-toc(quiet=TRUE)
train_time <- round(t$toc - t$tic, 2)
items_to_recommend <- 10
#start timing for prediction
tic()
eval_prediction_ALS <- predict(object = eval_recommender_ALS, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")
predict_time <- round(t$toc - t$tic, 2)
#Setting up timing
timing <- data.frame(Method="Regular R: Recomenderlab", data.frame(Model=as.factor(model_to_evaluate_ALS), Training=as.double(train_time),Predicting=as.double(predict_time)))
# RSMEs of the algorithm
eval_accuracy_ALS<- calcPredictionAccuracy( x = eval_prediction_ALS, data = getData(eval_sets, "unknown"), byUser =FALSE)
#listing the RSME
accuracy <- eval_accuracy_ALS
accuracy
## RMSE MSE MAE
## 0.9118787 0.8315227 0.7252335
#Chart of SVD RSME
eval_accuracy_ALS <- calcPredictionAccuracy( x = eval_prediction_ALS , data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_ALS[, "RMSE"],color="orange") + geom_histogram(binwidth = .09) +ggtitle("Fig1: Distribution of the ALS's RMSE (80/20 split)")+theme_economist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Observation 2:
Below is to repeat the same exercise as above but in SPARK environment and compare performances
Created a Spark Connection to link master / local node to Spark environment and invoked the sparklyR library
spset_start <- proc.time()
library(sparklyr)
# installed local version of Spark
#spark_install()
#SparkR::sparkR.session()
sc <- spark_connect(master = "local")
# Check the version of Spark
spark_version(sc)
## [1] '2.4.3'
movies <- as(ratings, "data.frame")
dim(movies)
## [1] 55298 3
movies <- transform(movies, itemid = as.integer(factor(item)))
colnames(movies) <- c("user", "item", "rating", "itemid")
dim(movies)
## [1] 55298 4
#Creating table of only user, ratings and itemid
moviename <- movies %>% select(user,rating, itemid)%>% distinct(user,rating, itemid)
#need to convert columns to integer for Spark
moviename $user <- as.integer(moviename$user)
moviename $itemid <- as.integer(moviename$itemid)
#changing itemid column name back to "item" only for Spark to work
moviesdata <- moviename%>% rename(item= itemid)
#Creating Spark Data frame
spmovies <- sdf_copy_to(sc, moviesdata, "spmovies",overwrite=T)
src_tbls(sc)#checking to see if spark data frame was created
## [1] "spmovies"
spset_end <- proc.time()
spset_end - spset_start
## user system elapsed
## 1.08 0.17 15.78
#Spark Dataframes
head(spmovies)
## # Source: spark<?> [?? x 3]
## user rating item
## <int> <dbl> <int>
## 1 1 5 309
## 2 1 3 134
## 3 1 3 128
## 4 1 4 314
## 5 1 1 25
## 6 1 5 82
Partitioning the data sets into 80/20 split ratio
Building Model in Spark
partitions <- spmovies %>%sdf_random_split(training = 0.8, testing = 0.2)
#training and testing data sets
spmovies_training <- (partitions$training)
spmovies_testing<- (partitions$testing)
#Build model
tic()
model <- ml_als(spmovies_training, max_iter = 5, nonnegative = TRUE, rating_col = "rating", user_col = "user", item_col = "item")
t <-toc(quiet=TRUE)
train_time <- round(t$toc - t$tic, 2)
#Running the prediction
tic()
sparkPred <- ml_transform(model,spmovies_testing) %>%collect()
predict_time<- round(t$toc - t$tic, 2)
# Remove NaN due to data set splitting
sparkPred <- sparkPred[!is.na(sparkPred$prediction), ]
# Model Name: ml_als
model_to_evaluate_ALS_SP="ml_ALS"
timing <- rbind(timing, data.frame(Method="Spark",Model=as.factor(model_to_evaluate_ALS_SP),
Training=as.double(train_time),
Predicting=as.double(predict_time)))
# Calculate errors in Spark
mseSpark <- mean((sparkPred$rating - sparkPred$prediction)^2)
rmseSpark <- sqrt(mseSpark)
maeSpark <- mean(abs(sparkPred$rating - sparkPred$prediction))
# Disconnect Spark
spark_disconnect(sc)
accuracy <- rbind(accuracy, data.frame(RMSE = rmseSpark, MSE = mseSpark, MAE = maeSpark))
rownames(accuracy) <- c("Recommenderlab ALS", "Spark ALS")
knitr::kable(accuracy, format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
RMSE | MSE | MAE | |
---|---|---|---|
Recommenderlab ALS | 0.9118787 | 0.8315227 | 0.7252335 |
Spark ALS | 0.8999893 | 0.8099807 | 0.7239071 |
knitr::kable(timing, format = "html", row.names = FALSE) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Method | Model | Training | Predicting |
---|---|---|---|
Regular R: Recomenderlab | ALS | 0.01 | 0.01 |
Spark | ml_ALS | 4.17 | 4.17 |
Summary of Findings:
Overall, the accuracy metrics were better in Spark than in Recommenderlabs. However, the training and predicting times were worst off than regular R; which was unexpected. This lasting finding is really strange as Spark would've definitely provided a better computational performance over Regular R with all it's vaunted distributed computing power. As noted, Spark was re-run just to validate the first run was not a "fluke". As a result, further testing and validation is needed to understand this abnormality.