Loading Libraries

library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")
library("recommenderlab")
library(tidytext)
library(psych)
library(knitr)
library(ggplot2)
require(ggthemes)
library(tictoc)
library(stringr)

suppressWarnings(suppressMessages(library(recommenderlab)))

Loading Data from built-in Database of Recommenderlab

Using the Movielens database this time
Per Chapter 3 of the book: “Building a Recommendation Systems”

data(MovieLense)
MovieLense

## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.

Selecting only relevant data sets

Users having rated more than 50 times
Movies getting rated more than 100 times

ratings <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]
ratings

## 560 x 332 rating matrix of class 'realRatingMatrix' with 55298 ratings.

Parameters of the evaluation

items_to_keep <- 15 # using the book's recommendation
rating_threshold <- 3# min rating threshold of ratings considered good (range: 1-5)

Split Method

Schema: 80/20 split

set.seed(123)
percentage_training <- 0.8
n_eval <- 1

eval_sets <- evaluationScheme(data = ratings, method = "split",
train = percentage_training, given = items_to_keep, goodRating =
rating_threshold, k = n_eval) 

getData(eval_sets, "train")

## 448 x 332 rating matrix of class 'realRatingMatrix' with 44457 ratings.

getData(eval_sets, "known")

## 112 x 332 rating matrix of class 'realRatingMatrix' with 1680 ratings.

getData(eval_sets, "unknown")

## 112 x 332 rating matrix of class 'realRatingMatrix' with 9161 ratings.

# Set up data frame for timing of training and prediction
timing <- data.frame(Model = factor(), Training = double(), Predicting = double())

Building the ALS Model in regular R

model_to_evaluate_ALS<- "ALS" #method name
model_parameters_ALS <- NULL
#start timing for training
tic() 
eval_recommender_ALS <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_ALS, parameter = model_parameters_ALS)
t <-toc(quiet=TRUE)
train_time <- round(t$toc - t$tic, 2)

items_to_recommend <- 10
#start timing for prediction
tic() 
eval_prediction_ALS <- predict(object = eval_recommender_ALS, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")
predict_time <- round(t$toc - t$tic, 2)

#Setting up timing
timing <- data.frame(Method="Regular R: Recomenderlab", data.frame(Model=as.factor(model_to_evaluate_ALS), Training=as.double(train_time),Predicting=as.double(predict_time)))

RSME of rating

RSME of the the ALS algorithm

# RSMEs of the algorithm

eval_accuracy_ALS<- calcPredictionAccuracy( x = eval_prediction_ALS, data = getData(eval_sets, "unknown"), byUser =FALSE)

#listing the RSME
accuracy <- eval_accuracy_ALS
accuracy

##      RMSE       MSE       MAE 
## 0.9118787 0.8315227 0.7252335

Distribution of the rating’s RSME

#Chart of SVD RSME
eval_accuracy_ALS  <- calcPredictionAccuracy( x = eval_prediction_ALS , data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_ALS[, "RMSE"],color="orange") + geom_histogram(binwidth = .09) +ggtitle("Fig1: Distribution of the ALS's RMSE (80/20 split)")+theme_economist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Observation 2:

The RSME ALS is the mostly normally distributed with a slight right skewness

SPARK

Below is to repeat the same exercise as above but in SPARK environment and compare performances
Created a Spark Connection to link master / local node to Spark environment and invoked the sparklyR library

spset_start <- proc.time()
library(sparklyr)
# installed local version of Spark
#spark_install()

#SparkR::sparkR.session()
sc <- spark_connect(master = "local")

# Check the version of Spark
spark_version(sc)

## [1] '2.4.3'

Data preparation in SparklyR

Converting ratings to a data frame

movies <- as(ratings, "data.frame")
dim(movies)

## [1] 55298     3

movies <- transform(movies, itemid = as.integer(factor(item)))
colnames(movies) <- c("user", "item", "rating", "itemid")
dim(movies)

## [1] 55298     4

Data Wrangling in SparklyR

Making sure the format can be fed into SparklyR

#Creating table of only user, ratings and itemid
moviename <- movies %>% select(user,rating, itemid)%>% distinct(user,rating, itemid)
#need to convert columns to integer for Spark
moviename $user <- as.integer(moviename$user)
moviename $itemid <- as.integer(moviename$itemid)

#changing itemid column name back to "item" only for Spark to work
moviesdata <- moviename%>% rename(item= itemid)

#Creating Spark Data frame

spmovies <- sdf_copy_to(sc, moviesdata, "spmovies",overwrite=T)

src_tbls(sc)#checking to see if spark data frame was created

## [1] "spmovies"

spset_end <- proc.time()
spset_end - spset_start

##    user  system elapsed 
##    1.08    0.17   15.78

#Spark Dataframes

head(spmovies)

## # Source: spark<?> [?? x 3]
##    user rating  item
##   <int>  <dbl> <int>
## 1     1      5   309
## 2     1      3   134
## 3     1      3   128
## 4     1      4   314
## 5     1      1    25
## 6     1      5    82

Split Method

Partitioning the data sets into 80/20 split ratio
Building Model in Spark

partitions <- spmovies %>%sdf_random_split(training = 0.8, testing = 0.2)
#training and testing data sets
spmovies_training <- (partitions$training)
spmovies_testing<- (partitions$testing)
#Build model
tic()
model <- ml_als(spmovies_training, max_iter = 5, nonnegative = TRUE, rating_col = "rating", user_col = "user", item_col = "item")
t <-toc(quiet=TRUE)
train_time <- round(t$toc - t$tic, 2)

#Running the prediction
tic()
sparkPred <- ml_transform(model,spmovies_testing) %>%collect()
predict_time<- round(t$toc - t$tic, 2)
# Remove NaN due to data set splitting
sparkPred <- sparkPred[!is.na(sparkPred$prediction), ] 
# Model Name: ml_als
model_to_evaluate_ALS_SP="ml_ALS"
timing <- rbind(timing, data.frame(Method="Spark",Model=as.factor(model_to_evaluate_ALS_SP), 
                                   Training=as.double(train_time), 
                                   Predicting=as.double(predict_time)))

Acurracy Metrics in SPARK

# Calculate errors in Spark 
mseSpark <- mean((sparkPred$rating - sparkPred$prediction)^2)
rmseSpark <- sqrt(mseSpark)
maeSpark <- mean(abs(sparkPred$rating - sparkPred$prediction))

# Disconnect Spark
spark_disconnect(sc)

Performance Analysis

Comparing accuracy between regular R vs. SparklyR environments

accuracy <- rbind(accuracy, data.frame(RMSE = rmseSpark, MSE = mseSpark, MAE = maeSpark))

rownames(accuracy) <- c("Recommenderlab ALS", "Spark ALS")
knitr::kable(accuracy, format = "html") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

	RMSE	MSE	MAE
Recommenderlab ALS	0.9118787	0.8315227	0.7252335
Spark ALS	0.8999893	0.8099807	0.7239071

Comparing computational speed between regular R vs. SparklyR environments

knitr::kable(timing, format = "html", row.names = FALSE) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

Method	Model	Training	Predicting
Regular R: Recomenderlab	ALS	0.01	0.01
Spark	ml_ALS	4.17	4.17

Summary of Findings:

Overall, the accuracy metrics were better in Spark than in Recommenderlabs.  However, the training and predicting times were worst off than regular R; which was unexpected.  This lasting finding is really strange as Spark would've definitely provided a better computational performance over Regular R with all it's vaunted distributed computing power.  As noted, Spark was re-run just to validate the first run was not a "fluke".  As a result, further testing and validation is needed to understand this abnormality.

DAT 612 - Proj 5: SparklyR Exercise

sufian

6/27/2020

Loading Libraries

Loading Data from built-in Database of Recommenderlab

Selecting only relevant data sets

Parameters of the evaluation

Split Method

Building the ALS Model in regular R

RSME of rating

Distribution of the rating’s RSME

SPARK

Data preparation in SparklyR

Data Wrangling in SparklyR

Split Method

Acurracy Metrics in SPARK

Performance Analysis