if (!require("knitr")) install.packages("knitr")
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("kableExtra")) install.packages("kableExtra")
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("recommenderlab")) install.packages("recommenderlab")
if (!require("tictoc")) install.packages("tictoc")
if (!require("sparklyr")) install.packages("sparklyr")
I used MovieLens small datasets: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users.
| userId | movieId | rating | timestamp |
|---|---|---|---|
| 1 | 1 | 4 | 964982703 |
| 1 | 3 | 4 | 964981247 |
| 1 | 6 | 4 | 964982224 |
| 1 | 47 | 5 | 964983815 |
| 1 | 50 | 5 | 964982931 |
| 1 | 70 | 3 | 964982400 |
I used realRatingMatrix from ‘recommenderlab’ to transform data.
ratings_data$userId <- as.factor(ratings_data$userId)
UI <- as(ratings_data, "realRatingMatrix")
dim(UI@data)## [1] 610 9724
Split the dataset into training set (80%) and testing set (20%).
recommenderlabI used realRatingMatrix from ‘recommenderlab’ to transform data.
#dt1 <- data.table(ratings_data, key = "movieId")
#dt2 <- data.table(movie_data, key = "movieId")
#ratings <- dt1[dt2]
# Training
tic('T-R')
Recommenderlab_CRS_Model <- Recommender(train, method = 'ALS')
t <- toc(quiet = TRUE)
train_time <- round(t$toc - t$tic, 3)
# Predicting
tic('P-R')
Recommenderlab_CRS_Predict<- predict(Recommenderlab_CRS_Model, newdata = known,
type="ratings")
t <- toc(quiet = TRUE)
predict_time <- round(t$toc - t$tic, 3)
(recclab_error <- calcPredictionAccuracy(Recommenderlab_CRS_Predict, unknown ))## RMSE MSE MAE
## 0.9266040 0.8585949 0.7170423
set.seed(10)
# connect to Spark locally
sc <- spark_connect(master = "local")
# Spark data processing
training_records <- sample(x = c(TRUE, FALSE), size = nrow(spark_data),replace = TRUE, prob = c(0.8, 0.2))
training <- spark_data[training_records, ]
testing <- spark_data[-training_records, ]
# moving data frames to Spark
spark_training <- sdf_copy_to(sc, training, "train_ratings", overwrite = TRUE)
spark_testing <- sdf_copy_to(sc, testing, "test_ratings", overwrite = TRUE)tic('T-S')
sdf_als_model <- ml_als(spark_training, max_iter = 5, nonnegative = TRUE, rating_col = "rating", user_col = "userId", item_col = "movieId")
ts <- toc(quiet = TRUE)
trainingTime_spark <- round(ts$toc - ts$tic, 3)
# make prediction
tic('P-S')
prediction <- ml_transform(sdf_als_model, spark_testing) %>% collect()
ps <- toc(quiet = TRUE)
predictionTime_spark <- round(ps$toc - ps$tic, 3)
# here are our top 10 movies we recommend for each userspark_mse <- mean((prediction$rating - prediction$prediction)^2)
spark_rmse <- sqrt(spark_mse)
spark_mae <- mean(abs(prediction$rating - prediction$prediction))
sparklyr_error <- c(
"RMSE" = RMSE(prediction$rating, prediction$prediction),
"MSE" = MSE(prediction$rating, prediction$prediction),
"MAE" = MAE(prediction$rating, prediction$prediction))
comparison <- rbind(recclab_error, sparklyr_error)
rownames(comparison) <- c("Centralized System", "Distributed System")
kable(comparison) %>% kable_styling("striped", full_width = F)| RMSE | MSE | MAE | |
|---|---|---|---|
| Centralized System | 0.9266040 | 0.8585949 | 0.7170423 |
| Distributed System | 0.6656635 | 0.4431079 | 0.4978245 |
comparison1 <- cbind(train_time, predict_time)
comparison2 <- cbind(trainingTime_spark, predictionTime_spark)
comparison_time <- rbind(comparison1, comparison2)
rownames(comparison_time) <- c("Centralized System", "Distributed System")
colnames(comparison_time) <- c("train_time", "predict_time")
kable(comparison_time) %>% kable_styling("striped", full_width = F)| train_time | predict_time | |
|---|---|---|
| Centralized System | 0.02 | 225.89 |
| Distributed System | 5.63 | 3.19 |