This project is to perform analysis Collaborative Filtering on existing MovieLens dataset of user-item ratings also analysing the prediction using spark ALS
Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating).
Spark ALS CF,which users and products are described by a small set of latent factors that can be used to predict missing entries
MovieLens 1M movie ratings. Stable benchmark dataset. 1 million ratings from 6000 users on 4000 movies. Released 2/2003, The dataset can be downloaded from https://grouplens.org/datasets/movielens/1m/. This dataset is choosen for analysis
movies <- read.csv("movies.csv",stringsAsFactors=FALSE)
ratings <- read.csv("ratings.csv",stringsAsFactors=FALSE)## movieId title genres
## Min. : 1 Length:10329 Length:10329
## 1st Qu.: 3240 Class :character Class :character
## Median : 7088 Mode :character Mode :character
## Mean : 31924
## 3rd Qu.: 59900
## Max. :149532
## userId movieId rating timestamp
## Min. : 1.0 Min. : 1 Min. :0.500 Min. :8.286e+08
## 1st Qu.:192.0 1st Qu.: 1073 1st Qu.:3.000 1st Qu.:9.711e+08
## Median :383.0 Median : 2497 Median :3.500 Median :1.115e+09
## Mean :364.9 Mean : 13381 Mean :3.517 Mean :1.130e+09
## 3rd Qu.:557.0 3rd Qu.: 5991 3rd Qu.:4.000 3rd Qu.:1.275e+09
## Max. :668.0 Max. :149532 Max. :5.000 Max. :1.452e+09
## [1] 105339
par(mfrow=c(2,2))
ratings %>%
count(movieId) %>%
ggplot(aes(n)) +
geom_histogram( bins=30, color = "red") +
scale_x_log10() +
ggtitle("Movies") +
labs(subtitle ="number of ratings by movieId",
x="movieId" ,
y="number of ratings",
caption ="source data : ratings set") +
theme(panel.border = element_rect(colour="black", fill=NA)) ratings %>%
count(userId) %>%
ggplot(aes(n)) +
geom_histogram( bins=30, color = "gold") +
scale_x_log10() +
ggtitle("Users") +
labs(subtitle ="number of ratings by UserId",
x="userId" ,
y="number of ratings") +
theme(panel.border = element_rect(colour="black", fill=NA)) kable(head(ratings),caption = "Sample Ratings Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") | userId | movieId | rating | timestamp |
|---|---|---|---|
| 1 | 16 | 4.0 | 1217897793 |
| 1 | 24 | 1.5 | 1217895807 |
| 1 | 32 | 4.0 | 1217896246 |
| 1 | 47 | 4.0 | 1217896556 |
| 1 | 50 | 4.0 | 1217896523 |
| 1 | 110 | 4.0 | 1217896150 |
kable(head(movies),caption = "Sample Movies Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872")| movieId | title | genres |
|---|---|---|
| 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 5 | Father of the Bride Part II (1995) | Comedy |
| 6 | Heat (1995) | Action|Crime|Thriller |
Pre-processing of data for further model building, Seperating the genre of movies
genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
colnames(genres2) <- c(1:10)
genre_list <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western") # we have 18 genres in total
genre_matrix <- matrix(0,10330,18) #empty matrix, 10330=no of movies+1, 18=no of genres
genre_matrix[1,] <- genre_list #set first row to genre list
colnames(genre_matrix) <- genre_list #set column names to genre list
#iterate through matrix
for (i in 1:nrow(genres2)) {
for (c in 1:ncol(genres2)) {
genmat_col = which(genre_matrix[1,] == genres2[i,c])
genre_matrix[i+1,genmat_col] <- 1
}
}
#convert into dataframe
genre_matrix2 <- as.data.frame(genre_matrix[-1,], stringsAsFactors=FALSE) #remove first row, which was the genre list
for (c in 1:ncol(genre_matrix2)) {
genre_matrix2[,c] <- as.integer(genre_matrix2[,c]) #convert from characters to integers
}
search_matrix <- cbind(movies[,1:2], genre_matrix2)
kable(head(search_matrix),caption = "Sample Movies Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| movieId | title | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Toy Story (1995) | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | Jumanji (1995) | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | Grumpier Old Men (1995) | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | Waiting to Exhale (1995) | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 5 | Father of the Bride Part II (1995) | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | Heat (1995) | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
IBCF and UBCF models are used comparison and performance
Sparse Matrix Conversion
#Create ratings matrix. Rows = userId, Columns = movieId
ratingmat <- dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)
ratingmat <- as.matrix(ratingmat[,-1]) #remove userIds
#Convert rating matrix into a recommenderlab sparse matrix
ratingmat <- as(ratingmat, "realRatingMatrix")
ratingmat## 668 x 10325 rating matrix of class 'realRatingMatrix' with 105339 ratings.
Identifying the algorithms and recommendation model
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)## [1] "HYBRID_realRatingMatrix" "ALS_realRatingMatrix"
## [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"
## [5] "LIBMF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [7] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [9] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [11] "UBCF_realRatingMatrix"
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
##
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $weighted
## [1] TRUE
##
## $normalize
## [1] "center"
##
## $min_matching_items
## [1] 0
##
## $min_predictive_items
## [1] 0
similarity_users <- similarity(ratingmat[1:4, ],
method = "cosine",
which = "users")
par(mfrow=c(1,2))
as.matrix(similarity_users)## 1 2 3 4
## 1 0.0000000 0.9760860 0.9641723 0.9914398
## 2 0.9760860 0.0000000 0.9925732 0.9374253
## 3 0.9641723 0.9925732 0.0000000 0.9888968
## 4 0.9914398 0.9374253 0.9888968 0.0000000
image(as.matrix(similarity_users), main = "User similarity")
similarity_items <- similarity(ratingmat[, 1:4], method =
"cosine", which = "items")
as.matrix(similarity_items)## 1 2 3 4
## 1 0.0000000 0.9669732 0.9559341 0.9101276
## 2 0.9669732 0.0000000 0.9658757 0.9412416
## 3 0.9559341 0.9658757 0.0000000 0.9864877
## 4 0.9101276 0.9412416 0.9864877 0.0000000
vector_ratings <- as.vector(ratingmat@data)
unique(vector_ratings) # what are unique values of ratings## [1] 0.0 5.0 4.0 3.0 4.5 1.5 2.0 3.5 1.0 2.5 0.5
table_ratings <- table(vector_ratings) # what is the count of each rating value
vector_ratings <- vector_ratings[vector_ratings != 0] # rating == 0 are NA values
vector_ratings <- factor(vector_ratings)
par(mfrow=c(1,2))
qplot(vector_ratings) +
ggtitle("Distribution of the vector ratings")+
theme(plot.title = element_text(hjust = 0.5)) ### Data Setup
Defining the minimum number of users per rated movie as 50 and the minimum views number per movie as 50
We will be exploring the Item-based and User-based collaborative filtering model
Item-item collaborative filtering, or item-based, or item-to-item, is a form of collaborative filtering for recommender systems based on the similarity between items calculated using people’s ratings of those items
Defining training/test datasets as below
which_train <- sample(x = c(TRUE, FALSE),
size = nrow(ratings_movies),
replace = TRUE,
prob = c(0.8, 0.2))
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
recc_model <- Recommender(data = recc_data_train,
method = "IBCF",
parameter = list(k = 30))
recc_model## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 331 users.
## [1] "Recommender"
## attr(,"package")
## [1] "recommenderlab"
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
## [1] 447 447
n_items_top <- 20
image(model_details$sim[1:n_items_top, 1:n_items_top],
main = "Heatmap of the first rows and columns")## row_sums
## 30
## 447
col_sums <- colSums(model_details$sim > 0)
qplot(col_sums) + stat_bin(binwidth = 1) + ggtitle("Distribution of the column count")n_recommended <- 10 # the number of items to recommend to each user
recc_predicted <- predict(object = recc_model,
newdata = recc_data_test,
n = n_recommended)
recc_predicted## Recommendations as 'topNList' with n = 10 for 89 users.
recc_user_1 <- recc_predicted@items[[1]] # recommendation for the first user
movies_user_1 <- recc_predicted@itemLabels[recc_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10){
movies_user_2[i] <- as.character(subset(movies,
movies$movieId == movies_user_1[i])$title)
}
movies_user_2## [1] "Heat (1995)"
## [2] "Ace Ventura: When Nature Calls (1995)"
## [3] "From Dusk Till Dawn (1996)"
## [4] "Broken Arrow (1996)"
## [5] "Taxi Driver (1976)"
## [6] "Congo (1995)"
## [7] "Desperado (1995)"
## [8] "First Knight (1995)"
## [9] "Net, The (1995)"
## [10] "Species (1995)"
recc_matrix <- sapply(recc_predicted@items,
function(x){ as.integer(colnames(ratings_movies)[x]) }) # matrix with the recommendations for each user
#dim(recc_matrix)
recc_matrix[,1:4]## [,1] [,2] [,3] [,4]
## [1,] 6 70 1199 2000
## [2,] 19 196 2028 529
## [3,] 70 288 3114 3052
## [4,] 95 300 3175 2918
## [5,] 111 380 4720 4896
## [6,] 160 494 919 4022
## [7,] 163 551 3418 2599
## [8,] 168 595 2011 1517
## [9,] 185 852 1625 1784
## [10,] 196 1094 508 3114
number_of_items <- factor(table(recc_matrix))
chart_title <- "Distribution of the number of items for IBCF"
qplot(number_of_items) + ggtitle(chart_title)number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(number_of_items_top)),
number_of_items_top)
for (i in 1:4){
table_top[i,1] <- as.character(subset(movies,
movies$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie title", "No of items")
head(table_top)IBCF recommends items on the basis of the similarity matrix, Item-item models resolve these problems in systems that have more users than items. Item-item models use rating distributions per item, not per user. With more users than items, each item tends to have more ratings than each user, so an item’s average rating usually doesn’t change quickly. This leads to more stable rating distributions in the model, so the model doesn’t have to be rebuilt as often. When users consume and then rate an item, that item’s similar items are picked from the existing system model and added to the user’s recommendation
The main idea behind UB-CF is that people with similar characteristics share similar taste. The method identifies users that are similar to the queried user and estimate the desired rating to be the weighted average of the ratings of these similar users
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $weighted
## [1] TRUE
##
## $normalize
## [1] "center"
##
## $min_matching_items
## [1] 0
##
## $min_predictive_items
## [1] 0
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 331 users.
## 331 x 447 rating matrix of class 'realRatingMatrix' with 29688 ratings.
## Normalized using center on rows.
n_recommended <- 10
recc_predicted <- predict(object = recc_model,
newdata = recc_data_test,
n = n_recommended)
recc_predicted## Recommendations as 'topNList' with n = 10 for 89 users.
recc_matrix <- sapply(recc_predicted@items,
function(x){ as.integer(colnames(ratings_movies)[x]) })
#dim(recc_matrix)
recc_matrix[, 1:4]## [,1] [,2] [,3] [,4]
## [1,] 49530 1250 1225 1088
## [2,] 2395 208 1230 1584
## [3,] 2324 1276 1234 920
## [4,] 2078 1204 2791 49530
## [5,] 1396 7438 2916 5669
## [6,] 1673 1252 594 750
## [7,] 4034 2268 1307 1380
## [8,] 4995 2640 1258 1968
## [9,] 70 2918 2268 1394
## [10,] 555 368 2640 1213
number_of_items <- factor(table(recc_matrix))
chart_title <- "Distribution of the number of items for UBCF"
qplot(number_of_items) + ggtitle(chart_title)number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(number_of_items_top)), number_of_items_top)
for (i in 1:4){
table_top[i,1] <- as.character(subset(movies,
movies$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie title", "No of items")
head(table_top)It looks like UBCF’s accuracy is proven to be slightly more accurate than IBCF
## [1] 8
items_to_keep <- 5 #number of items to generate recommendations
rating_threshold <- 3 # threshold with the minimum rating that is considered good
n_eval <- 1 #number of times to run evaluation
eval_sets <- evaluationScheme(data = ratings_movies,
method = "split",
train = percentage_training,
given = items_to_keep,
goodRating = rating_threshold,
k = n_eval)
eval_sets## Evaluation scheme with 5 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 420 x 447 rating matrix of class 'realRatingMatrix' with 38341 ratings.
## 336 x 447 rating matrix of class 'realRatingMatrix' with 30035 ratings.
## 84 x 447 rating matrix of class 'realRatingMatrix' with 420 ratings.
## 84 x 447 rating matrix of class 'realRatingMatrix' with 7886 ratings.
# Bootstrapping the data
#Bootrstrapping is another approach to split the data. The same user can be sampled more than once and, if the training set has the same size as it did earlier, there will be more users in the test set.
eval_sets <- evaluationScheme(data = ratings_movies,
method = "bootstrap",
train = percentage_training,
given = items_to_keep,
goodRating = rating_threshold,
k = n_eval)
table_train <- table(eval_sets@runsTrain[[1]])
n_repetitions <- factor(as.vector(table_train))
# Using cross-validation to validate models
#The k-fold cross-validation approach is the most accurate one, we split the data into some chunks, take a chunk out as the test set, and evaluate the accuracy. Then, we can do the same with each other chunk and compute the average accuracy.
n_fold <- 4
eval_sets <- evaluationScheme(data = ratings_movies,
method = "cross-validation",
k = n_fold,
given = items_to_keep,
goodRating = rating_threshold)
## Evaluating the ratings
#k-fold approach has been picked for evaluation
eval_sets <- evaluationScheme(data = ratings_movies,
method = "cross-validation",
k = n_fold,
given = items_to_keep,
goodRating = rating_threshold)
model_to_evaluate <- "IBCF"
model_parameters <- NULL
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
method = model_to_evaluate,
parameter = model_parameters)
items_to_recommend <- 10
eval_prediction <- predict(object = eval_recommender,
newdata = getData(eval_sets, "known"),
n = items_to_recommend,
type = "ratings")
eval_accuracy <- calcPredictionAccuracy(x = eval_prediction,
data = getData(eval_sets, "unknown"),
byUser = TRUE)Comparing the recommendations with the views having a positive rating.
## IBCF run fold/sample [model time/prediction time]
## 1 [0.22sec/0.02sec]
## 2 [0.22sec/0.02sec]
## 3 [0.21sec/0.01sec]
## 4 [0.2sec/0.03sec]
## TP FP FN TN precision recall TPR
## 10 1.790476 8.019048 79.08571 353.1048 0.1825243 0.02189570 0.02189570
## 20 3.495238 16.057143 77.38095 345.0667 0.1794623 0.04346245 0.04346245
## 30 5.161905 23.828571 75.71429 337.2952 0.1794814 0.06280803 0.06280803
## 40 6.771429 31.352381 74.10476 329.7714 0.1795942 0.08200694 0.08200694
## 50 8.180952 38.447619 72.69524 322.6762 0.1778116 0.09797314 0.09797314
## 60 9.304762 44.704762 71.57143 316.4190 0.1752430 0.11023134 0.11023134
## FPR
## 10 0.02248640
## 20 0.04473487
## 30 0.06644079
## 40 0.08745188
## 50 0.10740354
## 60 0.12504793
columns_to_sum <- c("TP", "FP", "FN", "TN")
indices_summed <- Reduce("+", getConfusionMatrix(results))[, columns_to_sum]
head(indices_summed)## TP FP FN TN
## 10 7.085714 32.13333 294.8095 1433.971
## 20 13.352381 64.71429 288.5429 1401.390
## 30 19.438095 96.94286 282.4571 1369.162
## 40 25.409524 128.00000 276.4857 1338.105
## 50 30.685714 157.82857 271.2095 1308.276
## 60 35.495238 185.60952 266.4000 1280.495
If a small percentage of rated movies is recommended, the precision decreases. On the other hand, the higher percentage of rated movies is recommended the higher is the recall.
The following models are considered for comparison
models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF",
param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF",
param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF",
param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF",
param = list(method = "pearson")),
random = list(name = "RANDOM", param=NULL)
)
n_recommendations <- c(1, 5, seq(10, 100, 10))
list_results <- evaluate(x = eval_sets,
method = models_to_evaluate,
n = n_recommendations)## IBCF run fold/sample [model time/prediction time]
## 1 [0.19sec/0.03sec]
## 2 [0.22sec/0.03sec]
## 3 [0.22sec/0.01sec]
## 4 [0.2sec/0.03sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.24sec/0.03sec]
## 2 [0.27sec/0.03sec]
## 3 [0.24sec/0.03sec]
## 4 [0.24sec/0.03sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.02sec/0.17sec]
## 2 [0sec/0.2sec]
## 3 [0sec/0.17sec]
## 4 [0sec/0.2sec]
## UBCF run fold/sample [model time/prediction time]
## 1 Error in neighbors[, x] : incorrect number of dimensions
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.04sec]
## 2 [0sec/0.03sec]
## 3 [0sec/0.03sec]
## 4 [0sec/0.03sec]
## IBCF_cos IBCF_cor UBCF_cos random
## TRUE TRUE TRUE TRUE
## precision recall TPR FPR
## 1 0.1651492 0.002841408 0.002841408 0.002315147
## 5 0.1753453 0.011946948 0.011946948 0.011168830
## 10 0.1807146 0.025319337 0.025319337 0.022107715
## 20 0.1719541 0.046062531 0.046062531 0.044347231
## 30 0.1689343 0.065892210 0.065892210 0.066344915
## 40 0.1680604 0.084312413 0.084312413 0.087589773
Comparing the chart will give oppurtunity to pick the ideal model
The graph shows that,UBCF with cosine distance is best performing model
Building an Alternating Least Squares (ALS) using Spark ML and predict the ratings
# connect to spark
config <- spark_config()
config$`sparklyr.shell.driver-memory` <- "8G"
config$`sparklyr.shell.executor-memory` <- "8G"
# The memory limit could not be configured on my low resouce laptop.
sc <- spark_connect(master = "local", config = config)
#Assign the dataset to Spark
spark_df <- ratings
#Building Train and Test dataset
train <- sample(x = c(TRUE, FALSE), size = nrow(spark_df),
replace = TRUE, prob = c(0.8, 0.2))
train_df <- spark_df[train, ]
test_df <- spark_df[!train, ]
#Copy function to Spark
spark_train <- sdf_copy_to(sc, train_df, "train_ratings", overwrite = TRUE)
spark_test <- sdf_copy_to(sc, test_df, "test_ratings", overwrite = TRUE)
sparkALS <- ml_als(spark_train, max_iter = 5, nonnegative = TRUE,
rating_col = "rating", user_col = "userId", item_col = "movieId")
sparkPred <- sparkALS$.jobj %>%
invoke("transform", spark_dataframe(spark_test)) %>% collect()
sparkPred <- sparkPred[!is.na(sparkPred$prediction), ]
#Calculate error
mse_spark <- mean((sparkPred$rating - sparkPred$prediction)^2)
rmse_spark <- sqrt(mse_spark)
mae_spark <- mean(abs(sparkPred$rating - sparkPred$prediction))
# disconnect from spark
spark_disconnect(sc)