Youtube Presentation Link: https://www.youtube.com/watch?v=-8GULl1pLFQ

Introduction

This project is to perform analysis Collaborative Filtering on existing MovieLens dataset of user-item ratings also analysing the prediction using spark ALS

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating).

Spark ALS CF,which users and products are described by a small set of latent factors that can be used to predict missing entries

Dataset

MovieLens 1M movie ratings. Stable benchmark dataset. 1 million ratings from 6000 users on 4000 movies. Released 2/2003, The dataset can be downloaded from https://grouplens.org/datasets/movielens/1m/. This dataset is choosen for analysis

movies <- read.csv("movies.csv",stringsAsFactors=FALSE)
ratings <- read.csv("ratings.csv",stringsAsFactors=FALSE)

Data Exploration

summary(movies)
##     movieId          title              genres         
##  Min.   :     1   Length:10329       Length:10329      
##  1st Qu.:  3240   Class :character   Class :character  
##  Median :  7088   Mode  :character   Mode  :character  
##  Mean   : 31924                                        
##  3rd Qu.: 59900                                        
##  Max.   :149532
summary(ratings)
##      userId         movieId           rating        timestamp        
##  Min.   :  1.0   Min.   :     1   Min.   :0.500   Min.   :8.286e+08  
##  1st Qu.:192.0   1st Qu.:  1073   1st Qu.:3.000   1st Qu.:9.711e+08  
##  Median :383.0   Median :  2497   Median :3.500   Median :1.115e+09  
##  Mean   :364.9   Mean   : 13381   Mean   :3.517   Mean   :1.130e+09  
##  3rd Qu.:557.0   3rd Qu.:  5991   3rd Qu.:4.000   3rd Qu.:1.275e+09  
##  Max.   :668.0   Max.   :149532   Max.   :5.000   Max.   :1.452e+09
nrow(ratings)
## [1] 105339
ratings %>% summarize(n_users = n_distinct(userId),n_movies = n_distinct(movieId))
par(mfrow=c(2,2))
ratings %>% 
  count(movieId) %>% 
  ggplot(aes(n)) + 
  geom_histogram( bins=30, color = "red") +
  scale_x_log10() + 
  ggtitle("Movies") +
  labs(subtitle  ="number of ratings by movieId", 
       x="movieId" , 
       y="number of ratings", 
       caption ="source data : ratings set") +
  theme(panel.border = element_rect(colour="black", fill=NA)) 

  ratings %>% 
  count(userId) %>% 
  ggplot(aes(n)) + 
  geom_histogram( bins=30, color = "gold") +
  scale_x_log10() + 
  ggtitle("Users") +
  labs(subtitle ="number of ratings by UserId", 
       x="userId" , 
       y="number of ratings") +
  theme(panel.border = element_rect(colour="black", fill=NA)) 

kable(head(ratings),caption = "Sample Ratings Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(0, bold = T, color = "white", background = "#ea7872") 
Sample Ratings Dataset
userId movieId rating timestamp
1 16 4.0 1217897793
1 24 1.5 1217895807
1 32 4.0 1217896246
1 47 4.0 1217896556
1 50 4.0 1217896523
1 110 4.0 1217896150
kable(head(movies),caption = "Sample Movies Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(0, bold = T, color = "white", background = "#ea7872")
Sample Movies Dataset
movieId title genres
1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
2 Jumanji (1995) Adventure|Children|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama|Romance
5 Father of the Bride Part II (1995) Comedy
6 Heat (1995) Action|Crime|Thriller

Data Wrangling

Pre-processing of data for further model building, Seperating the genre of movies

genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[|]', 
                                   type.convert=TRUE), 
                         stringsAsFactors=FALSE)
colnames(genres2) <- c(1:10)
genre_list <- c("Action", "Adventure", "Animation", "Children", 
                "Comedy", "Crime","Documentary", "Drama", "Fantasy",
                "Film-Noir", "Horror", "Musical", "Mystery","Romance",
                "Sci-Fi", "Thriller", "War", "Western") # we have 18 genres in total
genre_matrix <- matrix(0,10330,18) #empty matrix, 10330=no of movies+1, 18=no of genres
genre_matrix[1,] <- genre_list #set first row to genre list
colnames(genre_matrix) <- genre_list #set column names to genre list
#iterate through matrix
for (i in 1:nrow(genres2)) {
  for (c in 1:ncol(genres2)) {
    genmat_col = which(genre_matrix[1,] == genres2[i,c])
    genre_matrix[i+1,genmat_col] <- 1
  }
}
#convert into dataframe
genre_matrix2 <- as.data.frame(genre_matrix[-1,], stringsAsFactors=FALSE) #remove first row, which was the genre list
for (c in 1:ncol(genre_matrix2)) {
  genre_matrix2[,c] <- as.integer(genre_matrix2[,c])  #convert from characters to integers
} 
search_matrix <- cbind(movies[,1:2], genre_matrix2)

kable(head(search_matrix),caption = "Sample Movies Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
    scroll_box(width = "100%", height = "300px")
Sample Movies Dataset
movieId title Action Adventure Animation Children Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
1 Toy Story (1995) 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
2 Jumanji (1995) 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
3 Grumpier Old Men (1995) 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
4 Waiting to Exhale (1995) 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0
5 Father of the Bride Part II (1995) 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
6 Heat (1995) 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0

Model Exploration

IBCF and UBCF models are used comparison and performance

Sparse Matrix Conversion

#Create ratings matrix. Rows = userId, Columns = movieId
ratingmat <- dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)
ratingmat <- as.matrix(ratingmat[,-1]) #remove userIds
#Convert rating matrix into a recommenderlab sparse matrix
ratingmat <- as(ratingmat, "realRatingMatrix")
ratingmat
## 668 x 10325 rating matrix of class 'realRatingMatrix' with 105339 ratings.

Identifying the algorithms and recommendation model

recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)
##  [1] "HYBRID_realRatingMatrix"       "ALS_realRatingMatrix"         
##  [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"        
##  [5] "LIBMF_realRatingMatrix"        "POPULAR_realRatingMatrix"     
##  [7] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
##  [9] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [11] "UBCF_realRatingMatrix"
lapply(recommender_models, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
## 
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
## 
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
## 
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
## 
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
## 
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
## 
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
## 
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
## 
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
## 
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
## 
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE
recommender_models$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $weighted
## [1] TRUE
## 
## $normalize
## [1] "center"
## 
## $min_matching_items
## [1] 0
## 
## $min_predictive_items
## [1] 0
similarity_users <- similarity(ratingmat[1:4, ], 
                               method = "cosine", 
                               which = "users")
par(mfrow=c(1,2))
as.matrix(similarity_users)
##           1         2         3         4
## 1 0.0000000 0.9760860 0.9641723 0.9914398
## 2 0.9760860 0.0000000 0.9925732 0.9374253
## 3 0.9641723 0.9925732 0.0000000 0.9888968
## 4 0.9914398 0.9374253 0.9888968 0.0000000
image(as.matrix(similarity_users), main = "User similarity")

similarity_items <- similarity(ratingmat[, 1:4], method =
                                 "cosine", which = "items")
as.matrix(similarity_items)
##           1         2         3         4
## 1 0.0000000 0.9669732 0.9559341 0.9101276
## 2 0.9669732 0.0000000 0.9658757 0.9412416
## 3 0.9559341 0.9658757 0.0000000 0.9864877
## 4 0.9101276 0.9412416 0.9864877 0.0000000
image(as.matrix(similarity_items), main = "Movies similarity")

Data Distribution

vector_ratings <- as.vector(ratingmat@data)
unique(vector_ratings) # what are unique values of ratings
##  [1] 0.0 5.0 4.0 3.0 4.5 1.5 2.0 3.5 1.0 2.5 0.5
table_ratings <- table(vector_ratings) # what is the count of each rating value
vector_ratings <- vector_ratings[vector_ratings != 0] # rating == 0 are NA values
vector_ratings <- factor(vector_ratings)
par(mfrow=c(1,2))
qplot(vector_ratings) + 
  ggtitle("Distribution of the vector ratings")+
  theme(plot.title = element_text(hjust = 0.5))

image(ratingmat[1:20, 1:25], main = "Heatmap of the first 20 rows and 25 columns")

### Data Setup

Defining the minimum number of users per rated movie as 50 and the minimum views number per movie as 50

ratings_movies <- ratingmat[rowCounts(ratingmat) > 50,
                             colCounts(ratingmat) > 50]
ratings_movies_norm <- normalize(ratings_movies)

Collaborative Filtering Model

We will be exploring the Item-based and User-based collaborative filtering model

Item-based Collaborative Filtering

Item-item collaborative filtering, or item-based, or item-to-item, is a form of collaborative filtering for recommender systems based on the similarity between items calculated using people’s ratings of those items

Defining training/test datasets as below

which_train <- sample(x = c(TRUE, FALSE), 
                      size = nrow(ratings_movies),
                      replace = TRUE, 
                      prob = c(0.8, 0.2))
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]

Building the IBCF model

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE
recc_model <- Recommender(data = recc_data_train, 
                          method = "IBCF",
                          parameter = list(k = 30))
recc_model
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 331 users.
class(recc_model)
## [1] "Recommender"
## attr(,"package")
## [1] "recommenderlab"
model_details <- getModel(recc_model)
class(model_details$sim) # this contains a similarity matrix
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(model_details$sim)
## [1] 447 447
n_items_top <- 20
image(model_details$sim[1:n_items_top, 1:n_items_top],
      main = "Heatmap of the first rows and columns")

row_sums <- rowSums(model_details$sim > 0)
table(row_sums)
## row_sums
##  30 
## 447
col_sums <- colSums(model_details$sim > 0)
qplot(col_sums) + stat_bin(binwidth = 1) + ggtitle("Distribution of the column count")

n_recommended <- 10 # the number of items to recommend to each user
recc_predicted <- predict(object = recc_model, 
                          newdata = recc_data_test, 
                          n = n_recommended)
recc_predicted
## Recommendations as 'topNList' with n = 10 for 89 users.
recc_user_1 <- recc_predicted@items[[1]] # recommendation for the first user
movies_user_1 <- recc_predicted@itemLabels[recc_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10){
  movies_user_2[i] <- as.character(subset(movies, 
                                         movies$movieId == movies_user_1[i])$title)
}
movies_user_2
##  [1] "Heat (1995)"                          
##  [2] "Ace Ventura: When Nature Calls (1995)"
##  [3] "From Dusk Till Dawn (1996)"           
##  [4] "Broken Arrow (1996)"                  
##  [5] "Taxi Driver (1976)"                   
##  [6] "Congo (1995)"                         
##  [7] "Desperado (1995)"                     
##  [8] "First Knight (1995)"                  
##  [9] "Net, The (1995)"                      
## [10] "Species (1995)"
recc_matrix <- sapply(recc_predicted@items, 
                      function(x){ as.integer(colnames(ratings_movies)[x]) }) # matrix with the recommendations for each user
#dim(recc_matrix)
recc_matrix[,1:4]
##       [,1] [,2] [,3] [,4]
##  [1,]    6   70 1199 2000
##  [2,]   19  196 2028  529
##  [3,]   70  288 3114 3052
##  [4,]   95  300 3175 2918
##  [5,]  111  380 4720 4896
##  [6,]  160  494  919 4022
##  [7,]  163  551 3418 2599
##  [8,]  168  595 2011 1517
##  [9,]  185  852 1625 1784
## [10,]  196 1094  508 3114
number_of_items <- factor(table(recc_matrix))
chart_title <- "Distribution of the number of items for IBCF"
qplot(number_of_items) + ggtitle(chart_title)

number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(number_of_items_top)),
                       number_of_items_top)
for (i in 1:4){
  table_top[i,1] <- as.character(subset(movies, 
                                         movies$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie title", "No of items")
head(table_top)

IBCF recommends items on the basis of the similarity matrix, Item-item models resolve these problems in systems that have more users than items. Item-item models use rating distributions per item, not per user. With more users than items, each item tends to have more ratings than each user, so an item’s average rating usually doesn’t change quickly. This leads to more stable rating distributions in the model, so the model doesn’t have to be rebuilt as often. When users consume and then rate an item, that item’s similar items are picked from the existing system model and added to the user’s recommendation

User-based Collaborative Filtering Model

The main idea behind UB-CF is that people with similar characteristics share similar taste. The method identifies users that are similar to the queried user and estimate the desired rating to be the weighted average of the ratings of these similar users

Building the UBCF model

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $weighted
## [1] TRUE
## 
## $normalize
## [1] "center"
## 
## $min_matching_items
## [1] 0
## 
## $min_predictive_items
## [1] 0
recc_model <- Recommender(data = recc_data_train, method = "UBCF")
recc_model
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 331 users.
model_details <- getModel(recc_model)
#names(model_details)
model_details$data
## 331 x 447 rating matrix of class 'realRatingMatrix' with 29688 ratings.
## Normalized using center on rows.
n_recommended <- 10
recc_predicted <- predict(object = recc_model,
                          newdata = recc_data_test, 
                          n = n_recommended) 
recc_predicted
## Recommendations as 'topNList' with n = 10 for 89 users.
recc_matrix <- sapply(recc_predicted@items, 
                      function(x){ as.integer(colnames(ratings_movies)[x]) })
#dim(recc_matrix)
recc_matrix[, 1:4]
##        [,1] [,2] [,3]  [,4]
##  [1,] 49530 1250 1225  1088
##  [2,]  2395  208 1230  1584
##  [3,]  2324 1276 1234   920
##  [4,]  2078 1204 2791 49530
##  [5,]  1396 7438 2916  5669
##  [6,]  1673 1252  594   750
##  [7,]  4034 2268 1307  1380
##  [8,]  4995 2640 1258  1968
##  [9,]    70 2918 2268  1394
## [10,]   555  368 2640  1213
number_of_items <- factor(table(recc_matrix))
chart_title <- "Distribution of the number of items for UBCF"
qplot(number_of_items) + ggtitle(chart_title)

number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(number_of_items_top)), number_of_items_top)
for (i in 1:4){
  table_top[i,1] <- as.character(subset(movies, 
                                         movies$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie title", "No of items")
head(table_top)

It looks like UBCF’s accuracy is proven to be slightly more accurate than IBCF

Evaluation of model

percentage_training <- 0.8

min(rowCounts(ratings_movies)) 
## [1] 8
items_to_keep <- 5 #number of items to generate recommendations
rating_threshold <- 3 # threshold with the minimum rating that is considered good
n_eval <- 1 #number of times to run evaluation
eval_sets <- evaluationScheme(data = ratings_movies, 
                              method = "split",
                              train = percentage_training, 
                              given = items_to_keep, 
                              goodRating = rating_threshold, 
                              k = n_eval) 
eval_sets
## Evaluation scheme with 5 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 420 x 447 rating matrix of class 'realRatingMatrix' with 38341 ratings.
getData(eval_sets, "train") # training set
## 336 x 447 rating matrix of class 'realRatingMatrix' with 30035 ratings.
getData(eval_sets, "known") # set with the items used to build the recommendations
## 84 x 447 rating matrix of class 'realRatingMatrix' with 420 ratings.
getData(eval_sets, "unknown") # set with the items used to test the recommendations
## 84 x 447 rating matrix of class 'realRatingMatrix' with 7886 ratings.
# Bootstrapping the data

#Bootrstrapping is another approach to split the data. The same user can be sampled more than once and, if the training set has the same size as it did earlier, there will be more users in the test set.

eval_sets <- evaluationScheme(data = ratings_movies, 
                              method = "bootstrap", 
                              train = percentage_training, 
                              given = items_to_keep,
                              goodRating = rating_threshold, 
                              k = n_eval)
table_train <- table(eval_sets@runsTrain[[1]])
n_repetitions <- factor(as.vector(table_train))

# Using cross-validation to validate models

#The k-fold cross-validation approach is the most accurate one, we split the data into some chunks, take a chunk out as the test set, and evaluate the accuracy. Then, we can do the same with each other chunk and compute the average accuracy.

n_fold <- 4
eval_sets <- evaluationScheme(data = ratings_movies, 
                              method = "cross-validation",
                              k = n_fold, 
                              given = items_to_keep, 
                              goodRating = rating_threshold)

## Evaluating the ratings

#k-fold approach has been picked for evaluation

eval_sets <- evaluationScheme(data = ratings_movies, 
                              method = "cross-validation",
                              k = n_fold, 
                              given = items_to_keep, 
                              goodRating = rating_threshold)
model_to_evaluate <- "IBCF"
model_parameters <- NULL
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
                                method = model_to_evaluate, 
                                parameter = model_parameters)
items_to_recommend <- 10
eval_prediction <- predict(object = eval_recommender, 
                           newdata = getData(eval_sets, "known"), 
                           n = items_to_recommend, 
                           type = "ratings")

eval_accuracy <- calcPredictionAccuracy(x = eval_prediction, 
                                        data = getData(eval_sets, "unknown"), 
                                        byUser = TRUE)

Accuracy Measures

eval_accuracy <- calcPredictionAccuracy(x = eval_prediction, 
                                        data = getData(eval_sets, "unknown"), 
                                        byUser = FALSE) 
eval_accuracy
##     RMSE      MSE      MAE 
## 1.391840 1.937219 1.036975

The measures of accuracy are useful to compare the performance of different models on the same data.

Probability thresholds

Comparing the recommendations with the views having a positive rating.

results <- evaluate(x = eval_sets, 
                    method = model_to_evaluate, 
                    n = seq(10, 100, 10))
## IBCF run fold/sample [model time/prediction time]
##   1  [0.22sec/0.02sec] 
##   2  [0.22sec/0.02sec] 
##   3  [0.21sec/0.01sec] 
##   4  [0.2sec/0.03sec]
head(getConfusionMatrix(results)[[1]])
##          TP        FP       FN       TN precision     recall        TPR
## 10 1.790476  8.019048 79.08571 353.1048 0.1825243 0.02189570 0.02189570
## 20 3.495238 16.057143 77.38095 345.0667 0.1794623 0.04346245 0.04346245
## 30 5.161905 23.828571 75.71429 337.2952 0.1794814 0.06280803 0.06280803
## 40 6.771429 31.352381 74.10476 329.7714 0.1795942 0.08200694 0.08200694
## 50 8.180952 38.447619 72.69524 322.6762 0.1778116 0.09797314 0.09797314
## 60 9.304762 44.704762 71.57143 316.4190 0.1752430 0.11023134 0.11023134
##           FPR
## 10 0.02248640
## 20 0.04473487
## 30 0.06644079
## 40 0.08745188
## 50 0.10740354
## 60 0.12504793
columns_to_sum <- c("TP", "FP", "FN", "TN")
indices_summed <- Reduce("+", getConfusionMatrix(results))[, columns_to_sum]
head(indices_summed)
##           TP        FP       FN       TN
## 10  7.085714  32.13333 294.8095 1433.971
## 20 13.352381  64.71429 288.5429 1401.390
## 30 19.438095  96.94286 282.4571 1369.162
## 40 25.409524 128.00000 276.4857 1338.105
## 50 30.685714 157.82857 271.2095 1308.276
## 60 35.495238 185.60952 266.4000 1280.495
plot(results, annotate = TRUE, main = "ROC curve")

plot(results, "prec/rec", annotate = TRUE, main = "Precision-recall")

If a small percentage of rated movies is recommended, the precision decreases. On the other hand, the higher percentage of rated movies is recommended the higher is the recall.

Comparing models

The following models are considered for comparison

  • Item-based collaborative filtering, using the Cosine as the distance function
  • Item-based collaborative filtering, using the Pearson correlation as the distance function
  • User-based collaborative filtering, using the Cosine as the distance function
  • User-based collaborative filtering, using the Pearson correlation as the distance function
  • Random recommendations to have a base line
models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF", 
                param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF", 
                param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF", 
                param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF", 
                param = list(method = "pearson")),
random = list(name = "RANDOM", param=NULL)
)

n_recommendations <- c(1, 5, seq(10, 100, 10))
list_results <- evaluate(x = eval_sets, 
                         method = models_to_evaluate, 
                         n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.19sec/0.03sec] 
##   2  [0.22sec/0.03sec] 
##   3  [0.22sec/0.01sec] 
##   4  [0.2sec/0.03sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.24sec/0.03sec] 
##   2  [0.27sec/0.03sec] 
##   3  [0.24sec/0.03sec] 
##   4  [0.24sec/0.03sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/0.17sec] 
##   2  [0sec/0.2sec] 
##   3  [0sec/0.17sec] 
##   4  [0sec/0.2sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  Error in neighbors[, x] : incorrect number of dimensions
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.04sec] 
##   2  [0sec/0.03sec] 
##   3  [0sec/0.03sec] 
##   4  [0sec/0.03sec]
sapply(list_results, class) == "evaluationResults"
## IBCF_cos IBCF_cor UBCF_cos   random 
##     TRUE     TRUE     TRUE     TRUE
avg_matrices <- lapply(list_results, avg)
head(avg_matrices$IBCF_cos[, 5:8])
##    precision      recall         TPR         FPR
## 1  0.1651492 0.002841408 0.002841408 0.002315147
## 5  0.1753453 0.011946948 0.011946948 0.011168830
## 10 0.1807146 0.025319337 0.025319337 0.022107715
## 20 0.1719541 0.046062531 0.046062531 0.044347231
## 30 0.1689343 0.065892210 0.065892210 0.066344915
## 40 0.1680604 0.084312413 0.084312413 0.087589773

Ideal model

Comparing the chart will give oppurtunity to pick the ideal model

plot(list_results, annotate = 1, legend = "topleft") 
title("ROC curve")

plot(list_results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")

The graph shows that,UBCF with cosine distance is best performing model

Spark Implementation

Building an Alternating Least Squares (ALS) using Spark ML and predict the ratings

# connect to spark
config <- spark_config()
config$`sparklyr.shell.driver-memory` <- "8G"
config$`sparklyr.shell.executor-memory` <- "8G"

# The memory limit could not be configured on my low resouce laptop.
sc <- spark_connect(master = "local", config = config)

#Assign the dataset to Spark
spark_df <- ratings

#Building Train and Test dataset
train <- sample(x = c(TRUE, FALSE), size = nrow(spark_df),
                      replace = TRUE, prob = c(0.8, 0.2))
train_df <- spark_df[train, ]
test_df <- spark_df[!train, ]

#Copy function to Spark
spark_train <- sdf_copy_to(sc, train_df, "train_ratings", overwrite = TRUE)
spark_test <- sdf_copy_to(sc, test_df, "test_ratings", overwrite = TRUE)


sparkALS <- ml_als(spark_train, max_iter = 5, nonnegative = TRUE, 
                   rating_col = "rating", user_col = "userId", item_col = "movieId")

sparkPred <- sparkALS$.jobj %>%
  invoke("transform", spark_dataframe(spark_test)) %>% collect()



sparkPred <- sparkPred[!is.na(sparkPred$prediction), ] 

#Calculate error
mse_spark <- mean((sparkPred$rating - sparkPred$prediction)^2)
rmse_spark <- sqrt(mse_spark)
mae_spark <- mean(abs(sparkPred$rating - sparkPred$prediction))

# disconnect from spark
spark_disconnect(sc)

Spark Accuracy Measures

spark_pred_datatable <- rename(sparkPred, c("spark_prediction"="prediction"))
spark_pred_datatable <- subset(spark_pred_datatable, select=c(userId,movieId,rating,spark_prediction)) 

spark_error <- data.frame(RMSE = rmse_spark, MSE = mse_spark, MAE = mae_spark)
spark_error

Spark Prediction

datatable(spark_pred_datatable, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Summary

  • Analysis made on Collaborative filtering model using IBCF and UBCF for MovieLens dataset, the result shows that UBCF with cosine is the best performer.
  • ALS model is spark has predicted closely to the rating given MovieLens dataset.