The goal of this assignment is for you to try out different ways of implementing and configuring a recommender, and to evaluate your different approaches. For assignment 2, start with an existing dataset of user-item ratings, such as our toy books dataset, MovieLens, Jester [http://eigentaste.berkeley.edu/dataset/] or another dataset of your choosing. Implement at least two of these recommendation algorithms:

  • Content-Based Filtering
  • User-User Collaborative Filtering
  • Item-Item Collaborative Filtering
# Loading required libraries
library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)
library(knitr)
library(tidyverse)

1 Data Loading

#Read the Movie data
movie.url <- getURL("https://raw.githubusercontent.com/niteen11/MSDS/master/DATA643/Project%202/dataset/movies.csv")
movies.df <- read.csv(text = movie.url,header = TRUE, stringsAsFactors = FALSE)
#movies.df <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 643\\ml-latest-small\\movies.csv',header = TRUE, stringsAsFactors = FALSE)
kable(head(movies.df))
movieId title genres
1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
2 Jumanji (1995) Adventure|Children|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama|Romance
5 Father of the Bride Part II (1995) Comedy
6 Heat (1995) Action|Crime|Thriller
## read Ratings data, remove timestamp column.
rating.url <- getURL("https://raw.githubusercontent.com/niteen11/MSDS/master/DATA643/Project%202/dataset/ratings.csv")
rating.df <- read.csv(text = rating.url,header = TRUE, stringsAsFactors = FALSE)
#rating.df <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 643\\ml-latest-small\\ratings.csv',header = TRUE, stringsAsFactors = FALSE)
# rating.df <- rating.df %>% 
#   select(userId, movieId, rating)
kable(head(rating.df))
userId movieId rating timestamp
1 31 2.5 1260759144
1 1029 3.0 1260759179
1 1061 3.0 1260759182
1 1129 2.0 1260759185
1 1172 4.0 1260759205
1 1263 2.0 1260759151

Reshaping the data and making it ready for data visualization and data modeling purpose

# Convert to matrix
movieMatrix <- rating.df %>% 
  select(-timestamp) %>% 
  spread(movieId, rating)
row.names(movieMatrix) <- movieMatrix[,1]
movieMatrix <- movieMatrix[-c(1)]
movieMatrix <- as(as.matrix(movieMatrix), "realRatingMatrix")

movieMatrix
## 671 x 9066 rating matrix of class 'realRatingMatrix' with 100004 ratings.
class(movieMatrix)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
methods(class = class(movieMatrix))
##  [1] [                      [<-                    binarize              
##  [4] calcPredictionAccuracy coerce                 colCounts             
##  [7] colMeans               colSds                 colSums               
## [10] denormalize            dim                    dimnames              
## [13] dimnames<-             dissimilarity          evaluationScheme      
## [16] getData.frame          getList                getNormalize          
## [19] getRatingMatrix        getRatings             getTopNLists          
## [22] image                  normalize              nratings              
## [25] Recommender            removeKnownRatings     rowCounts             
## [28] rowMeans               rowSds                 rowSums               
## [31] sample                 show                   similarity            
## see '?methods' for accessing help and source code

2 Data Exploration & Visualization

similarity_users <- similarity(movieMatrix[1:5, ], method ="cosine", which = "userId")

(as.matrix(similarity_users))
##           1         2         3         4         5
## 1 0.0000000        NA        NA 0.9085757 1.0000000
## 2        NA 0.0000000 0.9556143 0.9627467 0.9758399
## 3        NA 0.9556143 0.0000000 0.9752314 0.9651193
## 4 0.9085757 0.9627467 0.9752314 0.0000000 0.9694769
## 5 1.0000000 0.9758399 0.9651193 0.9694769 0.0000000
image(as.matrix(similarity_users), main = "User similarity")

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix"          "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix"         "POPULAR_realRatingMatrix"     
## [5] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
## [7] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [9] "UBCF_realRatingMatrix"
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE
dim(movieMatrix)
## [1]  671 9066
slotNames(movieMatrix)
## [1] "data"      "normalize"
class(movieMatrix@data)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(movieMatrix@data)
## [1]  671 9066
vector_ratings <- as.vector(movieMatrix@data)
unique(vector_ratings)
##  [1] 0.0 3.0 4.0 5.0 2.0 3.5 1.0 2.5 4.5 1.5 0.5
## [1] 5 4 0 3 1 2

table_ratings <- table(vector_ratings)
(table_ratings)
## vector_ratings
##       0     0.5       1     1.5       2     2.5       3     3.5       4 
## 5983282    1101    3326    1687    7271    4449   20064   10538   28750 
##     4.5       5 
##    7723   15095

Selecting the most relevant data

 ratings_movies <- movieMatrix[rowCounts(movieMatrix) > 50, 
                               colCounts(movieMatrix) > 50] 

Exploring the most relevant data

min_movies <- quantile(rowCounts(ratings_movies), 0.95)
min_users <- quantile(colCounts(ratings_movies), 0.95)

image(ratings_movies[rowCounts(ratings_movies) > min_movies, 
                 colCounts(ratings_movies) > min_users], 
      main = "Heatmap of the Top Users and Movies (Not Normalized)")

average_ratings_per_user <- rowMeans(ratings_movies)
ggplot() + aes(average_ratings_per_user) + 
  geom_histogram(binwidth = 0.1) +
  ggtitle("Distribution of the average rating per user")+
  xlab("Average Rating") + ylab("No of Ratings")

Normalizing the data

ratings_movies_norm <- normalize(ratings_movies)
avg <- round(rowMeans(ratings_movies_norm),5)
table(avg)
## avg
##   0 
## 421
image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies, 
                 colCounts(ratings_movies_norm) > min_users], 
      main = "Heatmap of the Top Users and Movies (Normalized)")

3 Train and Test Datasets

which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies),replace = TRUE, prob = c(0.8, 0.2))
head(which_train,10)
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]

4 Recommendation model

4.1 IBCF

Item-based collaborative filtering

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

Train

IBCF.recc_model <- Recommender(data = recc_data_train, method = "IBCF",parameter = list(k = 30))
IBCF.recc_model
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 340 users.

Predict

IBCF.recc_predicted <- predict(object = IBCF.recc_model, newdata = recc_data_test, n = 6)
IBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 81 users.

Recommendation for the first user

IBCF.recc_predicted@items[[1]]
## [1] 10 30 32 33 41 63

4.2 UBCF

User-based collaborative filtering

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $normalize
## [1] "center"
UBCF_recc_model <- Recommender(data = recc_data_train, method = "UBCF",parameter = list(k = 30))
## Warning: Unknown parameter: k
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
UBCF_recc_model
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 340 users.

Predict

UBCF.recc_predicted <- predict(object = UBCF_recc_model,newdata = recc_data_test, n = 6) 
UBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 81 users.

Recommendation for the first user

UBCF.recc_predicted@items[[1]]
## [1]  85  28 103  53  57  82

4.3 Evaluation

eval_sets <- evaluationScheme(ratings_movies, method = "split", train = .8, given = 4, goodRating=3)
algorithms <- list("user-based CF" = list(name="UBCF"), "item-based CF" = list(name="IBCF"))
results <- evaluate(eval_sets, algorithms, n=c(2, 5, 7, 9, 11, 13))
## UBCF run fold/sample [model time/prediction time]
##   1  [0.03sec/0.16sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.57sec/0.19sec]
recommenderlab::plot(results, annotate = 1:4, legend="topleft")

UBCF appears to be better than IBCF.