Final Project

For the final project, I will be using the MovieLense dataset since, I have been using it for my previous projects also and I feel familiar with the dataset and its contents.

Dataset

This dataset is available publicily online at https://grouplens.org/datasets/movielens/

I have downloaded the dataset and put it in my github account.

#import relevant libraries
library(ggplot2)
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.2
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(data.table)

#import required datasets
data <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/movies.csv")
head(data)
##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller
ratings <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/ratings.csv")
head(ratings)
##   userId movieId rating timestamp
## 1      1       1      4 964982703
## 2      1       3      4 964981247
## 3      1       6      4 964982224
## 4      1      47      5 964983815
## 5      1      50      5 964982931
## 6      1      70      3 964982400

Summary of moives and ratings:

summary(data)
##     movieId                                          title     
##  Min.   :     1   Confessions of a Dangerous Mind (2002):   2  
##  1st Qu.:  3248   Emma (1996)                           :   2  
##  Median :  7300   Eros (2004)                           :   2  
##  Mean   : 42200   Saturn 3 (1980)                       :   2  
##  3rd Qu.: 76232   War of the Worlds (2005)              :   2  
##  Max.   :193609   ¡Three Amigos! (1986)                 :   1  
##                   (Other)                               :9731  
##             genres    
##  Drama         :1053  
##  Comedy        : 946  
##  Comedy|Drama  : 435  
##  Comedy|Romance: 363  
##  Drama|Romance : 349  
##  Documentary   : 339  
##  (Other)       :6257
summary(ratings)
##      userId         movieId           rating        timestamp        
##  Min.   :  1.0   Min.   :     1   Min.   :0.500   Min.   :8.281e+08  
##  1st Qu.:177.0   1st Qu.:  1199   1st Qu.:3.000   1st Qu.:1.019e+09  
##  Median :325.0   Median :  2991   Median :3.500   Median :1.186e+09  
##  Mean   :326.1   Mean   : 19435   Mean   :3.502   Mean   :1.206e+09  
##  3rd Qu.:477.0   3rd Qu.:  8122   3rd Qu.:4.000   3rd Qu.:1.436e+09  
##  Max.   :610.0   Max.   :193609   Max.   :5.000   Max.   :1.538e+09

Extracting Genre of the movie list

Below is the code to crete a matrix of generes for each movie

genre <- as.data.frame(data$genres)

genre2 <- as.data.frame(tstrsplit(genre[,1], '[|]', type.convert=TRUE))
colnames(genre2) <- c(1:10)

genre_list <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary", "Drama", "Fantasy",
                "Film-Noir", "Horror", "Musical", "Mystery","Romance", "Sci-Fi", "Thriller", "War", "Western") 

genre_matrix <- matrix(0, 9743,18)  #number of movies = 9742, number of genres = 18 
genre_matrix[1,] <- genre_list 
colnames(genre_matrix) <- genre_list  #set the column names to genre list


#let's iterate the matrix 
for (i in 1:nrow(genre2)) {
  for (c in 1:ncol(genre2)) {
    genre_matrix_col = which(genre_matrix[1,] == genre2[i,c])
    genre_matrix[i+1, genre_matrix_col] <- 1
  }
}


#now convert it into dataframe
final_genre_matrix <- as.data.frame(genre_matrix[-1,], stringsAsFactors=FALSE) 
for (c in 1:ncol(final_genre_matrix)) {
  final_genre_matrix[,c] <- as.integer(final_genre_matrix[,c])  #convert to integers
} 

head(final_genre_matrix)
##   Action Adventure Animation Children Comedy Crime Documentary Drama Fantasy
## 1      0         1         1        1      1     0           0     0       1
## 2      0         1         0        1      0     0           0     0       1
## 3      0         0         0        0      1     0           0     0       0
## 4      0         0         0        0      1     0           0     1       0
## 5      0         0         0        0      1     0           0     0       0
## 6      1         0         0        0      0     1           0     0       0
##   Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
## 1         0      0       0       0       0      0        0   0       0
## 2         0      0       0       0       0      0        0   0       0
## 3         0      0       0       0       1      0        0   0       0
## 4         0      0       0       0       1      0        0   0       0
## 5         0      0       0       0       0      0        0   0       0
## 6         0      0       0       0       0      0        1   0       0

Convert the ratings matrix into sparse matrix

#Create ratings matrix
rating_matrix <- dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)
## Warning in dcast(ratings, userId ~ movieId, value.var = "rating", na.rm =
## FALSE): The dcast generic in data.table has been passed a data.frame and
## will attempt to redirect to the reshape2::dcast; please note that reshape2
## is deprecated, and this redirection is now deprecated as well. Please do this
## redirection yourself like reshape2::dcast(ratings). In the next version, this
## warning will become an error.
rating_matrix <- as.matrix(rating_matrix[,-1]) 

#Convert ratings matrix into sparse matrix
rating_matrix <- as(rating_matrix, "realRatingMatrix")
rating_matrix
## 610 x 9724 rating matrix of class 'realRatingMatrix' with 100836 ratings.
#exploring recommender_models
recommender_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
recommender_model
## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
##   normalize lambda n_factors n_iterations min_item_nr seed
## 1      NULL    0.1        10           10           1 NULL
## 
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
##    k   method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine"  "center"                FALSE   0.5      FALSE
## 
## $LIBMF_realRatingMatrix
## Recommender method: LIBMF for realRatingMatrix
## Description: Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html).
## Reference: NA
## Parameters:
##   dim costp_l2 costq_l2 nthread
## 1  10     0.01     0.01       1
## 
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
##   normalize
## 1  "center"
##                                                      aggregationRatings
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1, 
##                                                   aggregationPopularity
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1, 
## 
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
## 
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
##   randomize minRating
## 1         1        NA
## 
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
##    k maxiter normalize
## 1 10     100  "center"
## 
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html).
## Reference: NA
## Parameters:
##    k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015  0.001         50        200           1e-06  "center"   FALSE
## 
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##     method nn sample normalize
## 1 "cosine" 25  FALSE  "center"
#So many models but I will be focusing only on IBCF and UBCF
recommender_model$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

Collaborative Filtering Algorithm

In order to see the similarity between users or between items, we can use collaboratice filtering algorithm.

Let’s create the matrix and visualize it.

similarity_betn_users <- as.matrix(similarity(rating_matrix[1:4, ], method="cosine", which="users"))
similarity_betn_users
##           1  2         3         4
## 1 0.0000000  1 0.7919033 0.9328096
## 2 1.0000000  0        NA 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000
## 4 0.9328096  1 1.0000000 0.0000000
image(similarity_betn_users, main = "Similarity between Users")

Similarly let’s do the same for the items.

similarity_betn_items <- as.matrix(similarity(rating_matrix[, 1:4], method="cosine", which="items"))
similarity_betn_items
##           1         2         3         4
## 1 0.0000000 0.9644641 0.9715415 0.9838699
## 2 0.9644641 0.0000000 0.9389013 0.9609877
## 3 0.9715415 0.9389013 0.0000000 1.0000000
## 4 0.9838699 0.9609877 1.0000000 0.0000000
image(similarity_betn_items, main = "Similarity between Items")

Now, we need to explore values of ratings

vector_ratings <- as.vector(rating_matrix@data)
unique(vector_ratings)
##  [1] 4.0 0.0 4.5 2.5 3.5 3.0 5.0 0.5 2.0 1.5 1.0
#Let's omit a missing values, 0 represents a missing value
vector_ratings <- vector_ratings[vector_ratings != 0] 
vector_ratings <- factor(vector_ratings)

#let's plot the dataset
qplot(vector_ratings) + 
  ggtitle("Distribution of the ratings")

According to the graph, the ratings are mostly between 3 to 5. There are some ratings between 0.5 to 2.5 but we can say that mostly people prefer rating 3 for dislike just to be on safe side.

Calculate the most viewed movies

#to calculate the most viewed movies

# count views for each movie
views_per_movie <- colCounts(rating_matrix) 

# create new dataframe
table_views <- data.frame(movie_Num = names(views_per_movie), Views = views_per_movie) 

# sort by Views
table_views <- table_views[order(table_views$Views, decreasing = TRUE), ] 
table_views$title <- NA

for (i in 1:9724)
  {
  table_views[i,3] <- as.character(subset(data, data$movieId == table_views[i,1])$title)
  }

table_views[1:6,]
##      movie_Num Views                                     title
## 356        356   329                       Forrest Gump (1994)
## 318        318   317          Shawshank Redemption, The (1994)
## 296        296   307                       Pulp Fiction (1994)
## 593        593   279          Silence of the Lambs, The (1991)
## 2571      2571   278                        Matrix, The (1999)
## 260        260   251 Star Wars: Episode IV - A New Hope (1977)
#let's plot top viewed movies
ggplot(table_views[1:6, ], 
       aes(x = title, y = Views)) +
  geom_bar(stat="identity") + 
  theme(axis.text.x = element_text(angle = 60,hjust=1)) + 
  ggtitle("Top movies -- total views")

Forrest Gump is the movie that has the highest views with 329 views.

Distribution of movies average ratings

average_ratings <- colMeans(rating_matrix)

qplot(average_ratings) + 
  stat_bin(binwidth = 0.1) +
  ggtitle("Distribution of the movies average ratings")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of items average ratings

average_ratings_relevant <- average_ratings[views_per_movie > 50] 
qplot(average_ratings_relevant) + 
  stat_bin(binwidth = 0.1) +
  ggtitle(paste("Distribution of the relevant average ratings"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data Preparation

Let’s select the data and normalize it for further computation.

#Here, I have standardize the row and columns value by greater than 50 to make the data meaningful.
movies_ratings <- rating_matrix[rowCounts(rating_matrix) > 50, colCounts(rating_matrix) > 50]
movies_ratings
## 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.

Now we have only 378 users and 436 movies.

##here

Using this approach, I have put a top 5 percent of movies and users in a heatmap below:

minimum_movies <- quantile(rowCounts(movies_ratings), 0.95)
minimum_users <- quantile(colCounts(movies_ratings), 0.95)
image(movies_ratings[rowCounts(movies_ratings) > minimum_movies,
                     colCounts(movies_ratings) > minimum_users], 
main = "Heatmap of the top users and movies",xlab='Movies',ylab='Users')

#normalize the dataset
normalized_movies_ratings <- normalize(movies_ratings)
sum(rowMeans(normalized_movies_ratings) > 0.00001)
## [1] 0
#visualize the normalized matrix of the top user and movies
image(normalized_movies_ratings[rowCounts(normalized_movies_ratings) > minimum_movies,
                          colCounts(normalized_movies_ratings) > minimum_users], 
main = "Heatmap of the top users and movies", xlab = 'Movies', ylab='Users')

IBCF Model

Building the model

#building a dataset with 80-20 ratio
build_dataset <- sample(x = c(TRUE, FALSE), size = nrow(movies_ratings),replace = TRUE, prob = c(0.8, 0.2))

data_train <- movies_ratings[build_dataset, ]
data_test <- movies_ratings[!build_dataset, ]
#train dataset, "IBCF"
recommender_model <- Recommender(data = data_train, method = "IBCF")

recommender_model
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 305 users.
#explore the model
recommender_model_details <- getModel(recommender_model)

class(recommender_model_details$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
# number of items to recommend to each user
num_recommended <- 10 

recommender_predicted <- predict(object = recommender_model, 
                          newdata = data_test, 
                          n = num_recommended)
recommender_predicted
## Recommendations as 'topNList' with n = 10 for 73 users.
# recommendation for the first user
recommender_user_1 <- recommender_predicted@items[[1]] 
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
  {
  movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
  }
movies_user_2
##  [1] "Get Shorty (1995)"                                  
##  [2] "Judge Dredd (1995)"                                 
##  [3] "Disclosure (1994)"                                  
##  [4] "Natural Born Killers (1994)"                        
##  [5] "Four Weddings and a Funeral (1994)"                 
##  [6] "Maverick (1994)"                                    
##  [7] "City Slickers II: The Legend of Curly's Gold (1994)"
##  [8] "Cliffhanger (1993)"                                 
##  [9] "Ghost (1990)"                                       
## [10] "Pretty Woman (1990)"
# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items, 
                      function(x){ as.integer(colnames(movies_ratings)[x]) }) 
recommender_matrix[,1:4]
##       [,1]  [,2]  [,3] [,4]
##  [1,]   21  2174  3033    3
##  [2,]  173  2716  1247   16
##  [3,]  225  3751  6016   19
##  [4,]  288  4027  5299   36
##  [5,]  357 48385 56367  104
##  [6,]  368 68157  3081  223
##  [7,]  432 76093  3671  300
##  [8,]  434 30707  3481  316
##  [9,]  587  1073   300  317
## [10,]  597  8644  4776  318
num_of_items <- factor(table(recommender_matrix))

qplot(num_of_items) + 
  ggtitle("Distribution for IBCF")

num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)

for (i in 1:4)
  {
  table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}

colnames(table_top) <- c("Movie", "Number of items")
head(table_top)
##                        Movie Number of items
## 2470 Crocodile Dundee (1986)              11
## 10          GoldenEye (1995)               9
## 3    Grumpier Old Men (1995)               8
## 16             Casino (1995)               8

UBCF

Building the model

recommender_model <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_model$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $normalize
## [1] "center"
#train dataset, "UBCF"
recommender_model <- Recommender(data = data_train, method = "UBCF")

recommender_model
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 305 users.
recommeder_model_details <- getModel(recommender_model)

class(recommender_model_details$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
num_recommended <- 10 # number of items to recommend to each user

recommender_predicted <- predict(object = recommender_model, newdata = data_test, n = num_recommended)
recommender_predicted
## Recommendations as 'topNList' with n = 10 for 73 users.
recommender_user_1 <- recommender_predicted@items[[1]] # recommendation for the first user
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
  {
  movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
}
movies_user_2
##  [1] "Rear Window (1954)"                                            
##  [2] "Lord of the Rings: The Return of the King, The (2003)"         
##  [3] "Incredibles, The (2004)"                                       
##  [4] "Godfather, The (1972)"                                         
##  [5] "Shawshank Redemption, The (1994)"                              
##  [6] "Beautiful Mind, A (2001)"                                      
##  [7] "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)"
##  [8] "One Flew Over the Cuckoo's Nest (1975)"                        
##  [9] "Lord of the Rings: The Fellowship of the Ring, The (2001)"     
## [10] "Spirited Away (Sen to Chihiro no kamikakushi) (2001)"
# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items, 
                      function(x){ as.integer(colnames(movies_ratings)[x]) }) 
recommender_matrix[,1:4]
##       [,1] [,2] [,3]  [,4]
##  [1,]  904  318  593   318
##  [2,] 7153  457  318 58559
##  [3,] 8961  908  457  5952
##  [4,]  858  253    6  4993
##  [5,]  318  590 1617   904
##  [6,] 4995 1089 1732 68954
##  [7,]  293  608 2028  7153
##  [8,] 1193  527  150 60069
##  [9,] 4993  904 1089   356
## [10,] 5618   34 1197  6711
num_of_items <- factor(table(recommender_matrix))

qplot(num_of_items) + 
  ggtitle("Distribution for IBCF")

num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)

for (i in 1:4)
  {
  table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}

colnames(table_top) <- c("Movie", "Number of items")
head(table_top)
##                                Movie Number of items
## 457             Fugitive, The (1993)              33
## 318 Shawshank Redemption, The (1994)              23
## 223                    Clerks (1994)              20
## 110                Braveheart (1995)              18

Evaluating the models

min(rowCounts(movies_ratings)) 
## [1] 11
percent_training <- 0.8
items_to_keep <- 5 
rating_threshold <- 3 
num_of_eval <- 1 

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "split",
                                    train = percent_training, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold, 
                                    k = num_of_eval) 
evaluation_sets
## Evaluation scheme with 5 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.
getData(evaluation_sets, "train") # training set
## 302 x 436 rating matrix of class 'realRatingMatrix' with 28867 ratings.
getData(evaluation_sets, "known") # set with the items used to build the recommendations
## 76 x 436 rating matrix of class 'realRatingMatrix' with 380 ratings.
getData(evaluation_sets, "unknown") # set with the items used to test the recommendations
## 76 x 436 rating matrix of class 'realRatingMatrix' with 6967 ratings.
qplot(rowCounts(getData(evaluation_sets, "unknown"))) + 
  geom_histogram(binwidth = 10) + 
  ggtitle("Unknown items by the users")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                  method = "bootstrap", 
                                  train = percent_training, 
                                  given = items_to_keep,
                                  goodRating = rating_threshold, 
                                  k = num_of_eval)

table_train <- table(evaluation_sets@runsTrain[[1]])
num_of_repetitions <- factor(as.vector(table_train))
qplot(num_of_repetitions) + 
  ggtitle("Number of repetitions in the training dataset")

num_of_fold <- 4
evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "cross-validation",
                                    k = num_of_fold, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold)
size_sets <- sapply(evaluation_sets@runsTrain, length)
size_sets
## [1] 282 282 282 282

Evaluating the rating using k-fold

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "cross-validation",
                                    k = num_of_fold, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold)

evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
                                      method = "IBCF", 
                                      parameter = NULL)

items_to_recommend <- 10
evaluation_prediction <- predict(object = evaluation_recommender, 
                                 newdata = getData(evaluation_sets, "known"), 
                                 n = items_to_recommend, 
                                 type = "ratings")

qplot(rowCounts(evaluation_prediction)) + 
  geom_histogram(binwidth = 10) +
  ggtitle("Distribution of movies per user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

evaluation_accuracy <- calcPredictionAccuracy(x = evaluation_prediction, 
                                              data = getData(evaluation_sets, "unknown"), 
                                              byUser = FALSE) 
evaluation_accuracy
##     RMSE      MSE      MAE 
## 1.380663 1.906230 1.027218
models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF", 
                param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF", 
                param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF", 
                param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF", 
                param = list(method = "pearson")),
random = list(name = "RANDOM", param=NULL)
)
num_of_recommendations <- c(1, 5, seq(10, 100, 10))
evaluation_list_results <- evaluate(x = evaluation_sets, 
                         method = models_to_evaluate, 
                         n = num_of_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.591sec/0.025sec] 
##   2  [0.596sec/0.021sec] 
##   3  [0.599sec/0.019sec] 
##   4  [0.566sec/0.019sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.596sec/0.02sec] 
##   2  [0.593sec/0.019sec] 
##   3  [0.614sec/0.021sec] 
##   4  [0.6sec/0.019sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.003sec/0.089sec] 
##   2  [0.002sec/0.088sec] 
##   3  [0.004sec/0.096sec] 
##   4  [0.003sec/0.087sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.002sec/0.115sec] 
##   2  [0.002sec/0.1sec] 
##   3  [0.002sec/0.098sec] 
##   4  [0.002sec/0.099sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0.001sec/0.029sec] 
##   2  [0.001sec/0.03sec] 
##   3  [0.001sec/0.028sec] 
##   4  [0sec/0.031sec]
sapply(evaluation_list_results, class) == "evaluationResults"
## IBCF_cos IBCF_cor UBCF_cos UBCF_cor   random 
##     TRUE     TRUE     TRUE     TRUE     TRUE
avg_matrices <- lapply(evaluation_list_results, avg)
head(avg_matrices$IBCF_cos[, 5:8])
##    precision      recall         TPR         FPR
## 1  0.2440090 0.003151194 0.003151194 0.002150107
## 5  0.2039948 0.013571678 0.013571678 0.011330510
## 10 0.1885165 0.024610156 0.024610156 0.023110442
## 20 0.1888598 0.046791884 0.046791884 0.045743931
## 30 0.1905377 0.070589940 0.070589940 0.067793227
## 40 0.1877560 0.091071761 0.091071761 0.089952070

Plot the ROC Curve and Precision - recall

plot(evaluation_list_results, annotate = 1, legend = "topleft") 
title("ROC curve")

plot(evaluation_list_results, annotate = 1, legend = "topright")
title("Precision - recall")

It seems UBCF is the better recommender system. It is more accurate than IBCF.