Final Project

For the final project, I will be using the MovieLense dataset since, I have been using it for my previous projects also and I feel familiar with the dataset and its contents.

Dataset

This dataset is available publicily online at https://grouplens.org/datasets/movielens/

I have downloaded the dataset and put it in my github account.

#import relevant libraries
library(ggplot2)
library(recommenderlab)

## Loading required package: Matrix

## Loading required package: arules

## Warning: package 'arules' was built under R version 3.6.2

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

library(data.table)

#import required datasets
data <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/movies.csv")
head(data)

##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller

ratings <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/ratings.csv")
head(ratings)

##   userId movieId rating timestamp
## 1      1       1      4 964982703
## 2      1       3      4 964981247
## 3      1       6      4 964982224
## 4      1      47      5 964983815
## 5      1      50      5 964982931
## 6      1      70      3 964982400

Summary of moives and ratings:

summary(data)

##     movieId                                          title     
##  Min.   :     1   Confessions of a Dangerous Mind (2002):   2  
##  1st Qu.:  3248   Emma (1996)                           :   2  
##  Median :  7300   Eros (2004)                           :   2  
##  Mean   : 42200   Saturn 3 (1980)                       :   2  
##  3rd Qu.: 76232   War of the Worlds (2005)              :   2  
##  Max.   :193609   ¡Three Amigos! (1986)                 :   1  
##                   (Other)                               :9731  
##             genres    
##  Drama         :1053  
##  Comedy        : 946  
##  Comedy|Drama  : 435  
##  Comedy|Romance: 363  
##  Drama|Romance : 349  
##  Documentary   : 339  
##  (Other)       :6257

summary(ratings)

##      userId         movieId           rating        timestamp        
##  Min.   :  1.0   Min.   :     1   Min.   :0.500   Min.   :8.281e+08  
##  1st Qu.:177.0   1st Qu.:  1199   1st Qu.:3.000   1st Qu.:1.019e+09  
##  Median :325.0   Median :  2991   Median :3.500   Median :1.186e+09  
##  Mean   :326.1   Mean   : 19435   Mean   :3.502   Mean   :1.206e+09  
##  3rd Qu.:477.0   3rd Qu.:  8122   3rd Qu.:4.000   3rd Qu.:1.436e+09  
##  Max.   :610.0   Max.   :193609   Max.   :5.000   Max.   :1.538e+09

Extracting Genre of the movie list

Below is the code to crete a matrix of generes for each movie

genre <- as.data.frame(data$genres)

genre2 <- as.data.frame(tstrsplit(genre[,1], '[|]', type.convert=TRUE))
colnames(genre2) <- c(1:10)

genre_list <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary", "Drama", "Fantasy",
                "Film-Noir", "Horror", "Musical", "Mystery","Romance", "Sci-Fi", "Thriller", "War", "Western") 

genre_matrix <- matrix(0, 9743,18)  #number of movies = 9742, number of genres = 18 
genre_matrix[1,] <- genre_list 
colnames(genre_matrix) <- genre_list  #set the column names to genre list


#let's iterate the matrix 
for (i in 1:nrow(genre2)) {
  for (c in 1:ncol(genre2)) {
    genre_matrix_col = which(genre_matrix[1,] == genre2[i,c])
    genre_matrix[i+1, genre_matrix_col] <- 1
  }
}


#now convert it into dataframe
final_genre_matrix <- as.data.frame(genre_matrix[-1,], stringsAsFactors=FALSE) 
for (c in 1:ncol(final_genre_matrix)) {
  final_genre_matrix[,c] <- as.integer(final_genre_matrix[,c])  #convert to integers
} 

head(final_genre_matrix)

##   Action Adventure Animation Children Comedy Crime Documentary Drama Fantasy
## 1      0         1         1        1      1     0           0     0       1
## 2      0         1         0        1      0     0           0     0       1
## 3      0         0         0        0      1     0           0     0       0
## 4      0         0         0        0      1     0           0     1       0
## 5      0         0         0        0      1     0           0     0       0
## 6      1         0         0        0      0     1           0     0       0
##   Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
## 1         0      0       0       0       0      0        0   0       0
## 2         0      0       0       0       0      0        0   0       0
## 3         0      0       0       0       1      0        0   0       0
## 4         0      0       0       0       1      0        0   0       0
## 5         0      0       0       0       0      0        0   0       0
## 6         0      0       0       0       0      0        1   0       0

Convert the ratings matrix into sparse matrix

#Create ratings matrix
rating_matrix <- dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)

## Warning in dcast(ratings, userId ~ movieId, value.var = "rating", na.rm =
## FALSE): The dcast generic in data.table has been passed a data.frame and
## will attempt to redirect to the reshape2::dcast; please note that reshape2
## is deprecated, and this redirection is now deprecated as well. Please do this
## redirection yourself like reshape2::dcast(ratings). In the next version, this
## warning will become an error.

rating_matrix <- as.matrix(rating_matrix[,-1]) 

#Convert ratings matrix into sparse matrix
rating_matrix <- as(rating_matrix, "realRatingMatrix")
rating_matrix

## 610 x 9724 rating matrix of class 'realRatingMatrix' with 100836 ratings.

#exploring recommender_models
recommender_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
recommender_model

## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
##   normalize lambda n_factors n_iterations min_item_nr seed
## 1      NULL    0.1        10           10           1 NULL
## 
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
##    k   method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine"  "center"                FALSE   0.5      FALSE
## 
## $LIBMF_realRatingMatrix
## Recommender method: LIBMF for realRatingMatrix
## Description: Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html).
## Reference: NA
## Parameters:
##   dim costp_l2 costq_l2 nthread
## 1  10     0.01     0.01       1
## 
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
##   normalize
## 1  "center"
##                                                      aggregationRatings
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1, 
##                                                   aggregationPopularity
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1, 
## 
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
## 
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
##   randomize minRating
## 1         1        NA
## 
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
##    k maxiter normalize
## 1 10     100  "center"
## 
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html).
## Reference: NA
## Parameters:
##    k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015  0.001         50        200           1e-06  "center"   FALSE
## 
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##     method nn sample normalize
## 1 "cosine" 25  FALSE  "center"

#So many models but I will be focusing only on IBCF and UBCF
recommender_model$IBCF_realRatingMatrix$parameters

## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

Collaborative Filtering Algorithm

In order to see the similarity between users or between items, we can use collaboratice filtering algorithm.

Let’s create the matrix and visualize it.

similarity_betn_users <- as.matrix(similarity(rating_matrix[1:4, ], method="cosine", which="users"))
similarity_betn_users

##           1  2         3         4
## 1 0.0000000  1 0.7919033 0.9328096
## 2 1.0000000  0        NA 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000
## 4 0.9328096  1 1.0000000 0.0000000

image(similarity_betn_users, main = "Similarity between Users")

Similarly let’s do the same for the items.

similarity_betn_items <- as.matrix(similarity(rating_matrix[, 1:4], method="cosine", which="items"))
similarity_betn_items

##           1         2         3         4
## 1 0.0000000 0.9644641 0.9715415 0.9838699
## 2 0.9644641 0.0000000 0.9389013 0.9609877
## 3 0.9715415 0.9389013 0.0000000 1.0000000
## 4 0.9838699 0.9609877 1.0000000 0.0000000

image(similarity_betn_items, main = "Similarity between Items")

Now, we need to explore values of ratings

vector_ratings <- as.vector(rating_matrix@data)
unique(vector_ratings)

##  [1] 4.0 0.0 4.5 2.5 3.5 3.0 5.0 0.5 2.0 1.5 1.0

#Let's omit a missing values, 0 represents a missing value
vector_ratings <- vector_ratings[vector_ratings != 0] 
vector_ratings <- factor(vector_ratings)

#let's plot the dataset
qplot(vector_ratings) + 
  ggtitle("Distribution of the ratings")

According to the graph, the ratings are mostly between 3 to 5. There are some ratings between 0.5 to 2.5 but we can say that mostly people prefer rating 3 for dislike just to be on safe side.

Calculate the most viewed movies

#to calculate the most viewed movies

# count views for each movie
views_per_movie <- colCounts(rating_matrix) 

# create new dataframe
table_views <- data.frame(movie_Num = names(views_per_movie), Views = views_per_movie) 

# sort by Views
table_views <- table_views[order(table_views$Views, decreasing = TRUE), ] 
table_views$title <- NA

for (i in 1:9724)
  {
  table_views[i,3] <- as.character(subset(data, data$movieId == table_views[i,1])$title)
  }

table_views[1:6,]

##      movie_Num Views                                     title
## 356        356   329                       Forrest Gump (1994)
## 318        318   317          Shawshank Redemption, The (1994)
## 296        296   307                       Pulp Fiction (1994)
## 593        593   279          Silence of the Lambs, The (1991)
## 2571      2571   278                        Matrix, The (1999)
## 260        260   251 Star Wars: Episode IV - A New Hope (1977)

#let's plot top viewed movies
ggplot(table_views[1:6, ], 
       aes(x = title, y = Views)) +
  geom_bar(stat="identity") + 
  theme(axis.text.x = element_text(angle = 60,hjust=1)) + 
  ggtitle("Top movies -- total views")

Forrest Gump is the movie that has the highest views with 329 views.

Distribution of movies average ratings

average_ratings <- colMeans(rating_matrix)

qplot(average_ratings) + 
  stat_bin(binwidth = 0.1) +
  ggtitle("Distribution of the movies average ratings")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of items average ratings

average_ratings_relevant <- average_ratings[views_per_movie > 50] 
qplot(average_ratings_relevant) + 
  stat_bin(binwidth = 0.1) +
  ggtitle(paste("Distribution of the relevant average ratings"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data Preparation

Let’s select the data and normalize it for further computation.

#Here, I have standardize the row and columns value by greater than 50 to make the data meaningful.
movies_ratings <- rating_matrix[rowCounts(rating_matrix) > 50, colCounts(rating_matrix) > 50]
movies_ratings

## 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.

Now we have only 378 users and 436 movies.

##here

Using this approach, I have put a top 5 percent of movies and users in a heatmap below:

minimum_movies <- quantile(rowCounts(movies_ratings), 0.95)
minimum_users <- quantile(colCounts(movies_ratings), 0.95)
image(movies_ratings[rowCounts(movies_ratings) > minimum_movies,
                     colCounts(movies_ratings) > minimum_users], 
main = "Heatmap of the top users and movies",xlab='Movies',ylab='Users')

#normalize the dataset
normalized_movies_ratings <- normalize(movies_ratings)
sum(rowMeans(normalized_movies_ratings) > 0.00001)

## [1] 0

#visualize the normalized matrix of the top user and movies
image(normalized_movies_ratings[rowCounts(normalized_movies_ratings) > minimum_movies,
                          colCounts(normalized_movies_ratings) > minimum_users], 
main = "Heatmap of the top users and movies", xlab = 'Movies', ylab='Users')

IBCF Model

Building the model

#building a dataset with 80-20 ratio
build_dataset <- sample(x = c(TRUE, FALSE), size = nrow(movies_ratings),replace = TRUE, prob = c(0.8, 0.2))

data_train <- movies_ratings[build_dataset, ]
data_test <- movies_ratings[!build_dataset, ]

#train dataset, "IBCF"
recommender_model <- Recommender(data = data_train, method = "IBCF")

recommender_model

## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 305 users.

#explore the model
recommender_model_details <- getModel(recommender_model)

class(recommender_model_details$sim)

## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"

# number of items to recommend to each user
num_recommended <- 10 

recommender_predicted <- predict(object = recommender_model, 
                          newdata = data_test, 
                          n = num_recommended)
recommender_predicted

## Recommendations as 'topNList' with n = 10 for 73 users.

# recommendation for the first user
recommender_user_1 <- recommender_predicted@items[[1]] 
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
  {
  movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
  }
movies_user_2

##  [1] "Get Shorty (1995)"                                  
##  [2] "Judge Dredd (1995)"                                 
##  [3] "Disclosure (1994)"                                  
##  [4] "Natural Born Killers (1994)"                        
##  [5] "Four Weddings and a Funeral (1994)"                 
##  [6] "Maverick (1994)"                                    
##  [7] "City Slickers II: The Legend of Curly's Gold (1994)"
##  [8] "Cliffhanger (1993)"                                 
##  [9] "Ghost (1990)"                                       
## [10] "Pretty Woman (1990)"

# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items, 
                      function(x){ as.integer(colnames(movies_ratings)[x]) }) 
recommender_matrix[,1:4]

##       [,1]  [,2]  [,3] [,4]
##  [1,]   21  2174  3033    3
##  [2,]  173  2716  1247   16
##  [3,]  225  3751  6016   19
##  [4,]  288  4027  5299   36
##  [5,]  357 48385 56367  104
##  [6,]  368 68157  3081  223
##  [7,]  432 76093  3671  300
##  [8,]  434 30707  3481  316
##  [9,]  587  1073   300  317
## [10,]  597  8644  4776  318

num_of_items <- factor(table(recommender_matrix))

qplot(num_of_items) + 
  ggtitle("Distribution for IBCF")

num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)

for (i in 1:4)
  {
  table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}

colnames(table_top) <- c("Movie", "Number of items")
head(table_top)

##                        Movie Number of items
## 2470 Crocodile Dundee (1986)              11
## 10          GoldenEye (1995)               9
## 3    Grumpier Old Men (1995)               8
## 16             Casino (1995)               8

UBCF

Building the model

recommender_model <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_model$UBCF_realRatingMatrix$parameters

## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $normalize
## [1] "center"

#train dataset, "UBCF"
recommender_model <- Recommender(data = data_train, method = "UBCF")

recommender_model

## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 305 users.

recommeder_model_details <- getModel(recommender_model)

class(recommender_model_details$sim)

## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"

num_recommended <- 10 # number of items to recommend to each user

recommender_predicted <- predict(object = recommender_model, newdata = data_test, n = num_recommended)
recommender_predicted

## Recommendations as 'topNList' with n = 10 for 73 users.

recommender_user_1 <- recommender_predicted@items[[1]] # recommendation for the first user
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
  {
  movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
}
movies_user_2

##  [1] "Rear Window (1954)"                                            
##  [2] "Lord of the Rings: The Return of the King, The (2003)"         
##  [3] "Incredibles, The (2004)"                                       
##  [4] "Godfather, The (1972)"                                         
##  [5] "Shawshank Redemption, The (1994)"                              
##  [6] "Beautiful Mind, A (2001)"                                      
##  [7] "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)"
##  [8] "One Flew Over the Cuckoo's Nest (1975)"                        
##  [9] "Lord of the Rings: The Fellowship of the Ring, The (2001)"     
## [10] "Spirited Away (Sen to Chihiro no kamikakushi) (2001)"

# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items, 
                      function(x){ as.integer(colnames(movies_ratings)[x]) }) 
recommender_matrix[,1:4]

##       [,1] [,2] [,3]  [,4]
##  [1,]  904  318  593   318
##  [2,] 7153  457  318 58559
##  [3,] 8961  908  457  5952
##  [4,]  858  253    6  4993
##  [5,]  318  590 1617   904
##  [6,] 4995 1089 1732 68954
##  [7,]  293  608 2028  7153
##  [8,] 1193  527  150 60069
##  [9,] 4993  904 1089   356
## [10,] 5618   34 1197  6711

num_of_items <- factor(table(recommender_matrix))

qplot(num_of_items) + 
  ggtitle("Distribution for IBCF")

num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)

for (i in 1:4)
  {
  table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}

colnames(table_top) <- c("Movie", "Number of items")
head(table_top)

##                                Movie Number of items
## 457             Fugitive, The (1993)              33
## 318 Shawshank Redemption, The (1994)              23
## 223                    Clerks (1994)              20
## 110                Braveheart (1995)              18

Evaluating the models

min(rowCounts(movies_ratings))

## [1] 11

percent_training <- 0.8
items_to_keep <- 5 
rating_threshold <- 3 
num_of_eval <- 1 

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "split",
                                    train = percent_training, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold, 
                                    k = num_of_eval) 
evaluation_sets

## Evaluation scheme with 5 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.

getData(evaluation_sets, "train") # training set

## 302 x 436 rating matrix of class 'realRatingMatrix' with 28867 ratings.

getData(evaluation_sets, "known") # set with the items used to build the recommendations

## 76 x 436 rating matrix of class 'realRatingMatrix' with 380 ratings.

getData(evaluation_sets, "unknown") # set with the items used to test the recommendations

## 76 x 436 rating matrix of class 'realRatingMatrix' with 6967 ratings.

qplot(rowCounts(getData(evaluation_sets, "unknown"))) + 
  geom_histogram(binwidth = 10) + 
  ggtitle("Unknown items by the users")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                  method = "bootstrap", 
                                  train = percent_training, 
                                  given = items_to_keep,
                                  goodRating = rating_threshold, 
                                  k = num_of_eval)

table_train <- table(evaluation_sets@runsTrain[[1]])
num_of_repetitions <- factor(as.vector(table_train))
qplot(num_of_repetitions) + 
  ggtitle("Number of repetitions in the training dataset")

num_of_fold <- 4
evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "cross-validation",
                                    k = num_of_fold, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold)
size_sets <- sapply(evaluation_sets@runsTrain, length)
size_sets

## [1] 282 282 282 282

Evaluating the rating using k-fold

evaluation_sets <- evaluationScheme(data = movies_ratings, 
                                    method = "cross-validation",
                                    k = num_of_fold, 
                                    given = items_to_keep, 
                                    goodRating = rating_threshold)

evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
                                      method = "IBCF", 
                                      parameter = NULL)

items_to_recommend <- 10
evaluation_prediction <- predict(object = evaluation_recommender, 
                                 newdata = getData(evaluation_sets, "known"), 
                                 n = items_to_recommend, 
                                 type = "ratings")

qplot(rowCounts(evaluation_prediction)) + 
  geom_histogram(binwidth = 10) +
  ggtitle("Distribution of movies per user")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

evaluation_accuracy <- calcPredictionAccuracy(x = evaluation_prediction, 
                                              data = getData(evaluation_sets, "unknown"), 
                                              byUser = FALSE) 
evaluation_accuracy

##     RMSE      MSE      MAE 
## 1.380663 1.906230 1.027218

models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF", 
                param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF", 
                param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF", 
                param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF", 
                param = list(method = "pearson")),
random = list(name = "RANDOM", param=NULL)
)

num_of_recommendations <- c(1, 5, seq(10, 100, 10))
evaluation_list_results <- evaluate(x = evaluation_sets, 
                         method = models_to_evaluate, 
                         n = num_of_recommendations)

## IBCF run fold/sample [model time/prediction time]
##   1  [0.591sec/0.025sec] 
##   2  [0.596sec/0.021sec] 
##   3  [0.599sec/0.019sec] 
##   4  [0.566sec/0.019sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.596sec/0.02sec] 
##   2  [0.593sec/0.019sec] 
##   3  [0.614sec/0.021sec] 
##   4  [0.6sec/0.019sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.003sec/0.089sec] 
##   2  [0.002sec/0.088sec] 
##   3  [0.004sec/0.096sec] 
##   4  [0.003sec/0.087sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.002sec/0.115sec] 
##   2  [0.002sec/0.1sec] 
##   3  [0.002sec/0.098sec] 
##   4  [0.002sec/0.099sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0.001sec/0.029sec] 
##   2  [0.001sec/0.03sec] 
##   3  [0.001sec/0.028sec] 
##   4  [0sec/0.031sec]

sapply(evaluation_list_results, class) == "evaluationResults"

## IBCF_cos IBCF_cor UBCF_cos UBCF_cor   random 
##     TRUE     TRUE     TRUE     TRUE     TRUE

avg_matrices <- lapply(evaluation_list_results, avg)
head(avg_matrices$IBCF_cos[, 5:8])

##    precision      recall         TPR         FPR
## 1  0.2440090 0.003151194 0.003151194 0.002150107
## 5  0.2039948 0.013571678 0.013571678 0.011330510
## 10 0.1885165 0.024610156 0.024610156 0.023110442
## 20 0.1888598 0.046791884 0.046791884 0.045743931
## 30 0.1905377 0.070589940 0.070589940 0.067793227
## 40 0.1877560 0.091071761 0.091071761 0.089952070

Plot the ROC Curve and Precision - recall

plot(evaluation_list_results, annotate = 1, legend = "topleft") 
title("ROC curve")

plot(evaluation_list_results, annotate = 1, legend = "topright")
title("Precision - recall")

It seems UBCF is the better recommender system. It is more accurate than IBCF.

DATA612_final_project

Sudhan Maharjan

5/19/2020

Final Project

Dataset

Summary of moives and ratings:

Extracting Genre of the movie list

Convert the ratings matrix into sparse matrix

Collaborative Filtering Algorithm

Calculate the most viewed movies

Distribution of movies average ratings

Distribution of items average ratings

Data Preparation

IBCF Model

Building the model

UBCF

Building the model

Evaluating the models

Evaluating the rating using k-fold

Plot the ROC Curve and Precision - recall