Load the libraries

library(recommenderlab)  # Matrix/recommender functions
## Warning: package 'recommenderlab' was built under R version 3.6.3
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.3
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## Warning: package 'proxy' was built under R version 3.6.3
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(dplyr)  # Data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)  # Data manipulation
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
## 
##     expand
library(ggplot2)  # Plotting
library(tictoc)  # Operation timing
set.seed(42)

Subsetting Dataset

Now let’s see the process of reducing the data from the main dataset

Import original file and select sample for project

ratings <- read.csv('C:\\DATA612\\DATA612ASSINGMENTS-master\\ml-20m\\ratings.csv')
ratings <- ratings[1:600000,]

Explore

head(ratings)
##   userId movieId rating  timestamp
## 1      1       2    3.5 1112486027
## 2      1      29    3.5 1112484676
## 3      1      32    3.5 1112484819
## 4      1      47    3.5 1112484727
## 5      1      50    3.5 1112484580
## 6      1     112    3.5 1094785740
hist(ratings$rating)

Convert to realRatingMatrix

ratingsMatrix <- sparseMatrix(as.integer(ratings$userId), as.integer(ratings$movieId),  x = ratings$rating)
colnames(ratingsMatrix) <- levels(ratings$movieId)
rownames(ratingsMatrix) <- levels(ratings$userId)
mratings <- as(ratingsMatrix, "realRatingMatrix")

Explore

mratings
## 4022 x 130642 rating matrix of class 'realRatingMatrix' with 600000 ratings.
hist(rowCounts(mratings))

#table(rowCounts(mratings))
hist(colCounts(mratings))

#table(colCounts(mratings))

Select Subset 1 and Subset 2

(ratingShort <- mratings[rowCounts(mratings) > 90, colCounts(mratings) > 60])
## 1688 x 2047 rating matrix of class 'realRatingMatrix' with 392641 ratings.

Step-6) Check and Remove Empty Lines

ratingShort <- mratings[, colCounts(mratings) > 60]
ratingShort <- ratingShort[rowCounts(ratingShort) > 90, ]
ratingShort
## 1552 x 2047 rating matrix of class 'realRatingMatrix' with 381907 ratings.
#table(rowCounts(ratingShort))
#table(colCounts(ratingShort))
(ratings_movies <- ratingShort[, colCounts(ratingShort) != 0])
## 1552 x 2047 rating matrix of class 'realRatingMatrix' with 381907 ratings.

Evaluating Recommender Techniques

Evaluating the model

Preparing the data for validation

n_fold <- 4
items_to_keep <- 25
rating_threshold <- 3
eval_sets <- evaluationScheme(data = ratings_movies, method = "cross-validation", k = n_fold,
                              given = items_to_keep, goodRating = rating_threshold)

Train and Test sets

Train/test split

Split the dataset into test and train sets to build the model.

train <- getData(eval_sets, "train")
known <- getData(eval_sets, "known")
unknown <- getData(eval_sets, "unknown")

Evaluate the item-based collaborative filtering recommender

model_to_evaluate <- "IBCF"
model_parameters <- NULL

Build the model

eval_recommender <- Recommender(data = getData(eval_sets,"train"), method = model_to_evaluate,
                                parameter = model_parameters)

Specify the number of items to recommend

items_to_recommend <- 10

Build the matrix with the predicted ratings

eval_prediction <- predict(object = eval_recommender, newdata = getData(eval_sets, "known"),
                           n = items_to_recommend, type = "ratings")
class(eval_prediction)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"

Number of movies recommend to each user

qplot(rowCounts(eval_prediction)) + geom_histogram(binwidth = 50) +
  ggtitle("Distribution of movies per users")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Metrics that mesure the accuracy:

  • Root mean square error(RMSE)
  • Mean squared error (MSE)
  • Mean absollute error(MAE) #### Metrics by user
eval_accuracy <- calcPredictionAccuracy(x = eval_prediction, data = getData(eval_sets, "unknown"),
                                        byUser = TRUE)
head(eval_accuracy)
##           RMSE      MSE       MAE
## [1,] 0.0000000 0.000000 0.0000000
## [2,] 1.7793729 3.166168 1.5728125
## [3,] 1.1649647 1.357143 0.9285714
## [4,] 2.8577380 8.166667 2.8333333
## [5,] 0.7071068 0.500000 0.5000000
## [6,] 0.0000000 0.000000 0.0000000

Distribution of RMSE by user

qplot(eval_accuracy[,"RMSE"]) + geom_histogram(binwidth = 0.1) +
  ggtitle("Distribution of the RMSE by user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing non-finite values (stat_bin).

Accuracy of the whole model

calcPredictionAccuracy(x = eval_prediction, data = getData(eval_sets, "unknown"),
                                        byUser = FALSE)
##     RMSE      MSE      MAE 
## 1.495287 2.235883 1.114363

Evaluation by Recommendation

Comparing the recommendations with the purchases having positive rating

results <- evaluate(x = eval_sets, method = model_to_evaluate, n = seq(10,100,10))
## IBCF run fold/sample [model time/prediction time]
##   1  [50.39sec/0.17sec] 
##   2  [48.92sec/0.18sec] 
##   3  [50.09sec/0.14sec] 
##   4  [49.27sec/0.22sec]
class(results)
## [1] "evaluationResults"
## attr(,"package")
## [1] "recommenderlab"

Build the ROC curve

plot(results, annotate = TRUE, main = "ROC curve")

Accuracy metrics Precision and Recall

plot(results, "prec/rec", annotate = TRUE, main = "Precision-Recall")

Compare the models

models_to_evaluate <- list(IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
                           IBCF_cor = list(name = "IBCF", param = list(method = "pearson")),
                           UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
                           UBCF_cor = list(name = "UBCF", param = list(method = "pearson")),
                           random = list(name = "RANDOM", param = NULL))

Number of the recommendations items to test the models

n_recommendations <- c(1, 5, seq(10, 100,10))

Run and evaluate the model

list_results <- evaluate(x = eval_sets, method = models_to_evaluate, n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [47.24sec/0.19sec] 
##   2  [46.26sec/0.14sec] 
##   3  [47.55sec/0.15sec] 
##   4  [47.74sec/0.39sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [34.23sec/0.18sec] 
##   2  [34.04sec/0.19sec] 
##   3  [33.77sec/0.14sec] 
##   4  [33.38sec/0.17sec] 
## UBCF run fold/sample [model time/prediction time]
##   1
## Timing stopped at: 7.96 0.07 8.22
## Error in validObject(.Object) : 
##   invalid class "topNList" object: invalid object for slot "itemLabels" in class "topNList": got class "NULL", should be or extend class "character"
## UBCF run fold/sample [model time/prediction time]
##   1
## Timing stopped at: 4.26 0.08 4.44
## Error in validObject(.Object) : 
##   invalid class "topNList" object: invalid object for slot "itemLabels" in class "topNList": got class "NULL", should be or extend class "character"
## RANDOM run fold/sample [model time/prediction time]
##   1
## Timing stopped at: 0 0 0
## Error in object@predict(object@model, newdata, n = n, data = data, type = type,  : 
##   number of items in newdata does not match model.
## Warning in .local(x, method, ...): 
##   Recommender 'UBCF_cos' has failed and has been removed from the results!
##   Recommender 'UBCF_cor' has failed and has been removed from the results!
##   Recommender 'random' has failed and has been removed from the results!
class(list_results)
## [1] "evaluationResultList"
## attr(,"package")
## [1] "recommenderlab"

Identify the most suitable model

Bild the chart displaying ROC curves

plot(list_results, annotate = 1, legend = "topleft")
title("ROC curves")

UBCF with pearson correlation has the better Area Under the Curve(AUC). It is the best-performing technique.

Precision-Recall chart

plot(list_results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-Recall")