library(recommenderlab) # Matrix/recommender functions
## Warning: package 'recommenderlab' was built under R version 3.6.3
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.3
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
## Warning: package 'proxy' was built under R version 3.6.3
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(dplyr) # Data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # Data manipulation
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
library(ggplot2) # Plotting
library(tictoc) # Operation timing
set.seed(42)
Now let’s see the process of reducing the data from the main dataset
Import original file and select sample for project
ratings <- read.csv('C:\\DATA612\\DATA612ASSINGMENTS-master\\ml-20m\\ratings.csv')
ratings <- ratings[1:600000,]
Explore
head(ratings)
## userId movieId rating timestamp
## 1 1 2 3.5 1112486027
## 2 1 29 3.5 1112484676
## 3 1 32 3.5 1112484819
## 4 1 47 3.5 1112484727
## 5 1 50 3.5 1112484580
## 6 1 112 3.5 1094785740
hist(ratings$rating)
Convert to realRatingMatrix
ratingsMatrix <- sparseMatrix(as.integer(ratings$userId), as.integer(ratings$movieId), x = ratings$rating)
colnames(ratingsMatrix) <- levels(ratings$movieId)
rownames(ratingsMatrix) <- levels(ratings$userId)
mratings <- as(ratingsMatrix, "realRatingMatrix")
Explore
mratings
## 4022 x 130642 rating matrix of class 'realRatingMatrix' with 600000 ratings.
hist(rowCounts(mratings))
#table(rowCounts(mratings))
hist(colCounts(mratings))
#table(colCounts(mratings))
Select Subset 1 and Subset 2
(ratingShort <- mratings[rowCounts(mratings) > 90, colCounts(mratings) > 60])
## 1688 x 2047 rating matrix of class 'realRatingMatrix' with 392641 ratings.
Step-6) Check and Remove Empty Lines
ratingShort <- mratings[, colCounts(mratings) > 60]
ratingShort <- ratingShort[rowCounts(ratingShort) > 90, ]
ratingShort
## 1552 x 2047 rating matrix of class 'realRatingMatrix' with 381907 ratings.
#table(rowCounts(ratingShort))
#table(colCounts(ratingShort))
(ratings_movies <- ratingShort[, colCounts(ratingShort) != 0])
## 1552 x 2047 rating matrix of class 'realRatingMatrix' with 381907 ratings.
n_fold <- 4
items_to_keep <- 25
rating_threshold <- 3
eval_sets <- evaluationScheme(data = ratings_movies, method = "cross-validation", k = n_fold,
given = items_to_keep, goodRating = rating_threshold)
Split the dataset into test and train sets to build the model.
train <- getData(eval_sets, "train")
known <- getData(eval_sets, "known")
unknown <- getData(eval_sets, "unknown")
model_to_evaluate <- "IBCF"
model_parameters <- NULL
Build the model
eval_recommender <- Recommender(data = getData(eval_sets,"train"), method = model_to_evaluate,
parameter = model_parameters)
Specify the number of items to recommend
items_to_recommend <- 10
Build the matrix with the predicted ratings
eval_prediction <- predict(object = eval_recommender, newdata = getData(eval_sets, "known"),
n = items_to_recommend, type = "ratings")
class(eval_prediction)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
Number of movies recommend to each user
qplot(rowCounts(eval_prediction)) + geom_histogram(binwidth = 50) +
ggtitle("Distribution of movies per users")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
eval_accuracy <- calcPredictionAccuracy(x = eval_prediction, data = getData(eval_sets, "unknown"),
byUser = TRUE)
head(eval_accuracy)
## RMSE MSE MAE
## [1,] 0.0000000 0.000000 0.0000000
## [2,] 1.7793729 3.166168 1.5728125
## [3,] 1.1649647 1.357143 0.9285714
## [4,] 2.8577380 8.166667 2.8333333
## [5,] 0.7071068 0.500000 0.5000000
## [6,] 0.0000000 0.000000 0.0000000
Distribution of RMSE by user
qplot(eval_accuracy[,"RMSE"]) + geom_histogram(binwidth = 0.1) +
ggtitle("Distribution of the RMSE by user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
calcPredictionAccuracy(x = eval_prediction, data = getData(eval_sets, "unknown"),
byUser = FALSE)
## RMSE MSE MAE
## 1.495287 2.235883 1.114363
Comparing the recommendations with the purchases having positive rating
results <- evaluate(x = eval_sets, method = model_to_evaluate, n = seq(10,100,10))
## IBCF run fold/sample [model time/prediction time]
## 1 [50.39sec/0.17sec]
## 2 [48.92sec/0.18sec]
## 3 [50.09sec/0.14sec]
## 4 [49.27sec/0.22sec]
class(results)
## [1] "evaluationResults"
## attr(,"package")
## [1] "recommenderlab"
Build the ROC curve
plot(results, annotate = TRUE, main = "ROC curve")
plot(results, "prec/rec", annotate = TRUE, main = "Precision-Recall")
models_to_evaluate <- list(IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF", param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF", param = list(method = "pearson")),
random = list(name = "RANDOM", param = NULL))
Number of the recommendations items to test the models
n_recommendations <- c(1, 5, seq(10, 100,10))
Run and evaluate the model
list_results <- evaluate(x = eval_sets, method = models_to_evaluate, n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
## 1 [47.24sec/0.19sec]
## 2 [46.26sec/0.14sec]
## 3 [47.55sec/0.15sec]
## 4 [47.74sec/0.39sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [34.23sec/0.18sec]
## 2 [34.04sec/0.19sec]
## 3 [33.77sec/0.14sec]
## 4 [33.38sec/0.17sec]
## UBCF run fold/sample [model time/prediction time]
## 1
## Timing stopped at: 7.96 0.07 8.22
## Error in validObject(.Object) :
## invalid class "topNList" object: invalid object for slot "itemLabels" in class "topNList": got class "NULL", should be or extend class "character"
## UBCF run fold/sample [model time/prediction time]
## 1
## Timing stopped at: 4.26 0.08 4.44
## Error in validObject(.Object) :
## invalid class "topNList" object: invalid object for slot "itemLabels" in class "topNList": got class "NULL", should be or extend class "character"
## RANDOM run fold/sample [model time/prediction time]
## 1
## Timing stopped at: 0 0 0
## Error in object@predict(object@model, newdata, n = n, data = data, type = type, :
## number of items in newdata does not match model.
## Warning in .local(x, method, ...):
## Recommender 'UBCF_cos' has failed and has been removed from the results!
## Recommender 'UBCF_cor' has failed and has been removed from the results!
## Recommender 'random' has failed and has been removed from the results!
class(list_results)
## [1] "evaluationResultList"
## attr(,"package")
## [1] "recommenderlab"
Bild the chart displaying ROC curves
plot(list_results, annotate = 1, legend = "topleft")
title("ROC curves")
UBCF with pearson correlation has the better Area Under the Curve(AUC). It is the best-performing technique.
Precision-Recall chart
plot(list_results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-Recall")