For the final project, I will be using the MovieLense dataset since, I have been using it for my previous projects also and I feel familiar with the dataset and its contents.
This dataset is available publicily online at https://grouplens.org/datasets/movielens/
I have downloaded the dataset and put it in my github account.
#import relevant libraries
library(ggplot2)
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.2
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(data.table)
#import required datasets
data <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/movies.csv")
head(data)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## 6 6 Heat (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
## 6 Action|Crime|Thriller
ratings <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA612/master/ml-latest-small/ratings.csv")
head(ratings)
## userId movieId rating timestamp
## 1 1 1 4 964982703
## 2 1 3 4 964981247
## 3 1 6 4 964982224
## 4 1 47 5 964983815
## 5 1 50 5 964982931
## 6 1 70 3 964982400
summary(data)
## movieId title
## Min. : 1 Confessions of a Dangerous Mind (2002): 2
## 1st Qu.: 3248 Emma (1996) : 2
## Median : 7300 Eros (2004) : 2
## Mean : 42200 Saturn 3 (1980) : 2
## 3rd Qu.: 76232 War of the Worlds (2005) : 2
## Max. :193609 ¡Three Amigos! (1986) : 1
## (Other) :9731
## genres
## Drama :1053
## Comedy : 946
## Comedy|Drama : 435
## Comedy|Romance: 363
## Drama|Romance : 349
## Documentary : 339
## (Other) :6257
summary(ratings)
## userId movieId rating timestamp
## Min. : 1.0 Min. : 1 Min. :0.500 Min. :8.281e+08
## 1st Qu.:177.0 1st Qu.: 1199 1st Qu.:3.000 1st Qu.:1.019e+09
## Median :325.0 Median : 2991 Median :3.500 Median :1.186e+09
## Mean :326.1 Mean : 19435 Mean :3.502 Mean :1.206e+09
## 3rd Qu.:477.0 3rd Qu.: 8122 3rd Qu.:4.000 3rd Qu.:1.436e+09
## Max. :610.0 Max. :193609 Max. :5.000 Max. :1.538e+09
Below is the code to crete a matrix of generes for each movie
genre <- as.data.frame(data$genres)
genre2 <- as.data.frame(tstrsplit(genre[,1], '[|]', type.convert=TRUE))
colnames(genre2) <- c(1:10)
genre_list <- c("Action", "Adventure", "Animation", "Children", "Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance", "Sci-Fi", "Thriller", "War", "Western")
genre_matrix <- matrix(0, 9743,18) #number of movies = 9742, number of genres = 18
genre_matrix[1,] <- genre_list
colnames(genre_matrix) <- genre_list #set the column names to genre list
#let's iterate the matrix
for (i in 1:nrow(genre2)) {
for (c in 1:ncol(genre2)) {
genre_matrix_col = which(genre_matrix[1,] == genre2[i,c])
genre_matrix[i+1, genre_matrix_col] <- 1
}
}
#now convert it into dataframe
final_genre_matrix <- as.data.frame(genre_matrix[-1,], stringsAsFactors=FALSE)
for (c in 1:ncol(final_genre_matrix)) {
final_genre_matrix[,c] <- as.integer(final_genre_matrix[,c]) #convert to integers
}
head(final_genre_matrix)
## Action Adventure Animation Children Comedy Crime Documentary Drama Fantasy
## 1 0 1 1 1 1 0 0 0 1
## 2 0 1 0 1 0 0 0 0 1
## 3 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 1 0 0 1 0
## 5 0 0 0 0 1 0 0 0 0
## 6 1 0 0 0 0 1 0 0 0
## Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 1 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0
#Create ratings matrix
rating_matrix <- dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)
## Warning in dcast(ratings, userId ~ movieId, value.var = "rating", na.rm =
## FALSE): The dcast generic in data.table has been passed a data.frame and
## will attempt to redirect to the reshape2::dcast; please note that reshape2
## is deprecated, and this redirection is now deprecated as well. Please do this
## redirection yourself like reshape2::dcast(ratings). In the next version, this
## warning will become an error.
rating_matrix <- as.matrix(rating_matrix[,-1])
#Convert ratings matrix into sparse matrix
rating_matrix <- as(rating_matrix, "realRatingMatrix")
rating_matrix
## 610 x 9724 rating matrix of class 'realRatingMatrix' with 100836 ratings.
#exploring recommender_models
recommender_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
recommender_model
## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
## normalize lambda n_factors n_iterations min_item_nr seed
## 1 NULL 0.1 10 10 1 NULL
##
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
## lambda alpha n_factors n_iterations min_item_nr seed
## 1 0.1 10 10 10 1 NULL
##
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
## k method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine" "center" FALSE 0.5 FALSE
##
## $LIBMF_realRatingMatrix
## Recommender method: LIBMF for realRatingMatrix
## Description: Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html).
## Reference: NA
## Parameters:
## dim costp_l2 costq_l2 nthread
## 1 10 0.01 0.01 1
##
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
## normalize
## 1 "center"
## aggregationRatings
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1,
## aggregationPopularity
## 1 new("standardGeneric", .Data = function (x, na.rm = FALSE, dims = 1,
##
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
##
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
## randomize minRating
## 1 1 NA
##
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
## k maxiter normalize
## 1 10 100 "center"
##
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html).
## Reference: NA
## Parameters:
## k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015 0.001 50 200 1e-06 "center" FALSE
##
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
## method nn sample normalize
## 1 "cosine" 25 FALSE "center"
#So many models but I will be focusing only on IBCF and UBCF
recommender_model$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
In order to see the similarity between users or between items, we can use collaboratice filtering algorithm.
Let’s create the matrix and visualize it.
similarity_betn_users <- as.matrix(similarity(rating_matrix[1:4, ], method="cosine", which="users"))
similarity_betn_users
## 1 2 3 4
## 1 0.0000000 1 0.7919033 0.9328096
## 2 1.0000000 0 NA 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000
## 4 0.9328096 1 1.0000000 0.0000000
image(similarity_betn_users, main = "Similarity between Users")
Similarly let’s do the same for the items.
similarity_betn_items <- as.matrix(similarity(rating_matrix[, 1:4], method="cosine", which="items"))
similarity_betn_items
## 1 2 3 4
## 1 0.0000000 0.9644641 0.9715415 0.9838699
## 2 0.9644641 0.0000000 0.9389013 0.9609877
## 3 0.9715415 0.9389013 0.0000000 1.0000000
## 4 0.9838699 0.9609877 1.0000000 0.0000000
image(similarity_betn_items, main = "Similarity between Items")
Now, we need to explore values of ratings
vector_ratings <- as.vector(rating_matrix@data)
unique(vector_ratings)
## [1] 4.0 0.0 4.5 2.5 3.5 3.0 5.0 0.5 2.0 1.5 1.0
#Let's omit a missing values, 0 represents a missing value
vector_ratings <- vector_ratings[vector_ratings != 0]
vector_ratings <- factor(vector_ratings)
#let's plot the dataset
qplot(vector_ratings) +
ggtitle("Distribution of the ratings")
According to the graph, the ratings are mostly between 3 to 5. There are some ratings between 0.5 to 2.5 but we can say that mostly people prefer rating 3 for dislike just to be on safe side.
#to calculate the most viewed movies
# count views for each movie
views_per_movie <- colCounts(rating_matrix)
# create new dataframe
table_views <- data.frame(movie_Num = names(views_per_movie), Views = views_per_movie)
# sort by Views
table_views <- table_views[order(table_views$Views, decreasing = TRUE), ]
table_views$title <- NA
for (i in 1:9724)
{
table_views[i,3] <- as.character(subset(data, data$movieId == table_views[i,1])$title)
}
table_views[1:6,]
## movie_Num Views title
## 356 356 329 Forrest Gump (1994)
## 318 318 317 Shawshank Redemption, The (1994)
## 296 296 307 Pulp Fiction (1994)
## 593 593 279 Silence of the Lambs, The (1991)
## 2571 2571 278 Matrix, The (1999)
## 260 260 251 Star Wars: Episode IV - A New Hope (1977)
#let's plot top viewed movies
ggplot(table_views[1:6, ],
aes(x = title, y = Views)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 60,hjust=1)) +
ggtitle("Top movies -- total views")
Forrest Gump is the movie that has the highest views with 329 views.
average_ratings <- colMeans(rating_matrix)
qplot(average_ratings) +
stat_bin(binwidth = 0.1) +
ggtitle("Distribution of the movies average ratings")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
average_ratings_relevant <- average_ratings[views_per_movie > 50]
qplot(average_ratings_relevant) +
stat_bin(binwidth = 0.1) +
ggtitle(paste("Distribution of the relevant average ratings"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Let’s select the data and normalize it for further computation.
#Here, I have standardize the row and columns value by greater than 50 to make the data meaningful.
movies_ratings <- rating_matrix[rowCounts(rating_matrix) > 50, colCounts(rating_matrix) > 50]
movies_ratings
## 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.
Now we have only 378 users and 436 movies.
##here
Using this approach, I have put a top 5 percent of movies and users in a heatmap below:
minimum_movies <- quantile(rowCounts(movies_ratings), 0.95)
minimum_users <- quantile(colCounts(movies_ratings), 0.95)
image(movies_ratings[rowCounts(movies_ratings) > minimum_movies,
colCounts(movies_ratings) > minimum_users],
main = "Heatmap of the top users and movies",xlab='Movies',ylab='Users')
#normalize the dataset
normalized_movies_ratings <- normalize(movies_ratings)
sum(rowMeans(normalized_movies_ratings) > 0.00001)
## [1] 0
#visualize the normalized matrix of the top user and movies
image(normalized_movies_ratings[rowCounts(normalized_movies_ratings) > minimum_movies,
colCounts(normalized_movies_ratings) > minimum_users],
main = "Heatmap of the top users and movies", xlab = 'Movies', ylab='Users')
#building a dataset with 80-20 ratio
build_dataset <- sample(x = c(TRUE, FALSE), size = nrow(movies_ratings),replace = TRUE, prob = c(0.8, 0.2))
data_train <- movies_ratings[build_dataset, ]
data_test <- movies_ratings[!build_dataset, ]
#train dataset, "IBCF"
recommender_model <- Recommender(data = data_train, method = "IBCF")
recommender_model
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 305 users.
#explore the model
recommender_model_details <- getModel(recommender_model)
class(recommender_model_details$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
# number of items to recommend to each user
num_recommended <- 10
recommender_predicted <- predict(object = recommender_model,
newdata = data_test,
n = num_recommended)
recommender_predicted
## Recommendations as 'topNList' with n = 10 for 73 users.
# recommendation for the first user
recommender_user_1 <- recommender_predicted@items[[1]]
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
{
movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
}
movies_user_2
## [1] "Get Shorty (1995)"
## [2] "Judge Dredd (1995)"
## [3] "Disclosure (1994)"
## [4] "Natural Born Killers (1994)"
## [5] "Four Weddings and a Funeral (1994)"
## [6] "Maverick (1994)"
## [7] "City Slickers II: The Legend of Curly's Gold (1994)"
## [8] "Cliffhanger (1993)"
## [9] "Ghost (1990)"
## [10] "Pretty Woman (1990)"
# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items,
function(x){ as.integer(colnames(movies_ratings)[x]) })
recommender_matrix[,1:4]
## [,1] [,2] [,3] [,4]
## [1,] 21 2174 3033 3
## [2,] 173 2716 1247 16
## [3,] 225 3751 6016 19
## [4,] 288 4027 5299 36
## [5,] 357 48385 56367 104
## [6,] 368 68157 3081 223
## [7,] 432 76093 3671 300
## [8,] 434 30707 3481 316
## [9,] 587 1073 300 317
## [10,] 597 8644 4776 318
num_of_items <- factor(table(recommender_matrix))
qplot(num_of_items) +
ggtitle("Distribution for IBCF")
num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)
for (i in 1:4)
{
table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie", "Number of items")
head(table_top)
## Movie Number of items
## 2470 Crocodile Dundee (1986) 11
## 10 GoldenEye (1995) 9
## 3 Grumpier Old Men (1995) 8
## 16 Casino (1995) 8
recommender_model <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_model$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $normalize
## [1] "center"
#train dataset, "UBCF"
recommender_model <- Recommender(data = data_train, method = "UBCF")
recommender_model
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 305 users.
recommeder_model_details <- getModel(recommender_model)
class(recommender_model_details$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
num_recommended <- 10 # number of items to recommend to each user
recommender_predicted <- predict(object = recommender_model, newdata = data_test, n = num_recommended)
recommender_predicted
## Recommendations as 'topNList' with n = 10 for 73 users.
recommender_user_1 <- recommender_predicted@items[[1]] # recommendation for the first user
movies_user_1 <- recommender_predicted@itemLabels[recommender_user_1]
movies_user_2 <- movies_user_1
for (i in 1:10)
{
movies_user_2[i] <- as.character(subset(data, data$movieId == movies_user_1[i])$title)
}
movies_user_2
## [1] "Rear Window (1954)"
## [2] "Lord of the Rings: The Return of the King, The (2003)"
## [3] "Incredibles, The (2004)"
## [4] "Godfather, The (1972)"
## [5] "Shawshank Redemption, The (1994)"
## [6] "Beautiful Mind, A (2001)"
## [7] "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)"
## [8] "One Flew Over the Cuckoo's Nest (1975)"
## [9] "Lord of the Rings: The Fellowship of the Ring, The (2001)"
## [10] "Spirited Away (Sen to Chihiro no kamikakushi) (2001)"
# matrix with the recommendations for each user
recommender_matrix <- sapply(recommender_predicted@items,
function(x){ as.integer(colnames(movies_ratings)[x]) })
recommender_matrix[,1:4]
## [,1] [,2] [,3] [,4]
## [1,] 904 318 593 318
## [2,] 7153 457 318 58559
## [3,] 8961 908 457 5952
## [4,] 858 253 6 4993
## [5,] 318 590 1617 904
## [6,] 4995 1089 1732 68954
## [7,] 293 608 2028 7153
## [8,] 1193 527 150 60069
## [9,] 4993 904 1089 356
## [10,] 5618 34 1197 6711
num_of_items <- factor(table(recommender_matrix))
qplot(num_of_items) +
ggtitle("Distribution for IBCF")
num_of_items_sorted <- sort(num_of_items, decreasing = TRUE)
num_of_items_top_movies <- head(num_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(num_of_items_top_movies)), num_of_items_top_movies)
for (i in 1:4)
{
table_top[i,1] <- as.character(subset(data, data$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie", "Number of items")
head(table_top)
## Movie Number of items
## 457 Fugitive, The (1993) 33
## 318 Shawshank Redemption, The (1994) 23
## 223 Clerks (1994) 20
## 110 Braveheart (1995) 18
min(rowCounts(movies_ratings))
## [1] 11
percent_training <- 0.8
items_to_keep <- 5
rating_threshold <- 3
num_of_eval <- 1
evaluation_sets <- evaluationScheme(data = movies_ratings,
method = "split",
train = percent_training,
given = items_to_keep,
goodRating = rating_threshold,
k = num_of_eval)
evaluation_sets
## Evaluation scheme with 5 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.
getData(evaluation_sets, "train") # training set
## 302 x 436 rating matrix of class 'realRatingMatrix' with 28867 ratings.
getData(evaluation_sets, "known") # set with the items used to build the recommendations
## 76 x 436 rating matrix of class 'realRatingMatrix' with 380 ratings.
getData(evaluation_sets, "unknown") # set with the items used to test the recommendations
## 76 x 436 rating matrix of class 'realRatingMatrix' with 6967 ratings.
qplot(rowCounts(getData(evaluation_sets, "unknown"))) +
geom_histogram(binwidth = 10) +
ggtitle("Unknown items by the users")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
evaluation_sets <- evaluationScheme(data = movies_ratings,
method = "bootstrap",
train = percent_training,
given = items_to_keep,
goodRating = rating_threshold,
k = num_of_eval)
table_train <- table(evaluation_sets@runsTrain[[1]])
num_of_repetitions <- factor(as.vector(table_train))
qplot(num_of_repetitions) +
ggtitle("Number of repetitions in the training dataset")
num_of_fold <- 4
evaluation_sets <- evaluationScheme(data = movies_ratings,
method = "cross-validation",
k = num_of_fold,
given = items_to_keep,
goodRating = rating_threshold)
size_sets <- sapply(evaluation_sets@runsTrain, length)
size_sets
## [1] 282 282 282 282
evaluation_sets <- evaluationScheme(data = movies_ratings,
method = "cross-validation",
k = num_of_fold,
given = items_to_keep,
goodRating = rating_threshold)
evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
method = "IBCF",
parameter = NULL)
items_to_recommend <- 10
evaluation_prediction <- predict(object = evaluation_recommender,
newdata = getData(evaluation_sets, "known"),
n = items_to_recommend,
type = "ratings")
qplot(rowCounts(evaluation_prediction)) +
geom_histogram(binwidth = 10) +
ggtitle("Distribution of movies per user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
evaluation_accuracy <- calcPredictionAccuracy(x = evaluation_prediction,
data = getData(evaluation_sets, "unknown"),
byUser = FALSE)
evaluation_accuracy
## RMSE MSE MAE
## 1.380663 1.906230 1.027218
models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF",
param = list(method = "cosine")),
IBCF_cor = list(name = "IBCF",
param = list(method = "pearson")),
UBCF_cos = list(name = "UBCF",
param = list(method = "cosine")),
UBCF_cor = list(name = "UBCF",
param = list(method = "pearson")),
random = list(name = "RANDOM", param=NULL)
)
num_of_recommendations <- c(1, 5, seq(10, 100, 10))
evaluation_list_results <- evaluate(x = evaluation_sets,
method = models_to_evaluate,
n = num_of_recommendations)
## IBCF run fold/sample [model time/prediction time]
## 1 [0.591sec/0.025sec]
## 2 [0.596sec/0.021sec]
## 3 [0.599sec/0.019sec]
## 4 [0.566sec/0.019sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.596sec/0.02sec]
## 2 [0.593sec/0.019sec]
## 3 [0.614sec/0.021sec]
## 4 [0.6sec/0.019sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.003sec/0.089sec]
## 2 [0.002sec/0.088sec]
## 3 [0.004sec/0.096sec]
## 4 [0.003sec/0.087sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.002sec/0.115sec]
## 2 [0.002sec/0.1sec]
## 3 [0.002sec/0.098sec]
## 4 [0.002sec/0.099sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0.001sec/0.029sec]
## 2 [0.001sec/0.03sec]
## 3 [0.001sec/0.028sec]
## 4 [0sec/0.031sec]
sapply(evaluation_list_results, class) == "evaluationResults"
## IBCF_cos IBCF_cor UBCF_cos UBCF_cor random
## TRUE TRUE TRUE TRUE TRUE
avg_matrices <- lapply(evaluation_list_results, avg)
head(avg_matrices$IBCF_cos[, 5:8])
## precision recall TPR FPR
## 1 0.2440090 0.003151194 0.003151194 0.002150107
## 5 0.2039948 0.013571678 0.013571678 0.011330510
## 10 0.1885165 0.024610156 0.024610156 0.023110442
## 20 0.1888598 0.046791884 0.046791884 0.045743931
## 30 0.1905377 0.070589940 0.070589940 0.067793227
## 40 0.1877560 0.091071761 0.091071761 0.089952070
plot(evaluation_list_results, annotate = 1, legend = "topleft")
title("ROC curve")
plot(evaluation_list_results, annotate = 1, legend = "topright")
title("Precision - recall")
It seems UBCF is the better recommender system. It is more accurate than IBCF.