Accuracy and Beyond

  1. Compare the accuracy of at least two recommender system algorithms against your offline data.
  2. Implement support for at least one business or user experience goal such as increased serendipity, novelty, or diversity.
  3. Compare and report on any change in accuracy before and after you’ve made the change in #2.
  4. As part of your textual conclusion, discuss one or more additional experiments that could be performed and/or metrics that could be evaluated only if online evaluation was possible. Also, briefly propose how you would design a reasonable online evaluation environment.
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
library(ggplot2)
data("MovieLense")
library(pander)

# data selection 
(ratings_movies <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100])
## 560 x 332 rating matrix of class 'realRatingMatrix' with 55298 ratings.
# split data
precentage_training <- .8

# setting parameters
# check the minimum # of items purchased by user 
# so there will not be any users without items to test the models
min(rowCounts(ratings_movies))
## [1] 18
items_to_keep <- 15

# minimum rating that is considered good
rating_threshold <- 3
# # of run the evaluation
n_eval <- 1

eval_sets <- evaluationScheme(
                data = ratings_movies,
                method = 'split',
                train = precentage_training, 
                given = items_to_keep,
                goodRating = rating_threshold,
                k = n_eval
             )

eval_sets
## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: >=3.000000
## Data set: 560 x 332 rating matrix of class 'realRatingMatrix' with 55298 ratings.
getData(eval_sets, 'train')
## 448 x 332 rating matrix of class 'realRatingMatrix' with 45095 ratings.
(nrow(getData(eval_sets, 'train'))/nrow(ratings_movies))
## [1] 0.8
getData(eval_sets, 'known')
## 112 x 332 rating matrix of class 'realRatingMatrix' with 1680 ratings.
getData(eval_sets, 'unknown')
## 112 x 332 rating matrix of class 'realRatingMatrix' with 8523 ratings.
nrow(getData(eval_sets, 'known'))/nrow(ratings_movies)
## [1] 0.2
unique(rowCounts(getData(eval_sets, 'known')))
## [1] 15
qplot(rowCounts(getData(eval_sets, 'unknown'))) + 
  geom_histogram(binwidth = 10) +
  ggtitle('Unknown items by the users')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# k-fold tests on each user
n_fold <- 4
eval_sets <- evaluationScheme(
  data = ratings_movies, 
  method = 'cross-validation',
  k = n_fold,
  given = items_to_keep,
  goodRating = rating_threshold)
size_sets <- sapply(eval_sets@runsTrain, length)
size_sets
## [1] 420 420 420 420
model_to_evaluate <- "IBCF"
model_parameters <- NULL

## build model
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
                                method = model_to_evaluate,
                                parameter = model_parameters)

items_to_recommend <- 10

## predict
eval_prediction <- predict(object = eval_recommender,
                           newdata = getData(eval_sets, "known"),
                           n = items_to_recommend,
                           type = "ratings")

class(eval_prediction)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
## Distribution of movies per user
qplot(rowCounts(eval_prediction)) +
  geom_histogram(binwidth = 10) +
  ggtitle("Distribution of movies per user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

eval_accuracy <- calcPredictionAccuracy(
  x = eval_prediction,
  data = getData(eval_sets, "unknown"),
  byUser = TRUE)

pander(head(eval_accuracy))
  RMSE MSE MAE
2 1.338 1.789 0.8071
3 1.147 1.316 0.9055
5 1.678 2.816 1.311
10 0.616 0.3795 0.441
14 1.141 1.302 0.9438
22 1.592 2.536 1.172
qplot(eval_accuracy[, "RMSE"]) +
  geom_histogram(binwidth = 0.1) +
  ggtitle("Distribution of the RMSE by user")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

eval_accuracy <- calcPredictionAccuracy(
  x = eval_prediction,
  data = getData(eval_sets, "unknown"),
  byUser = FALSE)
eval_accuracy
##      RMSE       MSE       MAE 
## 1.2017507 1.4442047 0.8989494
results <- evaluate(x = eval_sets,
                    method = model_to_evaluate,
                    n = seq(10, 100, 10))
## IBCF run fold/sample [model time/prediction time]
##   1  [1.14sec/0.36sec] 
##   2  [0.68sec/0.15sec] 
##   3  [0.59sec/0.28sec] 
##   4  [0.57sec/0.1sec]
class(results)
## [1] "evaluationResults"
## attr(,"package")
## [1] "recommenderlab"
head(getConfusionMatrix(results)[[1]])
##           TP        FP       FN       TN precision     recall        TPR
## 10  2.807143  7.192857 71.26429 235.7357 0.2807143 0.03868631 0.03868631
## 20  5.357143 14.642857 68.71429 228.2857 0.2678571 0.07522148 0.07522148
## 30  7.650000 22.350000 66.42143 220.5786 0.2550000 0.10726173 0.10726173
## 40 10.328571 29.671429 63.74286 213.2571 0.2582143 0.14290043 0.14290043
## 50 12.850000 37.150000 61.22143 205.7786 0.2570000 0.17838537 0.17838537
## 60 15.450000 44.550000 58.62143 198.3786 0.2575000 0.21785722 0.21785722
##           FPR
## 10 0.02927602
## 20 0.06003685
## 30 0.09198549
## 40 0.12181104
## 50 0.15252608
## 60 0.18312719
pander(head(getConfusionMatrix(results)[[1]]))
  TP FP FN TN precision recall TPR FPR
10 2.807 7.193 71.26 235.7 0.2807 0.03869 0.03869 0.02928
20 5.357 14.64 68.71 228.3 0.2679 0.07522 0.07522 0.06004
30 7.65 22.35 66.42 220.6 0.255 0.1073 0.1073 0.09199
40 10.33 29.67 63.74 213.3 0.2582 0.1429 0.1429 0.1218
50 12.85 37.15 61.22 205.8 0.257 0.1784 0.1784 0.1525
60 15.45 44.55 58.62 198.4 0.2575 0.2179 0.2179 0.1831
columns_to_sum <- c("TP", "FP", "FN", "TN")
indices_summed <- Reduce("+", getConfusionMatrix(results))[, columns_to_sum]
head(indices_summed)
##          TP        FP       FN       TN
## 10 10.86429  29.13571 281.3286 946.6714
## 20 20.99286  59.00714 271.2000 916.8000
## 30 30.57857  89.42143 261.6143 886.3857
## 40 40.68571 119.31429 251.5071 856.4929
## 50 50.02857 149.97143 242.1643 825.8357
## 60 60.53571 179.46429 231.6571 796.3429
pander(head(indices_summed))
  TP FP FN TN
10 10.86 29.14 281.3 946.7
20 20.99 59.01 271.2 916.8
30 30.58 89.42 261.6 886.4
40 40.69 119.3 251.5 856.5
50 50.03 150 242.2 825.8
60 60.54 179.5 231.7 796.3
plot(results,
     annotate = TRUE,
     main = "ROC curve")

plot(results, "prec/rec",
     annotate = TRUE,
     main = "Precision-recall")

models_to_evaluate <- list(
  IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
  IBCF_cor = list(name = "IBCF", param = list(method = "pearson")),
  UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
  UBCF_cor = list(name = "UBCF", param = list(method = "pearson")),
  random = list(name = "RANDOM", param=NULL)
)

n_recommendations <- c(1, 5, seq(10, 100, 10))
list_results <- evaluate(x = eval_sets,
                    method = models_to_evaluate,
                    n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.62sec/0.11sec] 
##   2  [0.83sec/0.09sec] 
##   3  [0.7sec/0.1sec] 
##   4  [0.73sec/0.09sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.57sec/0.08sec] 
##   2  [0.54sec/0.1sec] 
##   3  [0.71sec/0.07sec] 
##   4  [0.66sec/0.09sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.39sec] 
##   2  [0.05sec/0.38sec] 
##   3  [0sec/0.53sec] 
##   4  [0sec/0.37sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.53sec] 
##   2  [0sec/0.38sec] 
##   3  [0.02sec/0.48sec] 
##   4  [0sec/0.42sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0.01sec/0.11sec] 
##   2  [0.05sec/0.11sec] 
##   3  [0sec/0.27sec] 
##   4  [0sec/0.11sec]
class(list_results)
## [1] "evaluationResultList"
## attr(,"package")
## [1] "recommenderlab"
class(list_results[[1]])
## [1] "evaluationResults"
## attr(,"package")
## [1] "recommenderlab"
sapply(list_results, class) == "evaluationResults"
## IBCF_cos IBCF_cor UBCF_cos UBCF_cor   random 
##     TRUE     TRUE     TRUE     TRUE     TRUE
avg_matrices <- lapply(list_results, avg)
pander(head(avg_matrices$IBCF_cos)[, 5:8])
  precision recall TPR FPR
1 0.3125 0.004225 0.004225 0.002754
5 0.2893 0.01986 0.01986 0.01438
10 0.2716 0.03687 0.03687 0.02958
20 0.2624 0.07248 0.07248 0.06016
30 0.2548 0.1053 0.1053 0.09123
40 0.2543 0.1412 0.1412 0.1218
## compare the results of different models
plot(list_results,
     annotate = 1,
     legend = "topleft")
title("ROC curve")

plot(list_results,
     "prec/rec",
     annotate = 1,
     legend = "bottomright")
title("Precision-recall")

vector_k <- c(5, 10, 20, 30, 40)
models_to_evaluate <- lapply(vector_k, function(k){
  list(name = "IBCF",
       param = list(method = "cosine",
                    k = k))
})
names(models_to_evaluate) <- paste0("IBCF_k_", vector_k)

## 
n_recommendations <- c(1, 5, seq(10, 100, 10))
list_results <- evaluate(x = eval_sets,
                         method = models_to_evaluate,
                         n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.57sec/0.03sec] 
##   2  [0.67sec/0.05sec] 
##   3  [0.64sec/0.05sec] 
##   4  [0.69sec/0.05sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.49sec/0.06sec] 
##   2  [0.62sec/0.05sec] 
##   3  [0.64sec/0.07sec] 
##   4  [0.7sec/0.06sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.66sec/0.08sec] 
##   2  [0.51sec/0.09sec] 
##   3  [0.52sec/0.08sec] 
##   4  [0.64sec/0.07sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.67sec/0.09sec] 
##   2  [0.67sec/0.09sec] 
##   3  [0.67sec/0.09sec] 
##   4  [0.67sec/0.09sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.67sec/0.1sec] 
##   2  [0.65sec/0.08sec] 
##   3  [0.58sec/0.27sec] 
##   4  [0.6sec/0.1sec]
## topleft
plot(list_results,
     annotate = 1,
     legend = "topleft")
title("ROC curve")

## bottomright
plot(list_results,
     "prec/rec",
     annotate = 1,
     legend = "bottomright")
title("Precision-recall")

Conclusion:

I ran the example code in Chapter 4 of the text ‘Building a Recommendation System with R’, and tried to get familiar with model evaluation and comparison.

Reference:

Gorakala, S. and Usuelli, M. (2015). Building a Recommendation System with R.