Loading Libraries

library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")
library("recommenderlab")
library(tidytext)
library(psych)
library(knitr)
library(ggplot2)
require(ggthemes)

Loading Data from built-in Database of Recommenderlab

data(Jester5k)
Jester5k
## 5000 x 100 rating matrix of class 'realRatingMatrix' with 362106 ratings.

Selecting only relevant data sets

ratings <- Jester5k[rowCounts(Jester5k) > 50, colCounts(Jester5k) > 100]
ratings
## 3875 x 100 rating matrix of class 'realRatingMatrix' with 314302 ratings.

Parameters of the evaluation

items_to_keep <- 15 # using the book's recommendation
rating_threshold <- 5# min rating threshold of jokes considered good (range: 0-10)

K-fold method

set.seed(123)
n_fold <- 5
eval_sets <- evaluationScheme(data = ratings, method = "cross-validation", k = n_fold, given = items_to_keep, goodRating = rating_threshold)

getData(eval_sets, "train")
## 3100 x 100 rating matrix of class 'realRatingMatrix' with 251394 ratings.
getData(eval_sets, "known")
## 775 x 100 rating matrix of class 'realRatingMatrix' with 11625 ratings.
getData(eval_sets, "unknown")
## 775 x 100 rating matrix of class 'realRatingMatrix' with 51283 ratings.

Building the IBCF Model

model_to_evaluate_IBCF <- "IBCF"
model_parameters_IBCF <- NULL

eval_recommender_IBCF <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_IBCF, parameter = model_parameters_IBCF)

items_to_recommend <- 10

eval_prediction_IBCF <- predict(object = eval_recommender_IBCF, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Building the UBCF Model

model_to_evaluate_UBCF <- "UBCF"
model_parameters_UBCF <- NULL

eval_recommender_UBCF <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_UBCF, parameter = model_parameters_UBCF)

items_to_recommend <- 10

eval_prediction_UBCF <- predict(object = eval_recommender_UBCF, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Building the SVD Model

model_to_evaluate_SVD <- "SVD"
model_parameters_SVD <- NULL

eval_recommender_SVD <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_SVD, parameter = model_parameters_SVD)

items_to_recommend <- 10

eval_prediction_SVD <- predict(object = eval_recommender_SVD, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Comparing RSME of ratings

# RSMEs of the 3 algorithms
eval_accuracy_IBCF <- calcPredictionAccuracy( x = eval_prediction_IBCF, data = getData(eval_sets, "unknown"), byUser =FALSE)

eval_accuracy_UBCF <- calcPredictionAccuracy( x = eval_prediction_UBCF, data = getData(eval_sets, "unknown"), byUser =FALSE)

eval_accuracy_SVD <- calcPredictionAccuracy( x = eval_prediction_SVD, data = getData(eval_sets, "unknown"), byUser =FALSE)

#comparing the 3 together
accuracy <- rbind(eval_accuracy_IBCF , eval_accuracy_UBCF )
accuracy <- rbind(accuracy, eval_accuracy_SVD)
rownames(accuracy) <- c("IBCF", "UBCF", "SVD")
knitr::kable(accuracy, format = "html" ,caption = "Table 1: Acurracy Metrics of the 3 Models") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Table 1: Acurracy Metrics of the 3 Models
RMSE MSE MAE
IBCF 5.223588 27.28587 4.066770
UBCF 4.485514 20.11984 3.505529
SVD 4.604490 21.20132 3.631866

Observation 1:


Distribution of the rating’s RSME per Algorithm

#Chart of IBCF RSME
eval_accuracy_IBCF <- calcPredictionAccuracy( x = eval_prediction_IBCF, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_IBCF[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1a: Distribution of the IBCF's RMSE (k-fold=5)")+theme_economist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Chart of UBCF RSME
eval_accuracy_UBCF <- calcPredictionAccuracy( x = eval_prediction_UBCF, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_UBCF[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1b: Distribution of the UBCF's RMSE (k-fold=5)")+theme_economist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Chart of SVD RSME
eval_accuracy_SVD <- calcPredictionAccuracy( x = eval_prediction_SVD, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_SVD[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1c: Distribution of the SVD's RMSE (k-fold=5)")+theme_economist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


Observation 2:


Evaluating the recommendations

columns_to_sum <- c("TP", "FP", "FN", "TN")
#confusion matrix for IBCF
results_IBCF <- evaluate(x = eval_sets, method = model_to_evaluate_IBCF, n =seq(10, 100, 10))
## IBCF run fold/sample [model time/prediction time]
##   1  [0.2sec/0.11sec] 
##   2  [0.2sec/0.3sec] 
##   3  [0.19sec/0.14sec] 
##   4  [0.17sec/0.13sec] 
##   5  [0.2sec/0.16sec]
indices_summed_IBCF <- Reduce("+", getConfusionMatrix(results_IBCF))[, columns_to_sum]
knitr::kable(indices_summed_IBCF , format = "html",caption = "Table 2a: Confusion Matrix of IBCF") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Table 2a: Confusion Matrix of IBCF
TP FP FN TN
10 9.184516 40.81548 75.4387097 299.56129
20 19.246452 80.75355 65.3767742 259.62323
30 29.752258 120.24774 54.8709677 220.12903
40 40.621935 159.37806 44.0012903 180.99871
50 51.298065 198.70194 33.3251613 141.67484
60 61.775484 238.22452 22.8477419 102.15226
70 71.411613 278.57935 13.2116129 61.79742
80 80.489032 319.43742 4.1341935 20.93935
90 84.410323 339.13677 0.2129032 1.24000
100 84.410323 339.13677 0.2129032 1.24000
#confusion matrix for UBCF
results_UBCF <- evaluate(x = eval_sets, method = model_to_evaluate_UBCF, n =seq(10, 100, 10))
## UBCF run fold/sample [model time/prediction time]
##   1  [0.03sec/2.33sec] 
##   2  [0.03sec/2.19sec] 
##   3  [0.04sec/2.3sec] 
##   4  [0.04sec/2.23sec] 
##   5  [0.03sec/2.28sec]
indices_summed_UBCF <- Reduce("+", getConfusionMatrix(results_UBCF))[, columns_to_sum]
knitr::kable(indices_summed_UBCF , format = "html",caption = "Table 2b: Confusion Matrix of UBCF") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Table 2b: Confusion Matrix of UBCF
TP FP FN TN
10 19.36258 30.63742 65.260645 309.73935
20 34.12258 65.87742 50.500645 274.49935
30 44.92774 105.07226 39.695484 235.30452
40 53.69290 146.30710 30.930323 194.06968
50 61.73161 188.26839 22.891613 152.10839
60 68.96000 231.04000 15.663226 109.33677
70 76.02452 273.97548 8.598710 66.40129
80 82.20903 317.79097 2.414194 22.58581
90 84.62323 340.37677 0.000000 0.00000
100 84.62323 340.37677 0.000000 0.00000
#confusion matrix for SVD
results_SVD <- evaluate(x = eval_sets, method = model_to_evaluate_SVD, n =seq(10, 100, 10))
## SVD run fold/sample [model time/prediction time]
##   1  [0.09sec/0.11sec] 
##   2  [0.09sec/0.11sec] 
##   3  [0.05sec/0.11sec] 
##   4  [0.06sec/0.27sec] 
##   5  [0.08sec/0.11sec]
indices_summed_SVD <- Reduce("+", getConfusionMatrix(results_SVD))[, columns_to_sum]
knitr::kable(indices_summed_SVD  , format = "html",caption = "Table 2c: Confusion Matrix of SVD") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Table 2c: Confusion Matrix of SVD
TP FP FN TN
10 17.43097 32.56903 67.192258 307.80774
20 31.18452 68.81548 53.438710 271.56129
30 42.04387 107.95613 42.579355 232.42065
40 51.51613 148.48387 33.107097 191.89290
50 59.91871 190.08129 24.704516 150.29548
60 67.68774 232.31226 16.935484 108.06452
70 74.95871 275.04129 9.664516 65.33548
80 81.87226 318.12774 2.750968 22.24903
90 84.62323 340.37677 0.000000 0.00000
100 84.62323 340.37677 0.000000 0.00000

Overall Model performance

#creating a list of models to try out
models<-list(
  IBCF=list(name="IBCF",param=list(method="cosine")),
  UBCF=list(name="UBCF",param=list(method="cosine")),
  SVD=list(name="SVD",param = list(k = 20))) # randomly pick 20 to truncate the SVD model

nRecommendations<-c(1,5,seq(10,90,10))
results<-evaluate(x=eval_sets,method = models,n=nRecommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.2sec/0.12sec] 
##   2  [0.2sec/0.15sec] 
##   3  [0.2sec/0.13sec] 
##   4  [0.19sec/0.14sec] 
##   5  [0.19sec/0.14sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.01sec/2.3sec] 
##   2  [0.03sec/2.3sec] 
##   3  [0.03sec/2.24sec] 
##   4  [0.03sec/2.26sec] 
##   5  [0.02sec/2.19sec] 
## SVD run fold/sample [model time/prediction time]
##   1  [0.13sec/0.09sec] 
##   2  [0.25sec/0.09sec] 
##   3  [0.11sec/0.09sec] 
##   4  [0.09sec/0.24sec] 
##   5  [0.09sec/0.09sec]

ROC curve

par(bg="lightgray")
plot(results, annotate = TRUE ,type="l",legend = "bottomright")
title( "Fig2a: ROC curves")

Precision-Recall plot

par(bg="lightgray")
plot(results, "prec/rec", annotate = TRUE,type="l",legend = "bottomright")
title( "Fig 2b:Precision-recall")


Observation 3:


Diversity

matrix<-as(ratings,"matrix")
lowRatingsInd<-(which(matrix< 1,arr.ind = TRUE)) #take jokes with poor ratings of 1
#changed mix of ratings by 10% wih ratings below 1
lowRatingsIndchangeSize<-round(length(lowRatingsInd)*0.1,0)
lowRatingsIndToChange<-round(sample(1:length(lowRatingsInd)/2,lowRatingsIndchangeSize),0)

for(i in 1:length(lowRatingsIndToChange)) {
  matrix[lowRatingsInd[lowRatingsIndToChange[i],1],lowRatingsInd[lowRatingsIndToChange[i],2]]<-5 #make ratings 1 to be ratings of 5
}

dataDiversity <- as(as.matrix(matrix), "realRatingMatrix")

Repeat as before, K-fold cross validation

eval_sets_diversity <- evaluationScheme(data = dataDiversity , method = "cross-validation", k = n_fold, given = items_to_keep, goodRating = 5)#still considering 5 to be "good" ratings

Using UBCF Model only

model_to_evaluate_UBCF <- "UBCF"
model_parameters_UBCF <- NULL

eval_recommender_UBCF_diversity <- Recommender(data = getData(eval_sets_diversity , "train"),method = model_to_evaluate_UBCF, parameter = model_parameters_UBCF)

items_to_recommend <- 10

eval_prediction_UBCF_diversity  <- predict(object = eval_recommender_UBCF_diversity, newdata =
getData(eval_sets_diversity , "known"), n = items_to_recommend, type = "ratings")

evalAccuracy_diversity<-calcPredictionAccuracy(x=eval_prediction_UBCF_diversity,data=getData(eval_sets_diversity,"unknown"),byUser=FALSE)
#RSME of ratings with more diversity in ratings mix
print("Accuracy metrics of Data with diversity included:")
## [1] "Accuracy metrics of Data with diversity included:"
evalAccuracy_diversity
##      RMSE       MSE       MAE 
##  4.618249 21.328225  3.665731
models11<-list(

  UBCF=list(name="UBCF",param=list(method="cosine")))

nRecommendations<-c(1,5,seq(10,90,10))
results11<-evaluate(x=eval_sets_diversity,method = models11,n=nRecommendations)
## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/2.13sec] 
##   2  [0.03sec/2.3sec] 
##   3  [0.03sec/2.27sec] 
##   4  [0.02sec/2.1sec] 
##   5  [0.01sec/2.11sec]
#plot the ROC-AUC curve
par(bg="lightgray")
plot(results11, annotate = TRUE ,type="l",legend = "topleft",col ="red")
title( "Fig2c: ROC curve of Data set with Diversity")


Observation 4:


Summary

Again for this project, the evaluation steps were procedurally performed per Chapter 4 of the book.  K-fold validation was the accuracy evaluation of choice as it ensured that all sections of the dataset were trained and tested.  It also made for a more complete test harness as all data points were accounted for.  Parameters of K-fold and min. threshold ratings of 5 were selected for this project and 3 models were used as test cases:

1) IBCF Model

2) UBCF Model

3) SVD Model

The UBCF Model was found to be the most superior of the 3 with better RSME, ROC and Precision-Recall curves.  When diversity of lower ratings were thrown into the mix, RSME degraded slightly, ROC curve of Fig 2c, showed curve flattening post-diversity inclusion. 

Conclusion

While the above evaluation techniques have proven itself well acadamically on an off-line basis because of its measurebility. For instance it’s easy to compute RSME. When one goes into production online, other measures becomes more critical[1]. From a business perspective, Click-Thru-Rate (CTR) and Conversion Rate (CR) are more in tune with business’ bottom-lines. In my A/B testing, I would use these 2 metrics in tandem, meaning monitoring my CTRs and CRs concurrently. CTRs shows traffic while CR show conversion of those traffic in business dollars. In other words, conversion rate = number of recommendations that made both your boss and your user happy!

Furthermore, one of the quick strategy to improve the evaluation environment is to use the right training data in the design phase; 2 things that can be done to improve the on-line training data that could quickly improve a recommender system[2]:


References:

  1. https://medium.com/recombee-blog/evaluating-recommender-systems-choosing-the-best-one-for-your-business-c688ab781a35

  2. https://towardsdatascience.com/evaluation-metrics-for-recommender-systems-df56c6611093