Loading Libraries

library(Matrix)
library(reshape2)
library(data.table)
library(tidyr)
library(dplyr)
library(kableExtra)
library("scales")
library("recommenderlab")
library(tidytext)
library(psych)
library(knitr)
library(ggplot2)
require(ggthemes)

Loading Data from built-in Database of Recommenderlab

Using the Jester5K database this time
Per Chapter 3 of the book: “Building a Recommendation Systems”

data(Jester5k)
Jester5k

## 5000 x 100 rating matrix of class 'realRatingMatrix' with 362106 ratings.

Selecting only relevant data sets

Users having rated more than 50 times
Jokes getting rated more than 100 times

ratings <- Jester5k[rowCounts(Jester5k) > 50, colCounts(Jester5k) > 100]
ratings

## 3875 x 100 rating matrix of class 'realRatingMatrix' with 314302 ratings.

Parameters of the evaluation

items_to_keep <- 15 # using the book's recommendation
rating_threshold <- 5# min rating threshold of jokes considered good (range: 0-10)

K-fold method

set.seed(123)
n_fold <- 5
eval_sets <- evaluationScheme(data = ratings, method = "cross-validation", k = n_fold, given = items_to_keep, goodRating = rating_threshold)

getData(eval_sets, "train")

## 3100 x 100 rating matrix of class 'realRatingMatrix' with 251394 ratings.

getData(eval_sets, "known")

## 775 x 100 rating matrix of class 'realRatingMatrix' with 11625 ratings.

getData(eval_sets, "unknown")

## 775 x 100 rating matrix of class 'realRatingMatrix' with 51283 ratings.

Building the IBCF Model

model_to_evaluate_IBCF <- "IBCF"
model_parameters_IBCF <- NULL

eval_recommender_IBCF <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_IBCF, parameter = model_parameters_IBCF)

items_to_recommend <- 10

eval_prediction_IBCF <- predict(object = eval_recommender_IBCF, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Building the UBCF Model

model_to_evaluate_UBCF <- "UBCF"
model_parameters_UBCF <- NULL

eval_recommender_UBCF <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_UBCF, parameter = model_parameters_UBCF)

items_to_recommend <- 10

eval_prediction_UBCF <- predict(object = eval_recommender_UBCF, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Building the SVD Model

model_to_evaluate_SVD <- "SVD"
model_parameters_SVD <- NULL

eval_recommender_SVD <- Recommender(data = getData(eval_sets, "train"),method = model_to_evaluate_SVD, parameter = model_parameters_SVD)

items_to_recommend <- 10

eval_prediction_SVD <- predict(object = eval_recommender_SVD, newdata =
getData(eval_sets, "known"), n = items_to_recommend, type = "ratings")

Comparing RSME of ratings

RSMEs of the 3 algorithms

# RSMEs of the 3 algorithms
eval_accuracy_IBCF <- calcPredictionAccuracy( x = eval_prediction_IBCF, data = getData(eval_sets, "unknown"), byUser =FALSE)

eval_accuracy_UBCF <- calcPredictionAccuracy( x = eval_prediction_UBCF, data = getData(eval_sets, "unknown"), byUser =FALSE)

eval_accuracy_SVD <- calcPredictionAccuracy( x = eval_prediction_SVD, data = getData(eval_sets, "unknown"), byUser =FALSE)

#comparing the 3 together
accuracy <- rbind(eval_accuracy_IBCF , eval_accuracy_UBCF )
accuracy <- rbind(accuracy, eval_accuracy_SVD)
rownames(accuracy) <- c("IBCF", "UBCF", "SVD")
knitr::kable(accuracy, format = "html" ,caption = "Table 1: Acurracy Metrics of the 3 Models") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

Table 1: Acurracy Metrics of the 3 Models
	RMSE	MSE	MAE
IBCF	5.223588	27.28587	4.066770
UBCF	4.485514	20.11984	3.505529
SVD	4.604490	21.20132	3.631866

Observation 1:

The RSMEs of the ratings for UBCF and SVD were better than IBCF overall. Although, UBCF and SVD were quite similar

Distribution of the rating’s RSME per Algorithm

#Chart of IBCF RSME
eval_accuracy_IBCF <- calcPredictionAccuracy( x = eval_prediction_IBCF, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_IBCF[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1a: Distribution of the IBCF's RMSE (k-fold=5)")+theme_economist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Chart of UBCF RSME
eval_accuracy_UBCF <- calcPredictionAccuracy( x = eval_prediction_UBCF, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_UBCF[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1b: Distribution of the UBCF's RMSE (k-fold=5)")+theme_economist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Chart of SVD RSME
eval_accuracy_SVD <- calcPredictionAccuracy( x = eval_prediction_SVD, data = getData(eval_sets, "unknown"), byUser =TRUE)
qplot(eval_accuracy_SVD[, "RMSE"],color="orange") + geom_histogram(binwidth = .5) +ggtitle("Fig1c: Distribution of the SVD's RMSE (k-fold=5)")+theme_economist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Observation 2:

The RSME SVD is the most normally distrubted, followed by IBCF while UBCF exhibited right-skewness with abit of a long tail

Evaluating the recommendations

Confusion Matrix
Accounting all the splits at the same time
n: Is the number of jokes to recommend to each user; depending on range of n, we can visualize the performance of recommender system

columns_to_sum <- c("TP", "FP", "FN", "TN")
#confusion matrix for IBCF
results_IBCF <- evaluate(x = eval_sets, method = model_to_evaluate_IBCF, n =seq(10, 100, 10))

## IBCF run fold/sample [model time/prediction time]
##   1  [0.2sec/0.11sec] 
##   2  [0.2sec/0.3sec] 
##   3  [0.19sec/0.14sec] 
##   4  [0.17sec/0.13sec] 
##   5  [0.2sec/0.16sec]

indices_summed_IBCF <- Reduce("+", getConfusionMatrix(results_IBCF))[, columns_to_sum]
knitr::kable(indices_summed_IBCF , format = "html",caption = "Table 2a: Confusion Matrix of IBCF") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

Table 2a: Confusion Matrix of IBCF
	TP	FP	FN	TN
10	9.184516	40.81548	75.4387097	299.56129
20	19.246452	80.75355	65.3767742	259.62323
30	29.752258	120.24774	54.8709677	220.12903
40	40.621935	159.37806	44.0012903	180.99871
50	51.298065	198.70194	33.3251613	141.67484
60	61.775484	238.22452	22.8477419	102.15226
70	71.411613	278.57935	13.2116129	61.79742
80	80.489032	319.43742	4.1341935	20.93935
90	84.410323	339.13677	0.2129032	1.24000
100	84.410323	339.13677	0.2129032	1.24000

#confusion matrix for UBCF
results_UBCF <- evaluate(x = eval_sets, method = model_to_evaluate_UBCF, n =seq(10, 100, 10))

## UBCF run fold/sample [model time/prediction time]
##   1  [0.03sec/2.33sec] 
##   2  [0.03sec/2.19sec] 
##   3  [0.04sec/2.3sec] 
##   4  [0.04sec/2.23sec] 
##   5  [0.03sec/2.28sec]

indices_summed_UBCF <- Reduce("+", getConfusionMatrix(results_UBCF))[, columns_to_sum]
knitr::kable(indices_summed_UBCF , format = "html",caption = "Table 2b: Confusion Matrix of UBCF") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

Table 2b: Confusion Matrix of UBCF
	TP	FP	FN	TN
10	19.36258	30.63742	65.260645	309.73935
20	34.12258	65.87742	50.500645	274.49935
30	44.92774	105.07226	39.695484	235.30452
40	53.69290	146.30710	30.930323	194.06968
50	61.73161	188.26839	22.891613	152.10839
60	68.96000	231.04000	15.663226	109.33677
70	76.02452	273.97548	8.598710	66.40129
80	82.20903	317.79097	2.414194	22.58581
90	84.62323	340.37677	0.000000	0.00000
100	84.62323	340.37677	0.000000	0.00000

#confusion matrix for SVD
results_SVD <- evaluate(x = eval_sets, method = model_to_evaluate_SVD, n =seq(10, 100, 10))

## SVD run fold/sample [model time/prediction time]
##   1  [0.09sec/0.11sec] 
##   2  [0.09sec/0.11sec] 
##   3  [0.05sec/0.11sec] 
##   4  [0.06sec/0.27sec] 
##   5  [0.08sec/0.11sec]

indices_summed_SVD <- Reduce("+", getConfusionMatrix(results_SVD))[, columns_to_sum]
knitr::kable(indices_summed_SVD  , format = "html",caption = "Table 2c: Confusion Matrix of SVD") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

Table 2c: Confusion Matrix of SVD
	TP	FP	FN	TN
10	17.43097	32.56903	67.192258	307.80774
20	31.18452	68.81548	53.438710	271.56129
30	42.04387	107.95613	42.579355	232.42065
40	51.51613	148.48387	33.107097	191.89290
50	59.91871	190.08129	24.704516	150.29548
60	67.68774	232.31226	16.935484	108.06452
70	74.95871	275.04129	9.664516	65.33548
80	81.87226	318.12774	2.750968	22.24903
90	84.62323	340.37677	0.000000	0.00000
100	84.62323	340.37677	0.000000	0.00000

Overall Model performance

#creating a list of models to try out
models<-list(
  IBCF=list(name="IBCF",param=list(method="cosine")),
  UBCF=list(name="UBCF",param=list(method="cosine")),
  SVD=list(name="SVD",param = list(k = 20))) # randomly pick 20 to truncate the SVD model

nRecommendations<-c(1,5,seq(10,90,10))
results<-evaluate(x=eval_sets,method = models,n=nRecommendations)

## IBCF run fold/sample [model time/prediction time]
##   1  [0.2sec/0.12sec] 
##   2  [0.2sec/0.15sec] 
##   3  [0.2sec/0.13sec] 
##   4  [0.19sec/0.14sec] 
##   5  [0.19sec/0.14sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.01sec/2.3sec] 
##   2  [0.03sec/2.3sec] 
##   3  [0.03sec/2.24sec] 
##   4  [0.03sec/2.26sec] 
##   5  [0.02sec/2.19sec] 
## SVD run fold/sample [model time/prediction time]
##   1  [0.13sec/0.09sec] 
##   2  [0.25sec/0.09sec] 
##   3  [0.11sec/0.09sec] 
##   4  [0.09sec/0.24sec] 
##   5  [0.09sec/0.09sec]

ROC curve

par(bg="lightgray")
plot(results, annotate = TRUE ,type="l",legend = "bottomright")
title( "Fig2a: ROC curves")

Precision-Recall plot

par(bg="lightgray")
plot(results, "prec/rec", annotate = TRUE,type="l",legend = "bottomright")
title( "Fig 2b:Precision-recall")

Observation 3:

Although the confusion matrices for different ranges of n (# of items recommended) in Table 2 provided details of the evaluation, a better approach was to plot the ROC and Precision-recall curves. The curves of Fig2a and Fig2b allowed the analyst to quickly tell which model provided more accurate classifications. In Fig2a, the UBCF and SVD curves exhibited more “arching” from the 45 degree diagonal line; this meant models UBCF & SVD contained more True Positives than False Positives relative to model IBCF.
The IBCF curve in this case is also called the base line curve as it’s plotted at almost a 45 degree angle diagonally. The AUC is 0.5 and the analogy is it’s no better than flipping a coin classifier.
Both Fig2a and Fig2b showed UBCF to be the superior model which was consistent with calculations of the RSMEs. UBCF had the lowest RSME of the 3 models.

Diversity

To increase diversity, took items with poor reviews and forced it to show up in the recommender’s results
Changed 10% of the ratings at 1 and below and making it to be 5 (our so-called good rating)

matrix<-as(ratings,"matrix")
lowRatingsInd<-(which(matrix< 1,arr.ind = TRUE)) #take jokes with poor ratings of 1
#changed mix of ratings by 10% wih ratings below 1
lowRatingsIndchangeSize<-round(length(lowRatingsInd)*0.1,0)
lowRatingsIndToChange<-round(sample(1:length(lowRatingsInd)/2,lowRatingsIndchangeSize),0)

for(i in 1:length(lowRatingsIndToChange)) {
  matrix[lowRatingsInd[lowRatingsIndToChange[i],1],lowRatingsInd[lowRatingsIndToChange[i],2]]<-5 #make ratings 1 to be ratings of 5
}

dataDiversity <- as(as.matrix(matrix), "realRatingMatrix")

Repeat as before, K-fold cross validation

eval_sets_diversity <- evaluationScheme(data = dataDiversity , method = "cross-validation", k = n_fold, given = items_to_keep, goodRating = 5)#still considering 5 to be "good" ratings

Using UBCF Model only

Using UBCF because it was shown to be most accurate model, see above

model_to_evaluate_UBCF <- "UBCF"
model_parameters_UBCF <- NULL

eval_recommender_UBCF_diversity <- Recommender(data = getData(eval_sets_diversity , "train"),method = model_to_evaluate_UBCF, parameter = model_parameters_UBCF)

items_to_recommend <- 10

eval_prediction_UBCF_diversity  <- predict(object = eval_recommender_UBCF_diversity, newdata =
getData(eval_sets_diversity , "known"), n = items_to_recommend, type = "ratings")

evalAccuracy_diversity<-calcPredictionAccuracy(x=eval_prediction_UBCF_diversity,data=getData(eval_sets_diversity,"unknown"),byUser=FALSE)
#RSME of ratings with more diversity in ratings mix
print("Accuracy metrics of Data with diversity included:")

## [1] "Accuracy metrics of Data with diversity included:"

evalAccuracy_diversity

##      RMSE       MSE       MAE 
##  4.618249 21.328225  3.665731

models11<-list(

  UBCF=list(name="UBCF",param=list(method="cosine")))

nRecommendations<-c(1,5,seq(10,90,10))
results11<-evaluate(x=eval_sets_diversity,method = models11,n=nRecommendations)

## UBCF run fold/sample [model time/prediction time]
##   1  [0.02sec/2.13sec] 
##   2  [0.03sec/2.3sec] 
##   3  [0.03sec/2.27sec] 
##   4  [0.02sec/2.1sec] 
##   5  [0.01sec/2.11sec]

#plot the ROC-AUC curve
par(bg="lightgray")
plot(results11, annotate = TRUE ,type="l",legend = "topleft",col ="red")
title( "Fig2c: ROC curve of Data set with Diversity")

Observation 4:

Added the lowest ratings of 1 at 10% mix to the original data set; this resulted in the RSME increasing to 4.7%.
The RMSE degraded when diversity was introduced into the original “refined” data set. However on the other hand, inclusion of more diverse items is be a good thing as it’s more realistic depiction of datasets in the real world.

Summary

Again for this project, the evaluation steps were procedurally performed per Chapter 4 of the book.  K-fold validation was the accuracy evaluation of choice as it ensured that all sections of the dataset were trained and tested.  It also made for a more complete test harness as all data points were accounted for.  Parameters of K-fold and min. threshold ratings of 5 were selected for this project and 3 models were used as test cases:

1) IBCF Model

2) UBCF Model

3) SVD Model

The UBCF Model was found to be the most superior of the 3 with better RSME, ROC and Precision-Recall curves.  When diversity of lower ratings were thrown into the mix, RSME degraded slightly, ROC curve of Fig 2c, showed curve flattening post-diversity inclusion.

Conclusion

While the above evaluation techniques have proven itself well acadamically on an off-line basis because of its measurebility. For instance it’s easy to compute RSME. When one goes into production online, other measures becomes more critical[1]. From a business perspective, Click-Thru-Rate (CTR) and Conversion Rate (CR) are more in tune with business’ bottom-lines. In my A/B testing, I would use these 2 metrics in tandem, meaning monitoring my CTRs and CRs concurrently. CTRs shows traffic while CR show conversion of those traffic in business dollars. In other words, conversion rate = number of recommendations that made both your boss and your user happy!

Furthermore, one of the quick strategy to improve the evaluation environment is to use the right training data in the design phase; 2 things that can be done to improve the on-line training data that could quickly improve a recommender system[2]:

Remove popular items from the training data. (This is appropriate in cases where users can discover these items on their own, and may not find these recommendations useful)
Scale item ratings by the user’s value, such as average transaction value. This can help a model learn to recommend items that lead to loyal or high-value customers.

References:

DAT 612 - Proj 4: Evaluating Recommender Systems

sufian

6/27/2020