Recommender System Evaluation - Accuracy and Beyond

Recommendation system for music artists based on user preferences according to the songs listening frequencies (called weights) used as ratings

This dataset contains social networking, tagging, and music artist listening information from a set of 2K users from Last.fm online music system. The dataset is released in the framework of the 2nd International Workshop on Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011) http://ir.ii.uam.es/hetrec2011 http://www.last.fm

Data Statistics: 1892 users 17632 artists 92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]

Data collection, exploration and preparation
# Loading datasets, Package Installation

artists <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/artists.dat', header = TRUE, sep="\t")
artists <- artists[,1:2]
user_artists <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/user_artists.dat', header = TRUE, sep="\t")

#install.packages("tidyverse")
#install.packages("recommenderlab")
#install.packages("recosystem")
library(dplyr)
library(tidyr)
library(recommenderlab)
library(recosystem)

#Exploring data sets Transforming weights to ratings
head(artists)
##   id              name
## 1  1      MALICE MIZER
## 2  2   Diary of Dreams
## 3  3 Carpathian Forest
## 4  4      Moi dix Mois
## 5  5       Bella Morte
## 6  6         Moonspell
head(user_artists)
##   userID artistID weight
## 1      2       51  13883
## 2      2       52  11690
## 3      2       53  11351
## 4      2       54  10300
## 5      2       55   8983
## 6      2       56   6152
# Reduce data set to the most relevant users (with minimum listening frequency = 10) and the top artists based on total listening frequency

user_artists_mat <- user_artists %>% spread(key=artistID,value=weight)

artist_freq <- colSums(user_artists_mat[,-1],na.rm = TRUE)

top_user_artist <- user_artists_mat[,order(artist_freq,decreasing = TRUE)[1:100]]
list_freq <- apply(top_user_artist,1,function(x) return(sum(!is.na(x))))
top_user_artist <-  top_user_artist[list_freq > 10,]
dim(top_user_artist)
## [1]  36 100
# Transforming weights to ratings

quantile(user_artists$weight, na.rm=TRUE) #determine ratings limits
##     0%    25%    50%    75%   100% 
##      1    107    260    614 352698
quantile(user_artists$weight, probs=0.90,na.rm=TRUE)
##  90% 
## 1387
user_artist_trans <- top_user_artist
user_artist_trans[user_artist_trans$weight < 107] <- 1
user_artist_trans[user_artist_trans >= 107 & user_artist_trans < 260] <- 2
user_artist_trans[user_artist_trans >= 260 & user_artist_trans < 614] <- 3
user_artist_trans[user_artist_trans >= 614 & user_artist_trans < 1387] <- 4
user_artist_trans[user_artist_trans >= 1387] <- 5

head(user_artist_trans,3)
##    288 71 88 291 497 66 287 700 226 299 332 343 377 678 294 510 460 485
## 4   NA NA NA  NA  NA NA  NA  NA   2  NA  NA  NA  NA  NA  NA  NA  NA  NA
## 6    5 NA NA   5  NA NA  NA  NA  NA   5   3  NA  NA  NA   5  NA  NA  NA
## 16   2 NA NA  NA  NA NA  NA   2  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
##    189 162 54 153 465 256 706 916 791 50 64 474 202 156 206 197 376 290
## 4   NA  NA NA  NA  NA  NA  NA  NA  NA NA NA  NA   3  NA   3   3  NA  NA
## 6   NA  NA NA  NA  NA  NA  NA  NA  NA NA NA  NA  NA  NA  NA  NA  NA   5
## 16  NA  NA NA  NA  NA  NA  NA  NA  NA NA NA  NA  NA  NA  NA  NA  NA  NA
##    613 172 502 686 902 301 186 1411 1097 1671 457 228 233 305 55 324 532
## 4   NA  NA  NA  NA  NA  NA  NA   NA   NA   NA  NA   2   2  NA NA  NA  NA
## 6   NA  NA  NA  NA  NA   5  NA   NA   NA   NA  NA  NA  NA   4  5   3  NA
## 16  NA  NA  NA   3  NA  NA  NA   NA   NA   NA  NA  NA  NA  NA NA   3  NA
##    293 232 208 229 454 158 227 298 680 703 2043 417 1248 598 1368 348 423
## 4   NA   2   2   2  NA  NA   2  NA  NA  NA   NA  NA   NA  NA   NA  NA  NA
## 6    5  NA  NA  NA  NA  NA  NA   5  NA  NA   NA  NA   NA  NA   NA  NA  NA
## 16  NA  NA  NA   3  NA  NA  NA   2   4   2   NA  NA   NA  NA   NA  NA  NA
##    1103 297 1458 317 284 1245 315 219 543 679 958 194 688 309 685 917 411
## 4    NA  NA   NA  NA  NA   NA  NA   2  NA  NA  NA   4  NA  NA  NA  NA  NA
## 6    NA   5   NA   4  NA   NA   4  NA  NA  NA  NA  NA  NA   4  NA  NA  NA
## 16   NA  NA   NA  NA  NA   NA  NA  NA  NA   5  NA  NA   3  NA   3  NA  NA
##    438 216 2101 705 440 160 2178  6 867 888 330 1047 463
## 4   NA   2   NA  NA  NA  NA   NA NA  NA  NA  NA   NA  NA
## 6   NA  NA   NA  NA  NA  NA   NA NA  NA  NA   3   NA  NA
## 16  NA  NA   NA  NA  NA  NA   NA NA  NA  NA  NA   NA  NA
# Create sparse matrix and the corresponding Train and Test data sets

user_artist_trans_mat <- as(as.matrix(user_artist_trans),"realRatingMatrix")
user_artist_trans_mat
## 36 x 100 rating matrix of class 'realRatingMatrix' with 422 ratings.
set.seed(100)
eval_sch <- evaluationScheme(data = user_artist_trans_mat, method = "split", train = .8, given = 3, goodRating = 3)
recommenderlab - Different Recommender Methods
#Exploring parameters of recommendation methods in recommenderlab

recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix"          "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix"         "POPULAR_realRatingMatrix"     
## [5] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
## [7] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [9] "UBCF_realRatingMatrix"
lapply(recommender_models, "[[", "description")
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
## 
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
## 
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
## 
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
## 
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
## 
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
## 
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
## 
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend."
## 
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
# Create recommender model - User-based Collaborative Filtering (UCBF)

recommender_model_ubcf <- Recommender(data=getData(eval_sch,"train"),method="UBCF")

# Recommendation for the first 5 users

recom_ubcf <- predict(recommender_model_ubcf, getData(eval_sch,"known")[1:5], n=5, type = "topNList")
recom_ubcf_list <- as(recom_ubcf, "list") #convert recommenderlab object to readable list
recom_ubcf_list 
## $`4`
## character(0)
## 
## $`16`
## [1] "685"  "1047" "88"   "497"  "66"  
## 
## $`19`
## [1] "330" "465" "317" "680" "332"
## 
## $`747`
## [1] "497" "66"  "287" "700" "226"
## 
## $`793`
## [1] "290" "680" "332" "465" "543"
# Get artist names

recom_ubcf_result <- list()
for (i in c(1:5)){
 recom_ubcf_result[[i]] <- artists[artists$id %in% as.vector(recom_ubcf_list[[i]]),2]
}

# Top 5 recommendations for the first 5 users
recom_ubcf_result
## [[1]]
## factor(0)
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[2]]
## [1] Faithless                       Gorillaz                       
## [3] Papa Roach                      Nick Jonas & The Administration
## [5] McFly                          
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[3]]
## [1] Keri Hilson     T.I.            Kelly Rowland   Jessica Simpson
## [5] The Veronicas  
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[4]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Papa Roach              Boyce Avenue           
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[5]]
## [1] Jordin Sparks      Kelly Rowland      Jessica Simpson   
## [4] Nicole Scherzinger The Veronicas     
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
# Evaluation

# Prediction of ratings for all users

recom_ubcf <- predict(recommender_model_ubcf, getData(eval_sch,"known"), type = "ratings")

accuracy_ubcf <- calcPredictionAccuracy(x = recom_ubcf, data = getData(eval_sch, "unknown"), given=3, goodRating=3)

accuracy_ubcf
##       RMSE        MSE        MAE 
##  21.586661 465.983924   6.846728
# Create recommender model - Alternating least squares (ALS)

recommender_model_als <- Recommender(data=getData(eval_sch,"train"),method="ALS")

# Recommendation for the first 5 users

recom_als <- predict(recommender_model_als, getData(eval_sch,"known")[1:5], n=5, type = "topNList")
recom_als_list <- as(recom_als, "list") #convert recommenderlab object to readable list
recom_als_list 
## $`4`
## [1] "497" "66"  "287" "226" "343"
## 
## $`16`
## [1] "497" "66"  "287" "226" "343"
## 
## $`19`
## [1] "497" "66"  "287" "226" "343"
## 
## $`747`
## [1] "290" "497" "66"  "287" "226"
## 
## $`793`
## [1] "497" "66"  "287" "226" "343"
# Get artist names

recom_als_result <- list()
for (i in c(1:5)){
 recom_als_result[[i]] <- artists[artists$id %in% as.vector(recom_als_list[[i]]),2]
}

# Top 5 recommendations for the first 5 users
recom_als_result
## [[1]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Annett Louisan          Papa Roach             
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[2]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Annett Louisan          Papa Roach             
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[3]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Annett Louisan          Papa Roach             
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[4]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Jordin Sparks           Papa Roach             
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
## 
## [[5]]
## [1] Faithless               Queens of the Stone Age Monica                 
## [4] Annett Louisan          Papa Roach             
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
# Evaluation

# Prediction of ratings for all users

recom_als <- predict(recommender_model_als, getData(eval_sch,"known"), type = "ratings")

accuracy_als <- calcPredictionAccuracy(x = recom_als, data = getData(eval_sch, "unknown"), given=3, goodRating=3)

accuracy_als
##       RMSE        MSE        MAE 
##  16.205748 262.626279   4.973237
recommenderlab - Recommender Methods Comparison
comp_methods <- rbind(accuracy_ubcf, accuracy_als)
rownames(comp_methods) <- c("UBCF", "ALS")
library(knitr)
kable(comp_methods)
RMSE MSE MAE
UBCF 21.58666 465.9839 6.846728
ALS 16.20575 262.6263 4.973236
# ROC & Precision & Recall

rec_methods <- list(
  "UBCF" = list(name = "UBCF", param = NULL),
  "ALS" = list(name = "ALS", param = NULL))
eval_results <- evaluate(x = eval_sch, method = rec_methods, n = c(1, 5, 10, 15, 20,25,30))
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.03sec] 
## ALS run fold/sample [model time/prediction time]
##   1  [0sec/0.58sec]
plot(eval_results, 
     annotate = TRUE, legend = "topleft", main = "ROC Curve")

plot(eval_results, "prec/rec", 
     annotate = TRUE, legend = "topright", main = "Precision-Recall")

Observations

In terms of model method accuracy, Alternate Least Square (ALS) provides better RMSE and MSE than the UBCF. Interestingly in the ROC curve, UBCF appears to perform better that ALS, meaning that in general the classification power of UBCF is better even though that in accuracy terms, ALS, shows less deviation from the actual rating values when it classifies correctly