Recommendation system for music artists based on user preferences according to the songs listening frequencies (called weights) used as ratings
This dataset contains social networking, tagging, and music artist listening information from a set of 2K users from Last.fm online music system. The dataset is released in the framework of the 2nd International Workshop on Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011) http://ir.ii.uam.es/hetrec2011 http://www.last.fm
Data Statistics: 1892 users 17632 artists 92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
# Loading datasets, Package Installation
artists <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/artists.dat', header = TRUE, sep="\t")
artists <- artists[,1:2]
user_artists <- read.delim('https://raw.githubusercontent.com/humbertohpgit/MSDS3rdSem_DATA612/master/user_artists.dat', header = TRUE, sep="\t")
#install.packages("tidyverse")
#install.packages("recommenderlab")
#install.packages("recosystem")
library(dplyr)
library(tidyr)
library(recommenderlab)
library(recosystem)
#Exploring data sets Transforming weights to ratings
head(artists)
## id name
## 1 1 MALICE MIZER
## 2 2 Diary of Dreams
## 3 3 Carpathian Forest
## 4 4 Moi dix Mois
## 5 5 Bella Morte
## 6 6 Moonspell
head(user_artists)
## userID artistID weight
## 1 2 51 13883
## 2 2 52 11690
## 3 2 53 11351
## 4 2 54 10300
## 5 2 55 8983
## 6 2 56 6152
# Reduce data set to the most relevant users (with minimum listening frequency = 10) and the top artists based on total listening frequency
user_artists_mat <- user_artists %>% spread(key=artistID,value=weight)
artist_freq <- colSums(user_artists_mat[,-1],na.rm = TRUE)
top_user_artist <- user_artists_mat[,order(artist_freq,decreasing = TRUE)[1:100]]
list_freq <- apply(top_user_artist,1,function(x) return(sum(!is.na(x))))
top_user_artist <- top_user_artist[list_freq > 10,]
dim(top_user_artist)
## [1] 36 100
# Transforming weights to ratings
quantile(user_artists$weight, na.rm=TRUE) #determine ratings limits
## 0% 25% 50% 75% 100%
## 1 107 260 614 352698
quantile(user_artists$weight, probs=0.90,na.rm=TRUE)
## 90%
## 1387
user_artist_trans <- top_user_artist
user_artist_trans[user_artist_trans$weight < 107] <- 1
user_artist_trans[user_artist_trans >= 107 & user_artist_trans < 260] <- 2
user_artist_trans[user_artist_trans >= 260 & user_artist_trans < 614] <- 3
user_artist_trans[user_artist_trans >= 614 & user_artist_trans < 1387] <- 4
user_artist_trans[user_artist_trans >= 1387] <- 5
head(user_artist_trans,3)
## 288 71 88 291 497 66 287 700 226 299 332 343 377 678 294 510 460 485
## 4 NA NA NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA
## 6 5 NA NA 5 NA NA NA NA NA 5 3 NA NA NA 5 NA NA NA
## 16 2 NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA
## 189 162 54 153 465 256 706 916 791 50 64 474 202 156 206 197 376 290
## 4 NA NA NA NA NA NA NA NA NA NA NA NA 3 NA 3 3 NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 5
## 16 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 613 172 502 686 902 301 186 1411 1097 1671 457 228 233 305 55 324 532
## 4 NA NA NA NA NA NA NA NA NA NA NA 2 2 NA NA NA NA
## 6 NA NA NA NA NA 5 NA NA NA NA NA NA NA 4 5 3 NA
## 16 NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA 3 NA
## 293 232 208 229 454 158 227 298 680 703 2043 417 1248 598 1368 348 423
## 4 NA 2 2 2 NA NA 2 NA NA NA NA NA NA NA NA NA NA
## 6 5 NA NA NA NA NA NA 5 NA NA NA NA NA NA NA NA NA
## 16 NA NA NA 3 NA NA NA 2 4 2 NA NA NA NA NA NA NA
## 1103 297 1458 317 284 1245 315 219 543 679 958 194 688 309 685 917 411
## 4 NA NA NA NA NA NA NA 2 NA NA NA 4 NA NA NA NA NA
## 6 NA 5 NA 4 NA NA 4 NA NA NA NA NA NA 4 NA NA NA
## 16 NA NA NA NA NA NA NA NA NA 5 NA NA 3 NA 3 NA NA
## 438 216 2101 705 440 160 2178 6 867 888 330 1047 463
## 4 NA 2 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA 3 NA NA
## 16 NA NA NA NA NA NA NA NA NA NA NA NA NA
# Create sparse matrix and the corresponding Train and Test data sets
user_artist_trans_mat <- as(as.matrix(user_artist_trans),"realRatingMatrix")
user_artist_trans_mat
## 36 x 100 rating matrix of class 'realRatingMatrix' with 422 ratings.
set.seed(100)
eval_sch <- evaluationScheme(data = user_artist_trans_mat, method = "split", train = .8, given = 3, goodRating = 3)
#Exploring parameters of recommendation methods in recommenderlab
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix" "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [5] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [7] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [9] "UBCF_realRatingMatrix"
lapply(recommender_models, "[[", "description")
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
# Create recommender model - User-based Collaborative Filtering (UCBF)
recommender_model_ubcf <- Recommender(data=getData(eval_sch,"train"),method="UBCF")
# Recommendation for the first 5 users
recom_ubcf <- predict(recommender_model_ubcf, getData(eval_sch,"known")[1:5], n=5, type = "topNList")
recom_ubcf_list <- as(recom_ubcf, "list") #convert recommenderlab object to readable list
recom_ubcf_list
## $`4`
## character(0)
##
## $`16`
## [1] "685" "1047" "88" "497" "66"
##
## $`19`
## [1] "330" "465" "317" "680" "332"
##
## $`747`
## [1] "497" "66" "287" "700" "226"
##
## $`793`
## [1] "290" "680" "332" "465" "543"
# Get artist names
recom_ubcf_result <- list()
for (i in c(1:5)){
recom_ubcf_result[[i]] <- artists[artists$id %in% as.vector(recom_ubcf_list[[i]]),2]
}
# Top 5 recommendations for the first 5 users
recom_ubcf_result
## [[1]]
## factor(0)
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[2]]
## [1] Faithless Gorillaz
## [3] Papa Roach Nick Jonas & The Administration
## [5] McFly
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[3]]
## [1] Keri Hilson T.I. Kelly Rowland Jessica Simpson
## [5] The Veronicas
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[4]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Papa Roach Boyce Avenue
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[5]]
## [1] Jordin Sparks Kelly Rowland Jessica Simpson
## [4] Nicole Scherzinger The Veronicas
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
# Evaluation
# Prediction of ratings for all users
recom_ubcf <- predict(recommender_model_ubcf, getData(eval_sch,"known"), type = "ratings")
accuracy_ubcf <- calcPredictionAccuracy(x = recom_ubcf, data = getData(eval_sch, "unknown"), given=3, goodRating=3)
accuracy_ubcf
## RMSE MSE MAE
## 21.586661 465.983924 6.846728
# Create recommender model - Alternating least squares (ALS)
recommender_model_als <- Recommender(data=getData(eval_sch,"train"),method="ALS")
# Recommendation for the first 5 users
recom_als <- predict(recommender_model_als, getData(eval_sch,"known")[1:5], n=5, type = "topNList")
recom_als_list <- as(recom_als, "list") #convert recommenderlab object to readable list
recom_als_list
## $`4`
## [1] "497" "66" "287" "226" "343"
##
## $`16`
## [1] "497" "66" "287" "226" "343"
##
## $`19`
## [1] "497" "66" "287" "226" "343"
##
## $`747`
## [1] "290" "497" "66" "287" "226"
##
## $`793`
## [1] "497" "66" "287" "226" "343"
# Get artist names
recom_als_result <- list()
for (i in c(1:5)){
recom_als_result[[i]] <- artists[artists$id %in% as.vector(recom_als_list[[i]]),2]
}
# Top 5 recommendations for the first 5 users
recom_als_result
## [[1]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Annett Louisan Papa Roach
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[2]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Annett Louisan Papa Roach
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[3]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Annett Louisan Papa Roach
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[4]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Jordin Sparks Papa Roach
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
##
## [[5]]
## [1] Faithless Queens of the Stone Age Monica
## [4] Annett Louisan Papa Roach
## 14035 Levels: -123 min. -OZ- -t de sangre !!! !deladap !DISTAIN ... ZZ Top
# Evaluation
# Prediction of ratings for all users
recom_als <- predict(recommender_model_als, getData(eval_sch,"known"), type = "ratings")
accuracy_als <- calcPredictionAccuracy(x = recom_als, data = getData(eval_sch, "unknown"), given=3, goodRating=3)
accuracy_als
## RMSE MSE MAE
## 16.205748 262.626279 4.973237
comp_methods <- rbind(accuracy_ubcf, accuracy_als)
rownames(comp_methods) <- c("UBCF", "ALS")
library(knitr)
kable(comp_methods)
RMSE | MSE | MAE | |
---|---|---|---|
UBCF | 21.58666 | 465.9839 | 6.846728 |
ALS | 16.20575 | 262.6263 | 4.973236 |
# ROC & Precision & Recall
rec_methods <- list(
"UBCF" = list(name = "UBCF", param = NULL),
"ALS" = list(name = "ALS", param = NULL))
eval_results <- evaluate(x = eval_sch, method = rec_methods, n = c(1, 5, 10, 15, 20,25,30))
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/0.03sec]
## ALS run fold/sample [model time/prediction time]
## 1 [0sec/0.58sec]
plot(eval_results,
annotate = TRUE, legend = "topleft", main = "ROC Curve")
plot(eval_results, "prec/rec",
annotate = TRUE, legend = "topright", main = "Precision-Recall")
In terms of model method accuracy, Alternate Least Square (ALS) provides better RMSE and MSE than the UBCF. Interestingly in the ROC curve, UBCF appears to perform better that ALS, meaning that in general the classification power of UBCF is better even though that in accuracy terms, ALS, shows less deviation from the actual rating values when it classifies correctly