ACCURACY AND BEYOND

The goal of this assignment is to give you prcatice working with accuracy and other recommender system metrics.

For this Project, I am using the dataset from UCI repository. http://eigentaste.berkeley.edu/dataset/

I am using the “Jester” Joke Recommender System dataset which has more than 6 million ratings of jokes by different users.

#import required library, recommenderlab also contains the actual jokes in JesterJokes
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(ggplot2)

#using Jester5k(Sample) for ratings of package recommenderlab (5000 users and 100 jokes with ratings between -10.00 to 10.00)
data(Jester5k)
Jester <- Jester5k

# number of ratings -- 362106
nratings(Jester)
## [1] 362106
# number of ratings per user
summary(rowCounts(Jester))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36.00   53.00   72.00   72.42  100.00  100.00

present the ratings into histogram

# Histogram of rating distribution
hist(getRatings(Jester), main="Distribution of ratings",xlab="Rating",ylab="Frequency")

# Histogram rating distribution: normalized
hist(getRatings(normalize(Jester)),
     main="Distribution of ratings",
     xlab="Rating", 
     ylab="Frequency")

# 'best' joke with highest average rating
best <- which.max(colMeans(Jester))
cat(JesterJokes[best])
## A guy goes into confession and says to the priest, "Father, I'm 80 years old, widower, with 11 grandchildren. Last night I met two beautiful flight attendants. They took me home and I made love to both of them. Twice." The priest said: "Well, my son, when was the last time you were in confession?" "Never Father, I'm Jewish." "So then, why are you telling me?" "I'm telling everybody."
# 'worst' joke with lowest average rating
worst <- which.min(colMeans(Jester))
cat(JesterJokes[worst])
## How many teddybears does it take to change a lightbulb? It takes only one teddybear, but it takes a whole lot of lightbulbs.

calculate the jokes that have been rated

#calculate jokes and their ratings
ratings_per_joke<-colCounts(Jester)
ratings_per_joke
##   j1   j2   j3   j4   j5   j6   j7   j8   j9  j10  j11  j12  j13  j14  j15 
## 3314 3648 3338 3142 4998 4073 4999 5000 3173 4057 4353 4478 5000 4494 5000 
##  j16  j17  j18  j19  j20  j21  j22  j23  j24  j25  j26  j27  j28  j29  j30 
## 4999 5000 5000 5000 5000 4983 4282 4025 3194 4172 4764 4981 4781 4992 3616 
##  j31  j32  j33  j34  j35  j36  j37  j38  j39  j40  j41  j42  j43  j44  j45 
## 4937 4993 3366 4334 4994 4998 3379 4575 4611 4442 3764 4910 3515 3248 4276 
##  j46  j47  j48  j49  j50  j51  j52  j53  j54  j55  j56  j57  j58  j59  j60 
## 4722 4463 4964 4995 4999 3803 4017 4997 4933 3972 4954 3223 3135 3651 3597 
##  j61  j62  j63  j64  j65  j66  j67  j68  j69  j70  j71  j72  j73  j74  j75 
## 4964 4992 4047 3474 4951 4989 3532 4989 4987 4066 1706 1740 1699 1726 1751 
##  j76  j77  j78  j79  j80  j81  j82  j83  j84  j85  j86  j87  j88  j89  j90 
## 1745 1758 1750 1799 1760 1819 1784 1835 1854 1864 1860 1872 1917 1901 1946 
##  j91  j92  j93  j94  j95  j96  j97  j98  j99 j100 
## 1931 1958 2002 2026 2047 2076 2088 2131 2179 1968
#display the calculated jokes and ratings in a data.frame
jokes<-data.frame(joke=names(ratings_per_joke),ratings=ratings_per_joke)

#display the jokes in descending order
jokes<-jokes[order(jokes$ratings,decreasing = TRUE),]

#display jokes and their ratings from higest ratings to the lowest one, display only first 10 highest ratings
jokes[1:10, ]
##     joke ratings
## j8    j8    5000
## j13  j13    5000
## j15  j15    5000
## j17  j17    5000
## j18  j18    5000
## j19  j19    5000
## j20  j20    5000
## j7    j7    4999
## j16  j16    4999
## j50  j50    4999

convert matrix into vector

matrix_to_vector_ratings <- as.vector(Jester@data)

calculate the jokes that have been rated

#calculate jokes and their ratings
ratings_per_joke<-colCounts(Jester)
head(ratings_per_joke)
##   j1   j2   j3   j4   j5   j6 
## 3314 3648 3338 3142 4998 4073
#display the calculated jokes and ratings in a data.frame
jokes<-data.frame(joke=names(ratings_per_joke),ratings=ratings_per_joke)

#display the jokes in descending order
jokes<-jokes[order(jokes$ratings,decreasing = TRUE),]

#display jokes and their ratings from higest ratings to the lowest one
jokes[1:10, ]
##     joke ratings
## j8    j8    5000
## j13  j13    5000
## j15  j15    5000
## j17  j17    5000
## j18  j18    5000
## j19  j19    5000
## j20  j20    5000
## j7    j7    4999
## j16  j16    4999
## j50  j50    4999

calculate jokes with only higher ratings

# calculating jokes ratings higher than 100 and raters who have rated more than 50 jokes
jokes_with_higher_ratings_only <- Jester5k[rowCounts(Jester) > 50,
                                           colCounts(Jester) > 100]
jokes_with_higher_ratings_only
## 3875 x 100 rating matrix of class 'realRatingMatrix' with 314302 ratings.

split data in train and test dataset

split_data_train <- sample(x=c(TRUE,FALSE), size=nrow(jokes_with_higher_ratings_only),
                     replace=TRUE, prob = c(0.80,0.20))

train <- jokes_with_higher_ratings_only[split_data_train, ]
test <- jokes_with_higher_ratings_only[split_data_train, ]

split the dataset into different groups

split_data_set <- sample(x = 1:4, size=nrow(jokes_with_higher_ratings_only),
                    replace = TRUE)
for(i in 1:4) {
  split_data_train <- split_data_set == i
  train <- jokes_with_higher_ratings_only[split_data_train, ]
  test <- jokes_with_higher_ratings_only[!split_data_train, ]
}

create Item Based Collaborative Filtering recommender model

#create the IBCF model
model1 <- Recommender(data = train, method = "IBCF",
                     parameter = list(k=30))
model1
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 993 users.

use IBCF model on test dataset – IBFC model

predicted <- predict(object=model1, newdata=test, n=4)
predicted
## Recommendations as 'topNList' with n = 4 for 2882 users.

create User Based Collaborative Filtering model

#create the IBCF model
model2 <- Recommender(data = train, method = "UBCF",
                     parameter = list(k=30))
## Warning: Unknown parameter: k
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
model2
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 993 users.

use UBCF model on test dataset – UBCF model

predicted <- predict(object=model2, newdata=test, n=4)
predicted
## Recommendations as 'topNList' with n = 4 for 2882 users.

let’s evaluate IBCF model

evaluation_sets <- evaluationScheme(data = jokes_with_higher_ratings_only, 
                              method = "cross-validation", k = 4,
                              given = 15, goodRating = 3)

evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
                                method = "IBCF", parameter = NULL)

evaluation_predicted <- predict(object = evaluation_recommender, 
                          newdata = getData(evaluation_sets, "known"), n=4,
                          type = "ratings")

evaluation_accuracy_IBCF <- calcPredictionAccuracy(x = evaluation_predicted,
                                        data = getData(evaluation_sets, "unknown"),
                                        byUser = FALSE)
evaluation_accuracy_IBCF
##      RMSE       MSE       MAE 
##  5.218667 27.234486  4.067909

now let’s evalue UBCF model

evaluation_sets <- evaluationScheme(data = jokes_with_higher_ratings_only, 
                              method = "cross-validation", k = 4,
                              given = 15, goodRating = 3)

evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
                                method = "UBCF", parameter = NULL)

evaluation_predicted <- predict(object = evaluation_recommender, 
                          newdata = getData(evaluation_sets, "known"), n=4,
                          type = "ratings")

evaluation_accuracy_UBCF <- calcPredictionAccuracy(x = evaluation_predicted,
                                        data = getData(evaluation_sets, "unknown"),
                                        byUser = FALSE)
evaluation_accuracy_UBCF
##      RMSE       MSE       MAE 
##  4.532507 20.543623  3.555387

Summary

Here, IBCF model has better score for RMSE, MSE, and MAE than UBCF. That does not mean one model if better than other. They both have their own benefits. We have to always compare models and come to a conclusion. Every model can show you different kinds of information which can benefit you.