The goal of this assignment is to give you prcatice working with accuracy and other recommender system metrics.
For this Project, I am using the dataset from UCI repository. http://eigentaste.berkeley.edu/dataset/
I am using the “Jester” Joke Recommender System dataset which has more than 6 million ratings of jokes by different users.
#import required library, recommenderlab also contains the actual jokes in JesterJokes
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(ggplot2)
#using Jester5k(Sample) for ratings of package recommenderlab (5000 users and 100 jokes with ratings between -10.00 to 10.00)
data(Jester5k)
Jester <- Jester5k
# number of ratings -- 362106
nratings(Jester)
## [1] 362106
# number of ratings per user
summary(rowCounts(Jester))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.00 53.00 72.00 72.42 100.00 100.00
# Histogram of rating distribution
hist(getRatings(Jester), main="Distribution of ratings",xlab="Rating",ylab="Frequency")
# Histogram rating distribution: normalized
hist(getRatings(normalize(Jester)),
main="Distribution of ratings",
xlab="Rating",
ylab="Frequency")
# 'best' joke with highest average rating
best <- which.max(colMeans(Jester))
cat(JesterJokes[best])
## A guy goes into confession and says to the priest, "Father, I'm 80 years old, widower, with 11 grandchildren. Last night I met two beautiful flight attendants. They took me home and I made love to both of them. Twice." The priest said: "Well, my son, when was the last time you were in confession?" "Never Father, I'm Jewish." "So then, why are you telling me?" "I'm telling everybody."
# 'worst' joke with lowest average rating
worst <- which.min(colMeans(Jester))
cat(JesterJokes[worst])
## How many teddybears does it take to change a lightbulb? It takes only one teddybear, but it takes a whole lot of lightbulbs.
#calculate jokes and their ratings
ratings_per_joke<-colCounts(Jester)
ratings_per_joke
## j1 j2 j3 j4 j5 j6 j7 j8 j9 j10 j11 j12 j13 j14 j15
## 3314 3648 3338 3142 4998 4073 4999 5000 3173 4057 4353 4478 5000 4494 5000
## j16 j17 j18 j19 j20 j21 j22 j23 j24 j25 j26 j27 j28 j29 j30
## 4999 5000 5000 5000 5000 4983 4282 4025 3194 4172 4764 4981 4781 4992 3616
## j31 j32 j33 j34 j35 j36 j37 j38 j39 j40 j41 j42 j43 j44 j45
## 4937 4993 3366 4334 4994 4998 3379 4575 4611 4442 3764 4910 3515 3248 4276
## j46 j47 j48 j49 j50 j51 j52 j53 j54 j55 j56 j57 j58 j59 j60
## 4722 4463 4964 4995 4999 3803 4017 4997 4933 3972 4954 3223 3135 3651 3597
## j61 j62 j63 j64 j65 j66 j67 j68 j69 j70 j71 j72 j73 j74 j75
## 4964 4992 4047 3474 4951 4989 3532 4989 4987 4066 1706 1740 1699 1726 1751
## j76 j77 j78 j79 j80 j81 j82 j83 j84 j85 j86 j87 j88 j89 j90
## 1745 1758 1750 1799 1760 1819 1784 1835 1854 1864 1860 1872 1917 1901 1946
## j91 j92 j93 j94 j95 j96 j97 j98 j99 j100
## 1931 1958 2002 2026 2047 2076 2088 2131 2179 1968
#display the calculated jokes and ratings in a data.frame
jokes<-data.frame(joke=names(ratings_per_joke),ratings=ratings_per_joke)
#display the jokes in descending order
jokes<-jokes[order(jokes$ratings,decreasing = TRUE),]
#display jokes and their ratings from higest ratings to the lowest one, display only first 10 highest ratings
jokes[1:10, ]
## joke ratings
## j8 j8 5000
## j13 j13 5000
## j15 j15 5000
## j17 j17 5000
## j18 j18 5000
## j19 j19 5000
## j20 j20 5000
## j7 j7 4999
## j16 j16 4999
## j50 j50 4999
matrix_to_vector_ratings <- as.vector(Jester@data)
#calculate jokes and their ratings
ratings_per_joke<-colCounts(Jester)
head(ratings_per_joke)
## j1 j2 j3 j4 j5 j6
## 3314 3648 3338 3142 4998 4073
#display the calculated jokes and ratings in a data.frame
jokes<-data.frame(joke=names(ratings_per_joke),ratings=ratings_per_joke)
#display the jokes in descending order
jokes<-jokes[order(jokes$ratings,decreasing = TRUE),]
#display jokes and their ratings from higest ratings to the lowest one
jokes[1:10, ]
## joke ratings
## j8 j8 5000
## j13 j13 5000
## j15 j15 5000
## j17 j17 5000
## j18 j18 5000
## j19 j19 5000
## j20 j20 5000
## j7 j7 4999
## j16 j16 4999
## j50 j50 4999
# calculating jokes ratings higher than 100 and raters who have rated more than 50 jokes
jokes_with_higher_ratings_only <- Jester5k[rowCounts(Jester) > 50,
colCounts(Jester) > 100]
jokes_with_higher_ratings_only
## 3875 x 100 rating matrix of class 'realRatingMatrix' with 314302 ratings.
split_data_train <- sample(x=c(TRUE,FALSE), size=nrow(jokes_with_higher_ratings_only),
replace=TRUE, prob = c(0.80,0.20))
train <- jokes_with_higher_ratings_only[split_data_train, ]
test <- jokes_with_higher_ratings_only[split_data_train, ]
split_data_set <- sample(x = 1:4, size=nrow(jokes_with_higher_ratings_only),
replace = TRUE)
for(i in 1:4) {
split_data_train <- split_data_set == i
train <- jokes_with_higher_ratings_only[split_data_train, ]
test <- jokes_with_higher_ratings_only[!split_data_train, ]
}
#create the IBCF model
model1 <- Recommender(data = train, method = "IBCF",
parameter = list(k=30))
model1
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 993 users.
predicted <- predict(object=model1, newdata=test, n=4)
predicted
## Recommendations as 'topNList' with n = 4 for 2882 users.
#create the IBCF model
model2 <- Recommender(data = train, method = "UBCF",
parameter = list(k=30))
## Warning: Unknown parameter: k
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
model2
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 993 users.
predicted <- predict(object=model2, newdata=test, n=4)
predicted
## Recommendations as 'topNList' with n = 4 for 2882 users.
evaluation_sets <- evaluationScheme(data = jokes_with_higher_ratings_only,
method = "cross-validation", k = 4,
given = 15, goodRating = 3)
evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
method = "IBCF", parameter = NULL)
evaluation_predicted <- predict(object = evaluation_recommender,
newdata = getData(evaluation_sets, "known"), n=4,
type = "ratings")
evaluation_accuracy_IBCF <- calcPredictionAccuracy(x = evaluation_predicted,
data = getData(evaluation_sets, "unknown"),
byUser = FALSE)
evaluation_accuracy_IBCF
## RMSE MSE MAE
## 5.218667 27.234486 4.067909
evaluation_sets <- evaluationScheme(data = jokes_with_higher_ratings_only,
method = "cross-validation", k = 4,
given = 15, goodRating = 3)
evaluation_recommender <- Recommender(data = getData(evaluation_sets, "train"),
method = "UBCF", parameter = NULL)
evaluation_predicted <- predict(object = evaluation_recommender,
newdata = getData(evaluation_sets, "known"), n=4,
type = "ratings")
evaluation_accuracy_UBCF <- calcPredictionAccuracy(x = evaluation_predicted,
data = getData(evaluation_sets, "unknown"),
byUser = FALSE)
evaluation_accuracy_UBCF
## RMSE MSE MAE
## 4.532507 20.543623 3.555387
Here, IBCF model has better score for RMSE, MSE, and MAE than UBCF. That does not mean one model if better than other. They both have their own benefits. We have to always compare models and come to a conclusion. Every model can show you different kinds of information which can benefit you.