In this project we will implment a content based and user to user colloborations. For user colloboration we will use the MovieLense and MovieLenseMeta dataset. MovieLense dataset contains 943 rows and 1664 columns MovieLenseMeta dataset contains 1664 rows and 22 columns

Table and histogram below displays ratings frequencies. We will exclude movies with 0 ratings.

data("MovieLense")
d = as.vector(MovieLense@data)
table_ratings <- table(d)
kable_styling (kable(table_ratings),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
d Freq
0 1469760
1 6059
2 11307
3 27002
4 33947
5 21077
d= d[d!= 0]
ggplot() + aes(d) + 
  geom_histogram(binwidth = 0.5) +
  xlab("Rating") + ylab("Number of Ratings")

Below table displays some of the elements from moviemeta dataset

kable_styling (kable(head(MovieLenseMeta)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
title year url unknown Action Adventure Animation Children’s Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
Toy Story (1995) 1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
GoldenEye (1995) 1995 http://us.imdb.com/M/title-exact?GoldenEye%20(1995) 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
Four Rooms (1995) 1995 http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
Get Shorty (1995) 1995 http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995) 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
Copycat (1995) 1995 http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 1995 http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995) 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0

Train/Test Sets

We will use 80% of the data for training and 20% for testing. Since the dataset is large and sparse, we will only look at the users who have rated more than 50 movies and movies with more than 100 reviews.

 MovieLense <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]

set.seed(1)
testsample <- sample(x = c(TRUE, FALSE), size = nrow(MovieLense),
replace = TRUE, prob = c(0.8, 0.2))
train <- MovieLense[testsample, ]
test <- MovieLense[!testsample, ]

Item-Based Collaborative Filtering

We will apply cosine similarity to identify 20 neighboring items with similar profile and use that for recommendations

ibcfModel <- Recommender(data = train, method = "IBCF", parameter = list(k = 20, method = "Cosine"))
ibcfModel
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 451 users.
ibcfModelPredicted <- predict(object = ibcfModel, newdata = test, n = 10)
ibcfModelPredicted
## Recommendations as 'topNList' with n = 10 for 109 users.

Below table displays some of the recommendatiosn for a user.

recommendations <- data.frame(user = sort(rep(1:length(ibcfModelPredicted@items), ibcfModelPredicted@n)), 
    rating = unlist(ibcfModelPredicted@ratings), index = unlist(ibcfModelPredicted@items))

recommendations$title <- ibcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user) 

kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
user rating index title year
1 5.000000 5 Babe (1995) 1995
1 5.000000 45 Remains of the Day, The (1993) 1994
1 5.000000 300 Being There (1979) 1997
1 5.000000 311 Dave (1993) 1997
1 4.668660 33 Ace Ventura: Pet Detective (1994) 1995
1 4.662706 264 African Queen, The (1951) 1997

Model : User-Based Collaborative Filtering

We will apply cosine similarity to identify 20 neighboring users with similar profile and use that for recommendations

ubcfModel <- Recommender(data = train, method = "UBCF", parameter = list(k = 20, 
    method = "Cosine"))
## Warning: Unknown parameters: k
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
ubcfModelPredicted <- predict(object = ubcfModel, newdata = test, n = 10)

Recommendations for a user are displayed below

recommendations <- data.frame(user = sort(rep(1:length(ubcfModelPredicted@items), ubcfModelPredicted@n)), 
    rating = unlist(ubcfModelPredicted@ratings), index = unlist(ubcfModelPredicted@items))

recommendations$title <- ubcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user) %>% top_n(20)
## Selecting by year
kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
user rating index title year
1 3.718585 193 L.A. Confidential (1997) 1983
1 3.713766 173 Good Will Hunting (1997) 1987
1 3.525676 199 As Good As It Gets (1997) 1957
1 3.498421 201 Schindler’s List (1993) 1987
1 3.498134 69 Godfather, The (1972) 1994
1 3.481718 318 Amistad (1997) 1996

Evaluating different distance functions

build_Model_Evaluate <- function(data, modelname, method){
eval_sets <- evaluationScheme(data = data, given = 10)

model_to_evaluate <-modelname
model_parameters <- list(normalize = "Z-score",method=method)
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
                                method = model_to_evaluate,
                                parameter = model_parameters)



eval_prediction <- predict(object = eval_recommender,
                           newdata = getData(eval_sets, "known"),
                           n = 10,
                           type = "ratings")
class(eval_prediction)


eval_accuracy <- calcPredictionAccuracy(
  x = eval_prediction,
  data = getData(eval_sets, "unknown"),
  byUser = F)
 
return(eval_accuracy)
}

Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this particualar case Pearson functions is better than Cosine function.

Pearson_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','pearson')
Cosine_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','cosine')

kable_styling (kable( rbind(Pearson_IBCF, Cosine_IBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
RMSE MSE MAE
Pearson_IBCF 1.313823 1.726131 0.9725531
Cosine_IBCF 1.400038 1.960106 1.0461014

Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this case, we see that Pearson is better than cosine function.

mm <- MovieLenseMeta[-2]
mm <- mm[-2]
mm <- as(mm, 'realRatingMatrix')
mm <- binarize(mm, minRating = 1)
Pearson_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','pearson')
Cosine_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','cosine')

kable_styling (kable( rbind(Pearson_UBCF, Cosine_UBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
RMSE MSE MAE
Pearson_UBCF 0.9684725 0.9379389 0.7550760
Cosine_UBCF 1.0491982 1.1008168 0.8338011

Comparing Models

We will compare different distance functions using cross validation with following parameters.

k=4 given = 15 goodRating = 3

Item based

Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine.

set.seed(1)
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50,
                             colCounts(MovieLense) > 100]

n_fold <- 4
items_to_keep <- 15
rating_threshold <- 3
eval_sets <- evaluationScheme(data = ratings_movies,
                              method = "cross-validation",
                              k = n_fold,
                              given = items_to_keep,
                              goodRating = rating_threshold)
set.seed(1)
models_to_evaluate <- list(
  IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
  IBCF_Pearson = list(name = "IBCF", param = list(method = "pearson"))

  
)

n_recommendations <- c(1, 5, seq(10, 100, 10))

results <- evaluate(x = eval_sets,
                         method = models_to_evaluate,
                         n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
##   1  [0.15sec/0.02sec] 
##   2  [0.16sec/0.01sec] 
##   3  [0.15sec/0.02sec] 
##   4  [0.16sec/0.01sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.18sec/0.03sec] 
##   2  [0.2sec/0.02sec] 
##   3  [0.22sec/0.02sec] 
##   4  [0.22sec/0.02sec]
plot(results,annotate = 1,legend = "topleft")

title("ROC curve")

plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")

User Based

Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine. Above we had

set.seed(1)
models_to_evaluate <- list(
  UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
  UBCF_Pearson = list(name = "UBCF", param = list(method = "pearson"))

  
)
results <- evaluate(x = eval_sets,
                         method = models_to_evaluate,
                         n = n_recommendations)
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.14sec] 
##   2  [0sec/0.16sec] 
##   3  [0sec/0.16sec] 
##   4  [0sec/0.14sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.12sec] 
##   2  [0.02sec/0.14sec] 
##   3  [0sec/0.16sec] 
##   4  [0sec/0.44sec]
plot(results,annotate = 1,legend = "topleft")

title("ROC curve")

plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")