Project 2

In this project we will implment a content based and user to user colloborations. For user colloboration we will use the MovieLense and MovieLenseMeta dataset. MovieLense dataset contains 943 rows and 1664 columns MovieLenseMeta dataset contains 1664 rows and 22 columns

Table and histogram below displays ratings frequencies. We will exclude movies with 0 ratings.

data("MovieLense")
d = as.vector(MovieLense@data)
table_ratings <- table(d)
kable_styling (kable(table_ratings),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

d	Freq
0	1469760
1	6059
2	11307
3	27002
4	33947
5	21077

d= d[d!= 0]
ggplot() + aes(d) + 
  geom_histogram(binwidth = 0.5) +
  xlab("Rating") + ylab("Number of Ratings")

Below table displays some of the elements from moviemeta dataset

kable_styling (kable(head(MovieLenseMeta)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

title	year	url	Action	Adventure	Animation	Children’s	Comedy	Crime	Drama	Thriller
Toy Story (1995)	1995	http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)	0	0	1	1	1	0	0	0
GoldenEye (1995)	1995	http://us.imdb.com/M/title-exact?GoldenEye%20(1995)	1	1	0	0	0	0	0	1
Four Rooms (1995)	1995	http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)	0	0	0	0	0	0	0	1
Get Shorty (1995)	1995	http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)	1	0	0	0	1	0	1	0
Copycat (1995)	1995	http://us.imdb.com/M/title-exact?Copycat%20(1995)	0	0	0	0	0	1	1	1
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)	1995	http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)	0	0	0	0	0	0	1	0

Train/Test Sets

We will use 80% of the data for training and 20% for testing. Since the dataset is large and sparse, we will only look at the users who have rated more than 50 movies and movies with more than 100 reviews.

 MovieLense <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]

set.seed(1)
testsample <- sample(x = c(TRUE, FALSE), size = nrow(MovieLense),
replace = TRUE, prob = c(0.8, 0.2))
train <- MovieLense[testsample, ]
test <- MovieLense[!testsample, ]

Item-Based Collaborative Filtering

We will apply cosine similarity to identify 20 neighboring items with similar profile and use that for recommendations

ibcfModel <- Recommender(data = train, method = "IBCF", parameter = list(k = 20, method = "Cosine"))
ibcfModel

## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 451 users.

ibcfModelPredicted <- predict(object = ibcfModel, newdata = test, n = 10)
ibcfModelPredicted

## Recommendations as 'topNList' with n = 10 for 109 users.

Below table displays some of the recommendatiosn for a user.

recommendations <- data.frame(user = sort(rep(1:length(ibcfModelPredicted@items), ibcfModelPredicted@n)), 
    rating = unlist(ibcfModelPredicted@ratings), index = unlist(ibcfModelPredicted@items))

recommendations$title <- ibcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user) 

kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

user	rating	index	title	year
1	5.000000	5	Babe (1995)	1995
1	5.000000	45	Remains of the Day, The (1993)	1994
1	5.000000	300	Being There (1979)	1997
1	5.000000	311	Dave (1993)	1997
1	4.668660	33	Ace Ventura: Pet Detective (1994)	1995
1	4.662706	264	African Queen, The (1951)	1997

Model : User-Based Collaborative Filtering

We will apply cosine similarity to identify 20 neighboring users with similar profile and use that for recommendations

ubcfModel <- Recommender(data = train, method = "UBCF", parameter = list(k = 20, 
    method = "Cosine"))

## Warning: Unknown parameters: k

## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE

ubcfModelPredicted <- predict(object = ubcfModel, newdata = test, n = 10)

Recommendations for a user are displayed below

recommendations <- data.frame(user = sort(rep(1:length(ubcfModelPredicted@items), ubcfModelPredicted@n)), 
    rating = unlist(ubcfModelPredicted@ratings), index = unlist(ubcfModelPredicted@items))

recommendations$title <- ubcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user) %>% top_n(20)

## Selecting by year

kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

user	rating	index	title	year
1	3.718585	193	L.A. Confidential (1997)	1983
1	3.713766	173	Good Will Hunting (1997)	1987
1	3.525676	199	As Good As It Gets (1997)	1957
1	3.498421	201	Schindler’s List (1993)	1987
1	3.498134	69	Godfather, The (1972)	1994
1	3.481718	318	Amistad (1997)	1996

Evaluating different distance functions

build_Model_Evaluate <- function(data, modelname, method){
eval_sets <- evaluationScheme(data = data, given = 10)

model_to_evaluate <-modelname
model_parameters <- list(normalize = "Z-score",method=method)
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
                                method = model_to_evaluate,
                                parameter = model_parameters)



eval_prediction <- predict(object = eval_recommender,
                           newdata = getData(eval_sets, "known"),
                           n = 10,
                           type = "ratings")
class(eval_prediction)


eval_accuracy <- calcPredictionAccuracy(
  x = eval_prediction,
  data = getData(eval_sets, "unknown"),
  byUser = F)
 
return(eval_accuracy)
}

Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this particualar case Pearson functions is better than Cosine function.

Pearson_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','pearson')
Cosine_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','cosine')

kable_styling (kable( rbind(Pearson_IBCF, Cosine_IBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	RMSE	MSE	MAE
Pearson_IBCF	1.313823	1.726131	0.9725531
Cosine_IBCF	1.400038	1.960106	1.0461014

Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this case, we see that Pearson is better than cosine function.

mm <- MovieLenseMeta[-2]
mm <- mm[-2]
mm <- as(mm, 'realRatingMatrix')
mm <- binarize(mm, minRating = 1)
Pearson_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','pearson')
Cosine_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','cosine')

kable_styling (kable( rbind(Pearson_UBCF, Cosine_UBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	RMSE	MSE	MAE
Pearson_UBCF	0.9684725	0.9379389	0.7550760
Cosine_UBCF	1.0491982	1.1008168	0.8338011

Comparing Models

We will compare different distance functions using cross validation with following parameters.

k=4 given = 15 goodRating = 3

Item based

Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine.

set.seed(1)
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50,
                             colCounts(MovieLense) > 100]

n_fold <- 4
items_to_keep <- 15
rating_threshold <- 3
eval_sets <- evaluationScheme(data = ratings_movies,
                              method = "cross-validation",
                              k = n_fold,
                              given = items_to_keep,
                              goodRating = rating_threshold)

set.seed(1)
models_to_evaluate <- list(
  IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
  IBCF_Pearson = list(name = "IBCF", param = list(method = "pearson"))

  
)

n_recommendations <- c(1, 5, seq(10, 100, 10))

results <- evaluate(x = eval_sets,
                         method = models_to_evaluate,
                         n = n_recommendations)

## IBCF run fold/sample [model time/prediction time]
##   1  [0.15sec/0.02sec] 
##   2  [0.16sec/0.01sec] 
##   3  [0.15sec/0.02sec] 
##   4  [0.16sec/0.01sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.18sec/0.03sec] 
##   2  [0.2sec/0.02sec] 
##   3  [0.22sec/0.02sec] 
##   4  [0.22sec/0.02sec]

plot(results,annotate = 1,legend = "topleft")

title("ROC curve")

plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")

User Based

Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine. Above we had

set.seed(1)
models_to_evaluate <- list(
  UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
  UBCF_Pearson = list(name = "UBCF", param = list(method = "pearson"))

  
)
results <- evaluate(x = eval_sets,
                         method = models_to_evaluate,
                         n = n_recommendations)

## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.14sec] 
##   2  [0sec/0.16sec] 
##   3  [0sec/0.16sec] 
##   4  [0sec/0.14sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.12sec] 
##   2  [0.02sec/0.14sec] 
##   3  [0sec/0.16sec] 
##   4  [0sec/0.44sec]

plot(results,annotate = 1,legend = "topleft")

title("ROC curve")

plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")