In this project we will implment a content based and user to user colloborations. For user colloboration we will use the MovieLense and MovieLenseMeta dataset. MovieLense dataset contains 943 rows and 1664 columns MovieLenseMeta dataset contains 1664 rows and 22 columns
Table and histogram below displays ratings frequencies. We will exclude movies with 0 ratings.
data("MovieLense")
d = as.vector(MovieLense@data)
table_ratings <- table(d)
kable_styling (kable(table_ratings),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
d | Freq |
---|---|
0 | 1469760 |
1 | 6059 |
2 | 11307 |
3 | 27002 |
4 | 33947 |
5 | 21077 |
d= d[d!= 0]
ggplot() + aes(d) +
geom_histogram(binwidth = 0.5) +
xlab("Rating") + ylab("Number of Ratings")
Below table displays some of the elements from moviemeta dataset
kable_styling (kable(head(MovieLenseMeta)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
title | year | url | unknown | Action | Adventure | Animation | Children’s | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Toy Story (1995) | 1995 | http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
GoldenEye (1995) | 1995 | http://us.imdb.com/M/title-exact?GoldenEye%20(1995) | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Four Rooms (1995) | 1995 | http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Get Shorty (1995) | 1995 | http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995) | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Copycat (1995) | 1995 | http://us.imdb.com/M/title-exact?Copycat%20(1995) | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) | 1995 | http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
We will use 80% of the data for training and 20% for testing. Since the dataset is large and sparse, we will only look at the users who have rated more than 50 movies and movies with more than 100 reviews.
MovieLense <- MovieLense[rowCounts(MovieLense) > 50, colCounts(MovieLense) > 100]
set.seed(1)
testsample <- sample(x = c(TRUE, FALSE), size = nrow(MovieLense),
replace = TRUE, prob = c(0.8, 0.2))
train <- MovieLense[testsample, ]
test <- MovieLense[!testsample, ]
We will apply cosine similarity to identify 20 neighboring items with similar profile and use that for recommendations
ibcfModel <- Recommender(data = train, method = "IBCF", parameter = list(k = 20, method = "Cosine"))
ibcfModel
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 451 users.
ibcfModelPredicted <- predict(object = ibcfModel, newdata = test, n = 10)
ibcfModelPredicted
## Recommendations as 'topNList' with n = 10 for 109 users.
Below table displays some of the recommendatiosn for a user.
recommendations <- data.frame(user = sort(rep(1:length(ibcfModelPredicted@items), ibcfModelPredicted@n)),
rating = unlist(ibcfModelPredicted@ratings), index = unlist(ibcfModelPredicted@items))
recommendations$title <- ibcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user)
kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
user | rating | index | title | year |
---|---|---|---|---|
1 | 5.000000 | 5 | Babe (1995) | 1995 |
1 | 5.000000 | 45 | Remains of the Day, The (1993) | 1994 |
1 | 5.000000 | 300 | Being There (1979) | 1997 |
1 | 5.000000 | 311 | Dave (1993) | 1997 |
1 | 4.668660 | 33 | Ace Ventura: Pet Detective (1994) | 1995 |
1 | 4.662706 | 264 | African Queen, The (1951) | 1997 |
We will apply cosine similarity to identify 20 neighboring users with similar profile and use that for recommendations
ubcfModel <- Recommender(data = train, method = "UBCF", parameter = list(k = 20,
method = "Cosine"))
## Warning: Unknown parameters: k
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
ubcfModelPredicted <- predict(object = ubcfModel, newdata = test, n = 10)
Recommendations for a user are displayed below
recommendations <- data.frame(user = sort(rep(1:length(ubcfModelPredicted@items), ubcfModelPredicted@n)),
rating = unlist(ubcfModelPredicted@ratings), index = unlist(ubcfModelPredicted@items))
recommendations$title <- ubcfModelPredicted@itemLabels[recommendations$index]
recommendations$year <- MovieLenseMeta$year[recommendations$index]
recommendations <- recommendations %>% group_by(user) %>% top_n(20)
## Selecting by year
kable_styling (kable(head((recommendations[recommendations$user %in% (1:20), ]))),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
user | rating | index | title | year |
---|---|---|---|---|
1 | 3.718585 | 193 | L.A. Confidential (1997) | 1983 |
1 | 3.713766 | 173 | Good Will Hunting (1997) | 1987 |
1 | 3.525676 | 199 | As Good As It Gets (1997) | 1957 |
1 | 3.498421 | 201 | Schindler’s List (1993) | 1987 |
1 | 3.498134 | 69 | Godfather, The (1972) | 1994 |
1 | 3.481718 | 318 | Amistad (1997) | 1996 |
build_Model_Evaluate <- function(data, modelname, method){
eval_sets <- evaluationScheme(data = data, given = 10)
model_to_evaluate <-modelname
model_parameters <- list(normalize = "Z-score",method=method)
eval_recommender <- Recommender(data = getData(eval_sets, "train"),
method = model_to_evaluate,
parameter = model_parameters)
eval_prediction <- predict(object = eval_recommender,
newdata = getData(eval_sets, "known"),
n = 10,
type = "ratings")
class(eval_prediction)
eval_accuracy <- calcPredictionAccuracy(
x = eval_prediction,
data = getData(eval_sets, "unknown"),
byUser = F)
return(eval_accuracy)
}
Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this particualar case Pearson functions is better than Cosine function.
Pearson_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','pearson')
Cosine_IBCF <- build_Model_Evaluate(MovieLense, 'IBCF','cosine')
kable_styling (kable( rbind(Pearson_IBCF, Cosine_IBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
RMSE | MSE | MAE | |
---|---|---|---|
Pearson_IBCF | 1.313823 | 1.726131 | 0.9725531 |
Cosine_IBCF | 1.400038 | 1.960106 | 1.0461014 |
Below table displays RMSE, MSE and MAE for Pearson and cosine distance functions. In this case, we see that Pearson is better than cosine function.
mm <- MovieLenseMeta[-2]
mm <- mm[-2]
mm <- as(mm, 'realRatingMatrix')
mm <- binarize(mm, minRating = 1)
Pearson_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','pearson')
Cosine_UBCF <- build_Model_Evaluate(MovieLense, 'UBCF','cosine')
kable_styling (kable( rbind(Pearson_UBCF, Cosine_UBCF)),bootstrap_options = c("striped", "hover", "condensed", "responsive"))
RMSE | MSE | MAE | |
---|---|---|---|
Pearson_UBCF | 0.9684725 | 0.9379389 | 0.7550760 |
Cosine_UBCF | 1.0491982 | 1.1008168 | 0.8338011 |
We will compare different distance functions using cross validation with following parameters.
k=4 given = 15 goodRating = 3
Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine.
set.seed(1)
ratings_movies <- MovieLense[rowCounts(MovieLense) > 50,
colCounts(MovieLense) > 100]
n_fold <- 4
items_to_keep <- 15
rating_threshold <- 3
eval_sets <- evaluationScheme(data = ratings_movies,
method = "cross-validation",
k = n_fold,
given = items_to_keep,
goodRating = rating_threshold)
set.seed(1)
models_to_evaluate <- list(
IBCF_cos = list(name = "IBCF", param = list(method = "cosine")),
IBCF_Pearson = list(name = "IBCF", param = list(method = "pearson"))
)
n_recommendations <- c(1, 5, seq(10, 100, 10))
results <- evaluate(x = eval_sets,
method = models_to_evaluate,
n = n_recommendations)
## IBCF run fold/sample [model time/prediction time]
## 1 [0.15sec/0.02sec]
## 2 [0.16sec/0.01sec]
## 3 [0.15sec/0.02sec]
## 4 [0.16sec/0.01sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.18sec/0.03sec]
## 2 [0.2sec/0.02sec]
## 3 [0.22sec/0.02sec]
## 4 [0.22sec/0.02sec]
plot(results,annotate = 1,legend = "topleft")
title("ROC curve")
plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")
Based on the ROC curve and precision-recal plot we see that pearson is better distance funtion than cosine. Above we had
set.seed(1)
models_to_evaluate <- list(
UBCF_cos = list(name = "UBCF", param = list(method = "cosine")),
UBCF_Pearson = list(name = "UBCF", param = list(method = "pearson"))
)
results <- evaluate(x = eval_sets,
method = models_to_evaluate,
n = n_recommendations)
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/0.14sec]
## 2 [0sec/0.16sec]
## 3 [0sec/0.16sec]
## 4 [0sec/0.14sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/0.12sec]
## 2 [0.02sec/0.14sec]
## 3 [0sec/0.16sec]
## 4 [0sec/0.44sec]
plot(results,annotate = 1,legend = "topleft")
title("ROC curve")
plot(results, "prec/rec", annotate = 1, legend = "bottomright")
title("Precision-recall")