The goal of this assignment is for you to try out different ways of implementing and configuring a recommender, and to evaluate your different approaches. For assignment 2, start with an existing dataset of user-item ratings, such as our toy books dataset, MovieLens, Jester [http://eigentaste.berkeley.edu/dataset/] or another dataset of your choosing. Implement at least two of these recommendation algorithms:
# Loading required libraries
library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)
library(knitr)
library(tidyverse)
#Read the Movie data
movie.url <- getURL("https://raw.githubusercontent.com/niteen11/MSDS/master/DATA643/Project%202/dataset/movies.csv")
movies.df <- read.csv(text = movie.url,header = TRUE, stringsAsFactors = FALSE)
#movies.df <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 643\\ml-latest-small\\movies.csv',header = TRUE, stringsAsFactors = FALSE)
kable(head(movies.df))
movieId | title | genres |
---|---|---|
1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
2 | Jumanji (1995) | Adventure|Children|Fantasy |
3 | Grumpier Old Men (1995) | Comedy|Romance |
4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
5 | Father of the Bride Part II (1995) | Comedy |
6 | Heat (1995) | Action|Crime|Thriller |
## read Ratings data, remove timestamp column.
rating.url <- getURL("https://raw.githubusercontent.com/niteen11/MSDS/master/DATA643/Project%202/dataset/ratings.csv")
rating.df <- read.csv(text = rating.url,header = TRUE, stringsAsFactors = FALSE)
#rating.df <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 643\\ml-latest-small\\ratings.csv',header = TRUE, stringsAsFactors = FALSE)
# rating.df <- rating.df %>%
# select(userId, movieId, rating)
kable(head(rating.df))
userId | movieId | rating | timestamp |
---|---|---|---|
1 | 31 | 2.5 | 1260759144 |
1 | 1029 | 3.0 | 1260759179 |
1 | 1061 | 3.0 | 1260759182 |
1 | 1129 | 2.0 | 1260759185 |
1 | 1172 | 4.0 | 1260759205 |
1 | 1263 | 2.0 | 1260759151 |
Reshaping the data and making it ready for data visualization and data modeling purpose
# Convert to matrix
movieMatrix <- rating.df %>%
select(-timestamp) %>%
spread(movieId, rating)
row.names(movieMatrix) <- movieMatrix[,1]
movieMatrix <- movieMatrix[-c(1)]
movieMatrix <- as(as.matrix(movieMatrix), "realRatingMatrix")
movieMatrix
## 671 x 9066 rating matrix of class 'realRatingMatrix' with 100004 ratings.
class(movieMatrix)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
methods(class = class(movieMatrix))
## [1] [ [<- binarize
## [4] calcPredictionAccuracy coerce colCounts
## [7] colMeans colSds colSums
## [10] denormalize dim dimnames
## [13] dimnames<- dissimilarity evaluationScheme
## [16] getData.frame getList getNormalize
## [19] getRatingMatrix getRatings getTopNLists
## [22] image normalize nratings
## [25] Recommender removeKnownRatings rowCounts
## [28] rowMeans rowSds rowSums
## [31] sample show similarity
## see '?methods' for accessing help and source code
similarity_users <- similarity(movieMatrix[1:5, ], method ="cosine", which = "userId")
(as.matrix(similarity_users))
## 1 2 3 4 5
## 1 0.0000000 NA NA 0.9085757 1.0000000
## 2 NA 0.0000000 0.9556143 0.9627467 0.9758399
## 3 NA 0.9556143 0.0000000 0.9752314 0.9651193
## 4 0.9085757 0.9627467 0.9752314 0.0000000 0.9694769
## 5 1.0000000 0.9758399 0.9651193 0.9694769 0.0000000
image(as.matrix(similarity_users), main = "User similarity")
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix" "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [5] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [7] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [9] "UBCF_realRatingMatrix"
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
dim(movieMatrix)
## [1] 671 9066
slotNames(movieMatrix)
## [1] "data" "normalize"
class(movieMatrix@data)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(movieMatrix@data)
## [1] 671 9066
vector_ratings <- as.vector(movieMatrix@data)
unique(vector_ratings)
## [1] 0.0 3.0 4.0 5.0 2.0 3.5 1.0 2.5 4.5 1.5 0.5
## [1] 5 4 0 3 1 2
table_ratings <- table(vector_ratings)
(table_ratings)
## vector_ratings
## 0 0.5 1 1.5 2 2.5 3 3.5 4
## 5983282 1101 3326 1687 7271 4449 20064 10538 28750
## 4.5 5
## 7723 15095
Selecting the most relevant data
ratings_movies <- movieMatrix[rowCounts(movieMatrix) > 50,
colCounts(movieMatrix) > 50]
Exploring the most relevant data
min_movies <- quantile(rowCounts(ratings_movies), 0.95)
min_users <- quantile(colCounts(ratings_movies), 0.95)
image(ratings_movies[rowCounts(ratings_movies) > min_movies,
colCounts(ratings_movies) > min_users],
main = "Heatmap of the Top Users and Movies (Not Normalized)")
average_ratings_per_user <- rowMeans(ratings_movies)
ggplot() + aes(average_ratings_per_user) +
geom_histogram(binwidth = 0.1) +
ggtitle("Distribution of the average rating per user")+
xlab("Average Rating") + ylab("No of Ratings")
Normalizing the data
ratings_movies_norm <- normalize(ratings_movies)
avg <- round(rowMeans(ratings_movies_norm),5)
table(avg)
## avg
## 0
## 421
image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies,
colCounts(ratings_movies_norm) > min_users],
main = "Heatmap of the Top Users and Movies (Normalized)")
which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies),replace = TRUE, prob = c(0.8, 0.2))
head(which_train,10)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]
Item-based collaborative filtering
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
Train
IBCF.recc_model <- Recommender(data = recc_data_train, method = "IBCF",parameter = list(k = 30))
IBCF.recc_model
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 340 users.
Predict
IBCF.recc_predicted <- predict(object = IBCF.recc_model, newdata = recc_data_test, n = 6)
IBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 81 users.
Recommendation for the first user
IBCF.recc_predicted@items[[1]]
## [1] 10 30 32 33 41 63
User-based collaborative filtering
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $normalize
## [1] "center"
UBCF_recc_model <- Recommender(data = recc_data_train, method = "UBCF",parameter = list(k = 30))
## Warning: Unknown parameter: k
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
UBCF_recc_model
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 340 users.
Predict
UBCF.recc_predicted <- predict(object = UBCF_recc_model,newdata = recc_data_test, n = 6)
UBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 81 users.
Recommendation for the first user
UBCF.recc_predicted@items[[1]]
## [1] 85 28 103 53 57 82
eval_sets <- evaluationScheme(ratings_movies, method = "split", train = .8, given = 4, goodRating=3)
algorithms <- list("user-based CF" = list(name="UBCF"), "item-based CF" = list(name="IBCF"))
results <- evaluate(eval_sets, algorithms, n=c(2, 5, 7, 9, 11, 13))
## UBCF run fold/sample [model time/prediction time]
## 1 [0.03sec/0.16sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.57sec/0.19sec]
recommenderlab::plot(results, annotate = 1:4, legend="topleft")
UBCF appears to be better than IBCF.