library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)
library(knitr)
library(tidyverse)
#Read the Movie data
movies.df <- read.csv('movies.csv',header = TRUE, stringsAsFactors = FALSE)
kable(head(movies.df))
| movieId | title | genres |
|---|---|---|
| 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 5 | Father of the Bride Part II (1995) | Comedy |
| 6 | Heat (1995) | Action|Crime|Thriller |
rating.df <- read.csv('ratings.csv',header = TRUE, stringsAsFactors = FALSE)
# rating.df <- rating.df %>%
# select(userId, movieId, rating)
kable(head(rating.df))
| userId | movieId | rating | timestamp |
|---|---|---|---|
| 1 | 1 | 4 | 964982703 |
| 1 | 3 | 4 | 964981247 |
| 1 | 6 | 4 | 964982224 |
| 1 | 47 | 5 | 964983815 |
| 1 | 50 | 5 | 964982931 |
| 1 | 70 | 3 | 964982400 |
# Convert to matrix
movieMatrix <- rating.df %>%
select(-timestamp) %>%
spread(movieId, rating)
row.names(movieMatrix) <- movieMatrix[,1]
movieMatrix <- movieMatrix[-c(1)]
movieMatrix <- as(as.matrix(movieMatrix), "realRatingMatrix")
movieMatrix
## 610 x 9724 rating matrix of class 'realRatingMatrix' with 100836 ratings.
class(movieMatrix)
## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
methods(class = class(movieMatrix))
## [1] [ [<- binarize
## [4] calcPredictionAccuracy coerce colCounts
## [7] colMeans colSds colSums
## [10] denormalize dim dimnames
## [13] dimnames<- dissimilarity evaluationScheme
## [16] getData.frame getList getNormalize
## [19] getRatingMatrix getRatings getTopNLists
## [22] image normalize nratings
## [25] Recommender removeKnownRatings rowCounts
## [28] rowMeans rowSds rowSums
## [31] sample show similarity
## see '?methods' for accessing help and source code
similarity_users <- similarity(movieMatrix[1:5, ], method ="cosine", which = "userId")
(as.matrix(similarity_users))
## 1 2 3 4 5
## 1 0.0000000 1 0.7919033 0.9328096 0.9707699
## 2 1.0000000 0 NA 1.0000000 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000 1.0000000
## 4 0.9328096 1 1.0000000 0.0000000 0.9011374
## 5 0.9707699 1 1.0000000 0.9011374 0.0000000
image(as.matrix(similarity_users), main = "User similarity")
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
names(recommender_models)
## [1] "ALS_realRatingMatrix" "ALS_implicit_realRatingMatrix"
## [3] "IBCF_realRatingMatrix" "LIBMF_realRatingMatrix"
## [5] "POPULAR_realRatingMatrix" "RANDOM_realRatingMatrix"
## [7] "RERECOMMEND_realRatingMatrix" "SVD_realRatingMatrix"
## [9] "SVDF_realRatingMatrix" "UBCF_realRatingMatrix"
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
dim(movieMatrix)
## [1] 610 9724
slotNames(movieMatrix)
## [1] "data" "normalize"
class(movieMatrix@data)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(movieMatrix@data)
## [1] 610 9724
vector_ratings <- as.vector(movieMatrix@data)
unique(vector_ratings)
## [1] 4.0 0.0 4.5 2.5 3.5 3.0 5.0 0.5 2.0 1.5 1.0
table_ratings <- table(vector_ratings)
(table_ratings)
## vector_ratings
## 0 0.5 1 1.5 2 2.5 3 3.5 4 4.5
## 5830804 1370 2811 1791 7551 5550 20047 13136 26818 8551
## 5
## 13211
ratings_movies <- movieMatrix[rowCounts(movieMatrix) > 50,
colCounts(movieMatrix) > 50]
min_movies <- quantile(rowCounts(ratings_movies), 0.95)
min_users <- quantile(colCounts(ratings_movies), 0.95)
image(ratings_movies[rowCounts(ratings_movies) > min_movies,
colCounts(ratings_movies) > min_users],
main = "Heatmap of the Top Users and Movies (Not Normalized)")
average_ratings_per_user <- rowMeans(ratings_movies)
ggplot() + aes(average_ratings_per_user) +
geom_histogram(binwidth = 0.1) +
ggtitle("Distribution of the average rating per user")+
xlab("Average Rating") + ylab("No of Ratings")
ratings_movies_norm <- normalize(ratings_movies)
avg <- round(rowMeans(ratings_movies_norm),5)
table(avg)
## avg
## 0
## 378
image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies,
colCounts(ratings_movies_norm) > min_users],
main = "Heatmap of the Top Users and Movies (Normalized)")
which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies),replace = TRUE, prob = c(0.8, 0.2))
head(which_train,10)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
Train the model
IBCF.recc_model <- Recommender(data = recc_data_train, method = "IBCF",parameter = list(k = 30))
IBCF.recc_model
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 302 users.
Predict
IBCF.recc_predicted <- predict(object = IBCF.recc_model, newdata = recc_data_test, n = 6)
IBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 76 users.
Recommendation for the first user
IBCF.recc_predicted@items[[1]]
## [1] 368 396 394 325 213 11
recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters
## $method
## [1] "cosine"
##
## $nn
## [1] 25
##
## $sample
## [1] FALSE
##
## $normalize
## [1] "center"
UBCF_recc_model <- Recommender(data = recc_data_train, method = "UBCF",parameter = list(k = 30))
## Warning: Unknown parameter: k
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
UBCF_recc_model
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 302 users.
UBCF.recc_predicted <- predict(object = UBCF_recc_model,newdata = recc_data_test, n = 6)
UBCF.recc_predicted
## Recommendations as 'topNList' with n = 6 for 76 users.
Recommendation for the first user
UBCF.recc_predicted@items[[1]]
## [1] 40 97 64 78 96 24
eval_sets <- evaluationScheme(ratings_movies, method = "split", train = .8, given = 4, goodRating=3)
algorithms <- list("user-based CF" = list(name="UBCF"), "item-based CF" = list(name="IBCF"))
results <- evaluate(eval_sets, algorithms, n=c(2, 5, 7, 9, 11, 13))
## UBCF run fold/sample [model time/prediction time]
## 1 [0sec/0.08sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.55sec/0.01sec]
recommenderlab::plot(results, annotate = 1:4, legend="topleft")
UBCF appears to be better than IBCF.