This system recommends movies to users using different recommendation algorithms and evaluates those different approaches. The data source is Movielens dataset.
Load the required libraries
## load libraries ####
library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)
Data Acquisition & Clean up
Load the required data - movies and ratings
#Read the Movie data
moviesurl <- getURL("https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/movies.csv")
moviesDF <- read.csv(text = moviesurl,header = TRUE, stringsAsFactors = FALSE)
knitr::kable(head(moviesDF))
| movieId | title | genres |
|---|---|---|
| 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 5 | Father of the Bride Part II (1995) | Comedy |
| 6 | Heat (1995) | Action|Crime|Thriller |
## read Ratings data, remove timestamp column.
ratingsurl <- getURL("https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/ratings.csv")
ratingDF <- read.csv(text = ratingsurl,header = TRUE, stringsAsFactors = FALSE)
ratingDF <- ratingDF %>% select(userId, movieId, rating)
knitr::kable(head(ratingDF))
| userId | movieId | rating |
|---|---|---|
| 1 | 16 | 4.0 |
| 1 | 24 | 1.5 |
| 1 | 32 | 4.0 |
| 1 | 47 | 4.0 |
| 1 | 50 | 4.0 |
| 1 | 110 | 4.0 |
Reshape the data
ratingDF_horizontal <-acast(ratingDF, userId ~ movieId, value.var="rating")
#Lets find the top 3 movies by mean rating.
cmn <- colMeans(ratingDF_horizontal, na.rm = TRUE)
nd <- order(cmn, decreasing = TRUE)
moviesDF %>% filter(movieId %in% nd[1:3])
## movieId title genres
## 1 111 Taxi Driver (1976) Crime|Drama|Thriller
## 2 198 Strange Days (1995) Action|Crime|Drama|Mystery|Sci-Fi|Thriller
## 3 367 Mask, The (1994) Action|Comedy|Crime|Fantasy
Prepare ratings matrix
# Convert ratingDF_horizontal into realRatingMatrix data structure
# realRatingMatrix is a recommenderlab sparse-matrix like data-structure
(ratingDF.ratingMatrix <- as(as.matrix(ratingDF_horizontal), "realRatingMatrix"))
## 668 x 10325 rating matrix of class 'realRatingMatrix' with 105339 ratings.
#. Users who have rated at least 10 movies
#. Movies that have been watched at least 20 times
ratingDF.ratingMatrix <- ratingDF.ratingMatrix[rowCounts(ratingDF.ratingMatrix) > 10,colCounts(ratingDF.ratingMatrix) > 20]
#mean user ratings
hist(rowMeans(ratingDF.ratingMatrix), breaks=100, main="Histogram of Mean User Ratings", xlab="Ratings", col= "light blue")
#mean rating for eash show
hist(colMeans(ratingDF.ratingMatrix), breaks=100, main="Histogram of Mean Movie Ratings", xlab="Ratings", col= "light green")
min_movies <- quantile(rowCounts(ratingDF.ratingMatrix), 0.98)
min_users <- quantile(colCounts(ratingDF.ratingMatrix), 0.98)
# build the heatmap:
image(ratingDF.ratingMatrix[rowCounts(ratingDF.ratingMatrix) > min_movies,colCounts(ratingDF.ratingMatrix) > min_users], main = "Heatmap of the top users and movies")
ratingMatrix.normalized <- normalize(ratingDF.ratingMatrix)
# visualize the Normalized top matrix
min_movies <- quantile(rowCounts(ratingMatrix.normalized), 0.98)
min_users <- quantile(colCounts(ratingMatrix.normalized), 0.98)
# build the heatmap:
image(ratingMatrix.normalized[rowCounts(ratingMatrix.normalized) > min_movies,colCounts(ratingMatrix.normalized) > min_users], main = "Heatmap of the top users and movies")
## 75% of the sample size
smp_size <- floor(0.75 * nrow(ratingMatrix.normalized))
set.seed(123)
train_ind <- sample(seq_len(nrow(ratingMatrix.normalized)), size = smp_size)
train.RatingMat <- ratingMatrix.normalized[train_ind, ]
test.RatingMat <- ratingMatrix.normalized[-train_ind, ]
Lets build models using different recommender configurations, like UBCF (User Based Collab Filtering, Item Based Collab Filtering, with various similarity methods like Cosine, Jaccard)
createModel <-function (movieRatingMat,method, param)
{
model <- Recommender(movieRatingMat, method = method, param)
names(getModel(model))
getModel(model)$method
getModel(model)$nn
print(model)
return (model)
}
#UBCF, Cosine similarity
ubd.rec=createModel(train.RatingMat, method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 501 users.
#UBCF, Jaccard similarity
ubd.rec.jac=createModel(train.RatingMat, method="UBCF", param=list(normalize = "Z-score",method="Jaccard",nn=5, minRating=1))
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## Recommender of type 'UBCF' for 'realRatingMatrix'
## learned using 501 users.
#IBCF, Jaccard similarity
ibcf.rec.jac=createModel(train.RatingMat, method="IBCF", param=list(normalize = "Z-score",method="Jaccard",nn=5, minRating=1))
## Available parameter (with default values):
## k = 30
## method = Cosine
## normalize = center
## normalize_sim_matrix = FALSE
## alpha = 0.5
## na_as_zero = FALSE
## verbose = FALSE
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 501 users.
Generate recommendation(s) leveraging the above models:
Recommend top 3 movies for a given user.
recommendations <- function(movieRatingMat, model, userID, n)
{
### predict top n recommendations for given user
topN_recommendList <-predict(model,movieRatingMat[userID],n=n)
topN_recommendList@items[[1]]
return(topN_recommendList)
}
#Let us get the top 3 recommendations for user 1, Using UBCF - Cosine Similarity
userID <- 1
topN <- 3
predict_list <-recommendations(train.RatingMat, ubd.rec, userID, topN)
predict_list@items[[1]]
## [1] 550 78 746
subset(moviesDF, movieId %in% predict_list@items[[1]])
## movieId title genres
## 72 78 Crossing Guard, The (1995) Action|Crime|Drama|Thriller
## 494 550 Threesome (1994) Comedy|Romance
## 628 746 Force of Evil (1948) Film-Noir
#Let us get the top 3 recommendations for user 1, Using UBCF - Jaccard Similarity
predict_list <-recommendations(train.RatingMat, ubd.rec.jac, userID, topN)
predict_list@items[[1]]
## [1] 78 746 1218
subset(moviesDF, movieId %in% predict_list@items[[1]])
## movieId title
## 72 78 Crossing Guard, The (1995)
## 628 746 Force of Evil (1948)
## 980 1218 Killer, The (Die xue shuang xiong) (1989)
## genres
## 72 Action|Crime|Drama|Thriller
## 628 Film-Noir
## 980 Action|Crime|Drama|Thriller
#Let us get the top 3 recommendations for user 1 using IBCF Model
predict_list <-recommendations(train.RatingMat, ibcf.rec.jac, userID, topN)
predict_list@items[[1]]
## [1] 401 899 350
subset(moviesDF, movieId %in% predict_list@items[[1]])
## movieId title genres
## 311 350 Client, The (1994) Drama|Mystery|Thriller
## 721 899 Singin' in the Rain (1952) Comedy|Musical|Romance
Let’s evaluate different recommender algorithms and see what performs best for our ratings matrix:
scheme <- evaluationScheme(ratingDF.ratingMatrix, method = "split", train = .9, given = 4, goodRating = 4)
algorithms <- list(
"random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
"popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
"user-based CF" = list(name="UBCF", param=list(normalize = "Z-score",
method="Cosine",
nn=10, minRating=3)),
"item-based CF" = list(name="IBCF", param=list(normalize = "Z-score")))
# run algorithms, predict next n movies
results1 <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.17sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0.03sec/0.19sec]
## UBCF run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## [0.04sec/7.68sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [79.28sec/0.07sec]
#Draw ROC curve [true positive rate (Sensitivity) is plotted in function of the false positive rate (100-Specificity) for different cut-off points]
recommenderlab::plot(results1, annotate = 1:4, legend="topleft")
# See precision / recall
recommenderlab::plot(results1, "prec/rec", annotate=3)
From the above evaluation it seems like the UBCF did better than the IBCF. RANDOM items approach is the worst here, but surprisingly the POPULAR items did the best here!