DATA 643 - Project 2

Build a basic recommender system with multiple recommender configurations

Description

This system recommends movies to users using different recommendation algorithms and evaluates those different approaches. The data source is Movielens dataset.

Load the data & Analyse

Load the required libraries

## load libraries ####
library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)

Data Acquisition & Clean up

Load the required data - movies and ratings

#Read the Movie data
moviesurl <- getURL("https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/movies.csv")
moviesDF <- read.csv(text = moviesurl,header = TRUE, stringsAsFactors = FALSE)
knitr::kable(head(moviesDF))

movieId	title	genres
1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
2	Jumanji (1995)	Adventure\|Children\|Fantasy
3	Grumpier Old Men (1995)	Comedy\|Romance
4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
5	Father of the Bride Part II (1995)	Comedy
6	Heat (1995)	Action\|Crime\|Thriller

## read Ratings data, remove timestamp column.
ratingsurl <- getURL("https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/ratings.csv")
ratingDF <- read.csv(text = ratingsurl,header = TRUE, stringsAsFactors = FALSE)
ratingDF <- ratingDF %>% select(userId, movieId, rating)
knitr::kable(head(ratingDF))

userId	movieId	rating
1	16	4.0
1	24	1.5
1	32	4.0
1	47	4.0
1	50	4.0
1	110	4.0

Reshape the data

ratingDF_horizontal <-acast(ratingDF, userId ~ movieId, value.var="rating")

#Lets find the top 3 movies by mean rating.
cmn <- colMeans(ratingDF_horizontal, na.rm = TRUE)
nd <- order(cmn, decreasing = TRUE)
moviesDF %>% filter(movieId %in% nd[1:3])

##   movieId               title                                     genres
## 1     111  Taxi Driver (1976)                       Crime|Drama|Thriller
## 2     198 Strange Days (1995) Action|Crime|Drama|Mystery|Sci-Fi|Thriller
## 3     367    Mask, The (1994)                Action|Comedy|Crime|Fantasy

Prepare ratings matrix

# Convert ratingDF_horizontal into realRatingMatrix data structure
# realRatingMatrix is a recommenderlab sparse-matrix like data-structure

(ratingDF.ratingMatrix <- as(as.matrix(ratingDF_horizontal), "realRatingMatrix"))

## 668 x 10325 rating matrix of class 'realRatingMatrix' with 105339 ratings.

#. Users who have rated at least 10 movies
#. Movies that have been watched at least 20 times
ratingDF.ratingMatrix <- ratingDF.ratingMatrix[rowCounts(ratingDF.ratingMatrix) > 10,colCounts(ratingDF.ratingMatrix) > 20]

Visualize and Normalize

#mean user ratings
hist(rowMeans(ratingDF.ratingMatrix), breaks=100, main="Histogram of Mean User Ratings", xlab="Ratings", col= "light blue")

#mean rating for eash show
hist(colMeans(ratingDF.ratingMatrix), breaks=100, main="Histogram of Mean Movie Ratings", xlab="Ratings", col= "light green")

min_movies <- quantile(rowCounts(ratingDF.ratingMatrix), 0.98)
min_users <- quantile(colCounts(ratingDF.ratingMatrix), 0.98)

# build the heatmap:
image(ratingDF.ratingMatrix[rowCounts(ratingDF.ratingMatrix) > min_movies,colCounts(ratingDF.ratingMatrix) > min_users], main = "Heatmap of the top users and movies")

ratingMatrix.normalized <- normalize(ratingDF.ratingMatrix)

# visualize the Normalized top matrix
min_movies <- quantile(rowCounts(ratingMatrix.normalized), 0.98)
min_users <- quantile(colCounts(ratingMatrix.normalized), 0.98)

# build the heatmap:
image(ratingMatrix.normalized[rowCounts(ratingMatrix.normalized) > min_movies,colCounts(ratingMatrix.normalized) > min_users], main = "Heatmap of the top users and movies")

Split the data frame into training and test

## 75% of the sample size
smp_size <- floor(0.75 * nrow(ratingMatrix.normalized))
set.seed(123)
train_ind <- sample(seq_len(nrow(ratingMatrix.normalized)), size = smp_size)

train.RatingMat <- ratingMatrix.normalized[train_ind, ]
test.RatingMat <- ratingMatrix.normalized[-train_ind, ]

Model Building

Lets build models using different recommender configurations, like UBCF (User Based Collab Filtering, Item Based Collab Filtering, with various similarity methods like Cosine, Jaccard)

createModel <-function (movieRatingMat,method, param)
{
  
  model <- Recommender(movieRatingMat, method = method, param)
  names(getModel(model))
  getModel(model)$method
  getModel(model)$nn
  print(model)  
  
  return (model)
}


#UBCF, Cosine similarity
ubd.rec=createModel(train.RatingMat, method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))

## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 501 users.

#UBCF, Jaccard similarity
ubd.rec.jac=createModel(train.RatingMat, method="UBCF", param=list(normalize = "Z-score",method="Jaccard",nn=5, minRating=1))

## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 501 users.

#IBCF, Jaccard similarity
ibcf.rec.jac=createModel(train.RatingMat, method="IBCF", param=list(normalize = "Z-score",method="Jaccard",nn=5, minRating=1))

## Available parameter (with default values):
## k     =  30
## method    =  Cosine
## normalize     =  center
## normalize_sim_matrix  =  FALSE
## alpha     =  0.5
## na_as_zero    =  FALSE
## verbose   =  FALSE
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 501 users.

Recommendations

Generate recommendation(s) leveraging the above models:

Recommend top 3 movies for a given user.

recommendations <- function(movieRatingMat, model, userID, n)
{
  
  ### predict top n recommendations for given user
  topN_recommendList <-predict(model,movieRatingMat[userID],n=n) 
  topN_recommendList@items[[1]]
  return(topN_recommendList)
  
}


#Let us get the top 3 recommendations for user 1, Using UBCF - Cosine Similarity
userID <- 1
topN <- 3
predict_list <-recommendations(train.RatingMat, ubd.rec, userID, topN)
predict_list@items[[1]]

## [1] 550  78 746

subset(moviesDF, movieId %in% predict_list@items[[1]])

##     movieId                      title                      genres
## 72       78 Crossing Guard, The (1995) Action|Crime|Drama|Thriller
## 494     550           Threesome (1994)              Comedy|Romance
## 628     746       Force of Evil (1948)                   Film-Noir

#Let us get the top 3 recommendations for user 1, Using UBCF - Jaccard Similarity
predict_list <-recommendations(train.RatingMat, ubd.rec.jac, userID, topN)
predict_list@items[[1]]

## [1]   78  746 1218

subset(moviesDF, movieId %in% predict_list@items[[1]])

##     movieId                                     title
## 72       78                Crossing Guard, The (1995)
## 628     746                      Force of Evil (1948)
## 980    1218 Killer, The (Die xue shuang xiong) (1989)
##                          genres
## 72  Action|Crime|Drama|Thriller
## 628                   Film-Noir
## 980 Action|Crime|Drama|Thriller

#Let us get the top 3 recommendations for user 1 using IBCF Model
predict_list <-recommendations(train.RatingMat, ibcf.rec.jac, userID, topN)
predict_list@items[[1]]

## [1] 401 899 350

subset(moviesDF, movieId %in% predict_list@items[[1]])

##     movieId                      title                 genres
## 311     350         Client, The (1994) Drama|Mystery|Thriller
## 721     899 Singin' in the Rain (1952) Comedy|Musical|Romance

Evaluation

Let’s evaluate different recommender algorithms and see what performs best for our ratings matrix:

scheme <- evaluationScheme(ratingDF.ratingMatrix, method = "split", train = .9, given = 4, goodRating = 4)
algorithms <- list(
  "random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
  "popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
  "user-based CF" = list(name="UBCF", param=list(normalize = "Z-score",
                                                 method="Cosine",
                                                 nn=10, minRating=3)),
  "item-based CF" = list(name="IBCF", param=list(normalize = "Z-score")))

 
 

# run algorithms, predict next n movies
results1 <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))

## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.17sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0.03sec/0.19sec] 
## UBCF run fold/sample [model time/prediction time]
##   1

## Warning: Unknown parameters: minRating

## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## [0.04sec/7.68sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [79.28sec/0.07sec]

#Draw ROC curve [true positive rate (Sensitivity) is plotted in function of the false positive rate (100-Specificity) for different cut-off points]
recommenderlab::plot(results1, annotate = 1:4, legend="topleft")

# See precision / recall
recommenderlab::plot(results1, "prec/rec", annotate=3)

Conclusion

From the above evaluation it seems like the UBCF did better than the IBCF. RANDOM items approach is the worst here, but surprisingly the POPULAR items did the best here!