Data 612 - Project 2

library(recommenderlab)
library(reshape2)
library(RCurl)
library(dplyr)
library(ggplot2)
library(knitr)
library(tidyverse)

#Read the Movie data
movies.df <- read.csv('movies.csv',header = TRUE, stringsAsFactors = FALSE)
kable(head(movies.df))

movieId	title	genres
1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
2	Jumanji (1995)	Adventure\|Children\|Fantasy
3	Grumpier Old Men (1995)	Comedy\|Romance
4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
5	Father of the Bride Part II (1995)	Comedy
6	Heat (1995)	Action\|Crime\|Thriller

rating.df <- read.csv('ratings.csv',header = TRUE, stringsAsFactors = FALSE)
# rating.df <- rating.df %>% 
#   select(userId, movieId, rating)
kable(head(rating.df))

userId	movieId	rating	timestamp
1	1	4	964982703
1	3	4	964981247
1	6	4	964982224
1	47	5	964983815
1	50	5	964982931
1	70	3	964982400

# Convert to matrix
movieMatrix <- rating.df %>% 
  select(-timestamp) %>% 
  spread(movieId, rating)
row.names(movieMatrix) <- movieMatrix[,1]
movieMatrix <- movieMatrix[-c(1)]
movieMatrix <- as(as.matrix(movieMatrix), "realRatingMatrix")

movieMatrix

## 610 x 9724 rating matrix of class 'realRatingMatrix' with 100836 ratings.

class(movieMatrix)

## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"

methods(class = class(movieMatrix))

##  [1] [                      [<-                    binarize              
##  [4] calcPredictionAccuracy coerce                 colCounts             
##  [7] colMeans               colSds                 colSums               
## [10] denormalize            dim                    dimnames              
## [13] dimnames<-             dissimilarity          evaluationScheme      
## [16] getData.frame          getList                getNormalize          
## [19] getRatingMatrix        getRatings             getTopNLists          
## [22] image                  normalize              nratings              
## [25] Recommender            removeKnownRatings     rowCounts             
## [28] rowMeans               rowSds                 rowSums               
## [31] sample                 show                   similarity            
## see '?methods' for accessing help and source code

Section 2 : Exploration & Visualization

similarity_users <- similarity(movieMatrix[1:5, ], method ="cosine", which = "userId")
(as.matrix(similarity_users))

##           1  2         3         4         5
## 1 0.0000000  1 0.7919033 0.9328096 0.9707699
## 2 1.0000000  0        NA 1.0000000 1.0000000
## 3 0.7919033 NA 0.0000000 1.0000000 1.0000000
## 4 0.9328096  1 1.0000000 0.0000000 0.9011374
## 5 0.9707699  1 1.0000000 0.9011374 0.0000000

image(as.matrix(similarity_users), main = "User similarity")

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
names(recommender_models)

##  [1] "ALS_realRatingMatrix"          "ALS_implicit_realRatingMatrix"
##  [3] "IBCF_realRatingMatrix"         "LIBMF_realRatingMatrix"       
##  [5] "POPULAR_realRatingMatrix"      "RANDOM_realRatingMatrix"      
##  [7] "RERECOMMEND_realRatingMatrix"  "SVD_realRatingMatrix"         
##  [9] "SVDF_realRatingMatrix"         "UBCF_realRatingMatrix"

recommender_models$IBCF_realRatingMatrix$parameters

## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

dim(movieMatrix)

## [1]  610 9724

slotNames(movieMatrix)

## [1] "data"      "normalize"

class(movieMatrix@data)

## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"

dim(movieMatrix@data)

## [1]  610 9724

vector_ratings <- as.vector(movieMatrix@data)
unique(vector_ratings)

##  [1] 4.0 0.0 4.5 2.5 3.5 3.0 5.0 0.5 2.0 1.5 1.0

table_ratings <- table(vector_ratings)
(table_ratings)

## vector_ratings
##       0     0.5       1     1.5       2     2.5       3     3.5       4     4.5 
## 5830804    1370    2811    1791    7551    5550   20047   13136   26818    8551 
##       5 
##   13211

ratings_movies <- movieMatrix[rowCounts(movieMatrix) > 50, 
                               colCounts(movieMatrix) > 50]
min_movies <- quantile(rowCounts(ratings_movies), 0.95)
min_users <- quantile(colCounts(ratings_movies), 0.95)
image(ratings_movies[rowCounts(ratings_movies) > min_movies, 
                 colCounts(ratings_movies) > min_users], 
      main = "Heatmap of the Top Users and Movies (Not Normalized)")

average_ratings_per_user <- rowMeans(ratings_movies)
ggplot() + aes(average_ratings_per_user) + 
  geom_histogram(binwidth = 0.1) +
  ggtitle("Distribution of the average rating per user")+
  xlab("Average Rating") + ylab("No of Ratings")

ratings_movies_norm <- normalize(ratings_movies)
avg <- round(rowMeans(ratings_movies_norm),5)
table(avg)

## avg
##   0 
## 378

image(ratings_movies_norm[rowCounts(ratings_movies_norm) > min_movies, 
                 colCounts(ratings_movies_norm) > min_users], 
      main = "Heatmap of the Top Users and Movies (Normalized)")

Train and Test Datasets

which_train <- sample(x = c(TRUE, FALSE), size = nrow(ratings_movies),replace = TRUE, prob = c(0.8, 0.2))
head(which_train,10)

##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE

recc_data_train <- ratings_movies[which_train, ]
recc_data_test <- ratings_movies[!which_train, ]

Recommendation model

IBCF : Item-based collaborative filtering

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$IBCF_realRatingMatrix$parameters

## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

Train the model

IBCF.recc_model <- Recommender(data = recc_data_train, method = "IBCF",parameter = list(k = 30))
IBCF.recc_model

## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 302 users.

Predict

IBCF.recc_predicted <- predict(object = IBCF.recc_model, newdata = recc_data_test, n = 6)
IBCF.recc_predicted

## Recommendations as 'topNList' with n = 6 for 76 users.

Recommendation for the first user

IBCF.recc_predicted@items[[1]]

## [1] 368 396 394 325 213  11

UBCF : User-based collaborative filtering

recommender_models <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommender_models$UBCF_realRatingMatrix$parameters

## $method
## [1] "cosine"
## 
## $nn
## [1] 25
## 
## $sample
## [1] FALSE
## 
## $normalize
## [1] "center"

UBCF_recc_model <- Recommender(data = recc_data_train, method = "UBCF",parameter = list(k = 30))

## Warning: Unknown parameter: k

## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE

UBCF_recc_model

## Recommender of type 'UBCF' for 'realRatingMatrix' 
## learned using 302 users.

UBCF.recc_predicted <- predict(object = UBCF_recc_model,newdata = recc_data_test, n = 6) 
UBCF.recc_predicted

## Recommendations as 'topNList' with n = 6 for 76 users.

Recommendation for the first user

UBCF.recc_predicted@items[[1]]

## [1] 40 97 64 78 96 24

Evaluation

eval_sets <- evaluationScheme(ratings_movies, method = "split", train = .8, given = 4, goodRating=3)
algorithms <- list("user-based CF" = list(name="UBCF"), "item-based CF" = list(name="IBCF"))
results <- evaluate(eval_sets, algorithms, n=c(2, 5, 7, 9, 11, 13))

## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.08sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.55sec/0.01sec]

recommenderlab::plot(results, annotate = 1:4, legend="topleft")

UBCF appears to be better than IBCF.

Data 612 - Project 2

Murali Kunissery

2/27/2020

Section 2 : Exploration & Visualization

Train and Test Datasets

Recommendation model

IBCF : Item-based collaborative filtering

UBCF : User-based collaborative filtering

Evaluation