library(recommenderlab)
library(reshape2)
library(RCurl)
library(pander)
library(ggplot2)
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)

1.0 Objective

Select an existing dataset containing user-item ratings and implement at least two of these recommendation algorithms. Also evaluate and compare different approaches

2.0 Load the data & transform

2.1 Data Load

Load the sample dataset with titles and ratings for movies from Movielens - https://grouplens.org/datasets/movielens/

# Movie data
movies_DF <- read.csv("movies.csv", stringsAsFactors = FALSE)
knitr::kable(head(movies_DF))
movieId title genres
1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
2 Jumanji (1995) Adventure|Children|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama|Romance
5 Father of the Bride Part II (1995) Comedy
6 Heat (1995) Action|Crime|Thriller
# Ratings data
rating_DF <- read.csv("ratings.csv",header = TRUE, stringsAsFactors = FALSE)

# Remove timestamp & column
rating_DF <- rating_DF %>% select(userId, movieId, rating)
knitr::kable(head(rating_DF))
userId movieId rating
1 31 2.5
1 1029 3.0
1 1061 3.0
1 1129 2.0
1 1172 4.0
1 1263 2.0

2.2 Data Transformation

Also find average rating by item (movie) and the top 3

rating_DF_horizontal <- acast(rating_DF, userId ~ movieId, value.var="rating")
# change Na's to 0
#rating_DF_horizontal_test <- rating_DF_horizontal
#rating_DF_horizontal_test[is.na(rating_DF_horizontal_test)] <- 0

#Lets find the top 3 movies by mean rating.

movie_mean <- colMeans(rating_DF_horizontal,na.rm = TRUE)
movie_ordered <- order(movie_mean, decreasing = TRUE)

movies_DF %>% filter(movieId %in% movie_ordered[1:4])
##   movieId                 title        genres
## 1     162          Crumb (1994)   Documentary
## 2     270    Love Affair (1994) Drama|Romance
## 3     278 Miami Rhapsody (1995)        Comedy

Now, lets create a Matrix for the ratings using recommenderlab package.

rating_DF.ratingMatrix <- as(rating_DF_horizontal, "realRatingMatrix")

#. Users who have rated at least 25 movies and movies that have been watched at least 50 times

rating_DF.ratingMatrix <- rating_DF.ratingMatrix[rowCounts(rating_DF.ratingMatrix) > 25,colCounts(rating_DF.ratingMatrix) > 50] 

3.0 Data Visualization

# Calculate mean of user ratings
h1 <- hist(rowMeans(rating_DF.ratingMatrix), breaks=100, main="Histogram - Mean User Ratings", xlab="Ratings", col = "green")

# Calculate mean rating for eash movie
h2 <- hist(colMeans(rating_DF.ratingMatrix), breaks=100, main="Histogram - Mean Movie Ratings", xlab="Ratings", col = "blue")

# Create a heatmap:

min_movies <- quantile(rowCounts(rating_DF.ratingMatrix), 0.98)
min_raters <- quantile(colCounts(rating_DF.ratingMatrix), 0.98)

hm1 <- image(rating_DF.ratingMatrix[rowCounts(rating_DF.ratingMatrix) > min_movies, colCounts(rating_DF.ratingMatrix) > min_raters], main = "Heatmap - #Top raters and movies", xlab = "Items (movies)", ylab = "Users (Raters)")
hm1

# Create a Normalized matrix
ratingMatrix.normalized <- normalize(rating_DF.ratingMatrix)

min_movies_n <- quantile(rowCounts(ratingMatrix.normalized), 0.98)
min_raters_n <- quantile(colCounts(ratingMatrix.normalized), 0.98)

hm2 <- image(ratingMatrix.normalized[rowCounts(ratingMatrix.normalized) > min_movies_n,colCounts(ratingMatrix.normalized) > min_raters_n], main = "Heatmap - Top raters and movies", xlab = "Items (movies)", ylab = "Users (Raters)")
hm2

4.0 Split data into training and test datasets

## Picking 50% of the sample

sample_df <- floor(0.50 * nrow(rating_DF.ratingMatrix))
set.seed(465)

rating_sample <- sample(seq_len(nrow(rating_DF.ratingMatrix)), size = sample_df)


train.RatingMatrix <- rating_DF.ratingMatrix[rating_sample, ]
test.RatingMatrix <- rating_DF.ratingMatrix[-rating_sample, ]
head(test.RatingMatrix)
## 1 x 444 rating matrix of class 'realRatingMatrix' with 55 ratings.

5.0 Model Building

5.1 Item-Item Collaborative Filtering (IBCF)

ibcf <- function(data,test_data,method,n_users,n_recommended)
{
  ub_model <- Recommender(data,method = method, param = list(method = "Cosine",k = n_users))
  ub_predict <-
    predict(ub_model, newdata = test_data, n = n_recommended)
  
  result_list <- as(ub_predict, "list")
  result <- matrix(0, n_recommended)
  for (i in c(1:n_recommended)) {
    result[i] <- movies_DF$title[as.integer(result_list[[1]][i])]
  }
  
  return(result)
}

5.2 User-User Collaborative Filtering (UBCF)

ubcf <- function(data,test_data,method,n_users,n_recommended)
{
  ub_model <- Recommender(data,method = method, param = list(method = "Cosine",nn = n_users))
  ub_predict <-
    predict(ub_model, newdata = test_data, n = n_recommended)
  
  result_list <- as(ub_predict, "list")
  result <- matrix(0, n_recommended)
  for (i in c(1:n_recommended)) {
    result[i] <- movies_DF$title[as.integer(result_list[[1]][i])]
  }
  
  return(result)
}

6.0 Evaluation

6.1 Item-item collaborative Filtering

For our first evauluation We will select 30 similar users and choose the top 5 recommendations.

eval_1 <- ibcf(train.RatingMatrix,test.RatingMatrix,"IBCF",30,5)
eval_1
##      [,1]                                                
## [1,] "Across the Sea of Time (1995)"                     
## [2,] "Funeral, The (1996)"                               
## [3,] "Cheech and Chong's Up in Smoke (1978)"             
## [4,] "Austin Powers: International Man of Mystery (1997)"
## [5,] "Say Anything... (1989)"

For our second evauluation We will select 15 similar users and choose the top 5 recommendations.

eval_2 <- ibcf(train.RatingMatrix,test.RatingMatrix,"IBCF",15,5)
eval_2
##      [,1]                                    
## [1,] "Dracula (Bram Stoker's Dracula) (1992)"
## [2,] "Designated Mourner, The (1997)"        
## [3,] "Palmetto (1998)"                       
## [4,] "Roger & Me (1989)"                     
## [5,] "Mummy, The (1959)"

6.2 User-User collaborative Filtering

For our third evauluation We will select 30 similar users and choose the top 5 recommendations.

eval_3 <- ubcf(train.RatingMatrix,test.RatingMatrix,"UBCF",30,5)
eval_3
##      [,1]                                                    
## [1,] "Crooklyn (1994)"                                       
## [2,] "Crash (1996)"                                          
## [3,] "Wallace & Gromit: The Best of Aardman Animation (1996)"
## [4,] "Toy Story (1995)"                                      
## [5,] "Black and White (1999)"

For our fourth evauluation We will select 15 similar users and choose the top 5 recommendations.

eval_4 <- ubcf(train.RatingMatrix,test.RatingMatrix,"UBCF",15,5)
eval_4
##      [,1]                                                          
## [1,] "Crooklyn (1994)"                                             
## [2,] "Toy Story (1995)"                                            
## [3,] "Raise the Red Lantern (Da hong deng long gao gao gua) (1991)"
## [4,] "Wallace & Gromit: The Best of Aardman Animation (1996)"      
## [5,] "Amityville Horror, The (1979)"

7.0 Summary

We noticed that the number of neighbors influences the recommendation. Item-item collaborative filtering recommeded Sense and Sensibility (1995) with 30 neighbors but recommended Dead Presidents (1995) with 15 neighbors.

output <- data.frame(IBCF_30_5 = eval_1 , UBCF_30_5 = eval_3 ,IBCF_15_5 = eval_2, UBCF15_5 = eval_4)
knitr::kable(output)
IBCF_30_5 UBCF_30_5 IBCF_15_5 UBCF15_5
Across the Sea of Time (1995) Crooklyn (1994) Dracula (Bram Stoker’s Dracula) (1992) Crooklyn (1994)
Funeral, The (1996) Crash (1996) Designated Mourner, The (1997) Toy Story (1995)
Cheech and Chong’s Up in Smoke (1978) Wallace & Gromit: The Best of Aardman Animation (1996) Palmetto (1998) Raise the Red Lantern (Da hong deng long gao gao gua) (1991)
Austin Powers: International Man of Mystery (1997) Toy Story (1995) Roger & Me (1989) Wallace & Gromit: The Best of Aardman Animation (1996)
Say Anything… (1989) Black and White (1999) Mummy, The (1959) Amityville Horror, The (1979)

Comparing the results of UBCF with IBCF helps in understanding the algorithm better. UBCF needs to access the initial data, so it is a lazy-learning model. Since it needs to keep the entire database in memory, it doesn’t work well in the presence of a big rating matrix.

Also, building the similarity matrix requires a lot of computing power and time. However, UBCF’s accuracy is proven to be slightly more accurate than IBCF, so it’s a good option if the dataset is not too big.

Also note that collaborative filtering has some limitations. When dealing with new users and/ or new items, the algorithm has these potential problems:

`