library(recommenderlab)
library(reshape2)
library(RCurl)
library(pander)
library(ggplot2)
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
Select an existing dataset containing user-item ratings and implement at least two of these recommendation algorithms. Also evaluate and compare different approaches
Load the sample dataset with titles and ratings for movies from Movielens - https://grouplens.org/datasets/movielens/
# Movie data
movies_DF <- read.csv("movies.csv", stringsAsFactors = FALSE)
knitr::kable(head(movies_DF))
| movieId | title | genres |
|---|---|---|
| 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 5 | Father of the Bride Part II (1995) | Comedy |
| 6 | Heat (1995) | Action|Crime|Thriller |
# Ratings data
rating_DF <- read.csv("ratings.csv",header = TRUE, stringsAsFactors = FALSE)
# Remove timestamp & column
rating_DF <- rating_DF %>% select(userId, movieId, rating)
knitr::kable(head(rating_DF))
| userId | movieId | rating |
|---|---|---|
| 1 | 31 | 2.5 |
| 1 | 1029 | 3.0 |
| 1 | 1061 | 3.0 |
| 1 | 1129 | 2.0 |
| 1 | 1172 | 4.0 |
| 1 | 1263 | 2.0 |
Also find average rating by item (movie) and the top 3
rating_DF_horizontal <- acast(rating_DF, userId ~ movieId, value.var="rating")
# change Na's to 0
#rating_DF_horizontal_test <- rating_DF_horizontal
#rating_DF_horizontal_test[is.na(rating_DF_horizontal_test)] <- 0
#Lets find the top 3 movies by mean rating.
movie_mean <- colMeans(rating_DF_horizontal,na.rm = TRUE)
movie_ordered <- order(movie_mean, decreasing = TRUE)
movies_DF %>% filter(movieId %in% movie_ordered[1:4])
## movieId title genres
## 1 162 Crumb (1994) Documentary
## 2 270 Love Affair (1994) Drama|Romance
## 3 278 Miami Rhapsody (1995) Comedy
Now, lets create a Matrix for the ratings using recommenderlab package.
rating_DF.ratingMatrix <- as(rating_DF_horizontal, "realRatingMatrix")
#. Users who have rated at least 25 movies and movies that have been watched at least 50 times
rating_DF.ratingMatrix <- rating_DF.ratingMatrix[rowCounts(rating_DF.ratingMatrix) > 25,colCounts(rating_DF.ratingMatrix) > 50]
# Calculate mean of user ratings
h1 <- hist(rowMeans(rating_DF.ratingMatrix), breaks=100, main="Histogram - Mean User Ratings", xlab="Ratings", col = "green")
# Calculate mean rating for eash movie
h2 <- hist(colMeans(rating_DF.ratingMatrix), breaks=100, main="Histogram - Mean Movie Ratings", xlab="Ratings", col = "blue")
# Create a heatmap:
min_movies <- quantile(rowCounts(rating_DF.ratingMatrix), 0.98)
min_raters <- quantile(colCounts(rating_DF.ratingMatrix), 0.98)
hm1 <- image(rating_DF.ratingMatrix[rowCounts(rating_DF.ratingMatrix) > min_movies, colCounts(rating_DF.ratingMatrix) > min_raters], main = "Heatmap - #Top raters and movies", xlab = "Items (movies)", ylab = "Users (Raters)")
hm1
# Create a Normalized matrix
ratingMatrix.normalized <- normalize(rating_DF.ratingMatrix)
min_movies_n <- quantile(rowCounts(ratingMatrix.normalized), 0.98)
min_raters_n <- quantile(colCounts(ratingMatrix.normalized), 0.98)
hm2 <- image(ratingMatrix.normalized[rowCounts(ratingMatrix.normalized) > min_movies_n,colCounts(ratingMatrix.normalized) > min_raters_n], main = "Heatmap - Top raters and movies", xlab = "Items (movies)", ylab = "Users (Raters)")
hm2
## Picking 50% of the sample
sample_df <- floor(0.50 * nrow(rating_DF.ratingMatrix))
set.seed(465)
rating_sample <- sample(seq_len(nrow(rating_DF.ratingMatrix)), size = sample_df)
train.RatingMatrix <- rating_DF.ratingMatrix[rating_sample, ]
test.RatingMatrix <- rating_DF.ratingMatrix[-rating_sample, ]
head(test.RatingMatrix)
## 1 x 444 rating matrix of class 'realRatingMatrix' with 55 ratings.
ibcf <- function(data,test_data,method,n_users,n_recommended)
{
ub_model <- Recommender(data,method = method, param = list(method = "Cosine",k = n_users))
ub_predict <-
predict(ub_model, newdata = test_data, n = n_recommended)
result_list <- as(ub_predict, "list")
result <- matrix(0, n_recommended)
for (i in c(1:n_recommended)) {
result[i] <- movies_DF$title[as.integer(result_list[[1]][i])]
}
return(result)
}
ubcf <- function(data,test_data,method,n_users,n_recommended)
{
ub_model <- Recommender(data,method = method, param = list(method = "Cosine",nn = n_users))
ub_predict <-
predict(ub_model, newdata = test_data, n = n_recommended)
result_list <- as(ub_predict, "list")
result <- matrix(0, n_recommended)
for (i in c(1:n_recommended)) {
result[i] <- movies_DF$title[as.integer(result_list[[1]][i])]
}
return(result)
}
For our first evauluation We will select 30 similar users and choose the top 5 recommendations.
eval_1 <- ibcf(train.RatingMatrix,test.RatingMatrix,"IBCF",30,5)
eval_1
## [,1]
## [1,] "Across the Sea of Time (1995)"
## [2,] "Funeral, The (1996)"
## [3,] "Cheech and Chong's Up in Smoke (1978)"
## [4,] "Austin Powers: International Man of Mystery (1997)"
## [5,] "Say Anything... (1989)"
For our second evauluation We will select 15 similar users and choose the top 5 recommendations.
eval_2 <- ibcf(train.RatingMatrix,test.RatingMatrix,"IBCF",15,5)
eval_2
## [,1]
## [1,] "Dracula (Bram Stoker's Dracula) (1992)"
## [2,] "Designated Mourner, The (1997)"
## [3,] "Palmetto (1998)"
## [4,] "Roger & Me (1989)"
## [5,] "Mummy, The (1959)"
For our third evauluation We will select 30 similar users and choose the top 5 recommendations.
eval_3 <- ubcf(train.RatingMatrix,test.RatingMatrix,"UBCF",30,5)
eval_3
## [,1]
## [1,] "Crooklyn (1994)"
## [2,] "Crash (1996)"
## [3,] "Wallace & Gromit: The Best of Aardman Animation (1996)"
## [4,] "Toy Story (1995)"
## [5,] "Black and White (1999)"
For our fourth evauluation We will select 15 similar users and choose the top 5 recommendations.
eval_4 <- ubcf(train.RatingMatrix,test.RatingMatrix,"UBCF",15,5)
eval_4
## [,1]
## [1,] "Crooklyn (1994)"
## [2,] "Toy Story (1995)"
## [3,] "Raise the Red Lantern (Da hong deng long gao gao gua) (1991)"
## [4,] "Wallace & Gromit: The Best of Aardman Animation (1996)"
## [5,] "Amityville Horror, The (1979)"
We noticed that the number of neighbors influences the recommendation. Item-item collaborative filtering recommeded Sense and Sensibility (1995) with 30 neighbors but recommended Dead Presidents (1995) with 15 neighbors.
output <- data.frame(IBCF_30_5 = eval_1 , UBCF_30_5 = eval_3 ,IBCF_15_5 = eval_2, UBCF15_5 = eval_4)
knitr::kable(output)
| IBCF_30_5 | UBCF_30_5 | IBCF_15_5 | UBCF15_5 |
|---|---|---|---|
| Across the Sea of Time (1995) | Crooklyn (1994) | Dracula (Bram Stoker’s Dracula) (1992) | Crooklyn (1994) |
| Funeral, The (1996) | Crash (1996) | Designated Mourner, The (1997) | Toy Story (1995) |
| Cheech and Chong’s Up in Smoke (1978) | Wallace & Gromit: The Best of Aardman Animation (1996) | Palmetto (1998) | Raise the Red Lantern (Da hong deng long gao gao gua) (1991) |
| Austin Powers: International Man of Mystery (1997) | Toy Story (1995) | Roger & Me (1989) | Wallace & Gromit: The Best of Aardman Animation (1996) |
| Say Anything… (1989) | Black and White (1999) | Mummy, The (1959) | Amityville Horror, The (1979) |
Comparing the results of UBCF with IBCF helps in understanding the algorithm better. UBCF needs to access the initial data, so it is a lazy-learning model. Since it needs to keep the entire database in memory, it doesn’t work well in the presence of a big rating matrix.
Also, building the similarity matrix requires a lot of computing power and time. However, UBCF’s accuracy is proven to be slightly more accurate than IBCF, so it’s a good option if the dataset is not too big.
Also note that collaborative filtering has some limitations. When dealing with new users and/ or new items, the algorithm has these potential problems:
If the new user hasn’t seen any movie yet, neither the IBCF nor the UBCF is able to recommend any item. Unless the IBCF knows the items purchased by the new user, it can’t work. The UBCF needs to know which users have similar preferences to the new one, but we don’t know about its ratings.
If the new item hasn’t been purchased by anyone, it will never be recommended. IBCF matches items that have been purchased by the same users, so it won’t match the new item with any of the others. UBCF recommends to each user items purchased by similar users, and no one purchased the new item. So, the algorithm won’t recommend it to anyone.
`