# Build a recommendation engine that recommends movies to users.
# Item Based Collaborative Filter recommendation system
# Load necessary libraries
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(ggplot2)
library(data.table)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## Retrieve and display data
setwd("/Users/sunny/Downloads/Movie-Recommendation-System-using-R_Project-main/IMDB-Dataset")
movie_data <- read.csv("movies.csv", stringsAsFactors = FALSE)
rating_data <- read.csv("ratings.csv")
str(movie_data)
## 'data.frame': 10329 obs. of 3 variables:
## $ movieId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ title : chr "Toy Story (1995)" "Jumanji (1995)" "Grumpier Old Men (1995)" "Waiting to Exhale (1995)" ...
## $ genres : chr "Adventure|Animation|Children|Comedy|Fantasy" "Adventure|Children|Fantasy" "Comedy|Romance" "Comedy|Drama|Romance" ...
### Overview the summary
summary(movie_data)
## movieId title genres
## Min. : 1 Length:10329 Length:10329
## 1st Qu.: 3240 Class :character Class :character
## Median : 7088 Mode :character Mode :character
## Mean : 31924
## 3rd Qu.: 59900
## Max. :149532
head(movie_data)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## 6 6 Heat (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
## 6 Action|Crime|Thriller
summary(rating_data)
## userId movieId rating timestamp
## Min. : 1.0 Min. : 1 Min. :0.500 Min. :8.286e+08
## 1st Qu.:192.0 1st Qu.: 1073 1st Qu.:3.000 1st Qu.:9.711e+08
## Median :383.0 Median : 2497 Median :3.500 Median :1.115e+09
## Mean :364.9 Mean : 13381 Mean :3.517 Mean :1.130e+09
## 3rd Qu.:557.0 3rd Qu.: 5991 3rd Qu.:4.000 3rd Qu.:1.275e+09
## Max. :668.0 Max. :149532 Max. :5.000 Max. :1.452e+09
head(rating_data)
## userId movieId rating timestamp
## 1 1 16 4.0 1217897793
## 2 1 24 1.5 1217895807
## 3 1 32 4.0 1217896246
## 4 1 47 4.0 1217896556
## 5 1 50 4.0 1217896523
## 6 1 110 4.0 1217896150
## Data pre-processing
# Creating a one-hot encoding to create a matrix that comprises of corresponding genres for each of the films.
movie_genre <- as.data.frame(movie_data$genres, stringsAsFactors = FALSE)
library(data.table)
movie_genre2 <- as.data.frame(tstrsplit(movie_genre[, 1], '[|]',
type.convert = TRUE),
stringsAsFactors = FALSE)
colnames(movie_genre2) <- c(1:10)
list_genre <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime", "Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery", "Romance",
"Sci-Fi", "Thriller", "War", "Western")
genre_mat1 <- matrix(0, 10330, 18)
genre_mat1[1, ] <- list_genre
colnames(genre_mat1) <- list_genre
for (index in 1:nrow(movie_genre2)) {
for (col in 1:ncol(movie_genre2)) {
gen_col = which(genre_mat1[1, ] == movie_genre2[index, col])
genre_mat1[index + 1, gen_col] <- 1
}
}
genre_mat2 <- as.data.frame(genre_mat1[-1, ], stringsAsFactors = FALSE) # remove first row, which was the genre list
for (col in 1:ncol(genre_mat2)) {
genre_mat2[, col] <- as.integer(genre_mat2[, col]) # convert from characters to integers
}
str(genre_mat2)
## 'data.frame': 10329 obs. of 18 variables:
## $ Action : int 0 0 0 0 0 1 0 0 1 1 ...
## $ Adventure : int 1 1 0 0 0 0 0 1 0 1 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Children : int 1 1 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 1 1 1 0 1 0 0 0 ...
## $ Crime : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Fantasy : int 1 1 0 0 0 0 0 0 0 0 ...
## $ Film-Noir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 1 1 0 0 1 0 0 0 ...
## $ Sci-Fi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Thriller : int 0 0 0 0 0 1 0 0 0 1 ...
## $ War : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...
# Creating a ‘search matrix’ - searching films by specifying the genre
SearchMatrix <- cbind(movie_data[, 1:2], genre_mat2[])
head(SearchMatrix)
## movieId title Action Adventure Animation
## 1 1 Toy Story (1995) 0 1 1
## 2 2 Jumanji (1995) 0 1 0
## 3 3 Grumpier Old Men (1995) 0 0 0
## 4 4 Waiting to Exhale (1995) 0 0 0
## 5 5 Father of the Bride Part II (1995) 0 0 0
## 6 6 Heat (1995) 1 0 0
## Children Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical
## 1 1 1 0 0 0 1 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 0 1 0 0 0 0 0 0 0
## 4 0 1 0 0 1 0 0 0 0
## 5 0 1 0 0 0 0 0 0 0
## 6 0 0 1 0 0 0 0 0 0
## Mystery Romance Sci-Fi Thriller War Western
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 1 0 0 0 0
## 4 0 1 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 1 0 0
ratingMatrix <- dcast(rating_data, userId ~ movieId, value.var = "rating", na.rm = FALSE)
ratingMatrix <- as.matrix(ratingMatrix[, -1]) # remove userIds
# Convert rating matrix into a recommenderlab sparse matrix
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
ratingMatrix
## 668 x 10325 rating matrix of class 'realRatingMatrix' with 105339 ratings.
## Overview some important parameters for building recommendation systems for movies
recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
## [1] "HYBRID_realRatingMatrix" "ALS_realRatingMatrix"
## [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"
## [5] "LIBMF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [7] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [9] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [11] "UBCF_realRatingMatrix"
lapply(recommendation_model, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
##
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
## Implementing a single model in the R project – Item Based Collaborative Filtering
recommendation_model$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
## Collaborative Filtering involves suggesting movies to the users that are based on collecting preferences from many other users.
## With the help of recommenderlab, we can compute similarities between users
similarity_mat <- similarity(ratingMatrix[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_mat)
## 1 2 3 4
## 1 NA 0.9880430 0.9820862 0.9957199
## 2 0.9880430 NA 0.9962866 0.9687126
## 3 0.9820862 0.9962866 NA 0.9944484
## 4 0.9957199 0.9687126 0.9944484 NA
image(as.matrix(similarity_mat), main = "User's Similarities")

## Portray the similarity that is shared between the films
movie_similarity <- similarity(ratingMatrix[, 1:4], method = "cosine", which = "items")
as.matrix(movie_similarity)
## 1 2 3 4
## 1 NA 0.9834866 0.9779671 0.9550638
## 2 0.9834866 NA 0.9829378 0.9706208
## 3 0.9779671 0.9829378 NA 0.9932438
## 4 0.9550638 0.9706208 0.9932438 NA
image(as.matrix(movie_similarity), main = "Movies similarity")

rating_values <- as.vector(ratingMatrix@data)
unique(rating_values) # extracting unique ratings
## [1] 0.0 5.0 4.0 3.0 4.5 1.5 2.0 3.5 1.0 2.5 0.5
Table_of_Ratings <- table(rating_values) # creating a count of movie ratings
Table_of_Ratings
## rating_values
## 0 0.5 1 1.5 2 2.5 3 3.5 4 4.5
## 6791761 1198 3258 1567 7943 5484 21729 12237 28880 8187
## 5
## 14856
## Most viewed movies visualization
library(ggplot2)
movie_views <- colCounts(ratingMatrix) # count views for each movie
table_views <- data.frame(movie = names(movie_views),
views = movie_views) # create dataframe of views
table_views <- table_views[order(table_views$views,
decreasing = TRUE), ] # sort by number of views
table_views$title <- NA
for (index in 1:10325){
table_views[index, 3] <- as.character(subset(movie_data,
movie_data$movieId == table_views[index, 1])$title)
}
table_views[1:6, ]
## movie views title
## 296 296 325 Pulp Fiction (1994)
## 356 356 311 Forrest Gump (1994)
## 318 318 308 Shawshank Redemption, The (1994)
## 480 480 294 Jurassic Park (1993)
## 593 593 290 Silence of the Lambs, The (1991)
## 260 260 273 Star Wars: Episode IV - A New Hope (1977)
# Visualize a bar plot for the total number of views of the top films
## Plotting Total Views of the Top Films
ggplot(table_views[1:6, ], aes(x = title, y = views)) +
geom_bar(stat = "identity", fill = 'steelblue') +
geom_text(aes(label = views), vjust = -0.3, size = 3.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Total Views of the Top Films")

# Heatmap of Movie Ratings
## Visualize a heatmap of the movie ratings
# Data Preparation
### Describing matrix of relevant users
#### Heatmap of the first 25 rows and 25 columns
image(ratingMatrix[1:20, 1:25], axes = FALSE, main = "Heatmap of the first 25 rows and 25 columns")

movie_ratings <- ratingMatrix[rowCounts(ratingMatrix) > 50,
colCounts(ratingMatrix) > 50]
movie_ratings
## 420 x 447 rating matrix of class 'realRatingMatrix' with 38341 ratings.
#### Heatmap of the top users and movies
minimum_movies <- quantile(rowCounts(movie_ratings), 0.98)
minimum_users <- quantile(colCounts(movie_ratings), 0.98)
image(movie_ratings[rowCounts(movie_ratings) > minimum_movies,
colCounts(movie_ratings) > minimum_users],
main = "Heatmap of the top users and movies")

## Visualizing the distribution of the average ratings per user
average_ratings <- rowMeans(movie_ratings)
qplot(average_ratings, fill = I("steelblue"), col = I("red")) +
ggtitle("Distribution of the average rating per user")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Data Normalization
normalized_ratings <- normalize(movie_ratings)
sum(rowMeans(normalized_ratings) > 0.00001)
## [1] 0
image(normalized_ratings[rowCounts(normalized_ratings) > minimum_movies,
colCounts(normalized_ratings) > minimum_users],
main = "Normalized Ratings of the Top Users")

# Data Binarization
binary_minimum_movies <- quantile(rowCounts(movie_ratings), 0.95)
binary_minimum_users <- quantile(colCounts(movie_ratings), 0.95)
good_rated_films <- binarize(movie_ratings, minRating = 3)
image(good_rated_films[rowCounts(movie_ratings) > binary_minimum_movies,
colCounts(movie_ratings) > binary_minimum_users],
main = "Heatmap of the top users and movies")

# Collaborative Filtering System
## Splitting the dataset into 80% training set and 20% test set
sampled_data <- sample(x = c(TRUE, FALSE),
size = nrow(movie_ratings),
replace = TRUE,
prob = c(0.8, 0.2))
training_data <- movie_ratings[sampled_data, ]
testing_data <- movie_ratings[!sampled_data, ]
## Building the Recommendation System
recommendation_system <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
recommendation_system$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
recommen_model <- Recommender(data = training_data,
method = "IBCF",
parameter = list(k = 30))
recommen_model
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 334 users.
class(recommen_model)
## [1] "Recommender"
## attr(,"package")
## [1] "recommenderlab"
## Exploring the data science recommendation system model
model_info <- getModel(recommen_model)
class(model_info$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(model_info$sim)
## [1] 447 447
top_items <- 20
image(model_info$sim[1:top_items, 1:top_items],
main = "Heatmap of the first rows and columns")

## Visualize sum of rows and columns with the similarity of the objects above 0
sum_rows <- rowSums(model_info$sim > 0)
table(sum_rows)
## sum_rows
## 30
## 447
sum_cols <- colSums(model_info$sim > 0)
qplot(sum_cols, fill = I("steelblue"), col = I("red")) +
ggtitle("Distribution of the column count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## the number of items to recommend to each user
top_recommendations <- 10
predicted_recommendations <- predict(object = recommen_model,
newdata = testing_data,
n = top_recommendations)
predicted_recommendations
## Recommendations as 'topNList' with n = 10 for 86 users.
## recommendation for the first user
user1 <- predicted_recommendations@items[[1]]
movies_user1 <- predicted_recommendations@itemLabels[user1]
movies_user2 <- movies_user1
for (index in 1:10){
movies_user2[index] <- as.character(subset(movie_data,
movie_data$movieId == movies_user1[index])$title)
}
movies_user2
## [1] "Leaving Las Vegas (1995)"
## [2] "Mr. Holland's Opus (1995)"
## [3] "In the Line of Fire (1993)"
## [4] "Philadelphia (1993)"
## [5] "Schindler's List (1993)"
## [6] "Searching for Bobby Fischer (1993)"
## [7] "Blade Runner (1982)"
## [8] "Dances with Wolves (1990)"
## [9] "Fargo (1996)"
## [10] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)"
## matrix with the recommendations for each user
recommendation_matrix <- sapply(predicted_recommendations@items,
function(x){ as.integer(colnames(movie_ratings)[x]) })
## Distribution of the Number of Items for IBCF
number_of_items <- factor(table(recommendation_matrix))
chart_title <- "Distribution of the Number of Items for IBCF"
qplot(number_of_items, fill = I("steelblue"), col = I("red")) +
ggtitle(chart_title)

number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(as.integer(names(number_of_items_top)),
number_of_items_top)
for(i in 1:4) {
table_top[i,1] <- as.character(subset(movie_data,
movie_data$movieId == table_top[i,1])$title)
}
colnames(table_top) <- c("Movie Title", "No. of Items")
head(table_top)
## Movie Title No. of Items
## 62 Mr. Holland's Opus (1995) 10
## 923 Citizen Kane (1941) 9
## 1207 To Kill a Mockingbird (1962) 9
## 21 Get Shorty (1995) 8