My final project is my attempt at creating a basic recommender system using the MovieLens dataset I’ve found on Kaggle. This is my process at exploring the data,preprocessing the data,creating the rating matrix,building a model and implmenting the model to get a recommendations of movies per user.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(recommenderlab)
## Warning: package 'recommenderlab' was built under R version 4.1.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: arules
## Warning: package 'arules' was built under R version 4.1.3
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
## Warning: package 'proxy' was built under R version 4.1.3
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(stringi)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## insert the MovieLens Dataset:
ratings <-read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject/main/ratings.csv",header=TRUE)
movies <- read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject/main/movies.csv",header=TRUE)
## There are various rating from 0.5 to 5 stars rating..
vector_rating <- as.vector(unique(ratings$rating))
table_rating <- table(vector_rating)
table_rating
## vector_rating
## 0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1 1 1 1 1 1 1 1 1 1
## Looking at the distribution of the rating..
factor_vector <- as.factor(ratings$rating)
qplot(factor_vector) + ggtitle("Distribution of the Rating") +
theme_bw()
## Exploring which movies have the most views or most rating! in this small dataset!
## arrange the movie ratings that have been rated the most
movie_Rating <- ratings %>%
group_by(movieId) %>%
count() %>%
arrange(desc(n))
## do an inner join to get see which movies has been reviewed
Movie_rating <- movie_Rating %>%
inner_join(movies,by="movieId") %>%
select(-genres)
ggplot(Movie_rating[1:10,],aes(x=title,y=n))+
geom_bar(stat="identity") +
ggtitle("Top Ten Movies That Were Given A Rating") +
xlab("Movie") +
ylab("Movie Rated By A User") +
coord_flip()
## Exploring Average Rating Per Movie:
avg_movie_rating <- ratings %>%
select("userId","movieId","rating") %>%
group_by(movieId) %>%
summarise(avg_rating = mean(rating)) %>%
inner_join(movies,by="movieId") %>%
select(-genres)
## left-tail skewed again and some prominent peaks but we can see that most of the average ratings per movies are around a 3 or a 4 maybe we should get rids of 0.5 and 1 to get a better distribution of the data..
ggplot(avg_movie_rating,aes(x=avg_rating)) +
geom_histogram(color="darkblue",fill="lightblue") +
theme_classic() +
ggtitle("Distribution of the Average Movie Rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Here I filtered out the average rating greater than 3 and less than 4.5 to have less skewed datas..
avg_movie_rating_clean <- ratings %>%
select("userId","movieId","rating") %>%
group_by(movieId) %>%
summarise(avg_rating = mean(rating)) %>%
filter(avg_rating > "2.5" & avg_rating <= "4.5") %>%
inner_join(movies,by="movieId") %>%
select(-genres)
## Distribution Looks Okay when we removed the exteremes but there are too many dramatic peaks in the datas..
## we removed this since possibly a few people have accounted for the average rating..
ggplot(avg_movie_rating_clean,aes(x=avg_rating)) +
geom_histogram(color="purple",fill="lightblue") +
theme_classic() +
ggtitle("Distribution of the Average Movie Rating With Low Rating Removed")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## In order to create a rating matrix I had to convert the datings where each row was a user and each column was a movieId and a cell was a rating of a user given to that particular movie
ratingmat = dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)
dim(ratingmat)
## [1] 610 9725
## remove the UserId:
ratingMat <- ratingmat %>%
select(-userId) %>%
as.matrix()
RatingMat <- ratingMat %>%
as("realRatingMatrix")
## visualize the Heatmap and we can see that the matrix is mainly sparse.
## A rating matrix of the first 50 users and 50 movie ratings.
image(ratingMat[1:50,1:50], main ="Heatmap of the Rating Matrix")
## spliting the data into a testing and training model
training <- sample(x=c(TRUE,FALSE),size=nrow(RatingMat),replace=TRUE,prob=c(0.8,0.2))
recc_training_data <- RatingMat[training,]
recc_testing_data <- RatingMat[!training,]
## Build a UBCM since we are making the recommender based upon the users averages to recommend movies
## use the default method of cosine.
recc_model <- Recommender(data=recc_training_data,method="UBCF",parameter=list(method="Cosine"))
## use the predict model to make prediciton
Top_5_pred = predict(recc_model, recc_testing_data, n=5)
## define a dataframe having a column with the ordered movie labels, access the movie names with the item Label slots..
movie_Labels <- data.frame(movieId = Top_5_pred@itemLabels)
movie_Labels <- movies %>%
left_join(movies,by="movieId")
## look at the recommendation of the first user:
recc_user_1 <- Top_5_pred@items[[1]]
recc_user_1
## [1] 487 255 975 2960 2740
movie_Labels$title.x[recc_user_1]
## [1] "True Romance (1993)"
## [2] "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)"
## [3] "Highlander (1986)"
## [4] "Billy Elliot (2000)"
## [5] "Baraka (1992)"
Top_5_List = as(Top_5_pred, "list")
## Created A matrix to help show the numbers of users:
recc_Matrix <- sapply(Top_5_pred@items,function(x){
recommendation <- movie_Labels$title.x[x]
})
## Showing the Recommendation For the First 3 Users:
recc_Matrix[,1:5]
## [,1]
## [1,] "True Romance (1993)"
## [2,] "Léon: The Professional (a.k.a. The Professional) (Léon) (1994)"
## [3,] "Highlander (1986)"
## [4,] "Billy Elliot (2000)"
## [5,] "Baraka (1992)"
## [,2]
## [1,] "Cube 2: Hypercube (2002)"
## [2,] "City of Lost Children, The (Cité des enfants perdus, La) (1995)"
## [3,] "Burnt by the Sun (Utomlyonnye solntsem) (1994)"
## [4,] "Vanya on 42nd Street (1994)"
## [5,] "Doors, The (1991)"
## [,3]
## [1,] "Better Off Dead... (1985)"
## [2,] "Vampire in Venice (Nosferatu a Venezia) (Nosferatu in Venice) (1986)"
## [3,] "La Cérémonie (1995)"
## [4,] "There's Something About Mary (1998)"
## [5,] "Ballad of Narayama, The (Narayama bushiko) (1983)"
## [,4]
## [1,] "Paparazzi (2004)"
## [2,] "Catch a Fire (2006)"
## [3,] "Blood Creek (a.k.a. Town Creek) (2009)"
## [4,] "A-Team, The (2010)"
## [5,] "Heartbeats (Les amours imaginaires) (2010)"
## [,5]
## [1,] "Beverly Hills Cop III (1994)"
## [2,] "One Flew Over the Cuckoo's Nest (1975)"
## [3,] "Psycho (1960)"
## [4,] "Auntie Mame (1958)"
## [5,] "True Romance (1993)"
## We check if some movies are much more likely to be recommended.
## visualize the movie distribution:
## Some movies have been recommended 10 times or fewer and a few has been recommended 20 or 24 times.
recc_Matrix_table <- table(recc_Matrix)
recc_Matrix_Num <- as.numeric(recc_Matrix_table)
bin_recc <- cut(recc_Matrix_Num,breaks=c(0,10,20,max(recc_Matrix_Num)))
qplot(bin_recc) + ggtitle("Distribution of recomendations per Users")
1.Usuelli, Michele, and Suresh K. Gorakala. Building a Recommendation System with R. Packt Publishing Limited 2015._
___2.Pierson, Lillian. “How to Build a Recommendation Engine in R: Marketing Data Science!” Data, 22 Mar. 2021, https://www.data-mania.com/blog/how-to-build-a-recommendation-engine-in-r/.____
__3.Efficient Reshaping Using Data.tables, 22 Sept. 2021, https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html.____