Final Project

My final project is my attempt at creating a basic recommender system using the MovieLens dataset I’ve found on Kaggle. This is my process at exploring the data,preprocessing the data,creating the rating matrix,building a model and implmenting the model to get a recommendations of movies per user.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(recommenderlab)

## Warning: package 'recommenderlab' was built under R version 4.1.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: arules

## Warning: package 'arules' was built under R version 4.1.3

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## Warning: package 'proxy' was built under R version 4.1.3

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: registry

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

library(stringi)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.1.3

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## insert the MovieLens Dataset: 

ratings <-read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject/main/ratings.csv",header=TRUE)
movies <- read.csv("https://raw.githubusercontent.com/AldataSci/FinalProject/main/movies.csv",header=TRUE)

Data Exploration:

## There are various rating from 0.5 to 5 stars rating.. 
vector_rating <- as.vector(unique(ratings$rating))
table_rating <- table(vector_rating)
table_rating

## vector_rating
## 0.5   1 1.5   2 2.5   3 3.5   4 4.5   5 
##   1   1   1   1   1   1   1   1   1   1

## Looking at the distribution of the rating.. 
factor_vector <- as.factor(ratings$rating)
qplot(factor_vector) + ggtitle("Distribution of the Rating") +
  theme_bw()

## Exploring which movies have the most views or most rating! in this small dataset!

## arrange the movie ratings that have been rated the most
movie_Rating <- ratings %>%
  group_by(movieId) %>%
  count() %>%
  arrange(desc(n))

## do an inner join to get see which movies has been reviewed
Movie_rating <- movie_Rating %>%
  inner_join(movies,by="movieId") %>%
  select(-genres)

ggplot(Movie_rating[1:10,],aes(x=title,y=n))+
  geom_bar(stat="identity") + 
  ggtitle("Top Ten Movies That Were Given A Rating") +
  xlab("Movie") +
  ylab("Movie Rated By A User") +
  coord_flip()

## Exploring Average Rating Per Movie: 

avg_movie_rating <- ratings %>%
  select("userId","movieId","rating") %>%
  group_by(movieId) %>%
  summarise(avg_rating = mean(rating)) %>%
  inner_join(movies,by="movieId") %>%
  select(-genres)


## left-tail skewed again and some prominent peaks but we can see that most of the average ratings per movies are around a 3 or a 4 maybe we should get rids of 0.5 and 1 to get a better distribution of the data.. 

ggplot(avg_movie_rating,aes(x=avg_rating)) +
  geom_histogram(color="darkblue",fill="lightblue") +
  theme_classic() +
  ggtitle("Distribution of the Average Movie Rating")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Here I filtered out the average rating greater than 3 and less than 4.5 to have less skewed datas.. 
avg_movie_rating_clean <- ratings %>%
  select("userId","movieId","rating") %>%
  group_by(movieId) %>%
  summarise(avg_rating = mean(rating)) %>%
  filter(avg_rating > "2.5" & avg_rating <= "4.5") %>%
  inner_join(movies,by="movieId") %>%
  select(-genres)


## Distribution Looks Okay when we removed the exteremes but there are too many dramatic peaks in the datas..
## we removed this since possibly a few people have accounted for the average rating.. 
ggplot(avg_movie_rating_clean,aes(x=avg_rating)) +
  geom_histogram(color="purple",fill="lightblue") +
  theme_classic() +
  ggtitle("Distribution of the Average Movie Rating With Low Rating Removed")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data Preprocessing:

## In order to create a rating matrix I had to convert the datings where each row was a user and each column was a movieId and a cell was a rating of a user given to that particular movie

ratingmat = dcast(ratings, userId~movieId, value.var = "rating", na.rm=FALSE)

dim(ratingmat)

## [1]  610 9725

## remove the UserId:
ratingMat <- ratingmat %>%
  select(-userId) %>%
  as.matrix()
RatingMat <- ratingMat %>%
  as("realRatingMatrix")

## visualize the Heatmap and we can see that the matrix is mainly sparse.
## A rating matrix of the first 50 users and 50 movie ratings. 
image(ratingMat[1:50,1:50], main ="Heatmap of the Rating Matrix")

Building A Model:

## spliting the data into a testing and training model 
training <- sample(x=c(TRUE,FALSE),size=nrow(RatingMat),replace=TRUE,prob=c(0.8,0.2))
recc_training_data <- RatingMat[training,]
recc_testing_data <- RatingMat[!training,]

## Build a UBCM since we are making the recommender based upon the users averages to recommend movies
## use the default method of cosine.


recc_model <- Recommender(data=recc_training_data,method="UBCF",parameter=list(method="Cosine"))

## use the predict model to make prediciton

Top_5_pred = predict(recc_model, recc_testing_data, n=5)

## define a dataframe having a column with the ordered movie labels, access the movie names with the item Label slots..
movie_Labels <- data.frame(movieId = Top_5_pred@itemLabels)
movie_Labels <- movies %>%
  left_join(movies,by="movieId")

Implementing The Model:

## look at the recommendation of the first user: 
recc_user_1 <- Top_5_pred@items[[1]]
recc_user_1

## [1]  487  255  975 2960 2740

movie_Labels$title.x[recc_user_1]

## [1] "True Romance (1993)"                                             
## [2] "LÃ©on: The Professional (a.k.a. The Professional) (LÃ©on) (1994)"
## [3] "Highlander (1986)"                                               
## [4] "Billy Elliot (2000)"                                             
## [5] "Baraka (1992)"

Top_5_List = as(Top_5_pred, "list")

## Created A matrix to help show the numbers of users: 
recc_Matrix <- sapply(Top_5_pred@items,function(x){
  recommendation <- movie_Labels$title.x[x]
})

## Showing the Recommendation For the First 3 Users: 

recc_Matrix[,1:5]

##      [,1]                                                              
## [1,] "True Romance (1993)"                                             
## [2,] "LÃ©on: The Professional (a.k.a. The Professional) (LÃ©on) (1994)"
## [3,] "Highlander (1986)"                                               
## [4,] "Billy Elliot (2000)"                                             
## [5,] "Baraka (1992)"                                                   
##      [,2]                                                              
## [1,] "Cube 2: Hypercube (2002)"                                        
## [2,] "City of Lost Children, The (CitÃ© des enfants perdus, La) (1995)"
## [3,] "Burnt by the Sun (Utomlyonnye solntsem) (1994)"                  
## [4,] "Vanya on 42nd Street (1994)"                                     
## [5,] "Doors, The (1991)"                                               
##      [,3]                                                                  
## [1,] "Better Off Dead... (1985)"                                           
## [2,] "Vampire in Venice (Nosferatu a Venezia) (Nosferatu in Venice) (1986)"
## [3,] "La CÃ©rÃ©monie (1995)"                                               
## [4,] "There's Something About Mary (1998)"                                 
## [5,] "Ballad of Narayama, The (Narayama bushiko) (1983)"                   
##      [,4]                                        
## [1,] "Paparazzi (2004)"                          
## [2,] "Catch a Fire (2006)"                       
## [3,] "Blood Creek (a.k.a. Town Creek) (2009)"    
## [4,] "A-Team, The (2010)"                        
## [5,] "Heartbeats (Les amours imaginaires) (2010)"
##      [,5]                                    
## [1,] "Beverly Hills Cop III (1994)"          
## [2,] "One Flew Over the Cuckoo's Nest (1975)"
## [3,] "Psycho (1960)"                         
## [4,] "Auntie Mame (1958)"                    
## [5,] "True Romance (1993)"

## We check if some movies are much more likely to be recommended. 
## visualize the movie distribution:
## Some movies have been recommended 10 times or fewer and a few has been recommended 20 or 24 times. 
recc_Matrix_table <- table(recc_Matrix)
recc_Matrix_Num <- as.numeric(recc_Matrix_table)

bin_recc <- cut(recc_Matrix_Num,breaks=c(0,10,20,max(recc_Matrix_Num)))

qplot(bin_recc) + ggtitle("Distribution of recomendations per Users")

Sources/Acknowledgements:

1.Usuelli, Michele, and Suresh K. Gorakala. Building a Recommendation System with R. Packt Publishing Limited 2015._

___2.Pierson, Lillian. “How to Build a Recommendation Engine in R: Marketing Data Science!” Data, 22 Mar. 2021, https://www.data-mania.com/blog/how-to-build-a-recommendation-engine-in-r/.____

__3.Efficient Reshaping Using Data.tables, 22 Sept. 2021, https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html.____

Final Project

Al Haque

5/14/2022

Data Exploration:

Data Preprocessing:

Building A Model:

Implementing The Model:

Sources/Acknowledgements:

Thank you!