R Markdown

Here i load all the needed library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
library(scatterplot3d) 
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.6     ✓ purrr   0.3.4
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
library(ggplot2)
library(patchwork)
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(recommenderlab)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
df <- read.csv("Project.csv")
head(df)
##   userId movieId rating  timestamp
## 1      1      31    2.5 1260759144
## 2      1    1029    3.0 1260759179
## 3      1    1061    3.0 1260759182
## 4      1    1129    2.0 1260759185
## 5      1    1172    4.0 1260759205
## 6      1    1263    2.0 1260759151

Lets see summaries of data and Statistical stracture of the data

str(df)
## 'data.frame':    100004 obs. of  4 variables:
##  $ userId   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId  : int  31 1029 1061 1129 1172 1263 1287 1293 1339 1343 ...
##  $ rating   : num  2.5 3 3 2 4 2 2 2 3.5 2 ...
##  $ timestamp: int  1260759144 1260759179 1260759182 1260759185 1260759205 1260759151 1260759187 1260759148 1260759125 1260759131 ...
summary(df)
##      userId       movieId           rating        timestamp        
##  Min.   :  1   Min.   :     1   Min.   :0.500   Min.   :7.897e+08  
##  1st Qu.:182   1st Qu.:  1028   1st Qu.:3.000   1st Qu.:9.658e+08  
##  Median :367   Median :  2406   Median :4.000   Median :1.110e+09  
##  Mean   :347   Mean   : 12549   Mean   :3.544   Mean   :1.130e+09  
##  3rd Qu.:520   3rd Qu.:  5418   3rd Qu.:4.000   3rd Qu.:1.296e+09  
##  Max.   :671   Max.   :163949   Max.   :5.000   Max.   :1.477e+09

** Here is the rating Matrix and The heatmap of rating Matrix**

The melt and dcast functions for data.tables are for reshaping wide-to-long and long-to-wide, respectively; the implementations are specifically designed with large in-memory data (e.g. 10Gb)

ratingMatrix <- dcast(as.data.table(df), userId~movieId, value.var = "rating", na.rm=FALSE)
ratingMatrix <- as.matrix(ratingMatrix[,-1]) #remove userIds
#Convert rating matrix into a recommenderlab sparse matrix
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
image(ratingMatrix[1:20, 1:30], axes = FALSE, main = "Heatmap for first 20 users and 30 movies")

Bulding The model

recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
##  [1] "HYBRID_realRatingMatrix"       "ALS_realRatingMatrix"         
##  [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"        
##  [5] "LIBMF_realRatingMatrix"        "POPULAR_realRatingMatrix"     
##  [7] "RANDOM_realRatingMatrix"       "RERECOMMEND_realRatingMatrix" 
##  [9] "SVD_realRatingMatrix"          "SVDF_realRatingMatrix"        
## [11] "UBCF_realRatingMatrix"
lapply(recommendation_model, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
## 
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
## 
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
## 
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
## 
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
## 
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
## 
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
## 
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
## 
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
## 
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
## 
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."

Building the Recommendation System

recommendation_system <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommendation_system$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
## 
## $method
## [1] "Cosine"
## 
## $normalize
## [1] "center"
## 
## $normalize_sim_matrix
## [1] FALSE
## 
## $alpha
## [1] 0.5
## 
## $na_as_zero
## [1] FALSE

Here I build this filtering system by splitting the dataset into 80% training set and 20% test set.

sampled_data<- sample(x = c(TRUE, FALSE),
                      size = nrow(ratingMatrix),
                      replace = TRUE,
                      prob = c(0.8, 0.2))
training_data <- ratingMatrix[sampled_data, ]
testing_data <- ratingMatrix[!sampled_data, ]

training_data
## 543 x 9066 rating matrix of class 'realRatingMatrix' with 83455 ratings.
testing_data
## 128 x 9066 rating matrix of class 'realRatingMatrix' with 16549 ratings.
recommen_model <- Recommender(data = training_data,
                              method = "IBCF",
                              parameter = list(k = 30))
recommen_model
## Recommender of type 'IBCF' for 'realRatingMatrix' 
## learned using 543 users.
class(recommen_model)
## [1] "Recommender"
## attr(,"package")
## [1] "recommenderlab"

Using the getModel() function, we will retrieve the recommen_model. We will then find the class and dimensions of our similarity matrix that is contained within model_info

model_info <- getModel(recommen_model)
class(model_info$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(model_info$sim)
## [1] 9066 9066
top_items <-40 
image(model_info$sim[1:top_items, 1:top_items],
      main = "Heatmap of the first rows and columns")

top_recommendations <- 10 # the number of items to recommend to each user
predicted_recommendations <- predict(object = recommen_model,
                                     newdata = testing_data,
                                     n = top_recommendations)
predicted_recommendations
## Recommendations as 'topNList' with n = 10 for 128 users.
user1 <- predicted_recommendations@items[[1]] # recommendation for the first user
movies_user1 <- predicted_recommendations@itemLabels[user1]
print('top 10 recommended movies id for user 1')
## [1] "top 10 recommended movies id for user 1"
movies_user1
##  [1] "126548" "169"    "174"    "452"    "534"    "1190"   "1283"   "1328"  
##  [9] "1334"   "2106"