Here i load all the needed library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
library(scatterplot3d)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ purrr 0.3.4
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(patchwork)
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(recommenderlab)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: arules
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
df <- read.csv("Project.csv")
head(df)
## userId movieId rating timestamp
## 1 1 31 2.5 1260759144
## 2 1 1029 3.0 1260759179
## 3 1 1061 3.0 1260759182
## 4 1 1129 2.0 1260759185
## 5 1 1172 4.0 1260759205
## 6 1 1263 2.0 1260759151
Lets see summaries of data and Statistical stracture of the data
str(df)
## 'data.frame': 100004 obs. of 4 variables:
## $ userId : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : int 31 1029 1061 1129 1172 1263 1287 1293 1339 1343 ...
## $ rating : num 2.5 3 3 2 4 2 2 2 3.5 2 ...
## $ timestamp: int 1260759144 1260759179 1260759182 1260759185 1260759205 1260759151 1260759187 1260759148 1260759125 1260759131 ...
summary(df)
## userId movieId rating timestamp
## Min. : 1 Min. : 1 Min. :0.500 Min. :7.897e+08
## 1st Qu.:182 1st Qu.: 1028 1st Qu.:3.000 1st Qu.:9.658e+08
## Median :367 Median : 2406 Median :4.000 Median :1.110e+09
## Mean :347 Mean : 12549 Mean :3.544 Mean :1.130e+09
## 3rd Qu.:520 3rd Qu.: 5418 3rd Qu.:4.000 3rd Qu.:1.296e+09
## Max. :671 Max. :163949 Max. :5.000 Max. :1.477e+09
** Here is the rating Matrix and The heatmap of rating Matrix**
The melt and dcast functions for data.tables are for reshaping wide-to-long and long-to-wide, respectively; the implementations are specifically designed with large in-memory data (e.g. 10Gb)
ratingMatrix <- dcast(as.data.table(df), userId~movieId, value.var = "rating", na.rm=FALSE)
ratingMatrix <- as.matrix(ratingMatrix[,-1]) #remove userIds
#Convert rating matrix into a recommenderlab sparse matrix
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
image(ratingMatrix[1:20, 1:30], axes = FALSE, main = "Heatmap for first 20 users and 30 movies")
Bulding The model
recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
## [1] "HYBRID_realRatingMatrix" "ALS_realRatingMatrix"
## [3] "ALS_implicit_realRatingMatrix" "IBCF_realRatingMatrix"
## [5] "LIBMF_realRatingMatrix" "POPULAR_realRatingMatrix"
## [7] "RANDOM_realRatingMatrix" "RERECOMMEND_realRatingMatrix"
## [9] "SVD_realRatingMatrix" "SVDF_realRatingMatrix"
## [11] "UBCF_realRatingMatrix"
lapply(recommendation_model, "[[", "description")
## $HYBRID_realRatingMatrix
## [1] "Hybrid recommender that aggegates several recommendation strategies using weighted averages."
##
## $ALS_realRatingMatrix
## [1] "Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm."
##
## $ALS_implicit_realRatingMatrix
## [1] "Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm."
##
## $IBCF_realRatingMatrix
## [1] "Recommender based on item-based collaborative filtering."
##
## $LIBMF_realRatingMatrix
## [1] "Matrix factorization with LIBMF via package recosystem (https://cran.r-project.org/web/packages/recosystem/vignettes/introduction.html)."
##
## $POPULAR_realRatingMatrix
## [1] "Recommender based on item popularity."
##
## $RANDOM_realRatingMatrix
## [1] "Produce random recommendations (real ratings)."
##
## $RERECOMMEND_realRatingMatrix
## [1] "Re-recommends highly rated items (real ratings)."
##
## $SVD_realRatingMatrix
## [1] "Recommender based on SVD approximation with column-mean imputation."
##
## $SVDF_realRatingMatrix
## [1] "Recommender based on Funk SVD with gradient descend (https://sifter.org/~simon/journal/20061211.html)."
##
## $UBCF_realRatingMatrix
## [1] "Recommender based on user-based collaborative filtering."
Building the Recommendation System
recommendation_system <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommendation_system$IBCF_realRatingMatrix$parameters
## $k
## [1] 30
##
## $method
## [1] "Cosine"
##
## $normalize
## [1] "center"
##
## $normalize_sim_matrix
## [1] FALSE
##
## $alpha
## [1] 0.5
##
## $na_as_zero
## [1] FALSE
Here I build this filtering system by splitting the dataset into 80% training set and 20% test set.
sampled_data<- sample(x = c(TRUE, FALSE),
size = nrow(ratingMatrix),
replace = TRUE,
prob = c(0.8, 0.2))
training_data <- ratingMatrix[sampled_data, ]
testing_data <- ratingMatrix[!sampled_data, ]
training_data
## 543 x 9066 rating matrix of class 'realRatingMatrix' with 83455 ratings.
testing_data
## 128 x 9066 rating matrix of class 'realRatingMatrix' with 16549 ratings.
recommen_model <- Recommender(data = training_data,
method = "IBCF",
parameter = list(k = 30))
recommen_model
## Recommender of type 'IBCF' for 'realRatingMatrix'
## learned using 543 users.
class(recommen_model)
## [1] "Recommender"
## attr(,"package")
## [1] "recommenderlab"
Using the getModel() function, we will retrieve the recommen_model. We will then find the class and dimensions of our similarity matrix that is contained within model_info
model_info <- getModel(recommen_model)
class(model_info$sim)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(model_info$sim)
## [1] 9066 9066
top_items <-40
image(model_info$sim[1:top_items, 1:top_items],
main = "Heatmap of the first rows and columns")
top_recommendations <- 10 # the number of items to recommend to each user
predicted_recommendations <- predict(object = recommen_model,
newdata = testing_data,
n = top_recommendations)
predicted_recommendations
## Recommendations as 'topNList' with n = 10 for 128 users.
user1 <- predicted_recommendations@items[[1]] # recommendation for the first user
movies_user1 <- predicted_recommendations@itemLabels[user1]
print('top 10 recommended movies id for user 1')
## [1] "top 10 recommended movies id for user 1"
movies_user1
## [1] "126548" "169" "174" "452" "534" "1190" "1283" "1328"
## [9] "1334" "2106"