This is an R Markdown document for performing analysis of MovieLense Data and to recommend the new / untried movies to users. We use the R package sparklyr for building the recommender and compare it with the conventional recommender systems built earlier.
knitr::opts_chunk$set(message = FALSE, echo = TRUE)
# Library for loading CSV data
library(RCurl)
# Library for data tidying
library(tidyr)
# Library for data structure operations
library(dplyr)
library(knitr)
# Library for plotting
library(ggplot2)
# Library for data display in tabular format
library(DT)
library(pander)
library(reshape2)
suppressWarnings(suppressMessages(library(recommenderlab)))# We are using the MovieLens from the recommender lab package
data(MovieLense, package = "recommenderlab")
movielense <- MovieLense
class(movielense)## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"
# Verifying records and variables
nrow(movielense)## [1] 943
ncol(movielense)## [1] 1664
# Loading the metadata that gets loaded with main dataset
moviemeta <- MovieLenseMeta
# Verifying records and variables
nrow(moviemeta)## [1] 1664
ncol(moviemeta)## [1] 22
pander(head(moviemeta), caption = "Sample Movie Meta Data")| title | year |
|---|---|
| Toy Story (1995) | 1995 |
| GoldenEye (1995) | 1995 |
| Four Rooms (1995) | 1995 |
| Get Shorty (1995) | 1995 |
| Copycat (1995) | 1995 |
| Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) | 1995 |
| Adventure | Animation | Children’s | Comedy | Crime | Documentary | Drama |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| War | Western |
|---|---|
| 0 | 0 |
| 0 | 0 |
| 0 | 0 |
| 0 | 0 |
| 0 | 0 |
| 0 | 0 |
# movies<- data.frame(scale(movielense, center=T, scale=T))
movies <- as(movielense, "data.frame")
dim(movies)## [1] 99392 3
# We would need the item as numeric , hence doing factors to uniquely identify
# item of movie by assigning as itemid
movies <- transform(movies, itemid = as.numeric(factor(item)))
colnames(movies) <- c("user", "item", "rating", "itemid")
dim(movies)## [1] 99392 4
# View(movies)
# Forming the movieid/moviename mapping table for later reference
moviename <- movies %>% select(item, itemid)
# Since ths is made form the long format useritem table , we have duplicate rows.
# Deleting duplicate rows, taking only distince
moviename <- moviename %>% distinct(item, itemid)
# Verify the movie
dim(moviename)## [1] 1664 2
# We further select the data on basis on itemid , removing the named movie column
moviesdata <- movies %>% select(-item)
# View(moviesdata) View(moviename)
# We need to do this as Spark which is later conencted to for ALS factorization
# requires the user /item column be numeric
moviesdata$user <- as.numeric(moviesdata$user)
moviesdata$itemid <- as.numeric(moviesdata$itemid)
# Now arrangeing by user and to wide format
moviesdatawide <- reshape(moviesdata, idvar = "user", timevar = "itemid", direction = "wide") %>%
arrange(user)
dim(moviesdatawide)## [1] 943 1665
# View(moviesdatawide)
# Store the userids as rownames for later use
rownames(moviesdatawide) <- moviesdatawide$user
moviesdatawide <- moviesdatawide %>% select(-user)
# Store the moview ids for later use
library(stringr)
colnames(moviesdatawide) <- str_replace(colnames(moviesdatawide), "rating.", "")
# Now we have both rating tables moviesdata in long form and moviesdatawide in
# wide form
# Since this is in long format, as needed but is mixed, we would prep the data in
# such a way so as to identify the movies with a numerical value . This is
# because in order to use Spark library functions we would need to use movie
# lense data as dataframe and with the user and item as numeric columns
mometa <- as(moviemeta, "data.frame")Spark was initially installed and tried for V2.1.0 but due to configuration failures, Spark was switched to 2.0.2
# devtools::install_github('rstudio/sparklyr', force=TRUE) spark_install(version
# = '2.1.0') spark_install_dir()
# Sparklyr was installed with the following command
# install.packages('sparklyr',force='TRUE')
spset_start <- proc.time()
library(sparklyr)
# installed local version of Spark
spark_install(version = "2.0.2", hadoop_version = "2.7")
# Connect to your Spark cluster
spconn <- spark_connect(master = "local")
# Check Print the version of Spark
spark_version(spconn)## [1] '2.0.2'
spmovies <- sdf_copy_to(spconn, moviesdata, "spmovies", overwrite = TRUE)
spmometa <- sdf_copy_to(spconn, mometa, "spmometa", overwrite = TRUE)
library(dplyr)
src_tbls(spconn)## [1] "spmometa" "spmovies"
spset_end <- proc.time()
spset_end - spset_start## user system elapsed
## 1.49 0.17 18.35
# MAke Model
model_als <- ml_als_factorization(spmovies, rating.column = "rating", user.column = "user",
item.column = "itemid", iter.max = 7)
summary(model_als)## Length Class Mode
## item.factors 11 data.frame list
## user.factors 11 data.frame list
## data 2 spark_jobj environment
## ml.options 6 ml_options list
## model.parameters 2 -none- list
## .call 6 -none- call
## .model 2 spark_jobj environment
# Calculate Predicton
predictions <- model_als$.model %>% invoke("transform", spark_dataframe(spmovies)) %>%
collect()
class(predictions)## [1] "tbl_df" "tbl" "data.frame"
head(data.frame(predictions), 10)## user rating itemid prediction
## 1 857 4 12 3.275293
## 2 868 4 12 4.037489
## 3 822 1 12 1.645155
## 4 759 4 12 3.520603
## 5 141 4 13 3.596831
## 6 367 2 13 2.612070
## 7 173 4 13 3.764529
## 8 503 5 13 4.588961
## 9 17 5 14 4.588117
## 10 231 5 14 4.451607
# Attach movie name via movie id joining
alluseritems <- merge(predictions, moviename, by = c("itemid"))
dim(alluseritems)## [1] 99392 5
# View(alluseritems)
# Make it a predicted user item matrix with coumns as movie names , sorted user
# wise, predicted rating matrix
alluseritemswide <- alluseritems %>% select(user, prediction, item) %>% spread(item,
prediction) %>% arrange(user)
dim(alluseritemswide)## [1] 943 1665
# View(alluseritemswide)
# Extract the User and Item factor matrices from the model userfactormat <-
# as.matrix(model_als$user.factors[,-1]) itemfactormat <-
# as.matrix(model_als$item.factors[,-1])
userfactormat <- as.matrix(model_als$user.factors)
itemfactormat <- as.matrix(model_als$item.factors)
# For random user, find predictions
sampleuser <- userfactormat[sample(1:nrow(moviesdatawide), 1), ]
sampleuser## id V1 V2 V3 V4 V5
## 859.0000000 -0.9335458 0.4578500 -0.3457451 -0.5840850 -0.9588441
## V6 V7 V8 V9 V10
## 0.8232060 1.0859140 0.1410353 -1.1790280 -0.3657078
sampleuser1 <- predictions[sample(1:nrow(moviesdatawide), 1), ]
sampleuser1## # A tibble: 1 x 4
## user rating itemid prediction
## <dbl> <dbl> <dbl> <dbl>
## 1 874 4 18 3.262407
class(sampleuser1$user)## [1] "numeric"
# Finding the predicted and previous rating for the sampled user Printing only
# 500 of the movies predicted comparison . as data is large
sampleuser1pred <- alluseritems[which(alluseritems$user == sampleuser1$user), ]
sampleuser1pred <- sampleuser1pred %>% select(user, item, rating, prediction)
datatable(head(sampleuser1pred, 500))model_als.RMSE <- sqrt(mean((predictions$rating - predictions$prediction)^2))
model_als.RMSE## [1] 0.7778992
# Disconnect from Spark
finaltime <- proc.time() - spset_start
finaltime## user system elapsed
## 3.08 0.23 28.57
spark_disconnect(spconn)Working with Spark was an experience rooted in trial and error, especially with many a foreign exceptions to handle , with no clue to debug. Also some of the aspects , discovered in process, like ALS Factorization taking only numeric values was perplexing and is still so.
RMSE for ALS Factorization performed in Spark RMSE= 0.77, seemed pretty good and better than the IBCF (Item Based Collaborative Filtering) having RMSE 1.268 and UBCF (User Based Collaborative Filtering) with RMSE 1.017
The execution time also appeared better(lesser) It was overall a good learning lesson and more exploration of Textual content-based recommendation could be done in Spark qith more reading on the subject.
http://rpubs.com/DataDrivenMSDA/RSEval_MovieLens Building Recommender System in R https://rpubs.com/chezou/sparklyr-als