Recommender System : Movie Lens With SparklyR

Summary

This is an R Markdown document for performing analysis of MovieLense Data and to recommend the new / untried movies to users. We use the R package sparklyr for building the recommender and compare it with the conventional recommender systems built earlier.

knitr::opts_chunk$set(message = FALSE, echo = TRUE)

# Library for loading CSV data
library(RCurl)
# Library for data tidying
library(tidyr)
# Library for data structure operations
library(dplyr)
library(knitr)
# Library for plotting
library(ggplot2)
# Library for data display in tabular format
library(DT)
library(pander)
library(reshape2)


suppressWarnings(suppressMessages(library(recommenderlab)))

Data Loading & Preparation

# We are using the MovieLens from the recommender lab package

data(MovieLense, package = "recommenderlab")

movielense <- MovieLense
class(movielense)

## [1] "realRatingMatrix"
## attr(,"package")
## [1] "recommenderlab"

# Verifying records and variables
nrow(movielense)

## [1] 943

ncol(movielense)

## [1] 1664

# Loading the metadata that gets loaded with main dataset
moviemeta <- MovieLenseMeta

# Verifying records and variables
nrow(moviemeta)

## [1] 1664

ncol(moviemeta)

## [1] 22

pander(head(moviemeta), caption = "Sample Movie Meta Data")

Sample Movie Meta Data (continued below)
title	year
Toy Story (1995)	1995
GoldenEye (1995)	1995
Four Rooms (1995)	1995
Get Shorty (1995)	1995
Copycat (1995)	1995
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)	1995

Table continues below
url	unknown	Action
http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)	0	0
http://us.imdb.com/M/title-exact?GoldenEye%20(1995)	0	1
http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)	0	0
http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)	0	1
http://us.imdb.com/M/title-exact?Copycat%20(1995)	0	0
http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)	0	0

Table continues below
Adventure	Animation	Children’s	Comedy	Crime	Drama
0	1	1	1	0	0
1	0	0	0	0	0
0	0	0	0	0	0
0	0	0	1	0	1
0	0	0	0	1	1
0	0	0	0	0	1

Table continues below
Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller
0	0	0	0	0	0	0	0
0	0	0	0	0	0	0	1
0	0	0	0	0	0	0	1
0	0	0	0	0	0	0	0
0	0	0	0	0	0	0	1
0	0	0	0	0	0	0	0

War	Western
0	0
0	0
0	0
0	0
0	0
0	0

# movies<- data.frame(scale(movielense, center=T, scale=T))
movies <- as(movielense, "data.frame")
dim(movies)

## [1] 99392     3

# We would need the item as numeric , hence doing factors to uniquely identify
# item of movie by assigning as itemid
movies <- transform(movies, itemid = as.numeric(factor(item)))
colnames(movies) <- c("user", "item", "rating", "itemid")
dim(movies)

## [1] 99392     4

# View(movies)

# Forming the movieid/moviename mapping table for later reference
moviename <- movies %>% select(item, itemid)

# Since ths is made form the long format useritem table , we have duplicate rows.
# Deleting duplicate rows, taking only distince
moviename <- moviename %>% distinct(item, itemid)
# Verify the movie
dim(moviename)

## [1] 1664    2

# We further select the data on basis on itemid , removing the named movie column
moviesdata <- movies %>% select(-item)
# View(moviesdata) View(moviename)

# We need to do this as Spark which is later conencted to for ALS factorization
# requires the user /item column be numeric
moviesdata$user <- as.numeric(moviesdata$user)
moviesdata$itemid <- as.numeric(moviesdata$itemid)


# Now arrangeing by user and to wide format
moviesdatawide <- reshape(moviesdata, idvar = "user", timevar = "itemid", direction = "wide") %>% 
    arrange(user)
dim(moviesdatawide)

## [1]  943 1665

# View(moviesdatawide)


# Store the userids as rownames for later use
rownames(moviesdatawide) <- moviesdatawide$user
moviesdatawide <- moviesdatawide %>% select(-user)

# Store the moview ids for later use
library(stringr)
colnames(moviesdatawide) <- str_replace(colnames(moviesdatawide), "rating.", "")


# Now we have both rating tables moviesdata in long form and moviesdatawide in
# wide form

# Since this is in long format, as needed but is mixed, we would prep the data in
# such a way so as to identify the movies with a numerical value . This is
# because in order to use Spark library functions we would need to use movie
# lense data as dataframe and with the user and item as numeric columns



mometa <- as(moviemeta, "data.frame")

Spark

Connection With sparklyr

Spark was initially installed and tried for V2.1.0 but due to configuration failures, Spark was switched to 2.0.2

# devtools::install_github('rstudio/sparklyr', force=TRUE) spark_install(version
# = '2.1.0') spark_install_dir()


# Sparklyr was installed with the following command
# install.packages('sparklyr',force='TRUE')
spset_start <- proc.time()

library(sparklyr)

# installed local version of Spark
spark_install(version = "2.0.2", hadoop_version = "2.7")

# Connect to your Spark cluster
spconn <- spark_connect(master = "local")


# Check Print the version of Spark
spark_version(spconn)

## [1] '2.0.2'

Copy Data Tables To Spark

spmovies <- sdf_copy_to(spconn, moviesdata, "spmovies", overwrite = TRUE)
spmometa <- sdf_copy_to(spconn, mometa, "spmometa", overwrite = TRUE)

library(dplyr)
src_tbls(spconn)

## [1] "spmometa" "spmovies"

spset_end <- proc.time()
spset_end - spset_start

##    user  system elapsed 
##    1.49    0.17   18.35

ALS Factorization Model

# MAke Model
model_als <- ml_als_factorization(spmovies, rating.column = "rating", user.column = "user", 
    item.column = "itemid", iter.max = 7)
summary(model_als)

##                  Length Class      Mode       
## item.factors     11     data.frame list       
## user.factors     11     data.frame list       
## data              2     spark_jobj environment
## ml.options        6     ml_options list       
## model.parameters  2     -none-     list       
## .call             6     -none-     call       
## .model            2     spark_jobj environment

# Calculate Predicton
predictions <- model_als$.model %>% invoke("transform", spark_dataframe(spmovies)) %>% 
    collect()
class(predictions)

## [1] "tbl_df"     "tbl"        "data.frame"

head(data.frame(predictions), 10)

##    user rating itemid prediction
## 1   857      4     12   3.275293
## 2   868      4     12   4.037489
## 3   822      1     12   1.645155
## 4   759      4     12   3.520603
## 5   141      4     13   3.596831
## 6   367      2     13   2.612070
## 7   173      4     13   3.764529
## 8   503      5     13   4.588961
## 9    17      5     14   4.588117
## 10  231      5     14   4.451607

# Attach movie name via movie id joining
alluseritems <- merge(predictions, moviename, by = c("itemid"))
dim(alluseritems)

## [1] 99392     5

# View(alluseritems)

# Make it a predicted user item matrix with coumns as movie names , sorted user
# wise, predicted rating matrix
alluseritemswide <- alluseritems %>% select(user, prediction, item) %>% spread(item, 
    prediction) %>% arrange(user)

dim(alluseritemswide)

## [1]  943 1665

# View(alluseritemswide)

# Extract the User and Item factor matrices from the model userfactormat <-
# as.matrix(model_als$user.factors[,-1]) itemfactormat <-
# as.matrix(model_als$item.factors[,-1])
userfactormat <- as.matrix(model_als$user.factors)
itemfactormat <- as.matrix(model_als$item.factors)





# For random user, find predictions

sampleuser <- userfactormat[sample(1:nrow(moviesdatawide), 1), ]
sampleuser

##          id          V1          V2          V3          V4          V5 
## 859.0000000  -0.9335458   0.4578500  -0.3457451  -0.5840850  -0.9588441 
##          V6          V7          V8          V9         V10 
##   0.8232060   1.0859140   0.1410353  -1.1790280  -0.3657078

sampleuser1 <- predictions[sample(1:nrow(moviesdatawide), 1), ]
sampleuser1

## # A tibble: 1 x 4
##    user rating itemid prediction
##   <dbl>  <dbl>  <dbl>      <dbl>
## 1   874      4     18   3.262407

class(sampleuser1$user)

## [1] "numeric"

# Finding the predicted and previous rating for the sampled user Printing only
# 500 of the movies predicted comparison . as data is large

sampleuser1pred <- alluseritems[which(alluseritems$user == sampleuser1$user), ]
sampleuser1pred <- sampleuser1pred %>% select(user, item, rating, prediction)

datatable(head(sampleuser1pred, 500))

Calculate RMSE

model_als.RMSE <- sqrt(mean((predictions$rating - predictions$prediction)^2))
model_als.RMSE

## [1] 0.7778992

Spark Disconnection

# Disconnect from Spark
finaltime <- proc.time() - spset_start
finaltime

##    user  system elapsed 
##    3.08    0.23   28.57

spark_disconnect(spconn)

Summary Of Learnings

Working with Spark was an experience rooted in trial and error, especially with many a foreign exceptions to handle , with no clue to debug. Also some of the aspects , discovered in process, like ALS Factorization taking only numeric values was perplexing and is still so.

RMSE for ALS Factorization performed in Spark RMSE= 0.77, seemed pretty good and better than the IBCF (Item Based Collaborative Filtering) having RMSE 1.268 and UBCF (User Based Collaborative Filtering) with RMSE 1.017

The execution time also appeared better(lesser) It was overall a good learning lesson and more exploration of Textual content-based recommendation could be done in Spark qith more reading on the subject.

Reference

http://rpubs.com/DataDrivenMSDA/RSEval_MovieLens Building Recommender System in R https://rpubs.com/chezou/sparklyr-als

Recommender System :MovieLens With SparklyR

Kumudini Bhave

July 5, 2017