Project 5

The goal of this project is give you practice beginning to work with a distributed recommender system. It is sufficient for this assignment to build out your application on a single node.

Adapt one of your recommendation systems to work with Apache Spark and compare the performance with your previous iteration. Consider the efficiency of the system and the added complexity of using Spark. You may complete the assignment using PySpark (Python), SparkR (R) , sparklyr (R), or Scala.

Please include in your conclusion: For your given recommender system’s data, algorithm(s), and (envisioned) implementation, at what point would you see moving to a distributed platform such as Spark becoming necessary? You may work on any platform of your choosing, including Databricks Community Edition or in local mode. You are encouraged but not required to work in a small group on this project.

#importing required libraries for the project, i will be using movielense dataset from recommenderlab package
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.6.2
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(sparklyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#import dataset
data(MovieLense)
data <- as(MovieLense,"data.frame")
head(data)
##     user                                                 item rating
## 1      1                                     Toy Story (1995)      5
## 453    1                                     GoldenEye (1995)      3
## 584    1                                    Four Rooms (1995)      4
## 674    1                                    Get Shorty (1995)      3
## 883    1                                       Copycat (1995)      3
## 969    1 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)      5
tail(data)
##       user                                 item rating
## 93643  943                  Multiplicity (1996)      2
## 94227  943                 Bottle Rocket (1996)      2
## 94450  943                 Reality Bites (1994)      4
## 96451  943                 Young Guns II (1990)      3
## 97147  943 Under Siege 2: Dark Territory (1995)      3
## 98112  943       An Unforgettable Summer (1994)      3
#let's have a look at the dataset and its attributes
glimpse(data)
## Rows: 99,392
## Columns: 3
## $ user   <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ item   <fct> "Toy Story (1995)", "GoldenEye (1995)", "Four Rooms (1995)", "…
## $ rating <dbl> 5, 3, 4, 3, 3, 5, 4, 1, 5, 3, 2, 5, 5, 5, 5, 5, 3, 4, 5, 4, 1,…
colnames(data) <- c("user","item","rating")
glimpse(data)
## Rows: 99,392
## Columns: 3
## $ user   <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ item   <fct> "Toy Story (1995)", "GoldenEye (1995)", "Four Rooms (1995)", "…
## $ rating <dbl> 5, 3, 4, 3, 3, 5, 4, 1, 5, 3, 2, 5, 5, 5, 5, 5, 3, 4, 5, 4, 1,…
# convert user and item to numeric values for Spark
data$user <- as.numeric(data$user)
data$item <- as.numeric(data$item)

head(data)
##     user item rating
## 1      1 1525      5
## 453    1  618      3
## 584    1  555      4
## 674    1  594      3
## 883    1  344      3
## 969    1 1318      5
#Now let's transfer the table to Spark
#connect to spark
sc <- spark_connect(master = "local")

# copy table to Spark
movies_ratings <- copy_to(sc, data, "ratings", overwrite = TRUE)
src_tbls(sc)
## [1] "ratings"
glimpse(movies_ratings)
## Rows: ??
## Columns: 3
## Database: spark_connection
## $ user   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ item   <dbl> 1525, 618, 555, 594, 344, 1318, 1545, 111, 391, 1240, 1303, 15…
## $ rating <dbl> 5, 3, 4, 3, 3, 5, 4, 1, 5, 3, 2, 5, 5, 5, 5, 5, 3, 4, 5, 4, 1,…
# Distribution of average rating per user
user_summary <- movies_ratings %>%
                        group_by(user) %>%
                        summarise(avg = mean(rating, na.rm = TRUE)) %>% 
                                                                 collect()
head(user_summary)
## # A tibble: 6 x 2
##    user   avg
##   <dbl> <dbl>
## 1   445  2.87
## 2    13  3.46
## 3    79  3.04
## 4    90  3.88
## 5   113  3.10
## 6   168  4.05
summary(user_summary)
##       user            avg       
##  Min.   :  1.0   Min.   :1.497  
##  1st Qu.:236.5   1st Qu.:3.325  
##  Median :472.0   Median :3.619  
##  Mean   :472.0   Mean   :3.588  
##  3rd Qu.:707.5   3rd Qu.:3.871  
##  Max.   :943.0   Max.   :4.870
#plot the diagram for the distribution
ggplot(user_summary, aes(avg)) + 
  geom_histogram() + 
  labs(title = "Distribution of Average Rating per User",
       x = "Average Rating",
       y = "Users")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Distribution of average rating per movie
movie_summary <- movies_ratings %>%
                        group_by(item) %>%
                        summarise(avg = mean(rating, na.rm = TRUE)) %>% 
                                                                 collect()
head(movie_summary)
## # A tibble: 6 x 2
##    item   avg
##   <dbl> <dbl>
## 1  1318  3.58
## 2   111  4.00
## 3  1240  3.83
## 4   977  3.42
## 5  1169  3.97
## 6  1006  3.78
summary(movie_summary)
##       item             avg       
##  Min.   :   1.0   Min.   :1.000  
##  1st Qu.: 416.8   1st Qu.:2.665  
##  Median : 832.5   Median :3.162  
##  Mean   : 832.5   Mean   :3.077  
##  3rd Qu.:1248.2   3rd Qu.:3.653  
##  Max.   :1664.0   Max.   :5.000
#plot the diagram for the distribution
ggplot(movie_summary, aes(avg)) +
  geom_histogram() +
  labs(title = "Distribution of average rating per Movie",
       x = "Average Rating",
       y = "Movies")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#let's split the dataset into training and testing dataset
split_data <- movies_ratings %>% sdf_random_split(training = 0.8, test = 0.2)
split_movies_train <- split_data$train
split_movies_test <- split_data$test

head(split_movies_train)
## # Source: spark<?> [?? x 3]
##    user  item rating
##   <dbl> <dbl>  <dbl>
## 1     1     3      2
## 2     1     4      5
## 3     1     7      3
## 4     1     8      4
## 5     1    18      3
## 6     1    19      3
head(split_movies_test)
## # Source: spark<?> [?? x 3]
##    user  item rating
##   <dbl> <dbl>  <dbl>
## 1     1    90      4
## 2     1    95      2
## 3     1    96      4
## 4     1   129      3
## 5     1   136      1
## 6     1   180      4
# fit the model on the training set
als_model<- ml_als(split_movies_train)
predictions <- ml_predict(als_model, split_movies_test)

predictions <- data.frame(predictions)
predictions$difference <- (predictions$rating - predictions$prediction)
predictions$difference_square <- (predictions$difference)^2

head(predictions) %>%
  kable() %>%
  kable_styling()
user item rating prediction difference difference_square
857 12 4 0.7196801 3.280320 10.760499
868 12 4 0.8695859 3.130414 9.799493
759 12 4 0.7206417 3.279358 10.754191
503 13 5 3.1223922 1.877608 3.525411
17 14 5 3.0829089 1.917091 3.675238
231 14 5 3.9988732 1.001127 1.002255