Data 612 Project 5

library(sparklyr)
library(recommenderlab)
library(dplyr)
# Connect with spark
sc <- spark_connect(master = "local")
data(MovieLense, package = "recommenderlab")

movielense <- MovieLense
movies <- as(movielense,"data.frame")
head(movies)

##     user                                                 item rating
## 1      1                                     Toy Story (1995)      5
## 453    1                                     GoldenEye (1995)      3
## 584    1                                    Four Rooms (1995)      4
## 674    1                                    Get Shorty (1995)      3
## 883    1                                       Copycat (1995)      3
## 969    1 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)      5

movies <- transform(movies, itemid=as.numeric(factor(item)))
colnames(movies) <- c("user","item","rating","itemid")
# convert user and itemid to numeric in order to use for Spark
movies$user <- as.numeric(movies$user)
movies$itemid <- as.numeric(movies$itemid)
movies <- movies %>% select(-item)
head(movies)

##     user rating itemid
## 1      1      5   1525
## 453    1      3    618
## 584    1      4    555
## 674    1      3    594
## 883    1      3    344
## 969    1      5   1318

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

movies_wide <- reshape(movies, idvar = "user", timevar = "itemid", direction = "wide") %>% 
    arrange(user)
rownames(movies_wide) <- movies_wide$user
movies_wide <- movies_wide %>% select(-user)

movies_wide <- reshape(movies, idvar = "user", timevar = "itemid", direction = "wide") %>% 
    arrange(user)
rownames(movies_wide) <- movies_wide$user
movies_wide <- movies_wide %>% select(-user)



# copy table to Spark

sp_movies <- sdf_copy_to(sc,movies,"spmovies",overwrite = TRUE)
partitions <- sp_movies %>% sdf_random_split(training = 0.7, test = 0.3)
sp_movies_training <- partitions$training
sp_movies_test <- partitions$test
head(sp_movies_training)

## # Source: spark<?> [?? x 3]
##    user rating itemid
##   <dbl>  <dbl>  <dbl>
## 1     1      1     33
## 2     1      1    111
## 3     1      1    133
## 4     1      1    136
## 5     1      1    231
## 6     1      1    371

model<- ml_als(sp_movies_training,rating_col = "rating",user_col = "user", item_col = "itemid",rank = 10)
predictions <- ml_predict(model, sp_movies_test)

predictions <- data.frame(predictions)
predictions$difference <- (predictions$rating - predictions$prediction)
predictions$difference_square <- (predictions$difference)^2

head(predictions)

##   user rating itemid prediction difference difference_square
## 1  857      4     12   3.070576  0.9294240         0.8638291
## 2  868      4     12   3.688334  0.3116665         0.0971360
## 3  822      1     12   3.185374 -2.1853735         4.7758575
## 4  141      4     13   2.659673  1.3403268         1.7964759
## 5  231      5     14   3.482283  1.5177174         2.3034660
## 6  219      5     14   3.933356  1.0666442         1.1377298

sqrt(mean(predictions$difference_square,na.rm = TRUE))

## [1] 0.9300253

Conclusion : I have used spark with python and R and it is a great distributed technology and i love the way it distributes the load and takes the dificulty out of the programmers. The speed and performance is way better but it completely depends upon how your parallelize your job and structure your program. You could end up spending more time parallizing if the implementation is bad. However in this case it did perform better.

Data 612 Project 5

Murali Kunissery

5/2/2020