library(sparklyr)
library(recommenderlab)
library(dplyr)
# Connect with spark
sc <- spark_connect(master = "local")
data(MovieLense, package = "recommenderlab")
movielense <- MovieLense
movies <- as(movielense,"data.frame")
head(movies)
## user item rating
## 1 1 Toy Story (1995) 5
## 453 1 GoldenEye (1995) 3
## 584 1 Four Rooms (1995) 4
## 674 1 Get Shorty (1995) 3
## 883 1 Copycat (1995) 3
## 969 1 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 5
movies <- transform(movies, itemid=as.numeric(factor(item)))
colnames(movies) <- c("user","item","rating","itemid")
# convert user and itemid to numeric in order to use for Spark
movies$user <- as.numeric(movies$user)
movies$itemid <- as.numeric(movies$itemid)
movies <- movies %>% select(-item)
head(movies)
## user rating itemid
## 1 1 5 1525
## 453 1 3 618
## 584 1 4 555
## 674 1 3 594
## 883 1 3 344
## 969 1 5 1318
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
movies_wide <- reshape(movies, idvar = "user", timevar = "itemid", direction = "wide") %>%
arrange(user)
rownames(movies_wide) <- movies_wide$user
movies_wide <- movies_wide %>% select(-user)
movies_wide <- reshape(movies, idvar = "user", timevar = "itemid", direction = "wide") %>%
arrange(user)
rownames(movies_wide) <- movies_wide$user
movies_wide <- movies_wide %>% select(-user)
# copy table to Spark
sp_movies <- sdf_copy_to(sc,movies,"spmovies",overwrite = TRUE)
partitions <- sp_movies %>% sdf_random_split(training = 0.7, test = 0.3)
sp_movies_training <- partitions$training
sp_movies_test <- partitions$test
head(sp_movies_training)
## # Source: spark<?> [?? x 3]
## user rating itemid
## <dbl> <dbl> <dbl>
## 1 1 1 33
## 2 1 1 111
## 3 1 1 133
## 4 1 1 136
## 5 1 1 231
## 6 1 1 371
model<- ml_als(sp_movies_training,rating_col = "rating",user_col = "user", item_col = "itemid",rank = 10)
predictions <- ml_predict(model, sp_movies_test)
predictions <- data.frame(predictions)
predictions$difference <- (predictions$rating - predictions$prediction)
predictions$difference_square <- (predictions$difference)^2
head(predictions)
## user rating itemid prediction difference difference_square
## 1 857 4 12 3.070576 0.9294240 0.8638291
## 2 868 4 12 3.688334 0.3116665 0.0971360
## 3 822 1 12 3.185374 -2.1853735 4.7758575
## 4 141 4 13 2.659673 1.3403268 1.7964759
## 5 231 5 14 3.482283 1.5177174 2.3034660
## 6 219 5 14 3.933356 1.0666442 1.1377298
sqrt(mean(predictions$difference_square,na.rm = TRUE))
## [1] 0.9300253
Conclusion : I have used spark with python and R and it is a great distributed technology and i love the way it distributes the load and takes the dificulty out of the programmers. The speed and performance is way better but it completely depends upon how your parallelize your job and structure your program. You could end up spending more time parallizing if the implementation is bad. However in this case it did perform better.