library(tidyverse)
library(kableExtra)
library(magrittr)
library(sparklyr)

conf <- spark_config()
#conf$`sparklyr.cores.local` <- 1
#conf$spark.executor.instances <- 1
#conf$spark.dynamicAllocation.enabled <- "false"
sc <- spark_connect(master = "local", config = conf)
# source, https://stackoverflow.com/questions/50465390/gather-in-sparklyr
sdf_gather <- function(tbl, gather_cols) {
  other_cols <- colnames(tbl)[!colnames(tbl) %in% gather_cols]
  
  lapply(gather_cols, function(col_nm) {
    tbl %>%
      select(c(other_cols, col_nm)) %>%
      mutate(item = col_nm) %>%
      rename(ratingint = col_nm)
  }) %>%
    sdf_bind_rows() %>%
    select(c(other_cols, 'item', 'ratingint'))
}
ratings <-
  spark_read_csv(
    sc,
    name = 'ratings',
    path = 'ratings.csv',
    header = TRUE,
    infer_schema = TRUE,
    delimiter = ",",
    quote = "\"",
    escape = "\\",
    charset = "UTF-8",
    null_value = NULL,
    options = list(),
    repartition = 0,
    memory = TRUE,
    overwrite = TRUE
  ) %>%
  sdf_gather(c(
    "Hamburger",
    "Tacos" ,
    "Soup"    ,
    "Pizza",
    "Pasta"  ,
    "Salad"    ,
    "Sandwich"
  )) %>% 
  filter(ratingint != 'NA') %>% 
  sdf_random_split(training = 0.9, test = 0.1, seed = 1)
  
ratings$test %<>% 
  mutate(data_cat = "test")

ratings$training %<>% 
  mutate(data_cat = "training")
  
ratings <- sdf_bind_rows(ratings$test,ratings$training)
## Using your training data, calculate the raw average (mean) rating for every user-item combination.
training_avg <- ratings %>% 
  filter(data_cat == 'training') %>% 
  summarise(tmean = mean(ratingint, na.rm = TRUE)) %>% 
  pull()

training_avg
## Using your training data, calculate the raw average (mean) rating for every user-item combination.

user_avgs <- ratings %>% 
  filter(data_cat == 'training') %>% 
  group_by(user) %>% 
  summarise(user_avg = mean(ratingint, na.rm = TRUE)) %>% 
  mutate(user_bias = user_avg - training_avg)

user_avgs %>% 
  kable(digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
item_avgs <- ratings %>% 
  filter(data_cat == 'training') %>% 
  group_by(item) %>% 
  summarise(item_avg = mean(ratingint, na.rm = TRUE)) %>% 
  mutate(item_bias = item_avg - training_avg)

item_avgs %>% 
  kable(digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
## From the raw average, and the appropriate user and item biases, calculate the baseline predictors for every user-item combination.

bl_pred_df <- full_join(item_avgs, user_avgs, by = character()) %>%
  mutate(bl_predictor = item_bias + user_bias + training_avg) %>%
  mutate(bl_predictor = pmax(pmin(bl_predictor, 5), 0)) %>%
  select(item, user, bl_predictor)
rmse <- ratings %>% 
  left_join(bl_pred_df, by = c('user','item')) %>% 
  mutate(sq_err_bl_pred = (ratingint-bl_predictor)**2) %>% 
  mutate(sq_err_avg_pred = (ratingint-training_avg)**2)

rmse %>% 
  kable(col.names = c("User","Item","Rating","Category","Baseline","Baseline sq. error","Avg. sq. error")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover"),fixed_thead = T, full_width = F)
rmse2 <- rmse %>%
  select(sq_err_bl_pred, sq_err_avg_pred, data_cat) %>%
  group_by(data_cat) %>% 
  summarise(
    rmse_bl = sqrt(mean(sq_err_bl_pred, na.rm = TRUE)), 
    rmse_avg = sqrt(mean(sq_err_avg_pred, na.rm = TRUE))) %>% 
  collect() %>% 
  gather(error_type, rmse, rmse_bl:rmse_avg) 
  
ggplot(rmse2, aes(x = error_type, y = rmse, fill = error_type)) +
  geom_bar(stat = "identity") +
  facet_grid( ~ data_cat) +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "RMSE by data group and predictor type",
       subtitle = "The RMSE for both data groups is based on the avg. and bias values of the training data.",
       caption = "") +
  ylab("RMSE") +
  theme_minimal() +
  theme(legend.position = "none", axis.title.x = element_blank()) +
  geom_text(aes(label = round(rmse, 2)),
            vjust = 1.6,
            color = "white",
            size = 5) +
  scale_x_discrete(labels = c("Avg. Rating \n (Training Data)", "Baseline Predictor"))

Conclusions

Despite the excellent integration provided by SparklyR, several functions like gather and spread are not directly compatible which required a significant rework of the code.

On a small data set such as this, it’s unlikely that we’ll see a substantial difference in performance . Although it should be possible to select the number of cores and RAM available to Spark, those settings were not responsive, so it wasn’t possible to compare the run time with different core count and RAM settings.

If you expect that a recommender system will eventually process massive amounts of data that will require a distributed computing solution, then the development of the model should be implemented in Spark as early as possible.

LS0tDQp0aXRsZTogIlByb2plY3QgNTogUmVjb21tZW5kZXIgU3lzdGVtcyB3aXRoIFNwYXJrIg0Kc3VidGl0bGU6ICJEQVRBLTYxMiwgU3VtbWVyIDIwMTkiDQphdXRob3I6ICJGZXJuYW5kbyBGaWd1ZXJlcyBaZWxlZG9uIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KYGBge3IgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGthYmxlRXh0cmEpDQpsaWJyYXJ5KG1hZ3JpdHRyKQ0KbGlicmFyeShzcGFya2x5cikNCg0KY29uZiA8LSBzcGFya19jb25maWcoKQ0KI2NvbmYkYHNwYXJrbHlyLmNvcmVzLmxvY2FsYCA8LSAxDQojY29uZiRzcGFyay5leGVjdXRvci5pbnN0YW5jZXMgPC0gMQ0KI2NvbmYkc3BhcmsuZHluYW1pY0FsbG9jYXRpb24uZW5hYmxlZCA8LSAiZmFsc2UiDQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIsIGNvbmZpZyA9IGNvbmYpDQpgYGANCg0KYGBge3IgZWNobz1UUlVFfQ0KIyBzb3VyY2UsIGh0dHBzOi8vc3RhY2tvdmVyZmxvdy5jb20vcXVlc3Rpb25zLzUwNDY1MzkwL2dhdGhlci1pbi1zcGFya2x5cg0Kc2RmX2dhdGhlciA8LSBmdW5jdGlvbih0YmwsIGdhdGhlcl9jb2xzKSB7DQogIG90aGVyX2NvbHMgPC0gY29sbmFtZXModGJsKVshY29sbmFtZXModGJsKSAlaW4lIGdhdGhlcl9jb2xzXQ0KICANCiAgbGFwcGx5KGdhdGhlcl9jb2xzLCBmdW5jdGlvbihjb2xfbm0pIHsNCiAgICB0YmwgJT4lDQogICAgICBzZWxlY3QoYyhvdGhlcl9jb2xzLCBjb2xfbm0pKSAlPiUNCiAgICAgIG11dGF0ZShpdGVtID0gY29sX25tKSAlPiUNCiAgICAgIHJlbmFtZShyYXRpbmdpbnQgPSBjb2xfbm0pDQogIH0pICU+JQ0KICAgIHNkZl9iaW5kX3Jvd3MoKSAlPiUNCiAgICBzZWxlY3QoYyhvdGhlcl9jb2xzLCAnaXRlbScsICdyYXRpbmdpbnQnKSkNCn0NCmBgYA0KDQpgYGB7ciBlY2hvPVRSVUUgfQ0KcmF0aW5ncyA8LQ0KICBzcGFya19yZWFkX2NzdigNCiAgICBzYywNCiAgICBuYW1lID0gJ3JhdGluZ3MnLA0KICAgIHBhdGggPSAncmF0aW5ncy5jc3YnLA0KICAgIGhlYWRlciA9IFRSVUUsDQogICAgaW5mZXJfc2NoZW1hID0gVFJVRSwNCiAgICBkZWxpbWl0ZXIgPSAiLCIsDQogICAgcXVvdGUgPSAiXCIiLA0KICAgIGVzY2FwZSA9ICJcXCIsDQogICAgY2hhcnNldCA9ICJVVEYtOCIsDQogICAgbnVsbF92YWx1ZSA9IE5VTEwsDQogICAgb3B0aW9ucyA9IGxpc3QoKSwNCiAgICByZXBhcnRpdGlvbiA9IDAsDQogICAgbWVtb3J5ID0gVFJVRSwNCiAgICBvdmVyd3JpdGUgPSBUUlVFDQogICkgJT4lDQogIHNkZl9nYXRoZXIoYygNCiAgICAiSGFtYnVyZ2VyIiwNCiAgICAiVGFjb3MiICwNCiAgICAiU291cCIgICAgLA0KICAgICJQaXp6YSIsDQogICAgIlBhc3RhIiAgLA0KICAgICJTYWxhZCIgICAgLA0KICAgICJTYW5kd2ljaCINCiAgKSkgJT4lIA0KICBmaWx0ZXIocmF0aW5naW50ICE9ICdOQScpICU+JSANCiAgc2RmX3JhbmRvbV9zcGxpdCh0cmFpbmluZyA9IDAuOSwgdGVzdCA9IDAuMSwgc2VlZCA9IDEpDQogIA0KcmF0aW5ncyR0ZXN0ICU8PiUgDQogIG11dGF0ZShkYXRhX2NhdCA9ICJ0ZXN0IikNCg0KcmF0aW5ncyR0cmFpbmluZyAlPD4lIA0KICBtdXRhdGUoZGF0YV9jYXQgPSAidHJhaW5pbmciKQ0KICANCnJhdGluZ3MgPC0gc2RmX2JpbmRfcm93cyhyYXRpbmdzJHRlc3QscmF0aW5ncyR0cmFpbmluZykNCmBgYA0KDQpgYGB7ciBlY2hvPVRSVUUgfQ0KIyMgVXNpbmcgeW91ciB0cmFpbmluZyBkYXRhLCBjYWxjdWxhdGUgdGhlIHJhdyBhdmVyYWdlIChtZWFuKSByYXRpbmcgZm9yIGV2ZXJ5IHVzZXItaXRlbSBjb21iaW5hdGlvbi4NCnRyYWluaW5nX2F2ZyA8LSByYXRpbmdzICU+JSANCiAgZmlsdGVyKGRhdGFfY2F0ID09ICd0cmFpbmluZycpICU+JSANCiAgc3VtbWFyaXNlKHRtZWFuID0gbWVhbihyYXRpbmdpbnQsIG5hLnJtID0gVFJVRSkpICU+JSANCiAgcHVsbCgpDQoNCnRyYWluaW5nX2F2Zw0KYGBgDQoNCmBgYHtyIGVjaG89VFJVRSB9DQojIyBVc2luZyB5b3VyIHRyYWluaW5nIGRhdGEsIGNhbGN1bGF0ZSB0aGUgcmF3IGF2ZXJhZ2UgKG1lYW4pIHJhdGluZyBmb3IgZXZlcnkgdXNlci1pdGVtIGNvbWJpbmF0aW9uLg0KDQp1c2VyX2F2Z3MgPC0gcmF0aW5ncyAlPiUgDQogIGZpbHRlcihkYXRhX2NhdCA9PSAndHJhaW5pbmcnKSAlPiUgDQogIGdyb3VwX2J5KHVzZXIpICU+JSANCiAgc3VtbWFyaXNlKHVzZXJfYXZnID0gbWVhbihyYXRpbmdpbnQsIG5hLnJtID0gVFJVRSkpICU+JSANCiAgbXV0YXRlKHVzZXJfYmlhcyA9IHVzZXJfYXZnIC0gdHJhaW5pbmdfYXZnKQ0KDQp1c2VyX2F2Z3MgJT4lIA0KICBrYWJsZShkaWdpdHMgPSAyKSAlPiUNCiAga2FibGVfc3R5bGluZyhib290c3RyYXBfb3B0aW9ucyA9IGMoInN0cmlwZWQiLCAiaG92ZXIiKSwgZnVsbF93aWR0aCA9IEYpDQpgYGANCg0KYGBge3IgZWNobz1UUlVFIH0NCml0ZW1fYXZncyA8LSByYXRpbmdzICU+JSANCiAgZmlsdGVyKGRhdGFfY2F0ID09ICd0cmFpbmluZycpICU+JSANCiAgZ3JvdXBfYnkoaXRlbSkgJT4lIA0KICBzdW1tYXJpc2UoaXRlbV9hdmcgPSBtZWFuKHJhdGluZ2ludCwgbmEucm0gPSBUUlVFKSkgJT4lIA0KICBtdXRhdGUoaXRlbV9iaWFzID0gaXRlbV9hdmcgLSB0cmFpbmluZ19hdmcpDQoNCml0ZW1fYXZncyAlPiUgDQogIGthYmxlKGRpZ2l0cyA9IDIpICU+JQ0KICBrYWJsZV9zdHlsaW5nKGJvb3RzdHJhcF9vcHRpb25zID0gYygic3RyaXBlZCIsICJob3ZlciIpLCBmdWxsX3dpZHRoID0gRikNCmBgYA0KDQpgYGB7ciBlY2hvPVRSVUUgfQ0KIyMgRnJvbSB0aGUgcmF3IGF2ZXJhZ2UsIGFuZCB0aGUgYXBwcm9wcmlhdGUgdXNlciBhbmQgaXRlbSBiaWFzZXMsIGNhbGN1bGF0ZSB0aGUgYmFzZWxpbmUgcHJlZGljdG9ycyBmb3IgZXZlcnkgdXNlci1pdGVtIGNvbWJpbmF0aW9uLg0KDQpibF9wcmVkX2RmIDwtIGZ1bGxfam9pbihpdGVtX2F2Z3MsIHVzZXJfYXZncywgYnkgPSBjaGFyYWN0ZXIoKSkgJT4lDQogIG11dGF0ZShibF9wcmVkaWN0b3IgPSBpdGVtX2JpYXMgKyB1c2VyX2JpYXMgKyB0cmFpbmluZ19hdmcpICU+JQ0KICBtdXRhdGUoYmxfcHJlZGljdG9yID0gcG1heChwbWluKGJsX3ByZWRpY3RvciwgNSksIDApKSAlPiUNCiAgc2VsZWN0KGl0ZW0sIHVzZXIsIGJsX3ByZWRpY3RvcikNCg0KYGBgDQoNCmBgYHtyIGVjaG89VFJVRSB9DQpybXNlIDwtIHJhdGluZ3MgJT4lIA0KICBsZWZ0X2pvaW4oYmxfcHJlZF9kZiwgYnkgPSBjKCd1c2VyJywnaXRlbScpKSAlPiUgDQogIG11dGF0ZShzcV9lcnJfYmxfcHJlZCA9IChyYXRpbmdpbnQtYmxfcHJlZGljdG9yKSoqMikgJT4lIA0KICBtdXRhdGUoc3FfZXJyX2F2Z19wcmVkID0gKHJhdGluZ2ludC10cmFpbmluZ19hdmcpKioyKQ0KDQpybXNlICU+JSANCiAga2FibGUoY29sLm5hbWVzID0gYygiVXNlciIsIkl0ZW0iLCJSYXRpbmciLCJDYXRlZ29yeSIsIkJhc2VsaW5lIiwiQmFzZWxpbmUgc3EuIGVycm9yIiwiQXZnLiBzcS4gZXJyb3IiKSkgJT4lIA0KICBrYWJsZV9zdHlsaW5nKGJvb3RzdHJhcF9vcHRpb25zID0gYygic3RyaXBlZCIsICJob3ZlciIpLGZpeGVkX3RoZWFkID0gVCwgZnVsbF93aWR0aCA9IEYpDQpgYGANCg0KYGBge3IgZWNobz1UUlVFIH0NCnJtc2UyIDwtIHJtc2UgJT4lDQogIHNlbGVjdChzcV9lcnJfYmxfcHJlZCwgc3FfZXJyX2F2Z19wcmVkLCBkYXRhX2NhdCkgJT4lDQogIGdyb3VwX2J5KGRhdGFfY2F0KSAlPiUgDQogIHN1bW1hcmlzZSgNCiAgICBybXNlX2JsID0gc3FydChtZWFuKHNxX2Vycl9ibF9wcmVkLCBuYS5ybSA9IFRSVUUpKSwgDQogICAgcm1zZV9hdmcgPSBzcXJ0KG1lYW4oc3FfZXJyX2F2Z19wcmVkLCBuYS5ybSA9IFRSVUUpKSkgJT4lIA0KICBjb2xsZWN0KCkgJT4lIA0KICBnYXRoZXIoZXJyb3JfdHlwZSwgcm1zZSwgcm1zZV9ibDpybXNlX2F2ZykgDQogIA0KYGBgDQoNCg0KYGBge3IgZWNobz1UUlVFfQ0KZ2dwbG90KHJtc2UyLCBhZXMoeCA9IGVycm9yX3R5cGUsIHkgPSBybXNlLCBmaWxsID0gZXJyb3JfdHlwZSkpICsNCiAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpICsNCiAgZmFjZXRfZ3JpZCggfiBkYXRhX2NhdCkgKw0KICBzY2FsZV9maWxsX2JyZXdlcihwYWxldHRlID0gIlBhaXJlZCIpICsNCiAgbGFicyh0aXRsZSA9ICJSTVNFIGJ5IGRhdGEgZ3JvdXAgYW5kIHByZWRpY3RvciB0eXBlIiwNCiAgICAgICBzdWJ0aXRsZSA9ICJUaGUgUk1TRSBmb3IgYm90aCBkYXRhIGdyb3VwcyBpcyBiYXNlZCBvbiB0aGUgYXZnLiBhbmQgYmlhcyB2YWx1ZXMgb2YgdGhlIHRyYWluaW5nIGRhdGEuIiwNCiAgICAgICBjYXB0aW9uID0gIiIpICsNCiAgeWxhYigiUk1TRSIpICsNCiAgdGhlbWVfbWluaW1hbCgpICsNCiAgdGhlbWUobGVnZW5kLnBvc2l0aW9uID0gIm5vbmUiLCBheGlzLnRpdGxlLnggPSBlbGVtZW50X2JsYW5rKCkpICsNCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IHJvdW5kKHJtc2UsIDIpKSwNCiAgICAgICAgICAgIHZqdXN0ID0gMS42LA0KICAgICAgICAgICAgY29sb3IgPSAid2hpdGUiLA0KICAgICAgICAgICAgc2l6ZSA9IDUpICsNCiAgc2NhbGVfeF9kaXNjcmV0ZShsYWJlbHMgPSBjKCJBdmcuIFJhdGluZyBcbiAoVHJhaW5pbmcgRGF0YSkiLCAiQmFzZWxpbmUgUHJlZGljdG9yIikpDQpgYGANCg0KIyMgQ29uY2x1c2lvbnMNCg0KRGVzcGl0ZSB0aGUgZXhjZWxsZW50IGludGVncmF0aW9uIHByb3ZpZGVkIGJ5IFNwYXJrbHlSLCBzZXZlcmFsIGZ1bmN0aW9ucyBsaWtlIGBnYXRoZXJgIGFuZCBgc3ByZWFkYCBhcmUgbm90IGRpcmVjdGx5IGNvbXBhdGlibGUgd2hpY2ggcmVxdWlyZWQgYSBzaWduaWZpY2FudCByZXdvcmsgb2YgdGhlIGNvZGUuIA0KDQpPbiBhIHNtYWxsIGRhdGEgc2V0IHN1Y2ggYXMgdGhpcywgaXQncyB1bmxpa2VseSB0aGF0IHdlJ2xsIHNlZSBhIHN1YnN0YW50aWFsIGRpZmZlcmVuY2UgaW4gcGVyZm9ybWFuY2UgLiAgQWx0aG91Z2ggaXQgc2hvdWxkIGJlIHBvc3NpYmxlIHRvIHNlbGVjdCB0aGUgbnVtYmVyIG9mIGNvcmVzIGFuZCBSQU0gYXZhaWxhYmxlIHRvIFNwYXJrLCB0aG9zZSBzZXR0aW5ncyB3ZXJlIG5vdCByZXNwb25zaXZlLCBzbyBpdCB3YXNuJ3QgcG9zc2libGUgdG8gY29tcGFyZSB0aGUgcnVuIHRpbWUgd2l0aCBkaWZmZXJlbnQgY29yZSBjb3VudCBhbmQgUkFNIHNldHRpbmdzLg0KDQpJZiB5b3UgZXhwZWN0IHRoYXQgYSByZWNvbW1lbmRlciBzeXN0ZW0gd2lsbCBldmVudHVhbGx5IHByb2Nlc3MgbWFzc2l2ZSBhbW91bnRzIG9mIGRhdGEgdGhhdCB3aWxsIHJlcXVpcmUgYSBkaXN0cmlidXRlZCBjb21wdXRpbmcgc29sdXRpb24sIHRoZW4gdGhlIGRldmVsb3BtZW50IG9mIHRoZSBtb2RlbCBzaG91bGQgYmUgaW1wbGVtZW50ZWQgaW4gU3BhcmsgYXMgZWFybHkgYXMgcG9zc2libGUu