Initial Setup

library(sparklyr)
library(dplyr)
library(nycflights13)
sc <- spark_connect(master = "local", version = "2.1.0")
f <-  copy_to(sc, flights)
fs <- f %>%
  filter(!is.na(arr_delay)) %>%
  ft_string_indexer("origin", "origin_ca") %>%
  ft_string_indexer("dest", "dest_ca") %>%
  sdf_partition(train = 0.8, test = 0.2) 
  

With ft_string_indexer

with_cat <- system.time({
  fs_ml <- fs$train %>%
  select(origin_ca,
         dest_ca,
         arr_delay) %>%
    ml_random_forest(arr_delay~., max.bins = 200)
  
  fs_pr <- sdf_predict(fs_ml, fs$test)
  
  })
* No rows dropped by 'na.omit' call
with_cat
   user  system elapsed 
  0.145   0.017   6.053 

With out ft_string_indexer

wo_cat <- system.time({
  
  fs_ml <- fs$train %>%
  select(origin,
         dest,
         arr_delay) %>%
    ml_random_forest(arr_delay~., max.bins = 200)
  
 
  fs_pr <- sdf_predict(fs_ml, fs$test)
  
  
  
  
})
* No rows dropped by 'na.omit' call
wo_cat
   user  system elapsed 
  1.016   0.086  15.618 
spark_disconnect(sc)
LS0tCnRpdGxlOiAiQWRkaW5nIGZ0X3N0cmluZ19pbmRleGVyIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojSW5pdGlhbCBTZXR1cAoKYGBge3J9CgpsaWJyYXJ5KHNwYXJrbHlyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KG55Y2ZsaWdodHMxMykKCgpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIsIHZlcnNpb24gPSAiMi4xLjAiKQoKZiA8LSAgY29weV90byhzYywgZmxpZ2h0cykKYGBgCgoKCmBgYHtyfQoKZnMgPC0gZiAlPiUKICBmaWx0ZXIoIWlzLm5hKGFycl9kZWxheSkpICU+JQogIGZ0X3N0cmluZ19pbmRleGVyKCJvcmlnaW4iLCAib3JpZ2luX2NhIikgJT4lCiAgZnRfc3RyaW5nX2luZGV4ZXIoImRlc3QiLCAiZGVzdF9jYSIpICU+JQogIHNkZl9wYXJ0aXRpb24odHJhaW4gPSAwLjgsIHRlc3QgPSAwLjIpIAoKCiAgCmBgYAoKIyBXaXRoIGZ0X3N0cmluZ19pbmRleGVyCgpgYGB7cn0KCndpdGhfY2F0IDwtIHN5c3RlbS50aW1lKHsKICBmc19tbCA8LSBmcyR0cmFpbiAlPiUKICBzZWxlY3Qob3JpZ2luX2NhLAogICAgICAgICBkZXN0X2NhLAogICAgICAgICBhcnJfZGVsYXkpICU+JQogICAgbWxfcmFuZG9tX2ZvcmVzdChhcnJfZGVsYXl+LiwgbWF4LmJpbnMgPSAyMDApCiAgCiAgZnNfcHIgPC0gc2RmX3ByZWRpY3QoZnNfbWwsIGZzJHRlc3QpCiAgCgogIH0pCgoKd2l0aF9jYXQKCmBgYAoKCiMgV2l0aCBvdXQgZnRfc3RyaW5nX2luZGV4ZXIKCmBgYHtyfQp3b19jYXQgPC0gc3lzdGVtLnRpbWUoewogIAogIGZzX21sIDwtIGZzJHRyYWluICU+JQogIHNlbGVjdChvcmlnaW4sCiAgICAgICAgIGRlc3QsCiAgICAgICAgIGFycl9kZWxheSkgJT4lCiAgICBtbF9yYW5kb21fZm9yZXN0KGFycl9kZWxheX4uLCBtYXguYmlucyA9IDIwMCkKICAKIAogIGZzX3ByIDwtIHNkZl9wcmVkaWN0KGZzX21sLCBmcyR0ZXN0KQogIAogIAogIAogIAp9KQoKd29fY2F0CgoKYGBgCgpgYGB7cn0Kc3BhcmtfZGlzY29ubmVjdChzYykKYGBg