Initial Setup
library(sparklyr)
library(dplyr)
library(nycflights13)
sc <- spark_connect(master = "local", version = "2.1.0")
f <- copy_to(sc, flights)
fs <- f %>%
filter(!is.na(arr_delay)) %>%
ft_string_indexer("origin", "origin_ca") %>%
ft_string_indexer("dest", "dest_ca") %>%
sdf_partition(train = 0.8, test = 0.2)
With ft_string_indexer
with_cat <- system.time({
fs_ml <- fs$train %>%
select(origin_ca,
dest_ca,
arr_delay) %>%
ml_random_forest(arr_delay~., max.bins = 200)
fs_pr <- sdf_predict(fs_ml, fs$test)
})
* No rows dropped by 'na.omit' call
with_cat
user system elapsed
0.145 0.017 6.053
With out ft_string_indexer
wo_cat <- system.time({
fs_ml <- fs$train %>%
select(origin,
dest,
arr_delay) %>%
ml_random_forest(arr_delay~., max.bins = 200)
fs_pr <- sdf_predict(fs_ml, fs$test)
})
* No rows dropped by 'na.omit' call
wo_cat
user system elapsed
1.016 0.086 15.618
spark_disconnect(sc)
LS0tCnRpdGxlOiAiQWRkaW5nIGZ0X3N0cmluZ19pbmRleGVyIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojSW5pdGlhbCBTZXR1cAoKYGBge3J9CgpsaWJyYXJ5KHNwYXJrbHlyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KG55Y2ZsaWdodHMxMykKCgpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIsIHZlcnNpb24gPSAiMi4xLjAiKQoKZiA8LSAgY29weV90byhzYywgZmxpZ2h0cykKYGBgCgoKCmBgYHtyfQoKZnMgPC0gZiAlPiUKICBmaWx0ZXIoIWlzLm5hKGFycl9kZWxheSkpICU+JQogIGZ0X3N0cmluZ19pbmRleGVyKCJvcmlnaW4iLCAib3JpZ2luX2NhIikgJT4lCiAgZnRfc3RyaW5nX2luZGV4ZXIoImRlc3QiLCAiZGVzdF9jYSIpICU+JQogIHNkZl9wYXJ0aXRpb24odHJhaW4gPSAwLjgsIHRlc3QgPSAwLjIpIAoKCiAgCmBgYAoKIyBXaXRoIGZ0X3N0cmluZ19pbmRleGVyCgpgYGB7cn0KCndpdGhfY2F0IDwtIHN5c3RlbS50aW1lKHsKICBmc19tbCA8LSBmcyR0cmFpbiAlPiUKICBzZWxlY3Qob3JpZ2luX2NhLAogICAgICAgICBkZXN0X2NhLAogICAgICAgICBhcnJfZGVsYXkpICU+JQogICAgbWxfcmFuZG9tX2ZvcmVzdChhcnJfZGVsYXl+LiwgbWF4LmJpbnMgPSAyMDApCiAgCiAgZnNfcHIgPC0gc2RmX3ByZWRpY3QoZnNfbWwsIGZzJHRlc3QpCiAgCgogIH0pCgoKd2l0aF9jYXQKCmBgYAoKCiMgV2l0aCBvdXQgZnRfc3RyaW5nX2luZGV4ZXIKCmBgYHtyfQp3b19jYXQgPC0gc3lzdGVtLnRpbWUoewogIAogIGZzX21sIDwtIGZzJHRyYWluICU+JQogIHNlbGVjdChvcmlnaW4sCiAgICAgICAgIGRlc3QsCiAgICAgICAgIGFycl9kZWxheSkgJT4lCiAgICBtbF9yYW5kb21fZm9yZXN0KGFycl9kZWxheX4uLCBtYXguYmlucyA9IDIwMCkKICAKIAogIGZzX3ByIDwtIHNkZl9wcmVkaWN0KGZzX21sLCBmcyR0ZXN0KQogIAogIAogIAogIAp9KQoKd29fY2F0CgoKYGBgCgpgYGB7cn0Kc3BhcmtfZGlzY29ubmVjdChzYykKYGBg