Code
Environmental variables
Sys.setenv(SPARK_HOME='/Users/olegbaydakov/Downloads/spark-2.2.0-bin-hadoop2.7/')
Sys.setenv(SPARK_LOCAL_IP='127.0.0.1')
Sys.setenv(HADOOP_CONF_DIR='/usr/local/Cellar/hadoop/2.8.1/libexec/etc/hadoop')
Sys.setenv(YARN_CONF_DIR = '/usr/local/Cellar/hadoop/2.8.1/libexec/etc/hadoop')
options(rsparkling.sparklingwater.version = "2.2.0")
options(rsparkling.sparklingwater.location = "/Users/olegbaydakov/Documents/Books/R/sparkling-water-2.2.2/assembly/build/libs/sparkling-water-assembly_2.11-2.2.2-all.jar")
Libraries
library(rsparkling)
library(sparklyr)
library(h2o)
Spark configuration
config=spark_config()
config=c(config,list(
"spark.executor.memory"="1G",
"spark.driver.memory"="1G",
"spark.executor.instances"="2",
"spark.executor.cores" = "",
"sparklyr.log.console" = TRUE,
# "spark.dynamicAllocation.enabled"=TRUE,
# "spark.shuffle.service.enabled"=TRUE,
"spark.sql.tungsten.enabled"= TRUE
# "spark.cores.max" = "2"
# "spark.sql.warehouse.dir" = "/Users/olegbaydakov/Downloads/spark-2.2.0-bin-hadoop2.7/tmp/hive/warehouse"
))
Connect to Spark master (Standalone cluster)
sc <- spark_connect(master = "spark://127.0.0.1:7077",
version = "2.2.0",
app_name = "sparklyr4",
spark_home = "/Users/olegbaydakov/Downloads/spark-2.2.0-bin-hadoop2.7/",
config = config)
Connect to YARN (version 1)
config=spark_config()
sc <- spark_connect(master = "yarn-client",
version = "2.2.0",
app_name = "sparklyr4",
spark_home = "/Users/olegbaydakov/Downloads/spark-2.2.0-bin-hadoop2.7/",
config = config)
Connect to YARN (version 2)
sc <-
spark_connect(master="yarn",
config = list(
default = list(
spark.submit.deployMode= "client",
spark.executor.instances= 2,
spark.executor.memory= "2G",
spark.executor.cores= 2,
spark.driver.memory= "1G",
"sparklyr.log.console" = TRUE
)))
Connect to H2O
h2o_context(sc, strict_version_check = FALSE)
Data processing
library(dplyr)
mtcars_tbl <- copy_to(sc, mtcars, "mtcars")
src_tbls(sc)
# from Hive
data_tbl <- tbl(sc, "table_name")
data_tbl2 <- tbl(sc, sql("SELECT * from mtcars"))
# H2O
mtcars_h2o <- as_h2o_frame(sc, mtcars_tbl, strict_version_check = FALSE)
partitions <- mtcars_tbl %>%
filter(hp >= 100) %>%
mutate(cyl8 = cyl == 8) %>%
sdf_partition(training = 0.5, test = 0.5, seed = 1099)
Train model
training <- as_h2o_frame(sc, partitions$training, strict_version_check = FALSE)
test <- as_h2o_frame(sc, partitions$test, strict_version_check = FALSE)
# fit a linear model to the training dataset
glm_model <- h2o.glm(x = c("wt", "cyl"),
y = "mpg",
training_frame = training,
lambda_search = TRUE)
glm_model
library(ggplot2)
# compute predicted values on our test dataset
pred <- h2o.predict(glm_model, newdata = test)
# convert from H2O Frame to Spark DataFrame
predicted <- as_spark_dataframe(sc, pred, strict_version_check = FALSE)
# extract the true 'mpg' values from our test dataset
actual <- partitions$test %>%
select(mpg) %>%
collect() %>%
`[[`("mpg")
# produce a data.frame housing our predicted + actual 'mpg' values
data <- data.frame(
predicted = predicted,
actual = actual
)
# a bug in data.frame does not set colnames properly; reset here
names(data) <- c("predicted", "actual")
# plot predicted vs. actual values
ggplot(data, aes(x = actual, y = predicted)) +
geom_abline(lty = "dashed", col = "red") +
geom_point() +
theme(plot.title = element_text(hjust = 0.5)) +
coord_fixed(ratio = 1) +
labs(
x = "Actual Fuel Consumption",
y = "Predicted Fuel Consumption",
title = "Predicted vs. Actual Fuel Consumption"
)
Spark disconnect
spark_disconnect(sc)
h2o.shutdown()
# To stop all running H2O sessions
# ps -efww | grep h2o
LS0tCnRpdGxlOiAiU3BhcmtseXIiCnN1YnRpdGxlOiAiU3Bhcmt5bHIgKyBzcGFya2xpbmcgd2F0ZXIrIEgyTyIKYXV0aG9yOiAiT2xlZyBCYXlkYWtvdiIKZGF0ZTogIkRlY2VtYmVyIDAxLCAyMDE3IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6IAogICAgY29kZV9kb3dubG9hZDogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICBudW1iZXJfc2VjdGlvbnM6IHllcwogICAgdGhlbWU6IGpvdXJuYWwKICAgIGRmX3ByaW50OiBrYWJsZQogICAgdG9jOiBUUlVFCiAgICB0b2NfZmxvYXQ6IFRSVUUKLS0tCltNYXJrZG93bi1DaGVhdHNoZWV0XShodHRwczovL2dpdGh1Yi5jb20vYWRhbS1wL21hcmtkb3duLWhlcmUvd2lraS9NYXJrZG93bi1DaGVhdHNoZWV0KQoKW1NwYXJrbGluZyBXYXRlciAoSDJPKSBNYWNoaW5lIExlYXJuaW5nXShodHRwOi8vc3BhcmsucnN0dWRpby5jb20vZ3VpZGVzL2gyby8pCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQpgYGAKCiMgQ29kZQojIyBFbnZpcm9ubWVudGFsIHZhcmlhYmxlcwpgYGB7ciBldmFsPUZBTFNFLCBtZXNzYWdlPVRSVUUsIHdhcm5pbmc9VFJVRSwgLCBlY2hvPVRSVUUsIGluY2x1ZGU9VFJVRSwgcGFnZWQucHJpbnQ9VFJVRX0KU3lzLnNldGVudihTUEFSS19IT01FPScvVXNlcnMvb2xlZ2JheWRha292L0Rvd25sb2Fkcy9zcGFyay0yLjIuMC1iaW4taGFkb29wMi43LycpClN5cy5zZXRlbnYoU1BBUktfTE9DQUxfSVA9JzEyNy4wLjAuMScpClN5cy5zZXRlbnYoSEFET09QX0NPTkZfRElSPScvdXNyL2xvY2FsL0NlbGxhci9oYWRvb3AvMi44LjEvbGliZXhlYy9ldGMvaGFkb29wJykKU3lzLnNldGVudihZQVJOX0NPTkZfRElSID0gJy91c3IvbG9jYWwvQ2VsbGFyL2hhZG9vcC8yLjguMS9saWJleGVjL2V0Yy9oYWRvb3AnKQpvcHRpb25zKHJzcGFya2xpbmcuc3BhcmtsaW5nd2F0ZXIudmVyc2lvbiA9ICIyLjIuMCIpIApvcHRpb25zKHJzcGFya2xpbmcuc3BhcmtsaW5nd2F0ZXIubG9jYXRpb24gPSAiL1VzZXJzL29sZWdiYXlkYWtvdi9Eb2N1bWVudHMvQm9va3MvUi9zcGFya2xpbmctd2F0ZXItMi4yLjIvYXNzZW1ibHkvYnVpbGQvbGlicy9zcGFya2xpbmctd2F0ZXItYXNzZW1ibHlfMi4xMS0yLjIuMi1hbGwuamFyIikKCmBgYAojIyBMaWJyYXJpZXMKYGBge3IgZXZhbD1GQUxTRSwgbWVzc2FnZT1UUlVFLCB3YXJuaW5nPVRSVUUsICwgZWNobz1UUlVFLCBpbmNsdWRlPVRSVUUsIHBhZ2VkLnByaW50PVRSVUV9CmxpYnJhcnkocnNwYXJrbGluZykKbGlicmFyeShzcGFya2x5cikKbGlicmFyeShoMm8pCmBgYAojIyBTcGFyayBjb25maWd1cmF0aW9uCmBgYHtyIGV2YWw9RkFMU0UsIG1lc3NhZ2U9VFJVRSwgd2FybmluZz1UUlVFLCAsIGVjaG89VFJVRSwgaW5jbHVkZT1UUlVFLCBwYWdlZC5wcmludD1UUlVFfQpjb25maWc9c3BhcmtfY29uZmlnKCkKY29uZmlnPWMoY29uZmlnLGxpc3QoCiAgInNwYXJrLmV4ZWN1dG9yLm1lbW9yeSI9IjFHIiwKICAic3BhcmsuZHJpdmVyLm1lbW9yeSI9IjFHIiwKICAic3BhcmsuZXhlY3V0b3IuaW5zdGFuY2VzIj0iMiIsCiAgInNwYXJrLmV4ZWN1dG9yLmNvcmVzIiA9ICIiLAogICJzcGFya2x5ci5sb2cuY29uc29sZSIgPSBUUlVFLAogICMgICAgICJzcGFyay5keW5hbWljQWxsb2NhdGlvbi5lbmFibGVkIj1UUlVFLAogICMgICAgInNwYXJrLnNodWZmbGUuc2VydmljZS5lbmFibGVkIj1UUlVFLAogICJzcGFyay5zcWwudHVuZ3N0ZW4uZW5hYmxlZCI9IFRSVUUKICAjICAgInNwYXJrLmNvcmVzLm1heCIgPSAiMiIKICAjICAgICAgInNwYXJrLnNxbC53YXJlaG91c2UuZGlyIiA9ICIvVXNlcnMvb2xlZ2JheWRha292L0Rvd25sb2Fkcy9zcGFyay0yLjIuMC1iaW4taGFkb29wMi43L3RtcC9oaXZlL3dhcmVob3VzZSIKKSkKYGBgCiMjIENvbm5lY3QgdG8gU3BhcmsgbWFzdGVyIChTdGFuZGFsb25lIGNsdXN0ZXIpCmBgYHtyIGV2YWw9RkFMU0UsIG1lc3NhZ2U9VFJVRSwgd2FybmluZz1UUlVFLCAsIGVjaG89VFJVRSwgaW5jbHVkZT1UUlVFLCBwYWdlZC5wcmludD1UUlVFfQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJzcGFyazovLzEyNy4wLjAuMTo3MDc3IiwgCiAgICAgICAgICAgICAgICAgICAgdmVyc2lvbiA9ICIyLjIuMCIsCiAgICAgICAgICAgICAgICAgICAgYXBwX25hbWUgPSAic3BhcmtseXI0IiwKICAgICAgICAgICAgICAgICAgICBzcGFya19ob21lID0gIi9Vc2Vycy9vbGVnYmF5ZGFrb3YvRG93bmxvYWRzL3NwYXJrLTIuMi4wLWJpbi1oYWRvb3AyLjcvIiwKICAgICAgICAgICAgICAgICAgICBjb25maWcgPSBjb25maWcpCmBgYAojIyBDb25uZWN0IHRvIFlBUk4gKHZlcnNpb24gMSkKYGBge3IgZXZhbD1GQUxTRSwgbWVzc2FnZT1UUlVFLCB3YXJuaW5nPVRSVUUsICwgZWNobz1UUlVFLCBpbmNsdWRlPVRSVUUsIHBhZ2VkLnByaW50PVRSVUV9CmNvbmZpZz1zcGFya19jb25maWcoKQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJ5YXJuLWNsaWVudCIsIAogICAgICAgICAgICAgICAgICAgIHZlcnNpb24gPSAiMi4yLjAiLAogICAgICAgICAgICAgICAgICAgIGFwcF9uYW1lID0gInNwYXJrbHlyNCIsCiAgICAgICAgICAgICAgICAgICAgc3BhcmtfaG9tZSA9ICIvVXNlcnMvb2xlZ2JheWRha292L0Rvd25sb2Fkcy9zcGFyay0yLjIuMC1iaW4taGFkb29wMi43LyIsCiAgICAgICAgICAgICAgICAgICAgY29uZmlnID0gY29uZmlnKQpgYGAKIyMgQ29ubmVjdCB0byBZQVJOICh2ZXJzaW9uIDIpCmBgYHtyIGV2YWw9RkFMU0UsIG1lc3NhZ2U9VFJVRSwgd2FybmluZz1UUlVFLCAsIGVjaG89VFJVRSwgaW5jbHVkZT1UUlVFLCBwYWdlZC5wcmludD1UUlVFfQpzYyA8LSAKICBzcGFya19jb25uZWN0KG1hc3Rlcj0ieWFybiIsCiAgICAgICAgICAgICAgICBjb25maWcgPSBsaXN0KAogICAgICAgICAgICAgICAgICBkZWZhdWx0ID0gbGlzdCgKICAgICAgICAgICAgICAgICAgICBzcGFyay5zdWJtaXQuZGVwbG95TW9kZT0gImNsaWVudCIsCiAgICAgICAgICAgICAgICAgICAgc3BhcmsuZXhlY3V0b3IuaW5zdGFuY2VzPSAyLCAKICAgICAgICAgICAgICAgICAgICBzcGFyay5leGVjdXRvci5tZW1vcnk9ICIyRyIsCiAgICAgICAgICAgICAgICAgICAgc3BhcmsuZXhlY3V0b3IuY29yZXM9IDIsCiAgICAgICAgICAgICAgICAgICAgc3BhcmsuZHJpdmVyLm1lbW9yeT0gIjFHIiwKICAgICAgICAgICAgICAgICAgICAic3BhcmtseXIubG9nLmNvbnNvbGUiID0gVFJVRQogICAgICAgICAgICAgICAgICAgICkpKQoKYGBgCiMjIENvbm5lY3QgdG8gSDJPCmBgYHtyIGV2YWw9RkFMU0UsIG1lc3NhZ2U9VFJVRSwgd2FybmluZz1UUlVFLCAsIGVjaG89VFJVRSwgaW5jbHVkZT1UUlVFLCBwYWdlZC5wcmludD1UUlVFfQpoMm9fY29udGV4dChzYywgc3RyaWN0X3ZlcnNpb25fY2hlY2sgPSBGQUxTRSkKYGBgCiMjIERhdGEgcHJvY2Vzc2luZwpgYGB7ciBldmFsPUZBTFNFLCBtZXNzYWdlPVRSVUUsIHdhcm5pbmc9VFJVRSwgLCBlY2hvPVRSVUUsIGluY2x1ZGU9VFJVRSwgcGFnZWQucHJpbnQ9VFJVRX0KbGlicmFyeShkcGx5cikKbXRjYXJzX3RibCA8LSBjb3B5X3RvKHNjLCBtdGNhcnMsICJtdGNhcnMiKQpzcmNfdGJscyhzYykgCiMgZnJvbSBIaXZlCmRhdGFfdGJsIDwtIHRibChzYywgInRhYmxlX25hbWUiKSAKZGF0YV90YmwyIDwtIHRibChzYywgc3FsKCJTRUxFQ1QgKiBmcm9tIG10Y2FycyIpKQoKIyBIMk8KbXRjYXJzX2gybyA8LSBhc19oMm9fZnJhbWUoc2MsIG10Y2Fyc190YmwsIHN0cmljdF92ZXJzaW9uX2NoZWNrID0gRkFMU0UpCgpwYXJ0aXRpb25zIDwtIG10Y2Fyc190YmwgJT4lCiAgZmlsdGVyKGhwID49IDEwMCkgJT4lCiAgbXV0YXRlKGN5bDggPSBjeWwgPT0gOCkgJT4lCiAgc2RmX3BhcnRpdGlvbih0cmFpbmluZyA9IDAuNSwgdGVzdCA9IDAuNSwgc2VlZCA9IDEwOTkpCgpgYGAKIyMgVHJhaW4gbW9kZWwKYGBge3IgZXZhbD1GQUxTRSwgbWVzc2FnZT1UUlVFLCB3YXJuaW5nPVRSVUUsICwgZWNobz1UUlVFLCBpbmNsdWRlPVRSVUUsIHBhZ2VkLnByaW50PVRSVUV9CnRyYWluaW5nIDwtIGFzX2gyb19mcmFtZShzYywgcGFydGl0aW9ucyR0cmFpbmluZywgc3RyaWN0X3ZlcnNpb25fY2hlY2sgPSBGQUxTRSkKdGVzdCA8LSBhc19oMm9fZnJhbWUoc2MsIHBhcnRpdGlvbnMkdGVzdCwgc3RyaWN0X3ZlcnNpb25fY2hlY2sgPSBGQUxTRSkKCiMgZml0IGEgbGluZWFyIG1vZGVsIHRvIHRoZSB0cmFpbmluZyBkYXRhc2V0CmdsbV9tb2RlbCA8LSBoMm8uZ2xtKHggPSBjKCJ3dCIsICJjeWwiKSwgCiAgICAgICAgICAgICAgICAgICAgIHkgPSAibXBnIiwgCiAgICAgICAgICAgICAgICAgICAgIHRyYWluaW5nX2ZyYW1lID0gdHJhaW5pbmcsCiAgICAgICAgICAgICAgICAgICAgIGxhbWJkYV9zZWFyY2ggPSBUUlVFKQoKZ2xtX21vZGVsCgpsaWJyYXJ5KGdncGxvdDIpCgojIGNvbXB1dGUgcHJlZGljdGVkIHZhbHVlcyBvbiBvdXIgdGVzdCBkYXRhc2V0CnByZWQgPC0gaDJvLnByZWRpY3QoZ2xtX21vZGVsLCBuZXdkYXRhID0gdGVzdCkKIyBjb252ZXJ0IGZyb20gSDJPIEZyYW1lIHRvIFNwYXJrIERhdGFGcmFtZQpwcmVkaWN0ZWQgPC0gYXNfc3BhcmtfZGF0YWZyYW1lKHNjLCBwcmVkLCBzdHJpY3RfdmVyc2lvbl9jaGVjayA9IEZBTFNFKQoKIyBleHRyYWN0IHRoZSB0cnVlICdtcGcnIHZhbHVlcyBmcm9tIG91ciB0ZXN0IGRhdGFzZXQKYWN0dWFsIDwtIHBhcnRpdGlvbnMkdGVzdCAlPiUKICBzZWxlY3QobXBnKSAlPiUKICBjb2xsZWN0KCkgJT4lCiAgYFtbYCgibXBnIikKCiMgcHJvZHVjZSBhIGRhdGEuZnJhbWUgaG91c2luZyBvdXIgcHJlZGljdGVkICsgYWN0dWFsICdtcGcnIHZhbHVlcwpkYXRhIDwtIGRhdGEuZnJhbWUoCiAgcHJlZGljdGVkID0gcHJlZGljdGVkLAogIGFjdHVhbCAgICA9IGFjdHVhbAopCiMgYSBidWcgaW4gZGF0YS5mcmFtZSBkb2VzIG5vdCBzZXQgY29sbmFtZXMgcHJvcGVybHk7IHJlc2V0IGhlcmUgCm5hbWVzKGRhdGEpIDwtIGMoInByZWRpY3RlZCIsICJhY3R1YWwiKQoKIyBwbG90IHByZWRpY3RlZCB2cy4gYWN0dWFsIHZhbHVlcwpnZ3Bsb3QoZGF0YSwgYWVzKHggPSBhY3R1YWwsIHkgPSBwcmVkaWN0ZWQpKSArCiAgZ2VvbV9hYmxpbmUobHR5ID0gImRhc2hlZCIsIGNvbCA9ICJyZWQiKSArCiAgZ2VvbV9wb2ludCgpICsKICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gMC41KSkgKwogIGNvb3JkX2ZpeGVkKHJhdGlvID0gMSkgKwogIGxhYnMoCiAgICB4ID0gIkFjdHVhbCBGdWVsIENvbnN1bXB0aW9uIiwKICAgIHkgPSAiUHJlZGljdGVkIEZ1ZWwgQ29uc3VtcHRpb24iLAogICAgdGl0bGUgPSAiUHJlZGljdGVkIHZzLiBBY3R1YWwgRnVlbCBDb25zdW1wdGlvbiIKICApCmBgYAojIyBTcGFyayBkaXNjb25uZWN0CmBgYHtyIGV2YWw9RkFMU0UsIG1lc3NhZ2U9VFJVRSwgd2FybmluZz1UUlVFLCAsIGVjaG89VFJVRSwgaW5jbHVkZT1UUlVFLCBwYWdlZC5wcmludD1UUlVFfQpzcGFya19kaXNjb25uZWN0KHNjKQpoMm8uc2h1dGRvd24oKQojIFRvIHN0b3AgYWxsIHJ1bm5pbmcgSDJPIHNlc3Npb25zCiMgcHMgLWVmd3cgfCBncmVwIGgybwoKYGBgCiMjIERvY2tlciBmaWxlCltEb2NrZXIgZmlsZV0oaHR0cHM6Ly9naXRodWIuY29tL2gyb2FpL2gyby0zL2Jsb2IvbWFzdGVyL2RvY2tlci9Eb2NrZXJmaWxlKQoKW0RvY2tlciBpbWFnZSBidWlsZCBhbmQgcnVuXShodHRwczovL2gyby1yZWxlYXNlLnMzLmFtYXpvbmF3cy5jb20vaDJvL3JlbC13ZWllcnN0cmFzcy83L2RvY3Mtd2Vic2l0ZS9oMm8tZG9jcy93ZWxjb21lLmh0bWwjZG9ja2VyLXVzZXJzKQoK