2:00-2:30 sparklyr: Recap and Updates
2:40-3:10 sparklyr: Architecture and Use Cases
Designed at Bell Laboratories by John Chambers, where computing was done by calling Fortran subroutines.
“S is great but serious data analysis will always be done in Fortran” - Bell Labs Management
R community is noted for its active package contributions. CRAN R’s package manager with ~10K packages.
library(sparklyr) # Load sparklyrspark_install() # Install Apache Sparksc <- spark_connect(master = "local") # Connect to local instancelibrary(dplyr) # Data Manipulation Grammar
mtcars_tbl <- copy_to(sc, mtcars) # Copy mtcars into Spark
count(mtcars_tbl) # Count recordsml_linear_regression(mtcars_tbl, # Perform linear regression
response = "mpg", # Response vector
features = c("wt", "cyl")) # Features for the model fitlibrary(DBI) # R Database Interface
dbGetQuery(sc, "SELECT * FROM mtcars") # Run SQL query in Sparkinvoke(spark_context(sc), "version") # Run sc.version in Scalacompile_package_jars() # Compile Scala codeCross-platform installer for Apache Spark.
library(sparkinstall)
spark_install(version = "1.6.2") # Install Spark from Rfrom spark_install import *
spark_install(version = "1.6.2") # Install Spark from Python“This project provides a cross-platform installer for Apache Spark designed to use system resources efficiently under a common API. This initial version commes with support for R and Python that arose from a collaboration between RStudio and Microsoft” - github.com/rstudio/spark-install
var sparklyrRetVar_0 = LivyUtils.invokeFromBase64(
"AAAAHm9yZy5hcGFjaGUuc3BhcmsuU3BhcmtDb250" +
"ZXh0AAAAAAEAAAAMZ2V0T3JDcmVhdGUAAAAAAA=="
)delay <- flights_tbl %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
collect
# plot delays
library(ggplot2)
ggplot(delay, aes(dist, delay)) +
geom_point(aes(size = count), alpha = 1/2) +
geom_smooth() +
scale_size_area(max_size = 2)library(DBI)
dbGetQuery(sc, "SELECT * FROM flights LIMIT 100")# transform our data set, and then partition into 'training', 'test'
partitions <- mtcars_tbl %>%
filter(hp >= 100) %>%
mutate(cyl8 = cyl == 8) %>%
sdf_partition(training = 0.5, test = 0.5, seed = 1099)
# fit a linear model to the training dataset
partitions$training %>%
ml_linear_regression(response = "mpg", features = c("wt", "cyl"))library(rsparkling)
library(sparklyr)
library(dplyr)
library(h2o)
sc <- spark_connect(master = "local")mtcars_h2o <- as_h2o_frame(sc, mtcars_tbl,
strict_version_check = FALSE)
h2o.glm(x = c("wt", "cyl"),
y = "mpg",
training_frame = mtcars_h2o,
lambda_search = TRUE)h2o_flow(sc, strict_version_check = FALSE)spark_disconnect(sc)library(sparklygraphs)
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local", version = "2.1.0")
highschool_tbl <- copy_to(sc, ggraph::highschool, "highschool")
# create a table with unique vertices using dplyr
vertices_tbl <- sdf_bind_rows(
highschool_tbl %>% distinct(from) %>% transmute(id = from),
highschool_tbl %>% distinct(to) %>% transmute(id = to)
)
# create a table with <source, destination> edges
edges_tbl <- highschool_tbl %>% transmute(src = from, dst = to)
# calculate PageRank over the highschool dataset
gf_graphframe(vertices_tbl, edges_tbl) %>%
gf_pagerank(reset_prob = 0.15, max_iter = 10L, source_id = "1")spark_apply(highschool_tbl, function(x) {
x + rgamma(1, 2)
})@javierluraschi
spark_disconnect(sc)