R is an open source programming language and software environment for statistical computing
RStudio is a free and open-source integrated development environment (IDE) for R
R interface for Spark
install.packages("sparklyr") # Install sparklyr
library(sparklyr) # Load sparklyrspark_install() # Install Apache Spark
sc <- spark_connect(master = "local") # Connect to local instance
spark_web(sc) # Launch Spark UIlibrary(dplyr) # Data Manipulation Library
mtcars_tbl <- copy_to(sc, mtcars) # Copy mtcars into Spark
count(mtcars_tbl) # Count records
ml_linear_regression(mtcars_tbl, # Perform linear regression
response = "mpg", # Response vector
features = c("wt", "cyl")) # Features for the model fitml_kmeans(mtcars_tbl, # Perform kmeans clustering
centers = 3,
features = c("wt", "cyl")) # Features for the model fitflights_tbl <- copy_to(sc, nycflights13::flights, "flights")delay <- flights_tbl %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
collect
# plot delays
library(ggplot2)
ggplot(delay, aes(dist, delay)) +
geom_point(aes(size = count), alpha = 1/2) +
geom_smooth() +
scale_size_area(max_size = 2)# partition into 'training' and 'test'
partitions <- mtcars_tbl %>%
filter(hp >= 100) %>%
mutate(cyl8 = cyl == 8) %>%
sdf_partition(training = 0.5, test = 0.5, seed = 1099)# fitting a linear model
partitions$training %>%
ml_linear_regression(response = "mpg", features = c("wt", "cyl"))# clustering using kmeans
partitions$training %>%
ml_kmeans(centers = "3", features = c("wt", "cyl"))@javierluraschi
spark_disconnect(sc)