#Step 1 install required packages
install.packages("sparklyr")
Error in install.packages : Updating loaded packages
install.packages("dplyr")
Error in install.packages : Updating loaded packages
install.packages("ggplot2")
Error in install.packages : Updating loaded packages
install.packages("babynames")
trying URL 'https://cran.rstudio.com/src/contrib/babynames_1.0.0.tar.gz'
Content type 'application/x-gzip' length 5508403 bytes (5.3 MB)
==================================================
downloaded 5.3 MB
* installing *source* package ‘babynames’ ...
** package ‘babynames’ successfully unpacked and MD5 sums checked
** R
** data
*** moving datasets to lazyload DB
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (babynames)
The downloaded source packages are in
‘/tmp/Rtmp6sguoA/downloaded_packages’
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
install.packages("dygraphs")
trying URL 'https://cran.rstudio.com/src/contrib/dygraphs_1.1.1.6.tar.gz'
Content type 'application/x-gzip' length 318939 bytes (311 KB)
==================================================
downloaded 311 KB
* installing *source* package ‘dygraphs’ ...
** package ‘dygraphs’ successfully unpacked and MD5 sums checked
** R
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (dygraphs)
The downloaded source packages are in
‘/tmp/Rtmp6sguoA/downloaded_packages’
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
install.packages("ggplot2")
trying URL 'https://cran.rstudio.com/src/contrib/ggplot2_3.2.0.tar.gz'
Content type 'application/x-gzip' length 3193995 bytes (3.0 MB)
==================================================
downloaded 3.0 MB
* installing *source* package ‘ggplot2’ ...
** package ‘ggplot2’ successfully unpacked and MD5 sums checked
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded
* DONE (ggplot2)
The downloaded source packages are in
‘/tmp/Rtmp6sguoA/downloaded_packages’
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
install.packages("dplyr")
Error in install.packages : Updating loaded packages
install.packages("sparklyr")
Error in install.packages : Updating loaded packages
install.packages("rbokeh")
trying URL 'https://cran.rstudio.com/src/contrib/rbokeh_0.5.0.tar.gz'
Content type 'application/x-gzip' length 962899 bytes (940 KB)
==================================================
downloaded 940 KB
* installing *source* package ‘rbokeh’ ...
** package ‘rbokeh’ successfully unpacked and MD5 sums checked
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded
* DONE (rbokeh)
The downloaded source packages are in
‘/tmp/Rtmp6sguoA/downloaded_packages’
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
#Step 2 connect spark by using spark_connect
library(sparklyr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
options(bitmapType = 'cairo')
Sys.setenv(JAVA_HOME="/opt/client172.16.4.35/JDK/jdk-8u201")
Sys.setenv(SPARK_HOME="/opt/client172.16.4.35/Spark2x/spark")
spark_version_from_home(Sys.getenv("SPARK_HOME"))
[1] "2.3.2"
Sys.setenv(SPARK_HOME_VERSION="2.3.2")
sc <- spark_connect(master = "yarn-client", version = "2.3.2", spark_home = "/opt/client172.16.4.35/Spark2x/spark")
#Step 3 Cache the tables into memory
#Use tbl_cache to load the flights table into memory. Caching tables will make analysis much faster. Create a dplyr reference to the Spark DataFrame.
# Cache flights Hive table into Spark
tbl_cache(sc, 'flights')
flights_tbl <- tbl(sc, 'flights')
# Cache airlines Hive table into Spark
tbl_cache(sc, 'airlines')
airlines_tbl <- tbl(sc, 'airlines')
# Cache airports Hive table into Spark
tbl_cache(sc, 'airports')
airports_tbl <- tbl(sc, 'airports')
#Step 4 Create a model data set
#Filter the data to contain only the records to be used in the fitted model. Join carrier descriptions for reference. Create a new variable called gain which represents the amount of time gained (or lost) in flight.
# Filter records and create target variable 'gain'
model_data <- flights_tbl %>%
filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
filter(depdelay > 15 & depdelay < 240) %>%
filter(arrdelay > -60 & arrdelay < 360) %>%
filter(year >= 2003 & year <= 2007) %>%
left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>%
mutate(gain = depdelay - arrdelay) %>%
select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain)
# Summarize data by carrier
model_data %>%
group_by(uniquecarrier) %>%
summarize(description = min(description), gain=mean(gain),
distance=mean(distance), depdelay=mean(depdelay)) %>%
select(description, gain, distance, depdelay) %>%
arrange(gain)
Missing values are always removed in SQL.
Use `MIN(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.Missing values are always removed in SQL.
Use `mean(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
#Step 5 Train a linear model
#Predict time gained or lost in flight as a function of distance, departure delay, and airline carrier.
# Partition the data into training and validation sets
model_partition <- model_data %>%
sdf_random_split(train = 0.8, valid = 0.2, seed = 5555)
# Fit a linear model
ml1 <- model_partition$train %>%
ml_linear_regression(gain ~ distance + depdelay + uniquecarrier)
# Summarize the linear model
summary(ml1)
Deviance Residuals (approximate):
Min 1Q Median 3Q Max
-283.413 -5.600 2.734 9.908 103.224
Coefficients:
(Intercept) distance depdelay uniquecarrier_WN uniquecarrier_AA uniquecarrier_MQ uniquecarrier_UA
-2.435372942 0.003087106 -0.013970133 5.398096950 -1.015123763 -0.385983271 0.599749767
uniquecarrier_OO uniquecarrier_US uniquecarrier_XE uniquecarrier_EV uniquecarrier_DL uniquecarrier_NW uniquecarrier_CO
0.779013095 0.649706859 0.031337739 2.873915433 -0.782354850 -2.415317047 0.053595272
uniquecarrier_YV uniquecarrier_FL uniquecarrier_OH uniquecarrier_B6 uniquecarrier_AS uniquecarrier_9E uniquecarrier_F9
4.341202518 0.173966539 -0.236840840 -1.500180801 1.261810937 1.169707134 0.137925375
uniquecarrier_AQ uniquecarrier_TZ
4.313009552 -6.093145034
R-Squared: 0.02301
Root Mean Squared Error: 17.83
#Step 6 Assess model performance
#Compare the model performance using the validation data.
# Calculate average gains by predicted decile
model_deciles <- lapply(model_partition, function(x) {
ml_predict(ml1, x) %>%
mutate(decile = ntile(desc(prediction), 10)) %>%
group_by(decile) %>%
summarize(gain = mean(gain)) %>%
select(decile, gain) %>%
collect()
})
# Create a summary dataset for plotting
deciles <- rbind(
data.frame(data = 'train', model_deciles$train),
data.frame(data = 'valid', model_deciles$valid),
make.row.names = FALSE
)
# Plot average gains by predicted decile
deciles %>%
ggplot(aes(factor(decile), gain, fill = data)) +
geom_bar(stat = 'identity', position = 'dodge') +
labs(title = 'Average gain by predicted decile', x = 'Decile', y = 'Minutes')

#Step 7 Visualize predictions
#Compare actual gains to predicted gains for an out of time sample.
# Select data from an out of time sample
data_2008 <- flights_tbl %>%
filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
filter(depdelay > 15 & depdelay < 240) %>%
filter(arrdelay > -60 & arrdelay < 360) %>%
filter(year == 2008) %>%
left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>%
mutate(gain = depdelay - arrdelay) %>%
select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain, origin,dest)
# Summarize data by carrier
carrier <- ml_predict(ml1, data_2008) %>%
group_by(description) %>%
summarize(gain = mean(gain), prediction = mean(prediction), freq = n()) %>%
filter(freq > 10000) %>%
collect
# Plot actual gains and predicted gains by airline carrier
ggplot(carrier, aes(gain, prediction)) +
geom_point(alpha = 0.75, color = 'red', shape = 3) +
geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') +
geom_text(aes(label = substr(description, 1, 20)), size = 3, alpha = 0.75, vjust = -1) +
labs(title='Average Gains Forecast', x = 'Actual', y = 'Predicted')

LS0tCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgaHRtbF9kb2N1bWVudDogZGVmYXVsdAotLS0KYGBge3J9CiNTdGVwIDEgaW5zdGFsbCByZXF1aXJlZCBwYWNrYWdlcwppbnN0YWxsLnBhY2thZ2VzKCJzcGFya2x5ciIpCmluc3RhbGwucGFja2FnZXMoImRwbHlyIikKaW5zdGFsbC5wYWNrYWdlcygiZ2dwbG90MiIpCmluc3RhbGwucGFja2FnZXMoImJhYnluYW1lcyIpCmluc3RhbGwucGFja2FnZXMoImR5Z3JhcGhzIikKaW5zdGFsbC5wYWNrYWdlcygicmJva2VoIikKYGBgCgpgYGB7cn0KI1N0ZXAgMiBjb25uZWN0IHNwYXJrIGJ5IHVzaW5nIHNwYXJrX2Nvbm5lY3QKbGlicmFyeShzcGFya2x5cikKbGlicmFyeShkcGx5cikKbGlicmFyeShnZ3Bsb3QyKQpvcHRpb25zKGJpdG1hcFR5cGUgPSAnY2Fpcm8nKQpTeXMuc2V0ZW52KEpBVkFfSE9NRT0iL29wdC9jbGllbnQxNzIuMTYuNC4zNS9KREsvamRrLTh1MjAxIikKU3lzLnNldGVudihTUEFSS19IT01FPSIvb3B0L2NsaWVudDE3Mi4xNi40LjM1L1NwYXJrMngvc3BhcmsiKQpzcGFya192ZXJzaW9uX2Zyb21faG9tZShTeXMuZ2V0ZW52KCJTUEFSS19IT01FIikpClN5cy5zZXRlbnYoU1BBUktfSE9NRV9WRVJTSU9OPSIyLjMuMiIpCnNjIDwtIHNwYXJrX2Nvbm5lY3QobWFzdGVyID0gICJ5YXJuLWNsaWVudCIsIHZlcnNpb24gPSAiMi4zLjIiLCBzcGFya19ob21lID0gIi9vcHQvY2xpZW50MTcyLjE2LjQuMzUvU3BhcmsyeC9zcGFyayIpCmBgYAoKYGBge3J9CiNTdGVwIDMgQ2FjaGUgdGhlIHRhYmxlcyBpbnRvIG1lbW9yeQoKI1VzZSB0YmxfY2FjaGUgdG8gbG9hZCB0aGUgZmxpZ2h0cyB0YWJsZSBpbnRvIG1lbW9yeS4gQ2FjaGluZyB0YWJsZXMgd2lsbCBtYWtlIGFuYWx5c2lzIG11Y2ggZmFzdGVyLiBDcmVhdGUgYSBkcGx5ciByZWZlcmVuY2UgdG8gdGhlIFNwYXJrIERhdGFGcmFtZS4KCiMgQ2FjaGUgZmxpZ2h0cyBIaXZlIHRhYmxlIGludG8gU3BhcmsKdGJsX2NhY2hlKHNjLCAnZmxpZ2h0cycpCmZsaWdodHNfdGJsIDwtIHRibChzYywgJ2ZsaWdodHMnKQoKIyBDYWNoZSBhaXJsaW5lcyBIaXZlIHRhYmxlIGludG8gU3BhcmsKdGJsX2NhY2hlKHNjLCAnYWlybGluZXMnKQphaXJsaW5lc190YmwgPC0gdGJsKHNjLCAnYWlybGluZXMnKQoKIyBDYWNoZSBhaXJwb3J0cyBIaXZlIHRhYmxlIGludG8gU3BhcmsKdGJsX2NhY2hlKHNjLCAnYWlycG9ydHMnKQphaXJwb3J0c190YmwgPC0gdGJsKHNjLCAnYWlycG9ydHMnKQoKYGBgCgpgYGB7cn0KI1N0ZXAgNCBDcmVhdGUgYSBtb2RlbCBkYXRhIHNldAoKI0ZpbHRlciB0aGUgZGF0YSB0byBjb250YWluIG9ubHkgdGhlIHJlY29yZHMgdG8gYmUgdXNlZCBpbiB0aGUgZml0dGVkIG1vZGVsLiBKb2luIGNhcnJpZXIgZGVzY3JpcHRpb25zIGZvciByZWZlcmVuY2UuIENyZWF0ZSBhIG5ldyB2YXJpYWJsZSBjYWxsZWQgZ2FpbiB3aGljaCByZXByZXNlbnRzIHRoZSBhbW91bnQgb2YgdGltZSBnYWluZWQgKG9yIGxvc3QpIGluIGZsaWdodC4KCiMgRmlsdGVyIHJlY29yZHMgYW5kIGNyZWF0ZSB0YXJnZXQgdmFyaWFibGUgJ2dhaW4nCm1vZGVsX2RhdGEgPC0gZmxpZ2h0c190YmwgJT4lCmZpbHRlcighaXMubmEoYXJyZGVsYXkpICYgIWlzLm5hKGRlcGRlbGF5KSAmICFpcy5uYShkaXN0YW5jZSkpICU+JQpmaWx0ZXIoZGVwZGVsYXkgPiAxNSAmIGRlcGRlbGF5IDwgMjQwKSAlPiUKZmlsdGVyKGFycmRlbGF5ID4gLTYwICYgYXJyZGVsYXkgPCAzNjApICU+JQpmaWx0ZXIoeWVhciA+PSAyMDAzICYgeWVhciA8PSAyMDA3KSAlPiUKbGVmdF9qb2luKGFpcmxpbmVzX3RibCwgYnkgPSBjKCJ1bmlxdWVjYXJyaWVyIiA9ICJjb2RlIikpICU+JQptdXRhdGUoZ2FpbiA9IGRlcGRlbGF5IC0gYXJyZGVsYXkpICU+JQpzZWxlY3QoeWVhciwgbW9udGgsIGFycmRlbGF5LCBkZXBkZWxheSwgZGlzdGFuY2UsIHVuaXF1ZWNhcnJpZXIsIGRlc2NyaXB0aW9uLCBnYWluKQoKIyBTdW1tYXJpemUgZGF0YSBieSBjYXJyaWVyCm1vZGVsX2RhdGEgJT4lCmdyb3VwX2J5KHVuaXF1ZWNhcnJpZXIpICU+JQpzdW1tYXJpemUoZGVzY3JpcHRpb24gPSBtaW4oZGVzY3JpcHRpb24pLCBnYWluPW1lYW4oZ2FpbiksCiAgICAgICAgICBkaXN0YW5jZT1tZWFuKGRpc3RhbmNlKSwgZGVwZGVsYXk9bWVhbihkZXBkZWxheSkpICU+JQpzZWxlY3QoZGVzY3JpcHRpb24sIGdhaW4sIGRpc3RhbmNlLCBkZXBkZWxheSkgJT4lCmFycmFuZ2UoZ2FpbikKYGBgCmBgYHtyfQojU3RlcCA1IFRyYWluIGEgbGluZWFyIG1vZGVsCgojUHJlZGljdCB0aW1lIGdhaW5lZCBvciBsb3N0IGluIGZsaWdodCBhcyBhIGZ1bmN0aW9uIG9mIGRpc3RhbmNlLCBkZXBhcnR1cmUgZGVsYXksIGFuZCBhaXJsaW5lIGNhcnJpZXIuCgojIFBhcnRpdGlvbiB0aGUgZGF0YSBpbnRvIHRyYWluaW5nIGFuZCB2YWxpZGF0aW9uIHNldHMKbW9kZWxfcGFydGl0aW9uIDwtIG1vZGVsX2RhdGEgJT4lCnNkZl9yYW5kb21fc3BsaXQodHJhaW4gPSAwLjgsIHZhbGlkID0gMC4yLCBzZWVkID0gNTU1NSkKCiMgRml0IGEgbGluZWFyIG1vZGVsCm1sMSA8LSBtb2RlbF9wYXJ0aXRpb24kdHJhaW4gJT4lCm1sX2xpbmVhcl9yZWdyZXNzaW9uKGdhaW4gfiBkaXN0YW5jZSArIGRlcGRlbGF5ICsgdW5pcXVlY2FycmllcikKCiMgU3VtbWFyaXplIHRoZSBsaW5lYXIgbW9kZWwKc3VtbWFyeShtbDEpCgpgYGAKCmBgYHtyfQojU3RlcCA2IEFzc2VzcyBtb2RlbCBwZXJmb3JtYW5jZQoKI0NvbXBhcmUgdGhlIG1vZGVsIHBlcmZvcm1hbmNlIHVzaW5nIHRoZSB2YWxpZGF0aW9uIGRhdGEuCgojIENhbGN1bGF0ZSBhdmVyYWdlIGdhaW5zIGJ5IHByZWRpY3RlZCBkZWNpbGUKbW9kZWxfZGVjaWxlcyA8LSBsYXBwbHkobW9kZWxfcGFydGl0aW9uLCBmdW5jdGlvbih4KSB7CiAgbWxfcHJlZGljdChtbDEsIHgpICU+JQogICAgbXV0YXRlKGRlY2lsZSA9IG50aWxlKGRlc2MocHJlZGljdGlvbiksIDEwKSkgJT4lCiAgICBncm91cF9ieShkZWNpbGUpICU+JQogICAgc3VtbWFyaXplKGdhaW4gPSBtZWFuKGdhaW4pKSAlPiUKICAgIHNlbGVjdChkZWNpbGUsIGdhaW4pICU+JQogICAgY29sbGVjdCgpCn0pCgojIENyZWF0ZSBhIHN1bW1hcnkgZGF0YXNldCBmb3IgcGxvdHRpbmcKZGVjaWxlcyA8LSByYmluZCgKICBkYXRhLmZyYW1lKGRhdGEgPSAndHJhaW4nLCBtb2RlbF9kZWNpbGVzJHRyYWluKSwKICBkYXRhLmZyYW1lKGRhdGEgPSAndmFsaWQnLCBtb2RlbF9kZWNpbGVzJHZhbGlkKSwKICBtYWtlLnJvdy5uYW1lcyA9IEZBTFNFCikKCiMgUGxvdCBhdmVyYWdlIGdhaW5zIGJ5IHByZWRpY3RlZCBkZWNpbGUKZGVjaWxlcyAlPiUKICBnZ3Bsb3QoYWVzKGZhY3RvcihkZWNpbGUpLCBnYWluLCBmaWxsID0gZGF0YSkpICsKICBnZW9tX2JhcihzdGF0ID0gJ2lkZW50aXR5JywgcG9zaXRpb24gPSAnZG9kZ2UnKSArCiAgbGFicyh0aXRsZSA9ICdBdmVyYWdlIGdhaW4gYnkgcHJlZGljdGVkIGRlY2lsZScsIHggPSAnRGVjaWxlJywgeSA9ICdNaW51dGVzJykKCgpgYGAKCmBgYHtyfQojU3RlcCA3ICBWaXN1YWxpemUgcHJlZGljdGlvbnMKCiNDb21wYXJlIGFjdHVhbCBnYWlucyB0byBwcmVkaWN0ZWQgZ2FpbnMgZm9yIGFuIG91dCBvZiB0aW1lIHNhbXBsZS4KCiMgU2VsZWN0IGRhdGEgZnJvbSBhbiBvdXQgb2YgdGltZSBzYW1wbGUKZGF0YV8yMDA4IDwtIGZsaWdodHNfdGJsICU+JQogIGZpbHRlcighaXMubmEoYXJyZGVsYXkpICYgIWlzLm5hKGRlcGRlbGF5KSAmICFpcy5uYShkaXN0YW5jZSkpICU+JQogIGZpbHRlcihkZXBkZWxheSA+IDE1ICYgZGVwZGVsYXkgPCAyNDApICU+JQogIGZpbHRlcihhcnJkZWxheSA+IC02MCAmIGFycmRlbGF5IDwgMzYwKSAlPiUKICBmaWx0ZXIoeWVhciA9PSAyMDA4KSAlPiUKICBsZWZ0X2pvaW4oYWlybGluZXNfdGJsLCBieSA9IGMoInVuaXF1ZWNhcnJpZXIiID0gImNvZGUiKSkgJT4lCiAgbXV0YXRlKGdhaW4gPSBkZXBkZWxheSAtIGFycmRlbGF5KSAlPiUKICBzZWxlY3QoeWVhciwgbW9udGgsIGFycmRlbGF5LCBkZXBkZWxheSwgZGlzdGFuY2UsIHVuaXF1ZWNhcnJpZXIsIGRlc2NyaXB0aW9uLCBnYWluLCBvcmlnaW4sZGVzdCkKCiMgU3VtbWFyaXplIGRhdGEgYnkgY2FycmllcgpjYXJyaWVyIDwtIG1sX3ByZWRpY3QobWwxLCBkYXRhXzIwMDgpICU+JQogIGdyb3VwX2J5KGRlc2NyaXB0aW9uKSAlPiUKICBzdW1tYXJpemUoZ2FpbiA9IG1lYW4oZ2FpbiksIHByZWRpY3Rpb24gPSBtZWFuKHByZWRpY3Rpb24pLCBmcmVxID0gbigpKSAlPiUKICBmaWx0ZXIoZnJlcSA+IDEwMDAwKSAlPiUKICBjb2xsZWN0CgojIFBsb3QgYWN0dWFsIGdhaW5zIGFuZCBwcmVkaWN0ZWQgZ2FpbnMgYnkgYWlybGluZSBjYXJyaWVyCmdncGxvdChjYXJyaWVyLCBhZXMoZ2FpbiwgcHJlZGljdGlvbikpICsKICBnZW9tX3BvaW50KGFscGhhID0gMC43NSwgY29sb3IgPSAncmVkJywgc2hhcGUgPSAzKSArCiAgZ2VvbV9hYmxpbmUoaW50ZXJjZXB0ID0gMCwgc2xvcGUgPSAxLCBhbHBoYSA9IDAuMTUsIGNvbG9yID0gJ2JsdWUnKSArCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IHN1YnN0cihkZXNjcmlwdGlvbiwgMSwgMjApKSwgc2l6ZSA9IDMsIGFscGhhID0gMC43NSwgdmp1c3QgPSAtMSkgKwogIGxhYnModGl0bGU9J0F2ZXJhZ2UgR2FpbnMgRm9yZWNhc3QnLCB4ID0gJ0FjdHVhbCcsIHkgPSAnUHJlZGljdGVkJykKYGBgCgo=