#Description, Summary, and Page URL
NYC Flight Delays This dataset contains information about flights departing from NYC airports (EWR, JFK, and LGA) in 2013. The data contains 336k rows and 19 columns. The columns include the following data - year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour.
Link to dataset https://www.kaggle.com/lampubhutia/nyc-flight-delay
##Load Data and Show Summary Statistics
flights <- read_csv("flight_data.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## month = col_double(),
## day = col_double(),
## dep_time = col_double(),
## sched_dep_time = col_double(),
## dep_delay = col_double(),
## arr_time = col_double(),
## sched_arr_time = col_double(),
## arr_delay = col_double(),
## carrier = col_character(),
## flight = col_double(),
## tailnum = col_character(),
## origin = col_character(),
## dest = col_character(),
## air_time = col_double(),
## distance = col_double(),
## hour = col_double(),
## minute = col_double(),
## time_hour = col_character()
## )
summary(flights)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Length:336776
## 1st Qu.: 9.00 1st Qu.: 8.00 Class :character
## Median :13.00 Median :29.00 Mode :character
## Mean :13.18 Mean :26.23
## 3rd Qu.:17.00 3rd Qu.:44.00
## Max. :23.00 Max. :59.00
##
#Visualize Plot1 - Number of flight delays
data <- flights %>%
na.omit() %>%
sample_n(10000)
plot1 <- ggplot(data) +
geom_freqpoly((aes(x = dep_delay)), binwidth = 2.5) +
labs(x = "departure delay (minutes)", y = "number of delays")
plot1
Plot2 - Flight delays (minutes) by distance
plot2 <- ggplot(data, aes(x=distance, y=dep_delay)) +
geom_point()
plot2
Plot3 - Number of flight delays by departure hour
Plot3 <- ggplot(data, aes(x=hour)) +
geom_bar() +
labs(x = "departure hour", y= "number of delays" )
Plot3
Plot4 - Number of delays by carrier and origin
plot4 <- ggplot(data, aes(origin)) +
geom_bar() +
facet_wrap(~carrier) +
labs(x = "origin", y= "number of delays" )
plot4
#Summarize and discuss the patterns A subset of the dataset (5k rows) was used for the visualizations. Plot1 shows the majority of flights from NYC airports departed on-time in 2013. Plot2 plots dep_delay by distance for all three airports. According to Plot2, shorter distances experienced delays similar to longer distances. This observation suggests that distance has a limited impact on flight delays. Plot3 shows that flights were scheduled to depart between the hours of 500 (5am) and 2400 (12am). The majority of flight delays occurred in the morning between 5am - 9am. Plot4 shows that carriers, EV, UA, and B6 experienced the largest number of delays with those delayed flights originating out of EWR and JFK.
#Preprocess Data
#add target variable
flights_df <- flights %>%
add_column(delayed = ifelse(flights$arr_delay > 0, 1, 0)) %>%
na.omit()
#calculate day of week
date <- ymd(str_c(flights_df$year, "-",flights_df$month, "-", flights_df$day))
day_of_week <- as.character(wday(date, label=TRUE))
#add day of week column and select predictors
flights_df <- flights_df %>%
add_column(day_of_week) %>%
select(day_of_week, hour, carrier, origin, dest, delayed)
summary(flights_df)
## day_of_week hour carrier origin
## Length:327346 Min. : 5.00 Length:327346 Length:327346
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :13.00 Mode :character Mode :character
## Mean :13.14
## 3rd Qu.:17.00
## Max. :23.00
## dest delayed
## Length:327346 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.4063
## 3rd Qu.:1.0000
## Max. :1.0000
Split data into train (60%) and test (40%)
# partition the data
train.index <- sample(c(1:dim(flights_df)[1]), dim(flights_df)[1]*0.6)
train.df <- flights_df[train.index, ]
test.df <- flights_df[-train.index, ]
# Training data: Separate into x and y tibbles
x_train_tbl <- train.df %>% select(-delayed)
y_train_tbl <- train.df %>% select(delayed)
# Testing data:
x_test_tbl <- test.df
# Remove the original data to save memory
rm(train.df)
rm(test.df)
Inspect Data
x_train_tbl_skim = skim_to_list(x_train_tbl)
names(x_train_tbl_skim)
## [1] "character" "numeric"
kable(x_train_tbl_skim$character)
| variable | missing | complete | n | min | max | empty | n_unique |
|---|---|---|---|---|---|---|---|
| carrier | 0 | 196407 | 196407 | 2 | 2 | 0 | 16 |
| day_of_week | 0 | 196407 | 196407 | 3 | 3 | 0 | 7 |
| dest | 0 | 196407 | 196407 | 3 | 3 | 0 | 104 |
| origin | 0 | 196407 | 196407 | 3 | 3 | 0 | 3 |
kable(x_train_tbl_skim$numeric)
| variable | missing | complete | n | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| hour | 0 | 196407 | 196407 | 13.13 | 4.66 | 5 | 9 | 13 | 17 | 23 | ▇▆▅▇▆▆▅▂ |
Convert Data
string_2_factor_names <- x_train_tbl %>%
select_if(is.character) %>%
names()
kable(string_2_factor_names)
| x |
|---|
| day_of_week |
| carrier |
| origin |
| dest |
unique_numeric_values_tbl <- x_train_tbl %>%
select_if(is.numeric) %>%
purrr::map_df(~ unique(.) %>%
length()) %>%
gather() %>%
arrange(value) %>%
mutate(key = as_factor(key))
factor_limit <- 7 # if the numeric column has less than 7 dintinct values
# we consider it as a factor
num_2_factor_names <- unique_numeric_values_tbl %>%
filter(value < factor_limit) %>%
arrange(desc(value)) %>%
pull(key) %>%
as.character()
kable(num_2_factor_names)
| x |
|---|
Data Transformation
rec_obj <- recipe(~ ., data = x_train_tbl) %>%
step_string2factor(string_2_factor_names) %>%
step_meanimpute(all_numeric()) %>%
step_modeimpute(all_nominal()) %>%
prep(stringsAsFactors = FALSE)
x_train_processed_tbl <- bake(rec_obj, x_train_tbl)
x_test_processed_tbl <- bake(rec_obj, x_test_tbl)
x_train_tbl %>%
select(1:5) %>%
glimpse()
## Observations: 196,407
## Variables: 5
## $ day_of_week <chr> "Tue", "Sat", "Mon", "Mon", "Sat", "Fri", "Fri", "Th…
## $ hour <dbl> 18, 13, 15, 12, 16, 7, 6, 18, 15, 7, 12, 8, 15, 15, …
## $ carrier <chr> "MQ", "B6", "B6", "WN", "B6", "AA", "UA", "EV", "AA"…
## $ origin <chr> "LGA", "LGA", "JFK", "EWR", "JFK", "LGA", "LGA", "EW…
## $ dest <chr> "CLE", "SRQ", "PIT", "STL", "BOS", "MIA", "DEN", "MS…
x_train_processed_tbl %>%
select(1:5) %>%
glimpse()
## Observations: 196,407
## Variables: 5
## $ day_of_week <fct> Tue, Sat, Mon, Mon, Sat, Fri, Fri, Thu, Fri, Tue, Su…
## $ hour <dbl> 18, 13, 15, 12, 16, 7, 6, 18, 15, 7, 12, 8, 15, 15, …
## $ carrier <fct> MQ, B6, B6, WN, B6, AA, UA, EV, AA, UA, EV, EV, EV, …
## $ origin <fct> LGA, LGA, JFK, EWR, JFK, LGA, LGA, EWR, JFK, EWR, EW…
## $ dest <fct> CLE, SRQ, PIT, STL, BOS, MIA, DEN, MSP, SJU, BOS, CM…
rec_obj_for_y <- recipe(~ ., data = y_train_tbl) %>%
step_num2factor("delayed") %>%
prep(stringsAsFactors = FALSE)
y_train_processed_tbl <- bake(rec_obj_for_y, y_train_tbl)
kable(head(y_train_tbl))
| delayed |
|---|
| 0 |
| 1 |
| 1 |
| 0 |
| 0 |
| 0 |
kable(head(y_train_processed_tbl))
| delayed |
|---|
| 0 |
| 1 |
| 1 |
| 0 |
| 0 |
| 0 |
#Analysis of Data
h2o.init(nthreads = -1) #-1 for using all cores
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 39 minutes 56 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 3 months
## H2O cluster name: H2O_started_from_R_catresa_dhm254
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.61 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.6.1 (2019-07-05)
h2o.removeAll() ## clean slate - just in case the cluster was already running
Push data into h2o
# push data into h2o
data_h2o <- as.h2o(
bind_cols(y_train_processed_tbl, x_train_processed_tbl),
destination_frame="train.hex" #destination_frame is optional
)
##
|
| | 0%
|
|=================================================================| 100%
new_data_h2o <- as.h2o(
x_test_processed_tbl,
destination_frame= "test.hex" #destination_frame is optional
)
##
|
| | 0%
|
|=================================================================| 100%
h2o.ls()
## key
## 1 test.hex
## 2 train.hex
Split Data
# Partition the data into training, validation and test sets
splits <- h2o.splitFrame(data = data_h2o,
ratios = c(0.7, 0.15), # 70/15/15 split
seed = 1234)
train_h2o <- splits[[1]]
valid_h2o <- splits[[2]]
test_h2o <- splits[[3]]
Modeling - Deep Learning
y <- "delayed"
x <- setdiff(names(train_h2o), y)
m1 <- h2o.deeplearning(x = x, y = y, training_frame = train_h2o,
model_id = "dl_model_first",
validation_frame = valid_h2o
#activation="Rectifier",
#hidden=c(200,200),
#epochs = 1
)
Summarize Model
summary(m1)
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model Key: dl_model_first
## Status of Neuron Layers: predicting delayed, 2-class classification, bernoulli distribution, CrossEntropy loss, 67,802 weights/biases, 804.4 KB, 1,381,266 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms
## 1 1 135 Input 0.00 % NA NA NA NA
## 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.112652 0.299247
## 3 3 200 Rectifier 0.00 % 0.000000 0.000000 0.229585 0.254219
## 4 4 2 Softmax NA 0.000000 0.000000 0.025146 0.011515
## momentum mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA NA
## 2 0.000000 -0.002481 0.096833 -0.378921 0.193539
## 3 0.000000 -0.012218 0.081928 -0.289252 0.236503
## 4 0.000000 -0.017251 0.273693 -0.493510 0.076352
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 10021 samples **
##
## MSE: 0.225033
## RMSE: 0.4743764
## LogLoss: 0.6409102
## Mean Per-Class Error: 0.4204997
## AUC: 0.6492381
## pr_auc: 0.5488434
## Gini: 0.2984762
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 1790 4167 0.699513 =4167/5957
## 1 575 3489 0.141486 =575/4064
## Totals 2365 7656 0.473206 =4742/10021
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.303759 0.595392 281
## 2 max f2 0.160827 0.774653 375
## 3 max f0point5 0.430315 0.530325 174
## 4 max accuracy 0.502418 0.634667 113
## 5 max precision 0.754658 1.000000 0
## 6 max recall 0.117588 1.000000 390
## 7 max specificity 0.754658 1.000000 0
## 8 max absolute_mcc 0.393585 0.212327 204
## 9 max min_per_class_accuracy 0.402834 0.605807 196
## 10 max mean_per_class_accuracy 0.393585 0.608105 204
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
##
## MSE: 0.2272802
## RMSE: 0.4767391
## LogLoss: 0.6459797
## Mean Per-Class Error: 0.4406615
## AUC: 0.6386074
## pr_auc: 0.5348459
## Gini: 0.2772148
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 3808 13679 0.782238 =13679/17487
## 1 1180 10729 0.099085 =1180/11909
## Totals 4988 24408 0.505477 =14859/29396
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.277653 0.590853 303
## 2 max f2 0.138852 0.773437 384
## 3 max f0point5 0.430016 0.522867 175
## 4 max accuracy 0.484459 0.627875 130
## 5 max precision 0.685256 0.741935 15
## 6 max recall 0.064302 1.000000 397
## 7 max specificity 0.761105 0.999943 0
## 8 max absolute_mcc 0.461792 0.198078 151
## 9 max min_per_class_accuracy 0.402032 0.597027 198
## 10 max mean_per_class_accuracy 0.430016 0.599065 175
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
##
## Scoring History:
## timestamp duration training_speed epochs iterations
## 1 2019-10-27 20:32:01 0.000 sec NA 0.00000 0
## 2 2019-10-27 20:32:06 6.234 sec 5449 obs/sec 0.17904 1
## 3 2019-10-27 20:32:23 23.525 sec 18443 obs/sec 2.68711 15
## 4 2019-10-27 20:32:41 42.019 sec 22086 obs/sec 5.91555 33
## 5 2019-10-27 20:32:58 59.005 sec 23638 obs/sec 8.96540 50
## 6 2019-10-27 20:33:05 1 min 5.966 sec 24041 obs/sec 10.04667 56
## 7 2019-10-27 20:33:07 1 min 7.571 sec 24039 obs/sec 10.04667 56
## samples training_rmse training_logloss training_r2 training_auc
## 1 0.000000 NA NA NA NA
## 2 24616.000000 0.55346 0.87444 -0.27060 0.60805
## 3 369437.000000 0.47755 0.64763 0.05402 0.63909
## 4 813300.000000 0.47642 0.64508 0.05851 0.64351
## 5 1232608.000000 0.47438 0.64091 0.06656 0.64924
## 6 1381266.000000 0.47555 0.64339 0.06192 0.64688
## 7 1381266.000000 0.47438 0.64091 0.06656 0.64924
## training_pr_auc training_lift training_classification_error
## 1 NA NA NA
## 2 0.49968 1.58690 0.53647
## 3 0.54098 1.85545 0.50075
## 4 0.54615 1.97752 0.48528
## 5 0.54884 1.95311 0.47321
## 6 0.55014 2.00193 0.49336
## 7 0.54884 1.95311 0.47321
## validation_rmse validation_logloss validation_r2 validation_auc
## 1 NA NA NA NA
## 2 0.55333 0.87476 -0.27043 0.60462
## 3 0.47841 0.64934 0.05029 0.63558
## 4 0.47770 0.64784 0.05313 0.63869
## 5 0.47674 0.64598 0.05692 0.63861
## 6 0.47785 0.64818 0.05254 0.63739
## 7 0.47674 0.64598 0.05692 0.63861
## validation_pr_auc validation_lift validation_classification_error
## 1 NA NA NA
## 2 0.49940 1.66511 0.54518
## 3 0.53268 1.77992 0.48786
## 4 0.53474 1.80511 0.49146
## 5 0.53485 1.83030 0.50548
## 6 0.53272 1.77153 0.50881
## 7 0.53485 1.83030 0.50548
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 origin.LGA 1.000000 1.000000 0.011345
## 2 origin.JFK 0.972324 0.972324 0.011031
## 3 day_of_week.Sun 0.955083 0.955083 0.010835
## 4 hour 0.944375 0.944375 0.010714
## 5 carrier.EV 0.941654 0.941654 0.010683
##
## ---
## variable relative_importance scaled_importance
## 130 dest.BGR 0.573316 0.573316
## 131 dest.TVC 0.565988 0.565988
## 132 dest.missing(NA) 0.000000 0.000000
## 133 carrier.missing(NA) 0.000000 0.000000
## 134 day_of_week.missing(NA) 0.000000 0.000000
## 135 origin.missing(NA) 0.000000 0.000000
## percentage
## 130 0.006504
## 131 0.006421
## 132 0.000000
## 133 0.000000
## 134 0.000000
## 135 0.000000
Save Model
h2o.saveModel(object=m1, # the model you want to save
path=getwd(), # the folder to save
force=TRUE) # whether to overwrite an existing file
## [1] "/Users/catresa/gsu/CIS8392/assignments/dl_model_first"
model_filepath = str_c(getwd(), "/dl_model_first") #dl_model_first is model_id
Load DL
m1 <- h2o.loadModel(model_filepath)
Random Hyper-Parameter Search
hyper_params2 <- list(
activation = c("Rectifier", "Tanh", "Maxout", "RectifierWithDropout",
"TanhWithDropout", "MaxoutWithDropout"),
hidden = list( c(20,20), c(50,50), c(30,30,30), c(25,25,25,25)),
input_dropout_ratio = c(0, 0.05),
l1 = seq(0, 1e-4, 1e-6),
l2 = seq(0, 1e-4, 1e-6)
)
search_criteria = list(
strategy = "RandomDiscrete",
seed=1234567,
stopping_metric = "auto",
stopping_rounds=5,
stopping_tolerance=1e-2,
max_runtime_secs = 360,
max_models = 100
)
grid2 <- h2o.grid(
algorithm = "deeplearning",
grid_id = "dl_grid_random",
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
epochs=1,
stopping_metric="logloss",
stopping_tolerance=1e-2,
stopping_rounds=2,
score_validation_samples=10000,
score_duty_cycle=0.025,
max_w2=10,
hyper_params = hyper_params2,
search_criteria = search_criteria
)
grid2 <- h2o.getGrid("dl_grid_random",sort_by="logloss",decreasing=FALSE)
dl_grid_random_summary_table <- grid2@summary_table
dl_grid_random_best_model <- h2o.getModel(dl_grid_random_summary_table$model_ids[1])
summary(dl_grid_random_best_model)
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model Key: dl_grid_random_model_11
## Status of Neuron Layers: predicting delayed, 2-class classification, bernoulli distribution, CrossEntropy loss, 18,802 weights/biases, 227.8 KB, 145,762 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms
## 1 1 135 Input 5.00 % NA NA NA NA
## 2 2 50 Maxout 0.00 % 0.000096 0.000043 0.056559 0.215065
## 3 3 50 Maxout 0.00 % 0.000096 0.000043 0.468695 0.414333
## 4 4 2 Softmax NA 0.000096 0.000043 0.567191 0.459009
## momentum mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA NA
## 2 0.000000 -0.003088 0.103493 -0.080908 0.114518
## 3 0.000000 -0.003346 0.057780 0.126889 0.376191
## 4 0.000000 -0.007976 0.148262 0.003802 0.016888
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 9997 samples **
##
## MSE: 0.2281126
## RMSE: 0.4776114
## LogLoss: 0.6478013
## Mean Per-Class Error: 0.4516055
## AUC: 0.6350463
## pr_auc: 0.5313808
## Gini: 0.2700927
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 1068 4930 0.821941 =4930/5998
## 1 325 3674 0.081270 =325/3999
## Totals 1393 8604 0.525658 =5255/9997
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.308051 0.583036 312
## 2 max f2 0.205236 0.770286 378
## 3 max f0point5 0.476035 0.518197 153
## 4 max accuracy 0.522235 0.632890 107
## 5 max precision 0.682973 0.800000 4
## 6 max recall 0.135374 1.000000 397
## 7 max specificity 0.713773 0.999833 0
## 8 max absolute_mcc 0.482666 0.199649 146
## 9 max min_per_class_accuracy 0.443339 0.594698 187
## 10 max mean_per_class_accuracy 0.471787 0.597720 158
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on temporary validation frame with 10107 samples **
##
## MSE: 0.2303182
## RMSE: 0.4799148
## LogLoss: 0.6523661
## Mean Per-Class Error: 0.4464909
## AUC: 0.6252233
## pr_auc: 0.5185189
## Gini: 0.2504466
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 1193 4825 0.801761 =4825/6018
## 1 373 3716 0.091220 =373/4089
## Totals 1566 8541 0.514297 =5198/10107
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.312601 0.588440 311
## 2 max f2 0.170509 0.773087 392
## 3 max f0point5 0.436810 0.512493 197
## 4 max accuracy 0.554562 0.617493 81
## 5 max precision 0.719815 1.000000 0
## 6 max recall 0.142208 1.000000 397
## 7 max specificity 0.719815 1.000000 0
## 8 max absolute_mcc 0.424781 0.180994 209
## 9 max min_per_class_accuracy 0.443210 0.588408 191
## 10 max mean_per_class_accuracy 0.424781 0.591880 209
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
##
## Scoring History:
## timestamp duration training_speed epochs iterations
## 1 2019-10-27 20:33:51 0.000 sec NA 0.00000 0
## 2 2019-10-27 20:33:52 41.931 sec 17375 obs/sec 0.09795 1
## 3 2019-10-27 20:34:00 49.773 sec 17867 obs/sec 1.06020 11
## samples training_rmse training_logloss training_r2 training_auc
## 1 0.000000 NA NA NA NA
## 2 13466.000000 0.48308 0.66089 0.02765 0.61563
## 3 145762.000000 0.47761 0.64780 0.04955 0.63505
## training_pr_auc training_lift training_classification_error
## 1 NA NA NA
## 2 0.50819 1.70784 0.54456
## 3 0.53138 1.82491 0.52566
## validation_rmse validation_logloss validation_r2 validation_auc
## 1 NA NA NA NA
## 2 0.48571 0.66645 0.02068 0.61109
## 3 0.47991 0.65237 0.04390 0.62522
## validation_pr_auc validation_lift validation_classification_error
## 1 NA NA NA
## 2 0.49839 1.48785 0.56565
## 3 0.51852 1.76900 0.51430
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 dest.MKE 1.000000 1.000000 0.010026
## 2 carrier.OO 0.937829 0.937829 0.009403
## 3 dest.SAV 0.925839 0.925839 0.009283
## 4 dest.IAH 0.917458 0.917458 0.009198
## 5 dest.BHM 0.898708 0.898708 0.009010
##
## ---
## variable relative_importance scaled_importance
## 130 dest.MHT 0.610644 0.610644
## 131 dest.TUL 0.583213 0.583213
## 132 dest.missing(NA) 0.000000 0.000000
## 133 carrier.missing(NA) 0.000000 0.000000
## 134 day_of_week.missing(NA) 0.000000 0.000000
## 135 origin.missing(NA) 0.000000 0.000000
## percentage
## 130 0.006122
## 131 0.005847
## 132 0.000000
## 133 0.000000
## 134 0.000000
## 135 0.000000
dl_grid_random_best_model_params <- dl_grid_random_best_model@allparameters
prediction_h2o_dl <- h2o.predict(dl_grid_random_best_model,
newdata = new_data_h2o)
##
|
| | 0%
|
|=================================================================| 100%
prediction_dl_tbl <- tibble(
id = rownames(x_test_processed_tbl),
delayed = as.vector(prediction_h2o_dl$p1)
)
kable(head(prediction_h2o_dl))
| predict | p0 | p1 |
|---|---|---|
| 0 | 0.7193305 | 0.2806695 |
| 1 | 0.6846256 | 0.3153744 |
| 1 | 0.6027893 | 0.3972107 |
| 1 | 0.6466139 | 0.3533861 |
| 0 | 0.7428760 | 0.2571240 |
| 1 | 0.6331484 | 0.3668516 |
kable(head(prediction_dl_tbl))
| id | delayed |
|---|---|
| 1 | 0.2806695 |
| 2 | 0.3153744 |
| 3 | 0.3972107 |
| 4 | 0.3533861 |
| 5 | 0.2571240 |
| 6 | 0.3668516 |
AutoML
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 300
)
automl_leaderboard <- automl_models_h2o@leaderboard
automl_leaderboard # H2O was only able to try 4 models
## model_id auc logloss
## 1 StackedEnsemble_AllModels_AutoML_20191027_203408 0.6527708 0.6377297
## 2 StackedEnsemble_BestOfFamily_AutoML_20191027_203408 0.6521472 0.6380238
## 3 XGBoost_2_AutoML_20191027_203408 0.6521442 0.6382390
## 4 XGBoost_1_AutoML_20191027_203408 0.6510998 0.6386256
## 5 XGBoost_3_AutoML_20191027_203408 0.6346739 0.6606840
## mean_per_class_error rmse mse
## 1 0.4200061 0.4727385 0.2234817
## 2 0.4218049 0.4728797 0.2236152
## 3 0.4202507 0.4729855 0.2237153
## 4 0.4305753 0.4731853 0.2239043
## 5 0.4431210 0.4837041 0.2339696
##
## [5 rows x 6 columns]
automl_leader <- automl_models_h2o@leader
performance_h2o <- h2o.performance(automl_leader, newdata = test_h2o)
performance_h2o %>%
h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.301223640067689:
## 0 1 Error Rate
## 0 5422 12257 0.693308 =12257/17679
## 1 1738 10109 0.146704 =1738/11847
## Totals 7160 22366 0.473989 =13995/29526
performance_h2o %>%
h2o.auc()
## [1] 0.6527708
prediction_h2o_automl <- h2o.predict(automl_leader,
newdata = new_data_h2o)
##
|
| | 0%
|
|=================================================================| 100%
prediction_automl_tbl <- tibble(
id = rownames(x_test_processed_tbl),
delayed = as.vector(prediction_h2o_automl$p1)
)
kable(head(prediction_automl_tbl))
| id | delayed |
|---|---|
| 1 | 0.2390822 |
| 2 | 0.2897834 |
| 3 | 0.3012791 |
| 4 | 0.3628371 |
| 5 | 0.2069308 |
| 6 | 0.3160515 |
#Evaluation
The deeplearning algorithim was used to build the dl_model_first model. The more efficient random hyper-parameter search was used to build the dl_grid_random_model. The accuracy rate on the validation sets for dl_model_first and dl_grid_random_model was approximately 62-63%. The fact that most flights departed on-time means that few records represent delayed flights. Having an underrepresented class makes training the model more difficult.
AutoML - Of the 6 models evaluated, the most accurate model was the StackedEnsemble_AllModels_AutoML with an accuracy of approximately 65%.