#Description, Summary, and Page URL

NYC Flight Delays This dataset contains information about flights departing from NYC airports (EWR, JFK, and LGA) in 2013. The data contains 336k rows and 19 columns. The columns include the following data - year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour.
Link to dataset https://www.kaggle.com/lampubhutia/nyc-flight-delay

##Load Data and Show Summary Statistics

flights <- read_csv("flight_data.csv")
## Parsed with column specification:
## cols(
##   year = col_double(),
##   month = col_double(),
##   day = col_double(),
##   dep_time = col_double(),
##   sched_dep_time = col_double(),
##   dep_delay = col_double(),
##   arr_time = col_double(),
##   sched_arr_time = col_double(),
##   arr_delay = col_double(),
##   carrier = col_character(),
##   flight = col_double(),
##   tailnum = col_character(),
##   origin = col_character(),
##   dest = col_character(),
##   air_time = col_double(),
##   distance = col_double(),
##   hour = col_double(),
##   minute = col_double(),
##   time_hour = col_character()
## )
summary(flights)
##       year          month             day           dep_time   
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400  
##                                                  NA's   :8255  
##  sched_dep_time   dep_delay          arr_time    sched_arr_time
##  Min.   : 106   Min.   : -43.00   Min.   :   1   Min.   :   1  
##  1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124  
##  Median :1359   Median :  -2.00   Median :1535   Median :1556  
##  Mean   :1344   Mean   :  12.64   Mean   :1502   Mean   :1536  
##  3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945  
##  Max.   :2359   Max.   :1301.00   Max.   :2400   Max.   :2359  
##                 NA's   :8255      NA's   :8713                 
##    arr_delay          carrier              flight       tailnum         
##  Min.   : -86.000   Length:336776      Min.   :   1   Length:336776     
##  1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character  
##  Median :  -5.000   Mode  :character   Median :1496   Mode  :character  
##  Mean   :   6.895                      Mean   :1972                     
##  3rd Qu.:  14.000                      3rd Qu.:3465                     
##  Max.   :1272.000                      Max.   :8500                     
##  NA's   :9430                                                           
##     origin              dest              air_time        distance   
##  Length:336776      Length:336776      Min.   : 20.0   Min.   :  17  
##  Class :character   Class :character   1st Qu.: 82.0   1st Qu.: 502  
##  Mode  :character   Mode  :character   Median :129.0   Median : 872  
##                                        Mean   :150.7   Mean   :1040  
##                                        3rd Qu.:192.0   3rd Qu.:1389  
##                                        Max.   :695.0   Max.   :4983  
##                                        NA's   :9430                  
##       hour           minute       time_hour        
##  Min.   : 1.00   Min.   : 0.00   Length:336776     
##  1st Qu.: 9.00   1st Qu.: 8.00   Class :character  
##  Median :13.00   Median :29.00   Mode  :character  
##  Mean   :13.18   Mean   :26.23                     
##  3rd Qu.:17.00   3rd Qu.:44.00                     
##  Max.   :23.00   Max.   :59.00                     
## 

#Visualize Plot1 - Number of flight delays

data <- flights %>%
  na.omit() %>%
  sample_n(10000)

plot1 <- ggplot(data) +
  geom_freqpoly((aes(x = dep_delay)), binwidth = 2.5) +
  labs(x = "departure delay (minutes)", y = "number of delays") 

plot1

Plot2 - Flight delays (minutes) by distance

plot2 <- ggplot(data, aes(x=distance, y=dep_delay)) + 
  geom_point() 

plot2

Plot3 - Number of flight delays by departure hour

Plot3 <- ggplot(data, aes(x=hour)) +
  geom_bar() +
  labs(x = "departure hour", y= "number of delays" ) 

Plot3  

Plot4 - Number of delays by carrier and origin

plot4 <- ggplot(data, aes(origin)) +
  geom_bar() + 
  facet_wrap(~carrier) +
  labs(x = "origin", y= "number of delays" ) 

plot4

#Summarize and discuss the patterns A subset of the dataset (5k rows) was used for the visualizations. Plot1 shows the majority of flights from NYC airports departed on-time in 2013. Plot2 plots dep_delay by distance for all three airports. According to Plot2, shorter distances experienced delays similar to longer distances. This observation suggests that distance has a limited impact on flight delays. Plot3 shows that flights were scheduled to depart between the hours of 500 (5am) and 2400 (12am). The majority of flight delays occurred in the morning between 5am - 9am. Plot4 shows that carriers, EV, UA, and B6 experienced the largest number of delays with those delayed flights originating out of EWR and JFK.

#Preprocess Data

#add target variable
flights_df <- flights %>%
  add_column(delayed = ifelse(flights$arr_delay > 0, 1, 0)) %>%
  na.omit()

#calculate day of week
date <- ymd(str_c(flights_df$year, "-",flights_df$month, "-", flights_df$day))
day_of_week <- as.character(wday(date, label=TRUE))

#add day of week column and select predictors
flights_df <- flights_df %>%
  add_column(day_of_week) %>%
  select(day_of_week, hour, carrier, origin, dest, delayed)

summary(flights_df)
##  day_of_week             hour         carrier             origin         
##  Length:327346      Min.   : 5.00   Length:327346      Length:327346     
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :13.00   Mode  :character   Mode  :character  
##                     Mean   :13.14                                        
##                     3rd Qu.:17.00                                        
##                     Max.   :23.00                                        
##      dest              delayed      
##  Length:327346      Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.4063  
##                     3rd Qu.:1.0000  
##                     Max.   :1.0000

Split data into train (60%) and test (40%)

# partition the data

train.index <- sample(c(1:dim(flights_df)[1]), dim(flights_df)[1]*0.6)  
train.df <- flights_df[train.index, ]
test.df <- flights_df[-train.index, ]

# Training data: Separate into x and y tibbles
x_train_tbl <- train.df %>% select(-delayed) 
y_train_tbl <- train.df %>% select(delayed)
# Testing data: 
x_test_tbl <- test.df

# Remove the original data to save memory
rm(train.df) 
rm(test.df)

Inspect Data

x_train_tbl_skim = skim_to_list(x_train_tbl) 
names(x_train_tbl_skim)
## [1] "character" "numeric"
kable(x_train_tbl_skim$character)
variable missing complete n min max empty n_unique
carrier 0 196407 196407 2 2 0 16
day_of_week 0 196407 196407 3 3 0 7
dest 0 196407 196407 3 3 0 104
origin 0 196407 196407 3 3 0 3
kable(x_train_tbl_skim$numeric)
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
hour 0 196407 196407 13.13 4.66 5 9 13 17 23 ▇▆▅▇▆▆▅▂

Convert Data

string_2_factor_names <- x_train_tbl %>%
  select_if(is.character) %>% 
  names() 

kable(string_2_factor_names)
x
day_of_week
carrier
origin
dest
unique_numeric_values_tbl <- x_train_tbl %>% 
  select_if(is.numeric) %>%
  purrr::map_df(~ unique(.) %>% 
  length()) %>% 
  gather() %>% 
  arrange(value) %>% 
  mutate(key = as_factor(key))


factor_limit <- 7 # if the numeric column has less than 7 dintinct values 
                  # we consider it as a factor
num_2_factor_names <- unique_numeric_values_tbl %>% 
  filter(value < factor_limit) %>% 
  arrange(desc(value)) %>%
  pull(key) %>%
  as.character() 

kable(num_2_factor_names)
x

Data Transformation

rec_obj <- recipe(~ ., data = x_train_tbl) %>%
  step_string2factor(string_2_factor_names) %>%
  step_meanimpute(all_numeric()) %>% 
  step_modeimpute(all_nominal()) %>% 
  prep(stringsAsFactors = FALSE)
x_train_processed_tbl <- bake(rec_obj, x_train_tbl) 
x_test_processed_tbl <- bake(rec_obj, x_test_tbl)

x_train_tbl %>%
  select(1:5) %>%
  glimpse()
## Observations: 196,407
## Variables: 5
## $ day_of_week <chr> "Tue", "Sat", "Mon", "Mon", "Sat", "Fri", "Fri", "Th…
## $ hour        <dbl> 18, 13, 15, 12, 16, 7, 6, 18, 15, 7, 12, 8, 15, 15, …
## $ carrier     <chr> "MQ", "B6", "B6", "WN", "B6", "AA", "UA", "EV", "AA"…
## $ origin      <chr> "LGA", "LGA", "JFK", "EWR", "JFK", "LGA", "LGA", "EW…
## $ dest        <chr> "CLE", "SRQ", "PIT", "STL", "BOS", "MIA", "DEN", "MS…
x_train_processed_tbl %>% 
  select(1:5) %>% 
  glimpse()
## Observations: 196,407
## Variables: 5
## $ day_of_week <fct> Tue, Sat, Mon, Mon, Sat, Fri, Fri, Thu, Fri, Tue, Su…
## $ hour        <dbl> 18, 13, 15, 12, 16, 7, 6, 18, 15, 7, 12, 8, 15, 15, …
## $ carrier     <fct> MQ, B6, B6, WN, B6, AA, UA, EV, AA, UA, EV, EV, EV, …
## $ origin      <fct> LGA, LGA, JFK, EWR, JFK, LGA, LGA, EWR, JFK, EWR, EW…
## $ dest        <fct> CLE, SRQ, PIT, STL, BOS, MIA, DEN, MSP, SJU, BOS, CM…
rec_obj_for_y <- recipe(~ ., data = y_train_tbl) %>% 
  step_num2factor("delayed") %>% 
  prep(stringsAsFactors = FALSE)

y_train_processed_tbl <- bake(rec_obj_for_y, y_train_tbl)

kable(head(y_train_tbl))
delayed
0
1
1
0
0
0
kable(head(y_train_processed_tbl))
delayed
0
1
1
0
0
0

#Analysis of Data

h2o.init(nthreads = -1) #-1 for using all cores
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         39 minutes 56 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.26.0.2 
##     H2O cluster version age:    3 months  
##     H2O cluster name:           H2O_started_from_R_catresa_dhm254 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.61 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.6.1 (2019-07-05)
h2o.removeAll() ## clean slate - just in case the cluster was already running

Push data into h2o

# push data into h2o
data_h2o <- as.h2o(
  bind_cols(y_train_processed_tbl, x_train_processed_tbl), 
  destination_frame="train.hex" #destination_frame is optional
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
new_data_h2o <- as.h2o(
  x_test_processed_tbl,
  destination_frame= "test.hex" #destination_frame is optional 
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
h2o.ls()
##         key
## 1  test.hex
## 2 train.hex

Split Data

# Partition the data into training, validation and test sets
splits <- h2o.splitFrame(data = data_h2o,
                         ratios = c(0.7, 0.15), # 70/15/15 split
                         seed = 1234)
              
train_h2o <- splits[[1]]
valid_h2o <- splits[[2]]
test_h2o <- splits[[3]]

Modeling - Deep Learning

y <- "delayed"
x <- setdiff(names(train_h2o), y)

m1 <- h2o.deeplearning(x = x, y = y, training_frame = train_h2o, 
                       model_id = "dl_model_first", 
                       validation_frame = valid_h2o
                       #activation="Rectifier", 
                       #hidden=c(200,200),
                       #epochs = 1
                       )

Summarize Model

summary(m1)
## Model Details:
## ==============
## 
## H2OBinomialModel: deeplearning
## Model Key:  dl_model_first 
## Status of Neuron Layers: predicting delayed, 2-class classification, bernoulli distribution, CrossEntropy loss, 67,802 weights/biases, 804.4 KB, 1,381,266 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms
## 1     1   135     Input  0.00 %       NA       NA        NA       NA
## 2     2   200 Rectifier  0.00 % 0.000000 0.000000  0.112652 0.299247
## 3     3   200 Rectifier  0.00 % 0.000000 0.000000  0.229585 0.254219
## 4     4     2   Softmax      NA 0.000000 0.000000  0.025146 0.011515
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1       NA          NA         NA        NA       NA
## 2 0.000000   -0.002481   0.096833 -0.378921 0.193539
## 3 0.000000   -0.012218   0.081928 -0.289252 0.236503
## 4 0.000000   -0.017251   0.273693 -0.493510 0.076352
## 
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 10021 samples **
## 
## MSE:  0.225033
## RMSE:  0.4743764
## LogLoss:  0.6409102
## Mean Per-Class Error:  0.4204997
## AUC:  0.6492381
## pr_auc:  0.5488434
## Gini:  0.2984762
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error         Rate
## 0      1790 4167 0.699513   =4167/5957
## 1       575 3489 0.141486    =575/4064
## Totals 2365 7656 0.473206  =4742/10021
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.303759 0.595392 281
## 2                       max f2  0.160827 0.774653 375
## 3                 max f0point5  0.430315 0.530325 174
## 4                 max accuracy  0.502418 0.634667 113
## 5                max precision  0.754658 1.000000   0
## 6                   max recall  0.117588 1.000000 390
## 7              max specificity  0.754658 1.000000   0
## 8             max absolute_mcc  0.393585 0.212327 204
## 9   max min_per_class_accuracy  0.402834 0.605807 196
## 10 max mean_per_class_accuracy  0.393585 0.608105 204
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
## 
## MSE:  0.2272802
## RMSE:  0.4767391
## LogLoss:  0.6459797
## Mean Per-Class Error:  0.4406615
## AUC:  0.6386074
## pr_auc:  0.5348459
## Gini:  0.2772148
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0     1    Error          Rate
## 0      3808 13679 0.782238  =13679/17487
## 1      1180 10729 0.099085   =1180/11909
## Totals 4988 24408 0.505477  =14859/29396
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.277653 0.590853 303
## 2                       max f2  0.138852 0.773437 384
## 3                 max f0point5  0.430016 0.522867 175
## 4                 max accuracy  0.484459 0.627875 130
## 5                max precision  0.685256 0.741935  15
## 6                   max recall  0.064302 1.000000 397
## 7              max specificity  0.761105 0.999943   0
## 8             max absolute_mcc  0.461792 0.198078 151
## 9   max min_per_class_accuracy  0.402032 0.597027 198
## 10 max mean_per_class_accuracy  0.430016 0.599065 175
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## 
## Scoring History: 
##             timestamp          duration training_speed   epochs iterations
## 1 2019-10-27 20:32:01         0.000 sec             NA  0.00000          0
## 2 2019-10-27 20:32:06         6.234 sec   5449 obs/sec  0.17904          1
## 3 2019-10-27 20:32:23        23.525 sec  18443 obs/sec  2.68711         15
## 4 2019-10-27 20:32:41        42.019 sec  22086 obs/sec  5.91555         33
## 5 2019-10-27 20:32:58        59.005 sec  23638 obs/sec  8.96540         50
## 6 2019-10-27 20:33:05  1 min  5.966 sec  24041 obs/sec 10.04667         56
## 7 2019-10-27 20:33:07  1 min  7.571 sec  24039 obs/sec 10.04667         56
##          samples training_rmse training_logloss training_r2 training_auc
## 1       0.000000            NA               NA          NA           NA
## 2   24616.000000       0.55346          0.87444    -0.27060      0.60805
## 3  369437.000000       0.47755          0.64763     0.05402      0.63909
## 4  813300.000000       0.47642          0.64508     0.05851      0.64351
## 5 1232608.000000       0.47438          0.64091     0.06656      0.64924
## 6 1381266.000000       0.47555          0.64339     0.06192      0.64688
## 7 1381266.000000       0.47438          0.64091     0.06656      0.64924
##   training_pr_auc training_lift training_classification_error
## 1              NA            NA                            NA
## 2         0.49968       1.58690                       0.53647
## 3         0.54098       1.85545                       0.50075
## 4         0.54615       1.97752                       0.48528
## 5         0.54884       1.95311                       0.47321
## 6         0.55014       2.00193                       0.49336
## 7         0.54884       1.95311                       0.47321
##   validation_rmse validation_logloss validation_r2 validation_auc
## 1              NA                 NA            NA             NA
## 2         0.55333            0.87476      -0.27043        0.60462
## 3         0.47841            0.64934       0.05029        0.63558
## 4         0.47770            0.64784       0.05313        0.63869
## 5         0.47674            0.64598       0.05692        0.63861
## 6         0.47785            0.64818       0.05254        0.63739
## 7         0.47674            0.64598       0.05692        0.63861
##   validation_pr_auc validation_lift validation_classification_error
## 1                NA              NA                              NA
## 2           0.49940         1.66511                         0.54518
## 3           0.53268         1.77992                         0.48786
## 4           0.53474         1.80511                         0.49146
## 5           0.53485         1.83030                         0.50548
## 6           0.53272         1.77153                         0.50881
## 7           0.53485         1.83030                         0.50548
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##          variable relative_importance scaled_importance percentage
## 1      origin.LGA            1.000000          1.000000   0.011345
## 2      origin.JFK            0.972324          0.972324   0.011031
## 3 day_of_week.Sun            0.955083          0.955083   0.010835
## 4            hour            0.944375          0.944375   0.010714
## 5      carrier.EV            0.941654          0.941654   0.010683
## 
## ---
##                    variable relative_importance scaled_importance
## 130                dest.BGR            0.573316          0.573316
## 131                dest.TVC            0.565988          0.565988
## 132        dest.missing(NA)            0.000000          0.000000
## 133     carrier.missing(NA)            0.000000          0.000000
## 134 day_of_week.missing(NA)            0.000000          0.000000
## 135      origin.missing(NA)            0.000000          0.000000
##     percentage
## 130   0.006504
## 131   0.006421
## 132   0.000000
## 133   0.000000
## 134   0.000000
## 135   0.000000

Save Model

h2o.saveModel(object=m1, # the model you want to save 
              path=getwd(), # the folder to save
              force=TRUE) # whether to overwrite an existing file 
## [1] "/Users/catresa/gsu/CIS8392/assignments/dl_model_first"
model_filepath = str_c(getwd(), "/dl_model_first") #dl_model_first is model_id 

Load DL

m1 <- h2o.loadModel(model_filepath)

Random Hyper-Parameter Search

hyper_params2 <- list(
activation = c("Rectifier", "Tanh", "Maxout", "RectifierWithDropout",
               "TanhWithDropout", "MaxoutWithDropout"),
hidden = list( c(20,20), c(50,50), c(30,30,30), c(25,25,25,25)), 
input_dropout_ratio = c(0, 0.05),
l1 = seq(0, 1e-4, 1e-6),
l2 = seq(0, 1e-4, 1e-6)
)
search_criteria = list(
  strategy = "RandomDiscrete", 
  seed=1234567,
  stopping_metric = "auto", 
  stopping_rounds=5,
  stopping_tolerance=1e-2,
  max_runtime_secs = 360, 
  max_models = 100
)
grid2 <- h2o.grid(
  algorithm = "deeplearning", 
  grid_id = "dl_grid_random",
  
  x = x, 
  y = y,
  
  training_frame = train_h2o, 
  validation_frame = valid_h2o,
  
  epochs=1, 
  stopping_metric="logloss", 
  stopping_tolerance=1e-2, 
  stopping_rounds=2, 
  score_validation_samples=10000,
  score_duty_cycle=0.025, 
  max_w2=10,
  hyper_params = hyper_params2, 
  search_criteria = search_criteria
)
grid2 <- h2o.getGrid("dl_grid_random",sort_by="logloss",decreasing=FALSE)

dl_grid_random_summary_table <- grid2@summary_table

dl_grid_random_best_model <- h2o.getModel(dl_grid_random_summary_table$model_ids[1])
summary(dl_grid_random_best_model)
## Model Details:
## ==============
## 
## H2OBinomialModel: deeplearning
## Model Key:  dl_grid_random_model_11 
## Status of Neuron Layers: predicting delayed, 2-class classification, bernoulli distribution, CrossEntropy loss, 18,802 weights/biases, 227.8 KB, 145,762 training samples, mini-batch size 1
##   layer units    type dropout       l1       l2 mean_rate rate_rms
## 1     1   135   Input  5.00 %       NA       NA        NA       NA
## 2     2    50  Maxout  0.00 % 0.000096 0.000043  0.056559 0.215065
## 3     3    50  Maxout  0.00 % 0.000096 0.000043  0.468695 0.414333
## 4     4     2 Softmax      NA 0.000096 0.000043  0.567191 0.459009
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1       NA          NA         NA        NA       NA
## 2 0.000000   -0.003088   0.103493 -0.080908 0.114518
## 3 0.000000   -0.003346   0.057780  0.126889 0.376191
## 4 0.000000   -0.007976   0.148262  0.003802 0.016888
## 
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 9997 samples **
## 
## MSE:  0.2281126
## RMSE:  0.4776114
## LogLoss:  0.6478013
## Mean Per-Class Error:  0.4516055
## AUC:  0.6350463
## pr_auc:  0.5313808
## Gini:  0.2700927
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error        Rate
## 0      1068 4930 0.821941  =4930/5998
## 1       325 3674 0.081270   =325/3999
## Totals 1393 8604 0.525658  =5255/9997
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.308051 0.583036 312
## 2                       max f2  0.205236 0.770286 378
## 3                 max f0point5  0.476035 0.518197 153
## 4                 max accuracy  0.522235 0.632890 107
## 5                max precision  0.682973 0.800000   4
## 6                   max recall  0.135374 1.000000 397
## 7              max specificity  0.713773 0.999833   0
## 8             max absolute_mcc  0.482666 0.199649 146
## 9   max min_per_class_accuracy  0.443339 0.594698 187
## 10 max mean_per_class_accuracy  0.471787 0.597720 158
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on temporary validation frame with 10107 samples **
## 
## MSE:  0.2303182
## RMSE:  0.4799148
## LogLoss:  0.6523661
## Mean Per-Class Error:  0.4464909
## AUC:  0.6252233
## pr_auc:  0.5185189
## Gini:  0.2504466
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error         Rate
## 0      1193 4825 0.801761   =4825/6018
## 1       373 3716 0.091220    =373/4089
## Totals 1566 8541 0.514297  =5198/10107
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.312601 0.588440 311
## 2                       max f2  0.170509 0.773087 392
## 3                 max f0point5  0.436810 0.512493 197
## 4                 max accuracy  0.554562 0.617493  81
## 5                max precision  0.719815 1.000000   0
## 6                   max recall  0.142208 1.000000 397
## 7              max specificity  0.719815 1.000000   0
## 8             max absolute_mcc  0.424781 0.180994 209
## 9   max min_per_class_accuracy  0.443210 0.588408 191
## 10 max mean_per_class_accuracy  0.424781 0.591880 209
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## 
## Scoring History: 
##             timestamp   duration training_speed  epochs iterations
## 1 2019-10-27 20:33:51  0.000 sec             NA 0.00000          0
## 2 2019-10-27 20:33:52 41.931 sec  17375 obs/sec 0.09795          1
## 3 2019-10-27 20:34:00 49.773 sec  17867 obs/sec 1.06020         11
##         samples training_rmse training_logloss training_r2 training_auc
## 1      0.000000            NA               NA          NA           NA
## 2  13466.000000       0.48308          0.66089     0.02765      0.61563
## 3 145762.000000       0.47761          0.64780     0.04955      0.63505
##   training_pr_auc training_lift training_classification_error
## 1              NA            NA                            NA
## 2         0.50819       1.70784                       0.54456
## 3         0.53138       1.82491                       0.52566
##   validation_rmse validation_logloss validation_r2 validation_auc
## 1              NA                 NA            NA             NA
## 2         0.48571            0.66645       0.02068        0.61109
## 3         0.47991            0.65237       0.04390        0.62522
##   validation_pr_auc validation_lift validation_classification_error
## 1                NA              NA                              NA
## 2           0.49839         1.48785                         0.56565
## 3           0.51852         1.76900                         0.51430
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##     variable relative_importance scaled_importance percentage
## 1   dest.MKE            1.000000          1.000000   0.010026
## 2 carrier.OO            0.937829          0.937829   0.009403
## 3   dest.SAV            0.925839          0.925839   0.009283
## 4   dest.IAH            0.917458          0.917458   0.009198
## 5   dest.BHM            0.898708          0.898708   0.009010
## 
## ---
##                    variable relative_importance scaled_importance
## 130                dest.MHT            0.610644          0.610644
## 131                dest.TUL            0.583213          0.583213
## 132        dest.missing(NA)            0.000000          0.000000
## 133     carrier.missing(NA)            0.000000          0.000000
## 134 day_of_week.missing(NA)            0.000000          0.000000
## 135      origin.missing(NA)            0.000000          0.000000
##     percentage
## 130   0.006122
## 131   0.005847
## 132   0.000000
## 133   0.000000
## 134   0.000000
## 135   0.000000
                                                                                      dl_grid_random_best_model_params <- dl_grid_random_best_model@allparameters 
prediction_h2o_dl <- h2o.predict(dl_grid_random_best_model,
                                 newdata = new_data_h2o)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
prediction_dl_tbl <- tibble(
  id = rownames(x_test_processed_tbl), 
  delayed = as.vector(prediction_h2o_dl$p1)
)

kable(head(prediction_h2o_dl))
predict p0 p1
0 0.7193305 0.2806695
1 0.6846256 0.3153744
1 0.6027893 0.3972107
1 0.6466139 0.3533861
0 0.7428760 0.2571240
1 0.6331484 0.3668516
kable(head(prediction_dl_tbl))
id delayed
1 0.2806695
2 0.3153744
3 0.3972107
4 0.3533861
5 0.2571240
6 0.3668516

AutoML

automl_models_h2o <- h2o.automl(
  x = x,
  y = y,
  training_frame = train_h2o, 
  validation_frame = valid_h2o, 
  leaderboard_frame = test_h2o, 
  max_runtime_secs = 300
)
automl_leaderboard <- automl_models_h2o@leaderboard 
automl_leaderboard # H2O was only able to try 4 models
##                                              model_id       auc   logloss
## 1    StackedEnsemble_AllModels_AutoML_20191027_203408 0.6527708 0.6377297
## 2 StackedEnsemble_BestOfFamily_AutoML_20191027_203408 0.6521472 0.6380238
## 3                    XGBoost_2_AutoML_20191027_203408 0.6521442 0.6382390
## 4                    XGBoost_1_AutoML_20191027_203408 0.6510998 0.6386256
## 5                    XGBoost_3_AutoML_20191027_203408 0.6346739 0.6606840
##   mean_per_class_error      rmse       mse
## 1            0.4200061 0.4727385 0.2234817
## 2            0.4218049 0.4728797 0.2236152
## 3            0.4202507 0.4729855 0.2237153
## 4            0.4305753 0.4731853 0.2239043
## 5            0.4431210 0.4837041 0.2339696
## 
## [5 rows x 6 columns]
automl_leader <- automl_models_h2o@leader

performance_h2o <- h2o.performance(automl_leader, newdata = test_h2o)

performance_h2o %>% 
  h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.301223640067689:
##           0     1    Error          Rate
## 0      5422 12257 0.693308  =12257/17679
## 1      1738 10109 0.146704   =1738/11847
## Totals 7160 22366 0.473989  =13995/29526
performance_h2o %>% 
  h2o.auc()
## [1] 0.6527708
prediction_h2o_automl <- h2o.predict(automl_leader,
                                     newdata = new_data_h2o)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
prediction_automl_tbl <- tibble(
  id = rownames(x_test_processed_tbl), 
  delayed = as.vector(prediction_h2o_automl$p1)
) 

kable(head(prediction_automl_tbl))
id delayed
1 0.2390822
2 0.2897834
3 0.3012791
4 0.3628371
5 0.2069308
6 0.3160515

#Evaluation
The deeplearning algorithim was used to build the dl_model_first model. The more efficient random hyper-parameter search was used to build the dl_grid_random_model. The accuracy rate on the validation sets for dl_model_first and dl_grid_random_model was approximately 62-63%. The fact that most flights departed on-time means that few records represent delayed flights. Having an underrepresented class makes training the model more difficult.

AutoML - Of the 6 models evaluated, the most accurate model was the StackedEnsemble_AllModels_AutoML with an accuracy of approximately 65%.