Predicting Arrival Delay for Selected US Airports-Q1 2018

Part-1 Data loading and processing
Part-2 Data summarization and preparation
- Data preparation
- md_pred_final dataset
Part-3 Unique Carrier delays
Part-4 Data Imputation
- Imputing missing values using Random Forest
- Data summarization of imputed dataset
Part-5 Inferential Regression
- Finding outliers in md_pred_final dataset
- Removing the outliers
Part-6 Predictive Modeling

Part-1 Data loading and processing

master_data <- read_excel("~/master_data.xlsx")

More info on dataset

df_status(master_data)

##               variable q_zeros p_zeros   q_na  p_na q_inf p_inf      type
## 1              fl_date       0    0.00      0  0.00     0     0 character
## 2    op_unique_carrier       0    0.00      0  0.00     0     0 character
## 3             tail_num       0    0.00    397  0.24     0     0 character
## 4    op_carrier_fl_num       0    0.00      0  0.00     0     0   numeric
## 5               origin       0    0.00      0  0.00     0     0 character
## 6                 dest       0    0.00      0  0.00     0     0 character
## 7             dep_time       0    0.00   3597  2.16     0     0   numeric
## 8            dep_delay    7494    4.51   3769  2.27     0     0   numeric
## 9             taxi_out       0    0.00   3662  2.20     0     0   numeric
## 10             taxi_in       0    0.00   3674  2.21     0     0   numeric
## 11            arr_time       0    0.00   3674  2.21     0     0   numeric
## 12           arr_delay    2711    1.63   3943  2.37     0     0   numeric
## 13           cancelled  162527   97.79      0  0.00     0     0   numeric
## 14 actual_elapsed_time       0    0.00   3849  2.32     0     0   numeric
## 15            air_time       0    0.00   3849  2.32     0     0   numeric
## 16            distance       0    0.00      0  0.00     0     0   numeric
## 17       carrier_delay   15073    9.07 140602 84.60     0     0   numeric
## 18       weather_delay   23853   14.35 140602 84.60     0     0   numeric
## 19           nas_delay   10806    6.50 140602 84.60     0     0   numeric
## 20      security_delay   25539   15.37 140602 84.60     0     0   numeric
## 21 late_aircraft_delay   12707    7.65 140602 84.60     0     0   numeric
##    unique
## 1      89
## 2      17
## 3    4712
## 4    3818
## 5     195
## 6       3
## 7    1297
## 8     698
## 9     152
## 10    109
## 11   1406
## 12    734
## 13      2
## 14    434
## 15    413
## 16    287
## 17    558
## 18    290
## 19    321
## 20     32
## 21    393

Missing values

plot_missing(master_data)

Last 5 variables has over 85% missing values which should be removed as per the plot

Converting the date in year-month-date format

master_data <- master_data%>%mutate(fl_date=ymd(fl_date))

Generating day of week column

master_data <- master_data%>%mutate(dow=wday(fl_date,label=T))
ggplot(master_data,aes(x=master_data$dow))+geom_histogram(stat='count')

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Filtering on ORD and ATL destination airports

md_pred <- master_data%>%filter(dest %in% c("ORD", "ATL"))

We create a new dataframe

Removing negative arrival and departure delay

md_pred <- md_pred%>%filter(arr_delay> 0 & dep_delay>0)
dim(md_pred)

## [1] 31820    22

As negative values in both arrival and departure delay means before time arrival and departure, thus removing these values will make more sense. After removal of these values, only 31820 rows are left

Frequency of origin airports

origin <- md_pred%>%group_by(origin)%>%summarise(cnt=n())%>%arrange(desc(cnt))

kable(head(origin,194),'html') %>%
 kable_styling() %>%
 scroll_box(width = "400px", height = "700px")

origin	cnt
LGA	1195
DFW	784
MCO	673
LAX	637
DTW	627
MSP	597
MIA	581
BOS	578
FLL	569
DCA	560
DEN	526
EWR	520
CLT	516
IAH	511
TPA	457
PHL	453
BWI	444
RDU	439
SFO	434
CLE	418
ORD	408
CMH	407
IND	405
PHX	380
STL	368
MSY	361
BNA	360
ATL	354
CVG	348
SLC	335
LAS	327
PBI	313
MCI	296
GRR	289
AUS	274
JFK	271
JAX	264
MKE	263
PIT	258
RIC	256
RSW	239
MDW	236
SEA	236
MEM	223
FWA	219
DAL	207
HOU	207
SAT	202
BHM	200
SDF	198
AGS	192
CID	192
DSM	192
IAD	192
XNA	190
DAY	185
ORF	185
SAN	181
SBN	181
ASE	178
CHA	176
AVL	170
GSP	170
BUF	165
MGM	165
MSN	163
OMA	159
SGF	159
SYR	157
ICT	154
CHS	153
EYW	152
BDL	147
GNV	144
OKC	139
TUL	139
CHO	135
MOB	135
FAR	134
ALB	131
CAE	130
TYS	130
MLI	129
LIT	128
ROA	128
BMI	121
BTR	121
PIA	121
RST	121
LEX	120
SAV	115
FAY	112
MDT	109
ROC	109
GRB	105
HSV	103
HPN	100
ILM	100
SNA	100
SRQ	99
FSD	97
CRW	96
MLU	96
GSO	95
SHV	95
CAK	94
FNT	94
DHN	90
TRI	89
EVV	88
TLH	87
CMI	83
SJU	81
TVC	78
ABE	76
LSE	76
PNS	76
BZN	72
COS	69
PDX	66
ABY	65
CSG	65
MBS	65
SJC	65
ATW	64
ELP	64
PVD	64
SPI	64
VLD	64
EGE	63
JAN	63
TUS	63
VPS	62
BTV	60
DAB	60
ECP	60
AEX	59
LFT	57
DBQ	53
MLB	51
PHF	51
HDN	50
CWA	47
SMF	47
ABQ	46
BOI	46
COU	44
DLH	44
MYR	44
GTR	43
OAJ	43
TOL	43
CMX	40
GPT	39
UIN	39
PSP	38
SUX	38
PAH	37
ALO	35
BQK	35
JAC	34
MKG	33
AZO	32
EAU	32
AVP	30
FSM	30
LAN	30
RNO	29
MEI	27
EWN	26
MHK	26
BIS	24
MQT	23
HNL	21
LNK	21
MHT	21
OGG	21
PWM	21
CKB	19
OAK	18
RAP	16
CGI	15
STX	9
GEG	8
STT	7
MTJ	6
ANC	5
FCA	5
GRK	5
TTN	4
TWF	3
ELM	1
SUN	1

Binning origin airports using case statement

origin <- origin%>%filter(cnt>=10)%>%mutate(origin_freq=case_when(cnt >=175 & cnt<300 ~ 'Low' ,
                                      cnt>=300 & cnt<500 ~ 'Moderate',
                                      cnt>=500 ~ 'High',
                                      TRUE ~ 'Very Low'))

We create a new column based on the frequency of the flights from the origin airport

kable(head(origin,25),'html') %>%
 kable_styling() %>%
 scroll_box(width = "400px", height = "700px")

origin	cnt	origin_freq
LGA	1195	High
DFW	784	High
MCO	673	High
LAX	637	High
DTW	627	High
MSP	597	High
MIA	581	High
BOS	578	High
FLL	569	High
DCA	560	High
DEN	526	High
EWR	520	High
CLT	516	High
IAH	511	High
TPA	457	Moderate
PHL	453	Moderate
BWI	444	Moderate
RDU	439	Moderate
SFO	434	Moderate
CLE	418	Moderate
ORD	408	Moderate
CMH	407	Moderate
IND	405	Moderate
PHX	380	Moderate
STL	368	Moderate

Joining the two datasets to add a new origin frequency column

md_pred <- left_join(md_pred,origin,by='origin')
md_pred <- md_pred%>%filter(!is.na(origin_freq))

We joined the md_pred with origin dataset using left join and on origin column

Adding month column

md_pred <- md_pred%>%mutate(Month=lubridate::month(ymd(fl_date),label=T))

Adding levels to categorical columns

md_pred <- md_pred %>% 
  mutate_at(vars(op_unique_carrier,dest,origin_freq, Month,dow), funs(as.factor))

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

md_pred$Month <- factor(md_pred$Month, levels = c("Jan","Feb","Mar"), labels =c(1,2,3),
                        ordered = F )
md_pred$dow <- factor(md_pred$dow, levels = c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"), 
                        labels =c(1,2,3,4,5,6,7),ordered = F )
md_pred$origin_freq <- factor(md_pred$origin_freq, levels = c("Very Low","Low","Moderate","High"), 
                                labels =c(1,2,3,4) )

md_pred$dest <- factor(md_pred$dest, levels = c("ORD","ATL"), 
                                labels =c(1,2) )

md_pred$op_unique_carrier <- factor(md_pred$op_unique_carrier, 
                                    levels = c("F9","EV","MQ","NK","OH","OO","VX","WN","YV","YX","AA", "AS", "DL", "9E", "B6", "UA"), 
                                    labels =c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16) )

df_status(md_pred)

##               variable q_zeros p_zeros  q_na  p_na q_inf p_inf      type
## 1              fl_date       0    0.00     0  0.00     0     0      Date
## 2    op_unique_carrier       0    0.00     0  0.00     0     0    factor
## 3             tail_num       0    0.00     0  0.00     0     0 character
## 4    op_carrier_fl_num       0    0.00     0  0.00     0     0   numeric
## 5               origin       0    0.00     0  0.00     0     0 character
## 6                 dest       0    0.00     0  0.00     0     0    factor
## 7             dep_time       0    0.00     0  0.00     0     0   numeric
## 8            dep_delay       0    0.00     0  0.00     0     0   numeric
## 9             taxi_out       0    0.00     0  0.00     0     0   numeric
## 10             taxi_in       0    0.00     0  0.00     0     0   numeric
## 11            arr_time       0    0.00     0  0.00     0     0   numeric
## 12           arr_delay       0    0.00     0  0.00     0     0   numeric
## 13           cancelled   31766  100.00     0  0.00     0     0   numeric
## 14 actual_elapsed_time       0    0.00     0  0.00     0     0   numeric
## 15            air_time       0    0.00     0  0.00     0     0   numeric
## 16            distance       0    0.00     0  0.00     0     0   numeric
## 17       carrier_delay   10871   34.22 10411 32.77     0     0   numeric
## 18       weather_delay   19618   61.76 10411 32.77     0     0   numeric
## 19           nas_delay   10773   33.91 10411 32.77     0     0   numeric
## 20      security_delay   21303   67.06 10411 32.77     0     0   numeric
## 21 late_aircraft_delay    8496   26.75 10411 32.77     0     0   numeric
## 22                 dow       0    0.00     0  0.00     0     0    factor
## 23                 cnt       0    0.00     0  0.00     0     0   integer
## 24         origin_freq       0    0.00     0  0.00     0     0    factor
## 25               Month       0    0.00     0  0.00     0     0    factor
##    unique
## 1      89
## 2      16
## 3    4230
## 4    3103
## 5     182
## 6       2
## 7    1263
## 8     660
## 9     147
## 10    100
## 11   1331
## 12    668
## 13      1
## 14    331
## 15    308
## 16    275
## 17    557
## 18    289
## 19    321
## 20     32
## 21    393
## 22      7
## 23    128
## 24      4
## 25      3

Converted op_unique_carrier, origin_freq, Month, dow and dest columns to factors

Selecting final columns

md_pred <-  md_pred%>%select(2,6,8,12,14,16:22,24,25)

Selecting appropriate columns for predicting arrival delay

Part-2 Data summarization and preparation

skimr::skim(md_pred)

## Skim summary statistics
##  n obs: 31766 
##  n variables: 14 
## 
## ── Variable type:factor ───────────────────────────────────────────────────
##           variable missing complete     n n_unique
##               dest       0    31766 31766        2
##                dow       0    31766 31766        7
##              Month       0    31766 31766        3
##  op_unique_carrier       0    31766 31766       16
##        origin_freq       0    31766 31766        4
##                            top_counts ordered
##             2: 16957, 1: 14809, NA: 0   FALSE
##    1: 5878, 3: 5038, 4: 4810, 2: 4567   FALSE
##     1: 12062, 3: 9934, 2: 9770, NA: 0   FALSE
##  13: 9467, 6: 5874, 11: 3297, 3: 2943   FALSE
##    1: 9499, 4: 8874, 3: 7011, 2: 6382   FALSE
## 
## ── Variable type:numeric ──────────────────────────────────────────────────
##             variable missing complete     n    mean     sd p0 p25 p50 p75
##  actual_elapsed_time       0    31766 31766 128.61   52.69 37  90 117 156
##            arr_delay       0    31766 31766  57.21   95.45  1  11  27  65
##        carrier_delay   10411    21355 31766  26.41   88.33  0   0   0  16
##            dep_delay       0    31766 31766  58.46   95.01  1  13  29  66
##             distance       0    31766 31766 660.87  463.9  67 334 584 763
##  late_aircraft_delay   10411    21355 31766  33.47   58     0   0  14  42
##            nas_delay   10411    21355 31766  16.47   42.98  0   0   0  16
##       security_delay   10411    21355 31766   0.059   2.24  0   0   0   0
##        weather_delay   10411    21355 31766   5.31   39.04  0   0   0   0
##  p100     hist
##   539 ▆▇▃▁▁▁▁▁
##  1449 ▇▁▁▁▁▁▁▁
##  1390 ▇▁▁▁▁▁▁▁
##  1439 ▇▁▁▁▁▁▁▁
##  4502 ▇▃▁▁▁▁▁▁
##  1431 ▇▁▁▁▁▁▁▁
##  1161 ▇▁▁▁▁▁▁▁
##   198 ▇▁▁▁▁▁▁▁
##  1352 ▇▁▁▁▁▁▁▁

We see there are 5 factor columns and 9 numerical columns mentioned with extensive information about both the variable types. Many variables are right skewed along with arrival delay. The skewness needs to be addressed.

Data preparation

recipe_obj <- recipe(arr_delay ~ ., data = md_pred) %>%
    step_zv(all_predictors()) %>%
    step_YeoJohnson(nas_delay,late_aircraft_delay,security_delay,weather_delay,carrier_delay,arr_delay)%>%
    prep()

We solved the problem of zero variance columns which has no information and transformed the highly skewed varibles including arrival delay column using Yeo Johnson transformation

md_pred_final dataset

md_pred_final <- bake(recipe_obj, new_data = md_pred)
skimr::skim(md_pred_final)

## Skim summary statistics
##  n obs: 31766 
##  n variables: 14 
## 
## ── Variable type:factor ───────────────────────────────────────────────────
##           variable missing complete     n n_unique
##               dest       0    31766 31766        2
##                dow       0    31766 31766        7
##              Month       0    31766 31766        3
##  op_unique_carrier       0    31766 31766       16
##        origin_freq       0    31766 31766        4
##                            top_counts ordered
##             2: 16957, 1: 14809, NA: 0   FALSE
##    1: 5878, 3: 5038, 4: 4810, 2: 4567   FALSE
##     1: 12062, 3: 9934, 2: 9770, NA: 0   FALSE
##  13: 9467, 6: 5874, 11: 3297, 3: 2943   FALSE
##    1: 9499, 4: 8874, 3: 7011, 2: 6382   FALSE
## 
## ── Variable type:numeric ──────────────────────────────────────────────────
##             variable missing complete     n    mean      sd    p0    p25
##  actual_elapsed_time       0    31766 31766 128.61   52.69  37     90   
##            arr_delay       0    31766 31766   3.27    1.22   0.69   2.46
##        carrier_delay   10411    21355 31766   0.85    0.92   0      0   
##            dep_delay       0    31766 31766  58.46   95.01   1     13   
##             distance       0    31766 31766 660.87  463.9   67    334   
##  late_aircraft_delay   10411    21355 31766   2.06    1.83   0      0   
##            nas_delay   10411    21355 31766   0.87    0.94   0      0   
##       security_delay   10411    21355 31766   0.059   2.24   0      0   
##        weather_delay   10411    21355 31766   0.022   0.073  0      0   
##     p50    p75    p100     hist
##  117    156     539    ▆▇▃▁▁▁▁▁
##    3.29   4.13    7.1  ▂▅▇▇▆▃▁▁
##    0      1.78    2.58 ▇▁▁▁▂▂▂▁
##   29     66    1439    ▇▁▁▁▁▁▁▁
##  584    763    4502    ▇▃▁▁▁▁▁▁
##    2.64   3.63    6.78 ▇▁▁▅▃▂▁▁
##    0      1.83    2.71 ▇▁▁▁▂▂▂▁
##    0      0     198    ▇▁▁▁▁▁▁▁
##    0      0       0.27 ▇▁▁▁▁▁▁▁

Creating a new dataset md_pred_final for final predictive modeling. We used reciepe obj on md_pred_final dataset. We can observe arrival delay skewness is cured. The missing values of the five delay columns came down from 85% to 33%

Part-3 Unique Carrier delays

Most frequent carriers on monthly basis

ggplot(data = md_pred_final) +
  aes(x = op_unique_carrier) +
  geom_bar(color = "black", fill = "lightblue") + facet_grid(Month~.)

Delta airlines (13) has most flights per month followed by Skywest airlines(6)

Average delay time per month for unique carrier

a <- md_pred_final%>%group_by(Month,op_unique_carrier)%>%summarise(avg=mean(arr_delay))
md_pred_final%>%group_by(Month)%>%summarise(avg=mean(arr_delay))

## # A tibble: 3 x 2
##   Month   avg
##   <fct> <dbl>
## 1 1      3.40
## 2 2      3.35
## 3 3      3.03

p <- ggplot(data = a,aes(x = op_unique_carrier,y=avg) )+
  geom_bar(stat = 'identity',fill='#807dba') + facet_grid(Month~.)

p + geom_hline(aes(yintercept = avg), data.frame(Month = c(1, 2, 3), avg = c(3.40,3.35, 3.03)))

The average delay time for month of January is highest in the first quarter which makes sense as people are on holidays and are often travelling. The delay reduces gradually. The horizontal black line is the reference line for average arrival delay for that month. Delta airlines has average arrival delay time for the month of January and it reduces slightly below average for rest of the months. Skywest airline has more than average delay time in all three months. Southwest airline has significant below average arrival delays for all the three months and has decent frequency.

Carrier frequency on a daily basis

ggplot(data = md_pred_final) +
  aes(x = op_unique_carrier) +
  geom_bar(color = "black", fill = "lightblue") + facet_grid(dow~.)

Average delay time per day for 16 unique carriers

b <- md_pred_final%>%group_by(dow,op_unique_carrier)%>%summarise(avg=mean(arr_delay))
md_pred_final%>%group_by(dow)%>%summarise(avg=mean(arr_delay))

## # A tibble: 7 x 2
##   dow     avg
##   <fct> <dbl>
## 1 1      3.30
## 2 2      3.24
## 3 3      3.47
## 4 4      3.14
## 5 5      3.25
## 6 6      3.07
## 7 7      3.31

p <- ggplot(data = b,aes(x = op_unique_carrier,y=avg) )+
  geom_bar(stat = 'identity',fill='#807dba') + facet_grid(dow~.)

p + geom_hline(aes(yintercept = avg), data.frame(dow = c(1, 2, 3,4,5,6,7), avg = c(3.30, 3.24, 3.47,3.14, 3.25,3.07,3.31)))

Wednesdays (3rd grid) has highest average arrival delays

Part-4 Data Imputation

Imputing missing values using Random Forest

class(md_pred)

## [1] "tbl_df"     "tbl"        "data.frame"

#md_df <- data.frame(md_pred_final)
#md_pred_imp<- missForest(md_df,maxiter = 2)

As five columns namely carrier_delay, late_aircraft_delay, nas_delay, security_delay and weather_delay has 33% missing values, we need to impute them. Random forest is used for the imputation

#md_pred_impute <- md_pred_imp$ximp
#kable(head(md_pred_impute,50),'html') %>%
 #kable_styling() %>%
 #scroll_box(width = "800px", height = "700px")

Data summarization of imputed dataset

#skimr::skim(md_pred_impute)

Negative values are imputed in security delay and weather_delay. I would thus not like to use this dataset as negative values in delays column don’t make much sense.

Part-5 Inferential Regression

reg<- lm(arr_delay~.,data=md_pred_final,na.action = na.omit)
summary(reg)

## 
## Call:
## lm(formula = arr_delay ~ ., data = md_pred_final, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3199 -0.3088  0.0219  0.3156  1.0437 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.837e+00  3.453e-02  82.164  < 2e-16 ***
## op_unique_carrier2  -5.799e-02  3.327e-02  -1.743 0.081368 .  
## op_unique_carrier3  -1.143e-01  3.145e-02  -3.635 0.000279 ***
## op_unique_carrier4   2.585e-02  3.451e-02   0.749 0.453913    
## op_unique_carrier5  -6.535e-02  4.688e-02  -1.394 0.163349    
## op_unique_carrier6  -4.504e-02  3.021e-02  -1.491 0.136077    
## op_unique_carrier7   8.454e-02  8.402e-02   1.006 0.314313    
## op_unique_carrier8  -1.847e-01  3.135e-02  -5.893 3.84e-09 ***
## op_unique_carrier9   1.598e-02  6.108e-02   0.262 0.793657    
## op_unique_carrier10 -7.684e-02  3.336e-02  -2.303 0.021295 *  
## op_unique_carrier11 -9.279e-02  3.015e-02  -3.077 0.002091 ** 
## op_unique_carrier12  5.850e-02  7.505e-02   0.779 0.435696    
## op_unique_carrier13 -5.592e-02  2.938e-02  -1.903 0.057008 .  
## op_unique_carrier14 -7.100e-02  3.322e-02  -2.137 0.032613 *  
## op_unique_carrier15 -1.134e-01  4.031e-02  -2.814 0.004901 ** 
## op_unique_carrier16 -2.104e-02  3.070e-02  -0.685 0.493151    
## dest2                9.927e-04  9.699e-03   0.102 0.918475    
## dep_delay            5.597e-03  2.981e-05 187.757  < 2e-16 ***
## actual_elapsed_time  4.168e-03  1.901e-04  21.921  < 2e-16 ***
## distance            -4.791e-04  2.149e-05 -22.293  < 2e-16 ***
## carrier_delay        1.427e-01  4.321e-03  33.018  < 2e-16 ***
## weather_delay        1.040e+00  4.616e-02  22.532  < 2e-16 ***
## nas_delay            1.643e-01  4.784e-03  34.335  < 2e-16 ***
## security_delay       6.181e-03  1.361e-03   4.540 5.64e-06 ***
## late_aircraft_delay  1.163e-01  2.112e-03  55.042  < 2e-16 ***
## dow2                -7.493e-03  1.072e-02  -0.699 0.484555    
## dow3                 4.629e-02  1.025e-02   4.517 6.29e-06 ***
## dow4                -3.093e-02  1.078e-02  -2.868 0.004129 ** 
## dow5                -4.480e-03  1.084e-02  -0.413 0.679430    
## dow6                -1.205e-02  1.299e-02  -0.927 0.353870    
## dow7                 1.305e-02  1.100e-02   1.187 0.235247    
## origin_freq2        -1.824e-02  9.170e-03  -1.989 0.046736 *  
## origin_freq3        -5.297e-02  9.573e-03  -5.533 3.18e-08 ***
## origin_freq4        -6.609e-02  9.876e-03  -6.692 2.26e-11 ***
## Month2               1.581e-02  7.376e-03   2.143 0.032125 *  
## Month3              -6.641e-02  7.674e-03  -8.654  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4456 on 21319 degrees of freedom
##   (10411 observations deleted due to missingness)
## Multiple R-squared:  0.7064, Adjusted R-squared:  0.7059 
## F-statistic:  1465 on 35 and 21319 DF,  p-value: < 2.2e-16

Significant variables can be seen from the multiple linear regression model. All the delay variables are highly significant along with wednesday

plot(reg)

QQ plot- Residuals are left skewed. From other 3 graphs we see there are some outliers.

Finding outliers in md_pred_final dataset

 bp<- ggplot(data = md_pred_final) +
  aes(x = arr_delay, y = arr_delay) +
  geom_boxplot(fill = '#9e9ac8') +
  labs(title = 'Box Plot of Arrival Delay') +
  theme_linedraw()

histo <- ggplot(data = md_pred_final) +
  aes(x = arr_delay) +
  geom_histogram(bins = 12, fill = '#9e9ac8') +
  labs(title = 'Histogram of Arrival Delay') +
  theme_linedraw()

plot_grid(bp,histo)

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

From both the plots we can see that there are some outliers for more than 6.9 arr_delay value approx

Removing the outliers

 md_pred_final <- md_pred_final%>%filter(arr_delay<6.9)

We removed arr_delay values of more than 6.9

Part-6 Predictive Modeling

Splitting the data

train <- md_pred_final[sample(1:25411),]
colnames(train)

##  [1] "op_unique_carrier"   "dest"                "dep_delay"          
##  [4] "arr_delay"           "actual_elapsed_time" "distance"           
##  [7] "carrier_delay"       "weather_delay"       "nas_delay"          
## [10] "security_delay"      "late_aircraft_delay" "dow"                
## [13] "origin_freq"         "Month"

test <- md_pred_final[sample(25411:nrow(md_pred_final)),]
colnames(test)

##  [1] "op_unique_carrier"   "dest"                "dep_delay"          
##  [4] "arr_delay"           "actual_elapsed_time" "distance"           
##  [7] "carrier_delay"       "weather_delay"       "nas_delay"          
## [10] "security_delay"      "late_aircraft_delay" "dow"                
## [13] "origin_freq"         "Month"

write.csv(train,file="train.csv")
write.csv(test,file="test.csv")

Using H2O AutoML

h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         19 hours 59 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.2 
##     H2O cluster version age:    18 days  
##     H2O cluster name:           H2O_started_from_R_saurabhyelne_lbz280 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.96 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.2 (2017-09-28)

train <- h2o.importFile("train.csv")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=================================================================| 100%

test <- h2o.importFile("test.csv")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

split_h2o <- h2o.splitFrame(as.h2o(train), ratios = c(0.85))

train_h2o <- split_h2o[[1]]
valid_h2o <- split_h2o[[2]]
test_h2o  <- as.h2o(test)

Here I am using H2O which is one of the advanced machine learning library and has several state of the art models. We divided data into three sets for training, validation and testing

Model generation

y <- 'arr_delay'
x <- setdiff(names(train),y)

#automl_models_h2o <- h2o.automl(
 # x = x,
  #y = y,
#  training_frame = train_h2o,
 # validation_frame = valid_h2o,
 # leaderboard_frame = test_h2o,
  #max_runtime_secs = 300,
  #nfolds = 5
#)

We provided training, validation and test dataset to the automl function. We are using 5 fold cross validation with maximum runtime of 100 seconds.

Best performing models

#automl_models_h2o@leaderboard

#print(paste0("The best performing model is ",automl_models_h2o@leader@model_id))

The best model is GBM_1_AutoML_20190504_073500

Description about the best model - GBM_1_AutoML_20190504_073500

#automl_models_h2o@leader

Saving the top performing model to working directory

#h2o.getModel("GBM_1_AutoML_20190504_073500") %>%
#  h2o.saveModel(path = "04_Modeling/h2o_models/")

#h2o_gbm <- h2o.loadModel("04_Modeling/h2o_models/GBM_1_AutoML_20190504_073500")

Prediction by Gradient Boosting Machine model

#predictions <- h2o.predict(h2o_gbm, newdata = as.h2o(test))

#predictions_tbl <- predictions %>% as.tibble()
#predictions_tbl

Performance of the top model

#performance_h2o_gbm <- h2o.performance(h2o_gbm, newdata = as.h2o(test))
#typeof(performance_h2o_gbm)
#performance_h2o_gbm %>% slotNames()
#performance_h2o@metrics
#train_rmse_gbm  <- h2o.rmse(gbm_h2o, train = TRUE)
#test_rmse_gbm   <- h2o.rmse(gbm_h2o, valid = TRUE)
#hold_rmse_gbm   <- h2o.rmse(object = performance_h2o_gbm)
#print(paste0("GBM rmse TRAIN = ", train_rmse_gbm, ", rmse valid = ", test_rmse_gbm, ", rmse HOLDOUT = ",
#             hold_rmse_gbm))

#h2o.varimp_plot(gbm_h2o, num_of_features = NULL)

[1] “GBM rmse TRAIN = 8.34806208521452, rmse valid = 17.6673196637401, rmse HOLDOUT = 6.83885108336663”

The plot shows important variables for predicting arrival delay. Carrier delay is the most important factor

                                             **Thank You**