This is a continuation of working on the NYC Flight Data for the year 2013. The aim is create a model that can predict whether or not a flight will be 30 minutes late, given a set of parameters.

library(tidymodels)
library(tidyverse)
library(skimr)
library(nycflights13)
library(timeDate)
library(lubridate)
library(maps)
library(patchwork)
library(corrplot)
library(GGally)
library(rpart.plot)
library(vip)

theme_set(theme_bw())

1 Splitting the data

full_flights=readRDS("Cleaned_Full_Flights.rds")

new_full_flights = 
full_flights %>%
  select(-tailnum,-flight,-time_hour)  #This would've been used as ID to investigate the wrong cases

skim(new_full_flights)

Data summary
Name	new_full_flights
Number of rows	232412
Number of columns	15
_______________________
Column type frequency:
Date	1
factor	6
numeric	8
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
dep_date	0	1	2013-01-01	2013-12-30	2013-07-06	364

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
carrier	1	FALSE	16	UA: 46705, B6: 43035, EV: 42735, DL: 39866
origin	1	FALSE	3	EWR: 91968, JFK: 77973, LGA: 62471
is_late	1	FALSE	2	0: 199198, 1: 33214
mnfr_year	1	FALSE	7	200: 85189, 200: 56473, 199: 38295, 199: 21498
engine_type	1	FALSE	3	Tur: 197038, Tur: 33878, Oth: 1496
tzone	1	FALSE	6	Ame: 134986, Ame: 47286, Ame: 37241, Ame: 8455

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dep_delay	1	11.62	37.99	-43.00	-5.00	-2.00	10.00	1301.00	▇▁▁▁▁
temp	1	56.99	18.06	10.94	42.08	57.02	71.96	100.04	▁▇▇▇▂
precip	1	0.00	0.01	0.00	0.00	0.00	0.00	0.53	▇▁▁▁▁
visib	1	9.58	1.46	0.00	10.00	10.00	10.00	10.00	▁▁▁▁▇
seats	1	136.82	71.96	2.00	55.00	149.00	189.00	400.00	▆▅▇▁▁
lat	1	36.02	5.76	21.32	32.73	36.10	41.41	61.17	▃▆▇▁▁
lon	1	-90.15	15.70	-157.92	-95.34	-84.22	-80.15	-68.83	▁▁▂▃▇
alt	1	590.66	995.14	3.00	26.00	313.00	748.00	6602.00	▇▁▁▁▁

set.seed(4321)
flights_split = initial_split(new_full_flights, prop = 0.7, strata = is_late)

flights_train = training(flights_split)
flights_test = testing(flights_split)

2 Create a recipe

flights_rec =
  recipe(is_late ~ .,data=flights_train ) %>%
  #step_knnimpute(all_predictors()) %>%
  #update_role(flight, tailnum, time_hour, new_role="ID")%>%
  step_date(dep_date, features = c("dow", "month")) %>% 
  step_holiday( dep_date, holidays = timeDate::listHolidays() %>% str_subset("(^US)|(Easter)") ) %>%
  step_rm(dep_date) %>% 
  step_dummy(all_nominal(),-all_outcomes()) %>%
  step_zv(all_predictors())


tree_rec = 
  recipe(is_late ~.,data=flights_train) %>%
  #update_role(flight, tailnum, time_hour, new_role ="ID") %>%
  step_zv(all_predictors())

3 Fit a model with a recipe

logist_mod = logistic_reg() %>%
  set_engine("glm")

tree_mod = decision_tree(cost_complexity = tune(), 
                         tree_depth = tune()
                         ) %>%
  set_engine("rpart") %>% 
  set_mode("classification")

3.1 Tree Grid Search

set.seed(512)
tree_params = grid_max_entropy(cost_complexity(),tree_depth(range = c(3,5)),size=9)

set.seed(512)
grid_max_entropy(cost_complexity(),tree_depth(range = c(3,5)),size=9) %>%
  ggplot(aes(tree_depth,log(cost_complexity)))+
  geom_point()+
  theme_bw()

Now we tune using resamples

doParallel::registerDoParallel()

Start = Sys.time()

set.seed(234)

flights_folds = vfold_cv(flights_train, v=5, strata = is_late) 


set.seed(345)

tree_wf = workflow() %>%
  add_model(tree_mod) %>%
  add_recipe(tree_rec)


tree_res = tree_wf %>% 
  tune_grid( resamples = flights_folds,
             grid=tree_params)
End = Sys.time()

doParallel::stopImplicitCluster()

paste("Time Taken:",End-Start)

## [1] "Time Taken: 1.09096333185832"

tree_res %>%
  collect_metrics() %>%
  mutate(tree_depth = factor(tree_depth)) %>%
  ggplot(aes(x=cost_complexity,y=mean,color=tree_depth))+
  geom_line(size=1.5,alpha=0.6)+
  geom_point(size=2)+
  facet_wrap(~.metric, scales="free",nrow=2)+
  scale_x_log10(labels=label_number())+
  scale_color_viridis_d(option="C",begin=.9,end=0)+
  theme_bw()

3.2 Select Best Tree

best_tree = tree_res %>%
  select_best("roc_auc")

3.3 Logistic Model

Now we fit the logistic model

logt_wf = workflow() %>%
  add_model(logist_mod) %>%
  add_recipe(flights_rec)

logt_wf

## == Workflow ================================================================================================================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ------------------------------------------------------------------------------------------------------------------------------------------------------------
## 5 Recipe Steps
## 
## * step_date()
## * step_holiday()
## * step_rm()
## * step_dummy()
## * step_zv()
## 
## -- Model -------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

logt_fit = fit(logt_wf, data = flights_train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

expo_coefs  = logt_fit %>%
  pull_workflow_fit() %>%
  tidy()

test_expo_coefs = 
  logt_fit %>%
  pull_workflow_fit()%>%
  tidy(exponentiate=TRUE) %>% 
  mutate(conf_low = estimate - 1.96*std.error,
         conf_high = estimate + 1.96*std.error,
         #Significant =   if_else(p.value < 0.05 ,"True","False")
         )
   # tidy(conf.int=TRUE) 

expo_coefs %>%
  rmarkdown::paged_table()

We can also observe the odds ratio which tells us the relative risk of a fligtht being late when associated with either a unit change or factor change.

test_expo_coefs %>%
  select(term,estimate) %>%
  rename(Odds_Ratio = estimate)%>%
  rmarkdown::paged_table()

We can interpret this as a unit change in dep_delay is associated with a 10.8% increase in the odds of a flight being late. With factor variables, it is related to the change from the base level. If a factor is not significant, then it suggests that there is not enough evidence to say the two factors change the odds of a flight being late.

expo_coefs %>%
  filter( !str_starts(term,"dep|carrier|engine|tzone|origin|mnfr_year")) %>%
  filter( !term == "(Intercept)") %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
  geom_point()+
    geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  labs(title="Log Odds Ratio",col="Significant at\n5% level")+
  theme_bw()

expo_coefs %>%
  filter( str_starts(term,"carrier")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
  geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
    labs(title="Log Odds of Carrier")+
  theme_bw()

test_expo_coefs %>%
  filter( str_starts(term,"carrier")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
    geom_vline(xintercept = 1 ,col="grey80",lty=2,size=2)+
  geom_point()+
    geom_errorbar(aes(xmin=conf_low,
                    xmax=conf_high),width=0.5)+
    labs(title="Log Odds Ratio of Carrier")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"dep_date_month")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  mutate( term =  str_replace(term,"dep_date_month_","")) %>%
  mutate( term = factor(term, 
                        levels=c("Feb","Mar","Apr","May","Jun",
                                 "Jul","Aug","Sep","Oct","Nov","Dec"))) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
    geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  coord_flip()+
  labs(title="Log Odds Ratio by Month")+
   theme_bw()

test_expo_coefs %>%
  filter( str_starts(term,"dep_date_month")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  mutate( term =  str_replace(term,"dep_date_month_","")) %>%
  mutate( term = factor(term, 
                        levels=c("Feb","Mar","Apr","May","Jun",
                                 "Jul","Aug","Sep","Oct","Nov","Dec"))) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
    geom_vline(xintercept = 1,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=exp(log(estimate)-1.96*(std.error)),
                    xmax=exp(log(estimate)+1.96*(std.error))),width=0.5)+
  coord_flip()+
  labs(title="Odds Ratio by Month")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"dep_date_dow")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
  mutate( term =  str_replace(term,"dep_date_dow_","")) %>%
  mutate( term = factor(term, 
                        levels=c("Mon","Tue","Wed","Thu","Fri","Sat"))) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
    geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  coord_flip()+
  labs(title="Log Odds Ratio by Day of Week")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"mnfr_year")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
    mutate( term =  str_replace(term,"mnfr_year_","")) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
  geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  labs(title="Log Odds Ratio by Engine Manufacturing Year")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"engine_type")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
    mutate( term =  str_replace(term,"engine_type_","")) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
  geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  labs(title="Log Odds Ratio of Engine Type")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"tzone")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
    mutate( term =  str_replace(term,"engine_type_","")) %>%
  ggplot(aes(y=term,x=estimate,col=Significant))+
  geom_vline(xintercept = 0,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=estimate-1.96*std.error,
                    xmax=estimate+1.96*std.error),width=0.5)+
  labs(title="Log Odds Ratio of Timezones")+
   theme_bw()

expo_coefs %>%
  filter( str_starts(term,"dep_date_US|dep_date_Eas")) %>%
  mutate( Significant =   if_else(p.value < 0.05 ,"True","False"))%>%
    mutate( term =  str_replace(term,"dep_date_","")) %>%
    ggplot(aes(y=term,x=exp(estimate),col=Significant))+
  geom_vline(xintercept = 1,col="grey80",lty=2,size=2)+
  geom_point()+
  geom_errorbar(aes(xmin=exp(estimate-1.96*std.error),
                    xmax=exp(estimate+1.96*std.error)),width=0.5)+
  labs(title="Odds Ratio of Holiday")+
   theme_bw()

## Warning: Removed 1 rows containing missing values (geom_point).

4 Finialize Model

final_tree_wf = 
  tree_wf %>%
  finalize_workflow(best_tree)

final_tree_wf

## == Workflow ================================================================================================================================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ------------------------------------------------------------------------------------------------------------------------------------------------------------
## 1 Recipe Step
## 
## * step_zv()
## 
## -- Model -------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = 4.61113817239148e-07
##   tree_depth = 5
## 
## Computational engine: rpart

4.1 Fit Best Tree

final_tree = final_tree_wf %>% fit(data=flights_train)

final_tree

## == Workflow [trained] ======================================================================================================================================================
## Preprocessor: Recipe
## Model: decision_tree()
## 
## -- Preprocessor ------------------------------------------------------------------------------------------------------------------------------------------------------------
## 1 Recipe Step
## 
## * step_zv()
## 
## -- Model -------------------------------------------------------------------------------------------------------------------------------------------------------------------
## n= 162689 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 162689 23250 0 (0.85708929 0.14291071)  
##    2) dep_delay< 36.5 143273  6128 0 (0.95722851 0.04277149)  
##      4) dep_delay< 18.5 131910  3266 0 (0.97524069 0.02475931) *
##      5) dep_delay>=18.5 11363  2862 0 (0.74812990 0.25187010)  
##       10) dep_delay< 25.5 5392   931 0 (0.82733680 0.17266320)  
##         20) visib>=7.5 4988   805 0 (0.83861267 0.16138733) *
##         21) visib< 7.5 404   126 0 (0.68811881 0.31188119)  
##           42) temp>=33.53 364   102 0 (0.71978022 0.28021978) *
##           43) temp< 33.53 40    16 1 (0.40000000 0.60000000) *
##       11) dep_delay>=25.5 5971  1931 0 (0.67660358 0.32339642)  
##         22) dep_delay< 32.5 4078  1172 0 (0.71260422 0.28739578) *
##         23) dep_delay>=32.5 1893   759 0 (0.59904913 0.40095087)  
##           46) carrier=9E,AA,AS,DL,EV,FL,MQ,UA,VX,WN,YV 1401   509 0 (0.63668808 0.36331192) *
##           47) carrier=B6,F9,US 492   242 1 (0.49186992 0.50813008) *
##    3) dep_delay>=36.5 19416  2294 1 (0.11814998 0.88185002)  
##      6) dep_delay< 48.5 4361  1733 1 (0.39738592 0.60261408)  
##       12) carrier=9E,AA,AS,DL,UA,VX 1863   920 0 (0.50617284 0.49382716)  
##         24) dep_delay< 44.5 1309   590 0 (0.54927426 0.45072574) *
##         25) dep_delay>=44.5 554   224 1 (0.40433213 0.59566787)  
##           50) lat>=44.26406 29    12 0 (0.58620690 0.41379310) *
##           51) lat< 44.26406 525   207 1 (0.39428571 0.60571429) *
##       13) carrier=B6,EV,F9,FL,HA,MQ,US,WN,YV 2498   790 1 (0.31625300 0.68374700) *
##      7) dep_delay>=48.5 15055   561 1 (0.03726337 0.96273663) *

library(rpart.plot)

pull_final_tree= final_tree %>%
  pull_workflow_fit()

rpart.plot(pull_final_tree$fit,roundint=FALSE)

library(vip)

pull_final_tree %>%
  vip()+
   theme_bw()

5 Test Fit

5.1 Tree Fit

flights_pred <-
  predict(final_tree,flights_test,type="prob") %>%
  bind_cols(  flights_test %>% select(is_late)  )

flights_pred %>%
  roc_curve(truth=is_late, .pred_0)%>%
  autoplot()

flights_pred %>% 
  roc_auc(truth = is_late, .pred_0) %>%
  bind_rows(

predict(final_tree,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>% 
  accuracy(truth=is_late,.pred_class)
)

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 roc_auc  binary         0.922
## 2 accuracy binary         0.951

predict(final_tree,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>%
conf_mat(truth=is_late,estimate=.pred_class)

##           Truth
## Prediction     0     1
##          0 58987  2613
##          1   772  7351

5.2 Logistic Fit

logist_pred <-
  predict(logt_fit,new_data = flights_test ,type = "prob") %>%
  bind_cols(  flights_test %>% select(is_late)  )

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

logist_pred %>%
  roc_curve(truth=is_late, .pred_0)%>%
  autoplot()

logist_pred %>% 
  roc_auc(truth = is_late, .pred_0) %>%
  bind_rows(
    
  predict(logt_fit,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>% 
  accuracy(truth=is_late,.pred_class)  
    
    
  )

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 roc_auc  binary         0.956
## 2 accuracy binary         0.952

predict(logt_fit,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>%
conf_mat(truth=is_late,estimate=.pred_class)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

##           Truth
## Prediction     0     1
##          0 58838  2454
##          1   921  7510

Always predicting a plane not being late, would lead to a an accuracy of 86%.

(58838+921)/69732*100

## [1] 85.6981

5.3 Both Fits Compared

predict(final_tree, flights_test, type = "prob") %>%
  bind_cols(flights_test %>% select(is_late)) %>%
  roc_curve(truth = is_late, .pred_0) %>%
  mutate(model = "Decision Tree") %>%
  bind_rows(
    predict(logt_fit, flights_test, type = "prob") %>%
      bind_cols(flights_test %>% select(is_late)) %>%
      roc_curve(truth = is_late, .pred_0) %>%
      mutate(model = "Logistic")
  ) %>%
  ggplot(aes(x = 1 - sensitivity,y = specificity,col = model)) +
  geom_path(size = 1.5) +
  geom_abline(slope = 1,lty = 2,col = "grey60",size = 2) +
  coord_equal()+
   theme_bw()

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

flights_pred %>% 
  roc_auc(truth = is_late, .pred_0) %>%
  bind_rows(

predict(final_tree,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>% 
  accuracy(truth=is_late,.pred_class)
) %>% bind_cols(Model ="Decision Tree") %>%
  bind_rows(
    logist_pred %>% 
  roc_auc(truth = is_late, .pred_0) %>%
  bind_rows(
    
  predict(logt_fit,flights_test) %>%
  bind_cols( bind_cols(  flights_test %>% select(is_late)  ))%>% 
  accuracy(truth=is_late,.pred_class)  
    
    
  ) %>% bind_cols(Model="Logistic")
  ) %>%
  pivot_wider(names_from = .metric, values_from=.estimate) %>%
  select(-.estimator)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## # A tibble: 2 x 3
##   Model         roc_auc accuracy
##   <chr>           <dbl>    <dbl>
## 1 Decision Tree   0.922    0.951
## 2 Logistic        0.956    0.952

6 Post-Hoc Analysis

We can see what type of predictions the models made by generating fake data

#Fake Date of the Top 3 Most Visited Locations On a Specific Day With Normal Weather Conditions

fake_data = crossing(
  dep_delay=seq(0,75),
  carrier=c("UA","B6","EV","DL","US","9E"),
  origin=c("EWR","JFK","LGA"),
  temp=65,
  precip = 0,
  visib = 0,
  mnfr_year = ("2000"),
  seats = 200,
  engine_type = c("Turbo-fan", "Turbo-jet", "Other"),
  lat = c(33.94254,33.63672,42.36435), 
  lon = c(-118.40807,-84.42807,-71.00518),
  alt = c(126,1026,19),
  tzone=c("America/Los_Angeles","America/New_York","America/New_York"),
  dep_date=date("2013-09-16")
)
fake_data %>%
  mutate(mnfr_year = factor(mnfr_year),
         engine_type = factor(engine_type),
         tzone= factor(tzone),
         carrier=factor(carrier))

## # A tibble: 221,616 x 14
##    dep_delay carrier origin  temp precip visib mnfr_year seats engine_type   lat
##        <int> <fct>   <chr>  <dbl>  <dbl> <dbl> <fct>     <dbl> <fct>       <dbl>
##  1         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  2         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  3         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  4         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  5         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  6         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  7         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  8         0 9E      EWR       65      0     0 2000        200 Other        33.6
##  9         0 9E      EWR       65      0     0 2000        200 Other        33.6
## 10         0 9E      EWR       65      0     0 2000        200 Other        33.6
## # ... with 221,606 more rows, and 4 more variables: lon <dbl>, alt <dbl>,
## #   tzone <fct>, dep_date <date>

fake_prediction = predict(logt_fit,new_data = fake_data,type="prob")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

fake_prediction  %>%
  bind_cols(  fake_data ) %>%
  filter(lat ==33.63672  &  lon==-118.40807 & alt==126 & tzone =="America/Los_Angeles") %>%
  #filter(carrier %in% c("US","UA")) %>%
  ggplot(aes(x=dep_delay,y=.pred_1,col=carrier))+
  #geom_line(size=1.5)+
  geom_path()+
  facet_grid(origin~engine_type)+
  labs(title="Logistic Model Probabilities of Flight being Late",subtitle="Variation Between Engine Type, Carrier and Origin")+
   theme_bw()

fake_prediction = predict(final_tree,new_data=fake_data,type="prob")

fake_prediction  %>%
  bind_cols(  fake_data ) %>%
  filter(lat ==33.63672  &  lon==-118.40807 & alt==126 & tzone =="America/Los_Angeles") %>%
 #filter(carrier %in% c("US","UA")) %>%
  ggplot(aes(x=dep_delay,y=.pred_1,col=carrier))+
  geom_line(size=1)+
  facet_grid(origin~engine_type)+
  labs(title="Decision Tree Probabilities of Flight being Late",subtitle="Variation Between Engine Type, Carrier and Origin")+
   theme_bw()

We observe that the departure delay is extremely important in both models, and we also see how different carriers have different probabilities of being late. We also notice that there is very little differences between the origin of flight and the engine type of the aircraft.

We could further see how abnormal weather events may impact the arrival time, we could even try to see how the the day-by-day probabilities are affected if we fix everything else.

We could also investigate where the models went wrong, and true to see what the causes maybe and use the information for better models in the future.

6.1 Concluson

Overall, it seems that the most important feature in trying to determine whether a flight will arrive 30 minutes late is the amount of time the plane was delayed on departure. We have also observed some interesting effects due to day,month and weather events.

Some information was removed earlier, such as the time of day of flight. We have see that the time of day is important in discovering if a flight was late or not. For some reason this was removed.

Another way to improve the model would be to use factor lump on carriers that appear very few number of times. This could helping in making the model simplifier.

In contrast, we could make the model more complex; we could add non-linear transformations, like splines, on continuous variables such as: lon, lat, precip, or temp. This should improve the model predictability.

In reality it may be difficult to know by how delayed a flight will be delayed on departure, so another method could be to remove the departure delay feature. This may not improve final predictability, but it could more accurately represent real world conditions and be more useful for slightly long term predictions, on the order of around several days.

NYC Flights Model