flightnyc 2012

library(nycflights13)

set.seed(1234)
flight_data <- flights %>% 
  mutate(
    # Convert the arrival delay to a factor
    arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
    arr_delay = factor(arr_delay),
    
    # We will use the date (not date-time) in the recipe below
    date = lubridate::as_date(time_hour)
    ) %>% 
  # Include the weather data
  inner_join(weather, by = c("origin", "time_hour")) %>% 
  
  # Only retain the specific columns we will use
  select(
    dep_time, flight, origin, dest, air_time, distance, carrier, date,
    arr_delay, time_hour
    ) %>% 
  na.omit() %>% # Exclude missing data
  mutate_if(is.character, as.factor)

For creating models, it is better to have qualitative columns encoded as factors (instead of character strings)

flight_data %>% head(1) %>% t() %>% kable()
dep_time 517
flight 1545
origin EWR
dest IAH
air_time 227
distance 1400
carrier UA
date 2013-01-01
arr_delay on_time
time_hour 2013-01-01 05:00:00
flight_data %>% 
  count(arr_delay) %>% 
  mutate(
    prop = n / sum(n)
    )
## # A tibble: 2 x 3
##   arr_delay      n  prop
##   <fct>      <int> <dbl>
## 1 late       52540 0.161
## 2 on_time   273279 0.839
flight_data %>% 
  skimr::skim(dest, carrier)
Data summary
Name Piped data
Number of rows 325819
Number of columns 10
_______________________
Column type frequency:
factor 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
dest 0 1 FALSE 104 ATL: 16771, ORD: 16507, LAX: 15942, BOS: 14948
carrier 0 1 FALSE 16 UA: 57489, B6: 53715, EV: 50868, DL: 47465

Partition Data

Create Test / Train Split

set.seed(1234)
# Put 3/4 of the data into the training set 
data_split <- initial_split(flight_data, prop = 0.75)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Create Recipes and Roles

flights_recipe <- recipe(
    arr_delay ~ .,
    data = train_data
    ) %>%
  update_role(flight, time_hour, new_role = "ID")
summary(flights_recipe)
## # A tibble: 10 x 4
##    variable  type      role      source  
##    <chr>     <list>    <chr>     <chr>   
##  1 dep_time  <chr [2]> predictor original
##  2 flight    <chr [2]> ID        original
##  3 origin    <chr [3]> predictor original
##  4 dest      <chr [3]> predictor original
##  5 air_time  <chr [2]> predictor original
##  6 distance  <chr [2]> predictor original
##  7 carrier   <chr [3]> predictor original
##  8 date      <chr [1]> predictor original
##  9 time_hour <chr [1]> ID        original
## 10 arr_delay <chr [3]> outcome   original

We now want to add some additional fields

flights_recipe <- recipe(
    arr_delay ~ .,
    data = train_data
    ) %>%
  update_role(flight, time_hour, new_role = "ID") %>%
  step_date(date, features = c("dow", "month")) %>%               
  step_holiday(
    date,
    holidays = timeDate::listHolidays("US"),
    keep_original_cols = FALSE
    ) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors())

Fit Model with Recipe

lr_mod <- logistic_reg() %>% 
  set_engine("glm")
flights_wflow <- workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(flights_recipe)
flights_wflow %>% print()
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
## 
## * step_date()
## * step_holiday()
## * step_dummy()
## * step_zv()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm
flights_fit <- flights_wflow %>% 
  fit(data = train_data)
flights_fit %>% print()
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
## 
## * step_date()
## * step_holiday()
## * step_dummy()
## * step_zv()
## 
## -- Model -----------------------------------------------------------------------
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##                              (Intercept)  
##                                6.7643640  
##                                 dep_time  
##                               -0.0016673  
##                                 air_time  
##                               -0.0437019  
##                                 distance  
##                                0.0051165  
##                      date_USChristmasDay  
##                                1.3475572  
##                       date_USColumbusDay  
##                                0.9165029  
##                 date_USCPulaskisBirthday  
##                                0.8432919  
##             date_USDecorationMemorialDay  
##                                0.5328861  
##                       date_USElectionDay  
##                                0.9278291  
##                        date_USGoodFriday  
##                                1.4938197  
##                   date_USInaugurationDay  
##                                0.2542875  
##                   date_USIndependenceDay  
##                                2.1585043  
## date_USJuneteenthNationalIndependenceDay  
##                                0.7569327  
##                          date_USLaborDay  
##                               -1.8896959  
##                  date_USLincolnsBirthday  
##                                0.8076493  
##                       date_USMemorialDay  
##                                1.4220511  
##                   date_USMLKingsBirthday  
##                                0.5462148  
##                       date_USNewYearsDay  
##                                0.2447815  
##                     date_USPresidentsDay  
##                                0.7229128  
##                   date_USThanksgivingDay  
##                                0.1476563  
##                       date_USVeteransDay  
##                                0.8043864  
##               date_USWashingtonsBirthday  
##                                0.0779113  
##                               origin_JFK  
##                                0.1222052  
## 
## ...
## and 274 more lines.
flights_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
## # A tibble: 158 x 5
##    term                         estimate std.error statistic  p.value
##    <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                   6.76    2.72           2.49 1.28e- 2
##  2 dep_time                     -0.00167 0.0000141   -118.   0       
##  3 air_time                     -0.0437  0.000563     -77.6  0       
##  4 distance                      0.00512 0.00150        3.42 6.32e- 4
##  5 date_USChristmasDay           1.35    0.175          7.68 1.54e-14
##  6 date_USColumbusDay            0.917   0.186          4.92 8.56e- 7
##  7 date_USCPulaskisBirthday      0.843   0.141          5.97 2.43e- 9
##  8 date_USDecorationMemorialDay  0.533   0.120          4.44 9.11e- 6
##  9 date_USElectionDay            0.928   0.182          5.10 3.48e- 7
## 10 date_USGoodFriday             1.49    0.180          8.30 1.03e-16
## # ... with 148 more rows

Predict with Workflow

flights_fit %>% predict(test_data)
## # A tibble: 81,455 x 1
##    .pred_class
##    <fct>      
##  1 on_time    
##  2 on_time    
##  3 on_time    
##  4 on_time    
##  5 on_time    
##  6 on_time    
##  7 on_time    
##  8 on_time    
##  9 on_time    
## 10 on_time    
## # ... with 81,445 more rows
flights_aug <- augment(flights_fit, test_data)
# The data look like: 
flights_aug %>%
  select(arr_delay, time_hour, flight, .pred_class, .pred_on_time)
## # A tibble: 81,455 x 5
##    arr_delay time_hour           flight .pred_class .pred_on_time
##    <fct>     <dttm>               <int> <fct>               <dbl>
##  1 late      2013-01-01 05:00:00   1141 on_time             0.983
##  2 on_time   2013-01-01 05:00:00    725 on_time             0.992
##  3 on_time   2013-01-01 05:00:00   1696 on_time             0.918
##  4 on_time   2013-01-01 06:00:00   5708 on_time             0.961
##  5 on_time   2013-01-01 06:00:00     49 on_time             0.972
##  6 on_time   2013-01-01 05:00:00   1806 on_time             0.981
##  7 on_time   2013-01-01 06:00:00    371 on_time             0.973
##  8 on_time   2013-01-01 06:00:00    135 on_time             0.961
##  9 on_time   2013-01-01 06:00:00   4626 on_time             0.823
## 10 on_time   2013-01-01 06:00:00     27 on_time             0.934
## # ... with 81,445 more rows
flights_aug %>% 
  roc_curve(truth = arr_delay, .pred_late) %>% 
  autoplot()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## i Please use `reframe()` instead.
## i When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## i The deprecated feature was likely used in the yardstick package.
##   Please report the issue at <https://github.com/tidymodels/yardstick/issues>.

flights_aug %>% 
  roc_auc(truth = arr_delay, .pred_late)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.764

Check how useful the recipe has been.

workflow() %>% 
  add_model(lr_mod) %>% 
  add_formula(arr_delay ~ .) %>%
  fit(data = train_data %>% select(-flight, -time_hour)) %>%
  augment(test_data) %>%
  roc_auc(truth = arr_delay, .pred_late)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.735