library(nycflights13)
set.seed(1234)
flight_data <- flights %>%
mutate(
# Convert the arrival delay to a factor
arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
arr_delay = factor(arr_delay),
# We will use the date (not date-time) in the recipe below
date = lubridate::as_date(time_hour)
) %>%
# Include the weather data
inner_join(weather, by = c("origin", "time_hour")) %>%
# Only retain the specific columns we will use
select(
dep_time, flight, origin, dest, air_time, distance, carrier, date,
arr_delay, time_hour
) %>%
na.omit() %>% # Exclude missing data
mutate_if(is.character, as.factor)
For creating models, it is better to have qualitative columns encoded as factors (instead of character strings)
flight_data %>% head(1) %>% t() %>% kable()
dep_time | 517 |
flight | 1545 |
origin | EWR |
dest | IAH |
air_time | 227 |
distance | 1400 |
carrier | UA |
date | 2013-01-01 |
arr_delay | on_time |
time_hour | 2013-01-01 05:00:00 |
flight_data %>%
count(arr_delay) %>%
mutate(
prop = n / sum(n)
)
## # A tibble: 2 x 3
## arr_delay n prop
## <fct> <int> <dbl>
## 1 late 52540 0.161
## 2 on_time 273279 0.839
flight_data %>%
skimr::skim(dest, carrier)
Name | Piped data |
Number of rows | 325819 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 2 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
dest | 0 | 1 | FALSE | 104 | ATL: 16771, ORD: 16507, LAX: 15942, BOS: 14948 |
carrier | 0 | 1 | FALSE | 16 | UA: 57489, B6: 53715, EV: 50868, DL: 47465 |
Create Test / Train Split
set.seed(1234)
# Put 3/4 of the data into the training set
data_split <- initial_split(flight_data, prop = 0.75)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
flights_recipe <- recipe(
arr_delay ~ .,
data = train_data
) %>%
update_role(flight, time_hour, new_role = "ID")
summary(flights_recipe)
## # A tibble: 10 x 4
## variable type role source
## <chr> <list> <chr> <chr>
## 1 dep_time <chr [2]> predictor original
## 2 flight <chr [2]> ID original
## 3 origin <chr [3]> predictor original
## 4 dest <chr [3]> predictor original
## 5 air_time <chr [2]> predictor original
## 6 distance <chr [2]> predictor original
## 7 carrier <chr [3]> predictor original
## 8 date <chr [1]> predictor original
## 9 time_hour <chr [1]> ID original
## 10 arr_delay <chr [3]> outcome original
We now want to add some additional fields
flights_recipe <- recipe(
arr_delay ~ .,
data = train_data
) %>%
update_role(flight, time_hour, new_role = "ID") %>%
step_date(date, features = c("dow", "month")) %>%
step_holiday(
date,
holidays = timeDate::listHolidays("US"),
keep_original_cols = FALSE
) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
lr_mod <- logistic_reg() %>%
set_engine("glm")
flights_wflow <- workflow() %>%
add_model(lr_mod) %>%
add_recipe(flights_recipe)
flights_wflow %>% print()
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
##
## * step_date()
## * step_holiday()
## * step_dummy()
## * step_zv()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
flights_fit <- flights_wflow %>%
fit(data = train_data)
flights_fit %>% print()
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
##
## * step_date()
## * step_holiday()
## * step_dummy()
## * step_zv()
##
## -- Model -----------------------------------------------------------------------
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept)
## 6.7643640
## dep_time
## -0.0016673
## air_time
## -0.0437019
## distance
## 0.0051165
## date_USChristmasDay
## 1.3475572
## date_USColumbusDay
## 0.9165029
## date_USCPulaskisBirthday
## 0.8432919
## date_USDecorationMemorialDay
## 0.5328861
## date_USElectionDay
## 0.9278291
## date_USGoodFriday
## 1.4938197
## date_USInaugurationDay
## 0.2542875
## date_USIndependenceDay
## 2.1585043
## date_USJuneteenthNationalIndependenceDay
## 0.7569327
## date_USLaborDay
## -1.8896959
## date_USLincolnsBirthday
## 0.8076493
## date_USMemorialDay
## 1.4220511
## date_USMLKingsBirthday
## 0.5462148
## date_USNewYearsDay
## 0.2447815
## date_USPresidentsDay
## 0.7229128
## date_USThanksgivingDay
## 0.1476563
## date_USVeteransDay
## 0.8043864
## date_USWashingtonsBirthday
## 0.0779113
## origin_JFK
## 0.1222052
##
## ...
## and 274 more lines.
flights_fit %>%
extract_fit_parsnip() %>%
tidy()
## # A tibble: 158 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.76 2.72 2.49 1.28e- 2
## 2 dep_time -0.00167 0.0000141 -118. 0
## 3 air_time -0.0437 0.000563 -77.6 0
## 4 distance 0.00512 0.00150 3.42 6.32e- 4
## 5 date_USChristmasDay 1.35 0.175 7.68 1.54e-14
## 6 date_USColumbusDay 0.917 0.186 4.92 8.56e- 7
## 7 date_USCPulaskisBirthday 0.843 0.141 5.97 2.43e- 9
## 8 date_USDecorationMemorialDay 0.533 0.120 4.44 9.11e- 6
## 9 date_USElectionDay 0.928 0.182 5.10 3.48e- 7
## 10 date_USGoodFriday 1.49 0.180 8.30 1.03e-16
## # ... with 148 more rows
flights_fit %>% predict(test_data)
## # A tibble: 81,455 x 1
## .pred_class
## <fct>
## 1 on_time
## 2 on_time
## 3 on_time
## 4 on_time
## 5 on_time
## 6 on_time
## 7 on_time
## 8 on_time
## 9 on_time
## 10 on_time
## # ... with 81,445 more rows
flights_aug <- augment(flights_fit, test_data)
# The data look like:
flights_aug %>%
select(arr_delay, time_hour, flight, .pred_class, .pred_on_time)
## # A tibble: 81,455 x 5
## arr_delay time_hour flight .pred_class .pred_on_time
## <fct> <dttm> <int> <fct> <dbl>
## 1 late 2013-01-01 05:00:00 1141 on_time 0.983
## 2 on_time 2013-01-01 05:00:00 725 on_time 0.992
## 3 on_time 2013-01-01 05:00:00 1696 on_time 0.918
## 4 on_time 2013-01-01 06:00:00 5708 on_time 0.961
## 5 on_time 2013-01-01 06:00:00 49 on_time 0.972
## 6 on_time 2013-01-01 05:00:00 1806 on_time 0.981
## 7 on_time 2013-01-01 06:00:00 371 on_time 0.973
## 8 on_time 2013-01-01 06:00:00 135 on_time 0.961
## 9 on_time 2013-01-01 06:00:00 4626 on_time 0.823
## 10 on_time 2013-01-01 06:00:00 27 on_time 0.934
## # ... with 81,445 more rows
flights_aug %>%
roc_curve(truth = arr_delay, .pred_late) %>%
autoplot()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## i Please use `reframe()` instead.
## i When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## i The deprecated feature was likely used in the yardstick package.
## Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
flights_aug %>%
roc_auc(truth = arr_delay, .pred_late)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.764
Check how useful the recipe has been.
workflow() %>%
add_model(lr_mod) %>%
add_formula(arr_delay ~ .) %>%
fit(data = train_data %>% select(-flight, -time_hour)) %>%
augment(test_data) %>%
roc_auc(truth = arr_delay, .pred_late)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.735