Exploring NYC Flights 2013 Datasets

Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- nycflights13::flights
flights <- nycflights13::flights
head(df)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Clean

Add a complete, date column.

df <- df %>%
    unite("date", year, month, day, sep = "-", remove = FALSE) %>%
    mutate(date = ymd(date))

Analysis

Aggregate Data

month_aggregate <- df |> 
  group_by(month) |> 
  summarise(
    dep_delay_mean =  mean(dep_delay, na.rm = TRUE),
    distance_mean = mean(distance, na.rm = TRUE),
    arr_delay_mean = mean(arr_delay, na.rm = TRUE)
  )

month_aggregate
## # A tibble: 12 × 4
##    month dep_delay_mean distance_mean arr_delay_mean
##    <int>          <dbl>         <dbl>          <dbl>
##  1     1          10.0          1007.          6.13 
##  2     2          10.8          1001.          5.61 
##  3     3          13.2          1012.          5.81 
##  4     4          13.9          1039.         11.2  
##  5     5          13.0          1041.          3.52 
##  6     6          20.8          1057.         16.5  
##  7     7          21.7          1059.         16.7  
##  8     8          12.6          1062.          6.04 
##  9     9           6.72         1041.         -4.02 
## 10    10           6.24         1039.         -0.167
## 11    11           5.44         1050.          0.461
## 12    12          16.6          1065.         14.9
month_aggregate |> 
  ggplot(aes(x=month, y=arr_delay_mean)) +
  geom_line()

Correlation

dep_delay and arr_delay

df_delay <- df |> 
  select(dep_delay, arr_delay)

df_delay |> 
  arrange(desc(dep_delay)) 
## # A tibble: 336,776 × 2
##    dep_delay arr_delay
##        <dbl>     <dbl>
##  1      1301      1272
##  2      1137      1127
##  3      1126      1109
##  4      1014      1007
##  5      1005       989
##  6       960       931
##  7       911       915
##  8       899       850
##  9       898       895
## 10       896       878
## # ℹ 336,766 more rows
df_delay |> 
  ggplot(aes(x=dep_delay)) +
  geom_histogram(bins=6) + 
  scale_y_continuous(labels = scales::number_format(accuracy = 1))
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_bin()`).

cor(df_delay, use = 'pairwise.complete.obs')
##           dep_delay arr_delay
## dep_delay 1.0000000 0.9148028
## arr_delay 0.9148028 1.0000000
df_delay |>
  ggplot(aes(x=dep_delay, y=arr_delay)) + 
  geom_point()
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).

date and dep_delay

df <- df %>% mutate(days_after_epoch = as.numeric(date - ymd('1970-01-01')))
ggplot(df, aes(x=days_after_epoch)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df %>% filter(dep_delay %>% is.na())
## # A tibble: 8,255 × 21
##    date        year month   day dep_time sched_dep_time dep_delay arr_time
##    <date>     <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 2013-01-01  2013     1     1       NA           1630        NA       NA
##  2 2013-01-01  2013     1     1       NA           1935        NA       NA
##  3 2013-01-01  2013     1     1       NA           1500        NA       NA
##  4 2013-01-01  2013     1     1       NA            600        NA       NA
##  5 2013-01-02  2013     1     2       NA           1540        NA       NA
##  6 2013-01-02  2013     1     2       NA           1620        NA       NA
##  7 2013-01-02  2013     1     2       NA           1355        NA       NA
##  8 2013-01-02  2013     1     2       NA           1420        NA       NA
##  9 2013-01-02  2013     1     2       NA           1321        NA       NA
## 10 2013-01-02  2013     1     2       NA           1545        NA       NA
## # ℹ 8,245 more rows
## # ℹ 13 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   days_after_epoch <dbl>
df %>% select(days_after_epoch, dep_delay) %>% cor()
##                  days_after_epoch dep_delay
## days_after_epoch                1        NA
## dep_delay                      NA         1
ggplot(sample_n(df, 10000), aes(x=date, y=dep_delay)) + geom_smooth(method = 'gam') + geom_smooth()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_smooth()`).