# Load libraries
library(tidyverse)

Airline_Delay_Post_COVID_2021_2023 <- read.csv("Airline_Delay_Post_COVID_2021_2023.csv")

df <- Airline_Delay_Post_COVID_2021_2023

25% Subsample Analysis

sample_frac <- 0.25
n_samples <- 3

df_samples <- tibble()

for (sample_i in 1:n_samples) {
  df_i <- df |>
    sample_n(size = sample_frac * nrow(df), replace = TRUE) |>
    mutate(sample_num = sample_i)
  
  df_samples <- bind_rows(df_samples, df_i)
}

Using a 25% subsample, I observed moderate variation across the three simulated samples. While overall arrival delay averages were similar, the maximum delay values differed noticeably. This suggests that conclusions drawn from a single moderate-sized sample may be influenced by whether rare but extreme delay events are included.

Overall Behaviour

df_samples |>
  group_by(sample_num) |>
  summarize(
    mean_arr_delay = mean(arr_delay, na.rm = TRUE),
    median_arr_delay = median(arr_delay, na.rm = TRUE),
    max_arr_delay = max(arr_delay, na.rm = TRUE),
    total_flights = sum(arr_flights, na.rm = TRUE)
  )
## # A tibble: 3 × 5
##   sample_num mean_arr_delay median_arr_delay max_arr_delay total_flights
##        <int>          <dbl>            <dbl>         <dbl>         <dbl>
## 1          1          3947.             934.        305694       3464544
## 2          2          3880.             939         305694       3478460
## 3          3          4118.             936.        337375       3628608

Anomaly Metrics

df_samples |>
  group_by(sample_num) |>
  summarize(
    max_weather_delay = max(weather_delay, na.rm = TRUE),
    max_carrier_delay = max(carrier_delay, na.rm = TRUE),
    max_late_aircraft_delay = max(late_aircraft_delay, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   sample_num max_weather_delay max_carrier_delay max_late_aircraft_delay
##        <int>             <dbl>             <dbl>                   <dbl>
## 1          1             21063            109056                  130753
## 2          2             26428            105806                  130753
## 3          3             21362            118554                  155262

Extreme delay values, particularly those related to weather and late aircraft delays, appeared inconsistently across subsamples. A delay that would appear anomalous in one subsample was absent or less severe in another. This highlights how small numbers of extreme events can disproportionately affect interpretations when sample sizes are limited.

Consistency Across Samples

df_samples |>
  group_by(sample_num) |>
  summarize(
    avg_carrier_delay = mean(carrier_delay, na.rm = TRUE),
    avg_weather_delay = mean(weather_delay, na.rm = TRUE),
    avg_nas_delay = mean(nas_delay, na.rm = TRUE),
    avg_late_aircraft_delay = mean(late_aircraft_delay, na.rm = TRUE)
  )
## # A tibble: 3 × 5
##   sample_num avg_carrier_delay avg_weather_delay avg_nas_delay
##        <int>             <dbl>             <dbl>         <dbl>
## 1          1             1557.              236.          687.
## 2          2             1552.              236.          668.
## 3          3             1614.              254.          728.
## # ℹ 1 more variable: avg_late_aircraft_delay <dbl>

Despite variability in extreme values, the relative contribution of different delay causes remained consistent across subsamples. Carrier-related and late-aircraft delays were consistently larger than weather or security delays, suggesting that these operational factors were persistent contributors to airline delays during the post-COVID period.

Airline-level comparison

df_samples |>
  group_by(sample_num, carrier_name) |>
  summarize(
    avg_arr_delay = mean(arr_delay, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(sample_num, desc(avg_arr_delay))
## # A tibble: 51 × 3
##    sample_num carrier_name           avg_arr_delay
##         <int> <chr>                          <dbl>
##  1          1 Southwest Airlines Co.        10972.
##  2          1 American Airlines Inc.         8903.
##  3          1 JetBlue Airways                7334.
##  4          1 United Air Lines Inc.          5464.
##  5          1 Spirit Air Lines               5322.
##  6          1 Delta Air Lines Inc.           5032.
##  7          1 SkyWest Airlines Inc.          3375.
##  8          1 Republic Airline               3230.
##  9          1 Frontier Airlines Inc.         3099.
## 10          1 PSA Airlines Inc.              1999.
## # ℹ 41 more rows

10% subsampling

sample_frac <- 0.10

df_samples <- tibble()

for (sample_i in 1:n_samples) {
  df_i <- df |>
    sample_n(size = sample_frac * nrow(df), replace = TRUE) |>
    mutate(sample_num = sample_i)
  
  df_samples <- bind_rows(df_samples, df_i)
}
df_samples |>
  group_by(sample_num) |>
  summarize(
    mean_arr_delay = mean(arr_delay, na.rm = TRUE),
    max_arr_delay = max(arr_delay, na.rm = TRUE)
  )
## # A tibble: 3 × 3
##   sample_num mean_arr_delay max_arr_delay
##        <int>          <dbl>         <dbl>
## 1          1          3800.        215233
## 2          2          4376.        337375
## 3          3          3880.        281392

10% Subsample Comparison

When the subsample size was reduced to 10%, variability increased substantially. Mean arrival delays and maximum delay values differed more dramatically between samples, indicating that small samples are highly sensitive to random variation and rare events. Conclusions based on such small samples would therefore be less reliable.

75% subsampling

sample_frac <- 0.75

df_samples <- tibble()

for (sample_i in 1:n_samples) {
  df_i <- df |>
    sample_n(size = sample_frac * nrow(df), replace = TRUE) |>
    mutate(sample_num = sample_i)
  
  df_samples <- bind_rows(df_samples, df_i)
}
df_samples |>
  group_by(sample_num, year) |>
  summarize(
    avg_arr_delay = mean(arr_delay, na.rm = TRUE),
    .groups = "drop"
  )
## # A tibble: 9 × 3
##   sample_num  year avg_arr_delay
##        <int> <int>         <dbl>
## 1          1  2021         3393.
## 2          1  2022         4674.
## 3          1  2023         4585.
## 4          2  2021         3346.
## 5          2  2022         4594.
## 6          2  2023         5001.
## 7          3  2021         3289.
## 8          3  2022         4473.
## 9          3  2023         4725.

75% Subsample Comparison

With a 75% subsample, results became much more stable across samples. Average arrival delays by year were consistent, and differences between subsamples were minimal. This demonstrates how larger samples reduce uncertainty and provide more reliable insights into underlying delay patterns.

Conclusion

This investigation demonstrates how sampling variability can affect conclusions drawn from airline delay data. Smaller subsamples exaggerated anomalies and produced less stable results, while larger subsamples yielded consistent and reliable patterns. In future analyses, I would be cautious about drawing strong conclusions from limited samples and would prioritize larger datasets or repeated sampling. This raises further questions about whether extreme delay events are driven disproportionately by specific airports, seasons, or carriers, and whether these factors should be modeled separately rather than treated as random noise.