nycflights13

一)准备工作

一)1.加载库

library(nycflights13)
library(autoReg)
library(naniar)
library(patchwork)
library(tidyverse)
library(skimr)
library(showtext)
font_add("simhei", "C:/Users/speak/AppData/Local/Microsoft/Windows/Fonts/simhei.ttf") 
showtext_auto()

一)2.整体数据探索性分析

一)2.1.预处理部分缺失值

本研究基于nycflights13包提供的2013年纽约航班数据集(包含336,776条航班记录和19个变量),通过系统化的数据清洗流程确保分析可靠性。原始数据经distinct()去重后,采用随机抽样(2,000行)和缺失值可视化分析发现,飞机尾号(tailnum)缺失记录(2,512行,占比0.75%)与其他运行指标(如起降时间、延误时长)的缺失呈现完全共现模式,这与航班取消的数据特征高度吻合。基于实际执飞航班分析的研究目标,本研究选择列删法(listwise deletion)移除tailnum缺失行,并通过colSums(is.na()) 验证其余变量无系统性缺失。该方法在保证数据质量的同时(仅损失0.75%样本),有效避免了非随机缺失导致的估计偏差,为后续分析提供了清洁可靠的数据基础。如需研究航班取消规律,建议单独分析被删除的2,512条取消记录以获取完整业务洞察。

flights <- nycflights13::flights #主要表
flights <- flights |> 
  distinct()  # 删除所有列完全相同的重复行 
glimpse(flights)
Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
#* 发现有19个(列)变量, 336766行
gaze(flights)
—————————————————————————————————————————————————————
      name             levels             stats      
—————————————————————————————————————————————————————
year            Mean ± SD                2013.0 ± 0.0 
month           Mean ± SD                   6.5 ± 3.4 
day             Mean ± SD                  15.7 ± 8.8 
dep_time        Mean ± SD              1349.1 ± 488.3 
sched_dep_time  Mean ± SD              1344.3 ± 467.3 
dep_delay       Mean ± SD                 12.6 ± 40.2 
arr_time        Mean ± SD              1502.1 ± 533.3 
sched_arr_time  Mean ± SD              1536.4 ± 497.5 
arr_delay       Mean ± SD                  6.9 ± 44.6 
carrier         9E                       18460 (5.5%) 
                AA                       32729 (9.7%) 
                AS                         714 (0.2%) 
                B6                      54635 (16.2%) 
                DL                      48110 (14.3%) 
                EV                      54173 (16.1%) 
                F9                         685 (0.2%) 
                FL                        3260 (1.0%) 
                HA                         342 (0.1%) 
                MQ                       26397 (7.8%) 
                OO                          32 (0.0%) 
                UA                      58665 (17.4%) 
                US                       20536 (6.1%) 
                VX                        5162 (1.5%) 
                WN                       12275 (3.6%) 
                YV                         601 (0.2%) 
flight          Mean ± SD             1971.9 ± 1632.5 
tailnum         4044 unique values                    
origin          EWR                    120835 (35.9%) 
                JFK                    111279 (33.0%) 
                LGA                    104662 (31.1%) 
dest            105 unique values                     
air_time        Mean ± SD                150.7 ± 93.7 
distance        Mean ± SD              1039.9 ± 733.2 
hour            Mean ± SD                  13.2 ± 4.7 
minute          Mean ± SD                 26.2 ± 19.3 
time_hour       6936 unique values                    
—————————————————————————————————————————————————————
flights_sample <- slice_sample(flights, n = 2000) # 随机抽取n行 
vis_miss(flights_sample) # 可视化检查缺失值

#* 发现都比较分散,tailnum比较少

# 筛选出tailnum为NA且同一行中还有其他变量也为NA的所有行 
except_tailnum_NA_rows <- flights %>% 
  filter(is.na(tailnum))  %>%  # 筛选 tailnum 为 NA 的行 
  filter(if_any(c(dep_time, arr_time, dep_delay, arr_delay, air_time), is.na)) # 检查指定列是否有 NA 
glimpse(except_tailnum_NA_rows)
Rows: 2,512
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12, 13…
$ dep_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755, 1…
$ dep_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ arr_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, 101…
$ arr_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ carrier        <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US", "…
$ flight         <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 4023, …
$ tailnum        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ origin         <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JFK",…
$ dest           <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BUF",…
$ air_time       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ distance       <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569, 14…
$ hour           <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, 6, …
$ minute         <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, 0, …
$ time_hour      <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-03 0…
#* 发现和缺失tailnum的行数2512完全一样,看起来大多数缺失值都捆绑在一起,可以推测这些航班是被取消了

#1.直接删除 tailnum 含有缺失值的行(如果要分析航班取消的规律则保留)-------------------------------
flights <- flights %>% 
  filter(!is.na(tailnum)) 
colSums(is.na(flights)) # 每列的缺失值数量
          year          month            day       dep_time sched_dep_time 
             0              0              0           5743              0 
     dep_delay       arr_time sched_arr_time      arr_delay        carrier 
          5743           6201              0           6918              0 
        flight        tailnum         origin           dest       air_time 
             0              0              0              0           6918 
      distance           hour         minute      time_hour 
             0              0              0              0 
#* 并且验证了之前

一)2.2.含缺失值的变量分析与可视化

本研究利用R语言中的nycflights13数据集,对2013年纽约机场航班数据进行了系统的缺失值分析。通过数据探查发现,该数据集中的dep_time(起飞时间)、dep_delay(起飞延误)、arr_time(到达时间)、arr_delay(到达延误)和air_time(飞行时间)五个关键变量存在缺失值,但整体缺失率均低于3%,表明数据质量良好。

深入分析揭示了这些缺失值之间存在明确的因果关系:起飞延误(dep_delay)的缺失完全由起飞时间(dep_time)的缺失导致,因为dep_delay是通过dep_time计算得出的;同样地,到达延误(arr_delay)和飞行时间(air_time)的缺失均源于到达时间(arr_time)的缺失。研究还发现,飞行时间(air_time)作为起降时间的差值变量,其缺失可以通过air_time = arr_time - dep_time的公式进行部分推算。

采用横向柱状图直观展示了各变量的缺失比例分布,所有含缺失值的变量其缺失比例均分布在较低水平(0.5%-2.5%)。这些发现为后续的数据清洗和缺失值插补提供了重要依据,建议可以利用变量间的计算关系进行合理的缺失值推算,以提高数据完整性。该分析方法也可推广应用于其他航空数据的质量评估。

#筛选出含NA的变量
NA_variables <- flights %>% 
  summarise(across(everything(), ~ any(is.na(.))))  %>%  # 检查每个变量是否含NA 
  pivot_longer(everything(), names_to = "variable", values_to = "has_na") %>%  # 转换为长格式 
  filter(has_na) %>%  
  select(variable)   
NA_variables <- flights %>%
  select(all_of(NA_variables$variable))  # 选择含 NA 的变量
summary(NA_variables)
    dep_time      dep_delay          arr_time      arr_delay       
 Min.   :   1   Min.   : -43.00   Min.   :   1   Min.   : -86.000  
 1st Qu.: 907   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.: -17.000  
 Median :1401   Median :  -2.00   Median :1535   Median :  -5.000  
 Mean   :1349   Mean   :  12.64   Mean   :1502   Mean   :   6.895  
 3rd Qu.:1744   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:  14.000  
 Max.   :2400   Max.   :1301.00   Max.   :2400   Max.   :1272.000  
 NA's   :5743   NA's   :5743      NA's   :6201   NA's   :6918      
    air_time    
 Min.   : 20.0  
 1st Qu.: 82.0  
 Median :129.0  
 Mean   :150.7  
 3rd Qu.:192.0  
 Max.   :695.0  
 NA's   :6918   
# 计算含NA的变量的NA比例 
NA_variables_proportion <- NA_variables %>%
  summarise(across(everything(), ~ mean(is.na(.))))  %>%
  pivot_longer(everything(), names_to = "variable", values_to = "NA_percent")
# 绘制NA比例柱状图 
ggplot(NA_variables_proportion, aes(x = variable, y = NA_percent)) +
  geom_bar(stat = "identity", width = 0.7, fill = "slategrey", color = "#555555", alpha = 0.8) +  # 绘制柱状图 
  coord_flip() +  # 将坐标轴翻转为横图 
  labs(
    title = "变量缺失值比例", 
    x = "变量", 
    y = "缺失值比例", 
    fill = "变量"
  ) +
  scale_x_discrete(labels = c(
    "dep_time" = "起飞时间 (dep_time)",
    "dep_delay" = "起飞延误 (dep_delay)",
    "arr_time" = "到达时间 (arr_time)",
    "arr_delay" = "到达延误 (arr_delay)",
    "air_time" = "飞行时间 (air_time)")
    ) +  # 将变量名替换为中文 
  scale_y_continuous(limits = c(0, 0.03), breaks = seq(0, 0.3, by = 0.005)) +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 15, face = "bold"),
    legend.position  = "none"  # 移除图例 
  )

Tip

删去取消航班仍有NA的变量: dep_time dep_delay arr_time arr_delay air_time
dep_time和 dep_delay的缺失值一样多,根本原因是 dep_time缺失

解释:dep_delay = dep_time - sched_dep_time

arr_delay和 air_time的缺失值一样多,根本原因是 arr_time缺失

解释:arr_delay = arr_time - sched_arr_time,air_time = arr_time - dep_time

air_time ≈ distance/飞机速度speed

可通过 air_time = arr_time - dep_time,求出部分 arr_time

二)探究起飞延迟与到达延迟的关系

二)1.探索性分析与可视化

首先使用slice_sample()进行无偏抽样,确保分析的代表性。随后通过ggplot2绘制散点图,以起飞延误为横轴、到达延误为纵轴,图中特别添加红线(y=x参考线),直观对比实际延误与理论大约1:1关系(即起飞延误约等于到达延误)。

分析结果表明,多数数据点分布在参考线附近,但存在显著离群值(如起飞延误短而到达延误极长的情况),暗示除起飞延误外,航程中天气、航线调整等因素可能进一步影响最终延误。建议后续研究结合飞行距离变量进行分层回归,以量化不同航段延误传递效率的差异。

# arr_delay ~ dep_delay
set.seed(1042)
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行 
# arr_delay ~ dep_delay
ggplot(flights_sample, aes(x = dep_delay, y = arr_delay)) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotdash", linewidth = 0.8) +  # 添加 y = x的参考线
  geom_point(alpha = 0.2, color = "steelblue") +  # 设置点的透明度和颜色  
  labs(
    title = "到达延误 (arr_delay) ≈ 起飞延误 (dep_delay)", y = "到达延误 (arr_delay)",x = "起飞延误 (dep_delay)",
    subtitle = "y = x"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    plot.subtitle = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red")  # 坐标轴标题加粗 
  )

二)2.进一步分析与可视化

对航班到达延误与起飞延误的偏离特征进行了系统性分析。通过计算arr_delay与dep_delay的差值(arr_delay_dep_delay_deviation = arr_delay - dep_delay),研究发现该偏离值呈现显著的左偏分布特征(负偏态),其众数为-9分钟,中位数和均值分别为-7分钟和-5.66分钟,标准差达到18.04分钟。分组分析显示,随着起飞延误时间的增加,偏离值的众数和均值均呈现先降后升的非线性趋势。

# 计算偏离程度 
flights <- flights %>%
  mutate(arr_delay_dep_delay_deviation = arr_delay - dep_delay)

# 统计偏离程度 
mode <- function(x) {  #构造函数计算众数
  as.numeric(names(which.max(table(x))) )
}
deviation_stats <- flights %>%
  summarise(
    mean_deviation = mean(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    sd_deviation = sd(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    median_deviation = median(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    mode_deviation = mode(arr_delay_dep_delay_deviation)
  )
deviation_stats 
# A tibble: 1 × 4
  mean_deviation sd_deviation median_deviation mode_deviation
           <dbl>        <dbl>            <dbl>          <dbl>
1          -5.66         18.0               -7             -9
#* 左偏分布(负偏态),mode < median < mean < 0,数据集中在右侧,左侧有长尾

# 按dep_delay分组并计算arr_delay_dep_delay_deviation的均值 
arr_delay_dep_delay_deviation_by_dep_delay <- flights %>%
  group_by(dep_delay) %>%
  reframe(mode_deviation_by_dep_delay = mode(arr_delay_dep_delay_deviation),
          mean_deviation_by_dep_delay = mean(arr_delay_dep_delay_deviation))  # 自动取消分组 

# mode_deviation_by_dep_delay ~ dep_delay
a1 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mode_deviation_by_dep_delay, x = dep_delay)) +
  geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_point(
    alpha = 0.6, 
    color = "steelblue", 
    size = 2
  ) +
  geom_smooth(
    method = "loess", 
    color = "darkorange", 
    linewidth = 1.5, 
    se = TRUE,  # 置信区间 
    fill = "orange"
  ) +
  labs(
    x = "起飞延误 (dep_delay)", 
    y = "到达延误-起飞延误差值的 众数"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(
      hjust = 0.5,  # 居中 
      size = 20, 
      color = "red"
    )
  )
# mean_deviation_by_dep_delay ~ dep_delay
a2 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mean_deviation_by_dep_delay, x = dep_delay)) +
  geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_point(
    alpha = 0.6, 
    color = "steelblue", 
    size = 2
  ) +
  geom_smooth(
    method = "loess", 
    color = "darkorange", 
    linewidth = 1.5, 
    se = TRUE,  # 置信区间 
    fill = "orange"
  ) +
  labs(
    x = "起飞延误 (dep_delay)", 
    y = "到达延误-起飞延误差值的 均值"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(
      hjust = 0.5,  # 居中 
      size = 20, 
      color = "red"
    )
  )

wrap_plots(a1, a2, ncol = 2) +
  plot_annotation(
    title = "起飞延误 与 到达延误-起飞延误差值 的关系",
    theme = theme(
      plot.title  = element_text(
        hjust = 0.5,
        face = "bold",           # 加粗 
        size = 26
        )
      )
    )

# 可视化偏离分布 
ggplot(flights, aes(x = arr_delay_dep_delay_deviation)) +
  geom_vline(xintercept = deviation_stats$mode_deviation, color = "red", linetype = "dotdash", linewidth = 0.8) +
  geom_histogram(alpha = 0.9, binwidth = 1, fill = "steelblue") +
  labs(title = "到达延误-起飞延误差值 分布", x = "差值 (deviation = arr_delay - dep_delay)", y = "频数",
       subtitle = "x = - 9") +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 1, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    plot.subtitle  = element_text(hjust = 0.38, vjust = -3, size = 20, color = "red"))  # 副标题居中

# 将 arr_delay 换为 dep_delay - mean_deviation -----------------------------------------
flights <- flights %>%
  mutate(arr_delay = ifelse(is.na(arr_delay)  & !is.na(dep_delay),  dep_delay + deviation_stats$mean_deviation, arr_delay))
colSums(is.na(flights)) # 每列的缺失值数量
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 

三)探究飞行速度与其他变量的关系

三)1.探究飞行速度与飞行距离的关系与可视化

三)1.1探究飞行速度与飞行距离的关系与可视化

在航空运输领域,飞行速度与飞行距离的关系研究对于优化航线规划、提升运营效率具有重要意义。基于 nycflights13 数据集,本研究通过公式speed = (distance / air_time) * 60将航班飞行距离(distance)和飞行时间(air_time)转化为平均速度(单位:mph)。为降低数据维度、提升分析效率,从处理后的航班数据中随机抽取 5000 条记录构建分析样本,具体代码如下:

# 计算平均速度 
flights <- flights %>%
  mutate(speed = (distance / air_time) * 60)  # 速度单位:mph 
# speed ~ distance---------------------------------------------------------------------
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行 
# speed ~ distance
ggplot(flights_sample, aes(x = distance, y = speed)) +
  geom_point(alpha = 0.3, color = "steelblue") +  # 设置点的透明度和颜色 
  stat_smooth(method = "lm", formula = y ~ log(x), color = "red3", fill = "red") +  # 对数回归曲线 
  labs(
    title = " 平均速度 (mph) ~ 飞行距离 (miles)", 
    y = "平均速度 (mph)",
    x = "飞行距离 (miles)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
  )

分析结果表明飞行速度与距离间存在非线性关系,为后续航线优化提供数据支撑。 ## 三)2.探究飞行速度与机型的关系与可视化

三)1.2进一步探究飞行速度与飞行距离的关系与可视化

下方为速度水平与距离等级之间的流动关系丧基图图,展示了飞行速度与飞行距离两个类别间的数量流动关系;线条宽度可以直观地表示不同类别间的数量差异。

library(ggalluvial)

flights2 <- flights |> 
  mutate(across(c(speed), 
                ~ cut(.x, 
                      breaks = c(-Inf, 
                                 quantile(.x, 0.25, na.rm = TRUE),
                                 quantile(.x, 0.50, na.rm = TRUE), 
                                 quantile(.x, 0.75, na.rm = TRUE), 
                                 Inf), 
                      labels = c("lower", "low", "high", "higher")), 
                .names = "{.col}_level")) |> 
  mutate(across(c(distance), 
                ~ cut(.x, 
                      breaks = c(-Inf, 
                                 quantile(.x, 0.25, na.rm = TRUE), 
                                 quantile(.x, 0.50, na.rm = TRUE),
                                 quantile(.x, 0.75, na.rm = TRUE),
                                 Inf), 
                      labels = c("closer", "close", "far", "farther")), 
                .names = "{.col}_level")) |> 
  relocate(speed, speed_level, distance, distance_level)

sanky_data <- flights2 |> 
  dplyr::select(distance_level, speed_level) |> 
  pivot_longer(cols = c(distance_level),
               names_to = "distance_level",
               values_to = "level1"
  ) |> 
  pivot_longer(cols = c(speed_level),
               names_to = "speed_level",
               values_to = "level2"
  )

sanky_data1 <- sanky_data |> 
  dplyr::group_by(level1, level2) |>  # 按变量自由组合分组
  dplyr::summarise(count = n(), .groups = 'drop') # 计算每组频数

sanky_data1 <- sanky_data1 |> 
  filter(level2 != "NA")

ggplot(sanky_data1, aes(y = count,
                        axis1 = level1,
                        axis2 = level2)) +
  geom_alluvium(aes(fill = level1, alpha = count)) +
  geom_stratum(alpha = 0.4, width = 1/3, size = 1) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("distance", "speed")) +
  scale_alpha_continuous(range = c(0.4, 1)) +
  coord_flip() +
  theme_light() +
  theme(
    plot.title   = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, vjust = 6, size = 15, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position  = "none"  # 移除图例 
  )

ggplot(sanky_data1, aes(y = count,
                        axis1 = level1,
                        axis2 = level2)) +
  geom_alluvium(aes(fill = level2, alpha = count)) +
  geom_stratum(alpha = 0.4, width = 1/3, size = 1) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("distance", "speed")) +
  scale_alpha_continuous(range = c(0.4, 1)) +
  scale_fill_manual(values = c("forestgreen", "blue", "tomato", "yellow3")) +
  coord_flip() +
  theme_light() +
  theme(
    plot.title   = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, vjust = 6, size = 15, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position  = "none"  # 移除图例 
  )

三)2.连接flights, planes数据表并检查缺失值

三)2.1.连接flights, planes数据表并检查缺失值

为探究机型对飞行速度的影响,本研究整合航班运营数据(flights 表)与飞机基础信息数据(planes 表)。将 planes 表中year字段重命名为manufacture_year,并删除冗余的speed列,消除数据干扰。通过tailnum字段执行左连接操作,形成包含航班运行与机型特征的综合数据集flights_planes

planes <- nycflights13::planes %>% # 重命名 planes 表中的 year 变量为 manufacture_year 
  rename(manufacture_year = year)
plane_sample <- slice_sample(planes, n = 5000) # 随机抽取n行 
vis_miss(plane_sample) # 可视化检查缺失值

planes <- planes %>% # 删除本来的speed
  select(-speed)

# 合并 flights 与 planes(通过 tailnum)
flights_planes <- flights %>%
  left_join(planes, by = "tailnum")
colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                         6918                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                        50094                         50094 
                        seats                        engine 
                        50094                         50094 
flights_planes_sample <- slice_sample(flights_planes, n = 5000) # 随机抽取n行 
vis_miss(flights_planes_sample) # 可视化检查缺失值

对整合后的数据进行缺失值分析,先后对 planes 表、合并后的flights_planes表进行随机抽样(样本量 5000),并使用vis_miss函数可视化缺失模式。分析显示,整合过程未引入系统性缺失问题,数据完整性满足后续分析需求。

三)2.2.分析可视化

基于flights_planes数据集,按机型分组计算平均飞行速度,构建model_speed数据集。采用水平柱状图可视化不同机型的速度分布,柱状图以机型为横轴、平均速度为纵轴,填充色区分机型类别。图表经坐标翻转、网格线优化等处理,突出不同机型间的速度差异。运用单因素方差分析(ANOVA)评估机型对飞行速度的影响

# model_mean_speed ~ model---------------------------------------------------------------------
# 按 model 分组计算平均速度 
model_speed <- flights_planes %>%
  group_by(model) %>%
  summarise(model_mean_speed = mean(speed, na.rm = TRUE))
# model_mean_speed ~ model
ggplot(model_speed, aes(x = reorder(model, model_mean_speed), y = model_mean_speed, fill = model)) +
  geom_bar(width = 0.6, stat = "identity") +
  coord_flip() +  # 水平柱状图 
  labs(
    title = "各机型平均速度分布图",
    y = "平均速度 (mph)",
    x = "机型 (model)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 8, face = "bold.italic"),
    legend.position  = "none",
    panel.grid.major.y = element_blank(),  # 去除主要水平网格线
    panel.grid.minor.y = element_blank()   # 去除次要水平网格线
  )

anova_speed_model_result <- aov(speed ~ model, data = flights_planes)
summary(anova_speed_model_result)
                Df    Sum Sq Mean Sq F value Pr(>F)    
model          126 435397631 3455537    1554 <2e-16 ***
Residuals   278890 620244978    2224                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
55247 observations deleted due to missingness

结果表明机型变量对飞行速度存在显著影响(anova_speed_model_result),为航空公司机型选型与航线适配提供量化依据。

三)3.探究飞行速度与天气的关系

三)3.1. 连接flights, weather数据表并检查缺失值

气象条件对飞行速度的影响是航空运行研究的重要方向。本研究将航班数据集(flights)与气象数据集(weather)通过机场代码(origin)和时间戳(time_hour)进行关联,构建包含航班运行与气象信息的flights_weather数据集。对 weather 表执行数据清洗,删除wind_gust列,并通过随机抽样与缺失值可视化,验证合并后数据的质量稳定性

# 合并 flights 与 weather (通过 origin, time_hour)
weather <- nycflights13::weather
weather_sample <- slice_sample(weather, n = 5000) # 随机抽取n行 
vis_miss(weather_sample) # 可视化检查缺失值

weather <- weather %>% # 删除wind_gust
  select(-wind_gust)

flights_weather <- flights %>%
  left_join(weather, by = c("origin", "time_hour"))
colSums(is.na(flights_weather)) # 每列的缺失值数量 
                       year.x                       month.x 
                            0                             0 
                        day.x                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                       hour.x                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed                        year.y 
                         6918                          1542 
                      month.y                         day.y 
                         1542                          1542 
                       hour.y                          temp 
                         1542                          1559 
                         dewp                         humid 
                         1559                          1559 
                     wind_dir                    wind_speed 
                         9743                          1620 
                       precip                      pressure 
                         1542                         38079 
                        visib 
                         1542 
flights_weather_sample <- slice_sample(flights_weather, n = 5000) # 随机抽取n行 
vis_miss(flights_weather_sample) # 可视化检查缺失值

三)3.2. 探究飞行速度与天气的关系与可视化

提取 weather 表中除时间、地点相关字段外的气象变量,通过循环遍历构建热力图矩阵,展示各气象要素与飞行速度的二维分布关系。热力图采用六边形分箱(stat_binhex),以颜色梯度表征数据密度,直观呈现变量间的相关性模式

# 遍历 weather 表中的所有变量
weather_vars <- names(weather) %>% 
  setdiff(c("origin", "time_hour", "year", "month", "day", "hour")) # 排除不需要的列 
# 创建一个空列表,用于存储所有热力图 
speed_weather_plot_list <- list() #创建列表
# 生成所有图形并存储到列表中 
for (var in weather_vars) {
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
    stat_binhex(bins = 30) +  
    scale_fill_gradient(low = "lightblue", high = "red") +
    labs(
      title = paste("平均速度 ~", var),
      y = "平均速度 (mph)",
      x = paste(var, "(单位根据变量而定)")
    ) +
    theme_minimal()
  
  speed_weather_plot_list[[var]] <- p +
    theme(plot.background  = element_rect(fill = "#CCCCCC", color = "white", size = 1),
          legend.key.width  = unit(0.1, "cm"))
}
# 使用 patchwork 将所有图形组合并显示 
wrap_plots(speed_weather_plot_list, ncol = 3)  # 每行显示n个图形 

三)3.3. 进一步探究飞行速度与天气的关系与可视化

为进一步揭示气象变量与飞行速度的关系,针对每个气象变量实施动态分箱策略:离散变量按唯一值分箱,连续变量采用分位数划分。在此基础上进行分层抽样,确保各分箱区间样本均衡。重新构建热力图并添加局部加权回归曲线(loess),深入刻画变量间的非线性关系

# 遍历所有天气变量 
for (var in weather_vars) {
  # --- 动态分箱策略 --- 
  unique_values <- unique(flights_weather[[var]])
  if (length(unique_values) <= 5) {
    breaks <- unique_values  # 离散变量直接分箱(如雾:0/1)
  } else {
    breaks <- unique(quantile(flights_weather[[var]], probs = seq(0, 1, 0.2), na.rm  = TRUE))  # 连续变量分位数分箱 
  }
  
  # --- 分层抽样 --- 
  flights_weather_sample <- flights_weather %>%
    mutate(var_bin = cut(.data[[var]], breaks, include.lowest  = TRUE)) %>%
    group_by(var_bin) %>%
    sample_n(100, replace = TRUE) %>%  # 允许重复抽样(小样本时必需)
    ungroup()
  
  # --- 生成热力图(速度分析) --- 
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
    stat_binhex(bins = 30) +  # 六边形热力图 
    scale_fill_gradient(low = "lightblue", high = "red") +  # 颜色渐变 
    geom_smooth(method = "loess", color = "black", linewidth = 0.5, se = FALSE) +  # 添加拟合线 
    labs(
      title = paste("平均速度 ~", var),  # 动态标题 
      y = "平均速度 (mph)",  # Y轴标签 
      x = paste(var, "(单位根据变量而定)")  # X轴标签 
    ) +
    theme_minimal() +  # 简约主题 
    theme(
      plot.background  = element_rect(fill = "#CCCCCC", color = "white", size = 1),  # 灰色背景+白色边框 
      legend.key.width  = unit(0.1, "cm")  # 调整图例宽度 
    )
  
  # 存储图形 
  speed_weather_plot_list[[var]] <- p
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(speed_weather_plot_list, ncol = 3)  # 每行显示3个图形 

#* 没啥波动,应该极端天气本来就不会起飞

分析结果显示,在常规气象条件下,飞行速度波动较小,侧面印证极端天气对航班运行的显著影响。

三)用众数填充model缺失值

针对flights_planes数据集中机型(model)变量的缺失值,采用众数插补策略。通过计算机型变量的频数分布,获取出现频率最高的机型作为众数,并将缺失值替换为该众数。此方法在保留数据分布特征的同时,提升数据集完整性,为后续基于机型的深度分析奠定基础

#2.用众数填补 model 含有缺失值的数据(?)-------------------------------------------
mode_model <- names(sort(table(flights_planes$model), decreasing = TRUE))[1]
flights_planes <- flights_planes %>%
  mutate(model = ifelse(is.na(model),  mode_model, model))

四)拟合模型预测填补speed缺失值

围绕航班数据缺失值处理展开了严谨且系统的操作。先是将数据划分为训练集与预测集,针对 `speed` 缺失值,采用线性回归模型,根据前面已知`distance`与速度的大概分布,以 `log(distance)` 和 `model` 为自变量进行拟合,并绘制 QQ 图检验正态性,确保预测合理。接着用拟合模型预测 `speed` 缺失值并填充速度,合并数据集。

#3.预测 speed 含有缺失值的数据-----------------------------------
# 训练集:已知 speed 
train_data <- flights_planes %>%
  filter(!is.na(speed)) 
# 预测集:speed 缺失 
predict_data <- flights_planes %>%
  filter(is.na(speed)) 
# 拟合模型 
model_speed <- lm(speed ~ log(distance) + model, data = train_data) #考虑到航空公司 (carrier)是间接影响,不纳入
#*会自动将分类变量转换为虚拟变量

plot(model_speed, which = 2)  # QQ 图(正态性检验)

# 预测 speed 缺失值 
predicted_speed <- predict(model_speed, newdata = predict_data)
# 将预测值填充到原数据中 
predict_data <- predict_data %>%
  mutate(speed = predicted_speed)
# 合并预测结果 
flights_planes <- bind_rows(train_data, predict_data)

colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                            0                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                            0                         50094 
                        seats                        engine 
                        50094                         50094 

填充 `air_time` 和 `arr_time`,依据 `air_time` 与 `distance`、`speed_minute` 以及 `arr_time` 与 `dep_time`、`air_time` 的关系,分别计算并填充 `air_time` 和 `arr_time`。

flights_planes <- flights_planes %>%  # 将速度转换为英里/分钟 
  mutate(speed_minute = speed / 60)
#4.计算 air_time 含有缺失值的数据-----------------------------------
#* air_time ≈ distance/speed_minute
# 计算缺失的 arr_time 
flights_planes <- flights_planes %>%
  mutate(air_time = ifelse(is.na(air_time),  distance / speed_minute, air_time))
#5.计算 arr_time 含有缺失值的数据-----------------------------------
#*可通过 air_time = arr_time - dep_time,求出部分 arr_time
# 计算缺失的 arr_time 
flights_planes <- flights_planes %>%
  mutate(arr_time = ifelse(is.na(arr_time),  dep_time + (distance / speed_minute), arr_time))

colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         5743                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                            0                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                            0                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                            0                         50094 
                        seats                        engine 
                        50094                         50094 
                 speed_minute 
                            0 

五)探究起飞延误与各变量的关系

五)1.探索性分析与可视化

通过绘制起飞延误时间的箱线图与直方图并组合展示,对航班起飞延误时间开展了深入可视化分析。箱线图呈现数据分布范围、中位数、四分位数等信息,凸显离散程度与异常值;直方图以频数分布展现不同延误时间区间的航班数量。组合图形全面呈现出起飞延误时间呈大数据长尾分布,绝大多数航班延误时间短,少部分航班超长延迟。此分析为深入理解航班延误规律、后续数据分析及决策提供了有力支撑。

# 处理dep_delay异常值------------------------------------------
# dep_delay分布箱线图 
dep_delay_boxplot_plot <- ggplot(flights, aes(y = dep_delay)) +
  geom_boxplot(fill = "steelblue", color = "black") +
  labs(title = "起飞延误时间箱线图", y = "延误时间(分钟)") +
  theme_minimal() +
  theme(
    axis.title.x  = element_blank(),  # 去除 x 轴标题 
    axis.text.x  = element_blank(),   # 去除 x 轴刻度标签 
    axis.ticks.x  = element_blank(),   # 去除 x 轴刻度线 
    axis.text.y  = element_text(size = 10, face = "bold.italic"),
    plot.title  = element_text(face = "bold", size = 15),
    axis.title  = element_text(face = "bold", size = 15)
  )
# dep_delay分布直方图
dep_delay_histogram_plot <- ggplot(flights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 20, fill = "steelblue") +
  labs(title = "起飞延误时间分布图", x = "延误时间(分钟)", y = "频数") +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 15),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(hjust = 0.5, size = 10, face = "bold"),
    axis.text.y  = element_text(size = 10, face = "bold.italic")
  )

dep_delay_plot <- dep_delay_boxplot_plot + dep_delay_histogram_plot 
dep_delay_plot 

#* 发现大数据长尾分布,绝大多数是没什么延迟,有少部分是超长延迟

五)2.探究起飞延误与机场的关系

对 `flights` 数据按出发地机场分组,计算各机场平均起飞延误时间,展现出不同机场航班延误的差异。结果显示 EWR 机场平均延误最高,LGA 最低,为了解机场运营、航班调度效率提供了关键依据,有助于探究延误因素并制定改进策略。

# dep_delay ~ origin--------------------------
flights %>%
  group_by(origin) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm  = TRUE))
# A tibble: 3 × 2
  origin mean_dep_delay
  <chr>           <dbl>
1 EWR              15.1
2 JFK              12.1
3 LGA              10.3
#* EWR机场平均延误最高,LGA最低

五)3.探究起飞延误与一天中起飞时间的关系与可视化

五)3.1.探究起飞延误与一天中起飞时间的关系与可视化

先从 `flights` 数据中提取计划起飞小时,计算各小时平均起飞延误,接着绘制峰曲线图展示不同计划起飞小时的平均延误变化。图中折线和点图结合,点的颜色随平均延误值渐变,清晰呈现出傍晚平均延误高、早上凌晨延误低的趋势,为理解航班起飞延误与计划起飞时间的关系提供有力支持,助力探究延误规律与制定合理调度策略。

# dep_delay ~ hour-------------------------
# 计算每小时平均延误 
hour_dep_delay <- flights %>%
  mutate(hour = sched_dep_time %/% 100) %>%  # 提取计划起飞小时 
  group_by(hour) %>%
  summarise(avg_delay = mean(dep_delay, na.rm  = TRUE))

# 绘制峰曲线图 
ggplot(hour_dep_delay, aes(x = hour, y = avg_delay)) +
  geom_line(color = "steelblue", linewidth = 0.8) +  
  geom_point(aes(fill = avg_delay), shape = 21, color = "steelblue", size = 3, alpha = 0.95) + 
  scale_x_continuous(breaks = 0:23) + 
  scale_fill_gradient(low = "blue4", high = "red") +
  labs(
    title = "不同计划起飞小时的平均起飞延误",
    x = "计划起飞小时", 
    y = "平均起飞延误 (dep_delay)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red"),
    panel.grid.minor  = element_blank()   # 去除次网格线 
  )

#* 傍晚平均延误较高,早上凌晨延误很低

五)3.2.进一步探究起飞延误与一天中起飞时间的关系与可视化

围绕起飞延误与天气的关系展开分析,先从`flights_weather`数据集随机抽取4000行样本,筛选出关键天气变量,随后针对每个变量在样本数据上绘制热力图,通过六边形分箱展示起飞延误与天气变量的分布关系,以颜色渐变呈现数据密集程度,并动态设置标题与标签,应用简洁主题并添加浅蓝色背景。最后用`patchwork`将热力图按3列布局组合展示。但由于数据分布不均,好天气与低延误样本占比过高,导致热力图视觉上趋于集中,难以有效挖掘天气变量与起飞延误间的潜在关联,凸显出优化数据处理策略的必要性。

# dep_delay ~ weather----------------------------------------------------------
# 随机抽取 4000 行数据 
flights_weather_sample <- slice_sample(flights_weather, n = 4000)

# 遍历 weather 表中的所有变量,排除不需要的列 
weather_vars <- names(weather) %>% 
  setdiff(c("origin", "time_hour", "year", "month", "day", "hour"))

# 创建一个空列表,用于存储所有热力图 
dep_delay_weather_plot_list <- list()

# 生成所有图形并存储到列表中 
for (var in weather_vars) {
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
    stat_binhex(bins = 30) +  # 使用热力图 
    scale_fill_gradient(low = "lightyellow", high = "#154415") +  # 设置颜色渐变 
    labs(
      title = paste("起飞延误 ~", var),  # 动态设置标题 
      y = "起飞延误 (分钟)",
      x = paste(var, "(单位根据变量而定)")
    ) +
    theme_minimal()
  
  dep_delay_weather_plot_list[[var]] <- p +
    theme(plot.background  = element_rect(fill = "lightblue", color = "white", size = 1))
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(dep_delay_weather_plot_list, ncol = 3)

#* 但是数据分布不均衡(好天气样本多、延误低的样本占主导),图都成一团
#* 这会导致热力图无法有效揭示天气变量与延误之间的潜在关联了

五)4.进一步探究起飞延误与天气的关系与可视化

通过一系列专业步骤深入分析起飞延误与天气变量的关系。首先针对每个天气变量进行动态分箱,根据唯一值数量确定合适分箱边界,再基于分箱进行分层抽样,从各分箱中抽取等量数据以解决数据分布不均问题。然后利用抽样后的数据绘制热力图,添加拟合线展示变量间趋势,以颜色渐变体现数据密集程度。最后将所有热力图按3列布局组合展示,形成一套逻辑紧密、全面可靠的分析流程,为研究天气对航班延误的影响提供有力支持。

# 遍历所有天气变量
for (var in weather_vars) {
  # --- 第一步:动态分箱策略 --- 
  # 检查当前变量的唯一值数量(例如:降水可能只有0和5两种值)
  unique_values <- unique(flights_weather[[var]])
  
  
  if (length(unique_values) <= 5) {
    breaks <- unique_values # 如果变量值种类≤5种,直接按实际值分箱 
  } else {
    # 变量值种类多时,按20%分位数切分5个区间 
    # 例如:温度可能分为[低温, 中低温, 中温, 中高温, 高温]
    # unique()用于防止分位数重复(如80%分位和100%分位值相同)
    breaks <- unique(quantile(flights_weather[[var]], 
                              probs = seq(0, 1, 0.2), 
                              na.rm  = TRUE))  # 忽略缺失值 
  }
  
  # --- 第二步:分层抽样 --- 
  flights_weather_sample <- flights_weather %>%
    # 创建分箱标签列(var_bin)
    mutate(
      var_bin = cut(
        .data[[var]], 
        breaks = breaks,  # 使用上一步计算的分箱边界 
        include.lowest  = TRUE  # 包含最小值所在的区间 
      )
    ) %>%
    # 按分箱结果分组,每组抽取100条数据 
    group_by(var_bin) %>%
    sample_n(500, replace = TRUE) %>%  # replace = TRUE 可重复抽样
    ungroup()  # 取消分组状态 
  
  # --- 第三步:生成热力图 --- 
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
    stat_binhex(bins = 30) +  # 六边形热力图
    scale_fill_gradient(low = "lightyellow", high = "#154415") + 
    geom_smooth(method = "loess", color = "red", linewidth = 0.5, se = FALSE) +  # 添加拟合线 
    labs(
      title = paste("起飞延误 ~", var), 
      y = "起飞延误 (dep_delay)",  
      x = paste(var, "(单位根据变量而定)") 
    ) +
    theme_minimal() + 
    theme(plot.background  = element_rect(fill = "lightblue", color = "white", size = 1))
  
  # 存储图形到列表(按变量名作为键)
  dep_delay_weather_plot_list[[var]] <- p 
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(dep_delay_weather_plot_list, ncol = 3)

六)探究起飞延迟与机型,航空公司,月份的关系

六)1.探究机型与航空公司的关系与可视化

围绕航空公司(`carrier`)与机型(`model`)的关系进行数据处理与可视化。先对 `flights_planes` 数据按 `carrier` 和 `model` 分组统计各机型在各航空公司的出现数量,接着计算每个航空公司使用的所有机型总数,筛选出使用频率前三的机型并计算其在对应航空公司中的占比。绘制堆叠柱状图将各航空公司使用频率前三的机型占比以柱子堆叠形式展示,柱子上标注机型名称,方便直观比较各航空公司对不同机型的偏好。

# model ~ carrier---------------------------------------------------------------
#1.汇总数据:统计每个 carrier 对应的 model 数量 
carrier_model_count <- flights_planes %>%
  group_by(carrier, model) %>%
  summarise(count = n(), .groups = 'drop')
#2.计算每个 carrier 最多使用的 model 数量及其比例 
carrier_total <- carrier_model_count %>% #提取出各机型在各航空公司的数量
  group_by(carrier) %>%
  summarise(total = sum(count), .groups = 'drop')
carrier_most_model <- carrier_model_count %>% #提取出前三频率的
  group_by(carrier) %>%
  slice_max(order_by = count, n = 3) %>% 
  ungroup()
carrier_most_model <- carrier_most_model %>% #合并
  left_join(carrier_total, by = "carrier") %>%
  mutate(carrier_model_ratio = count / total)
#3. model ~ carrier
ggplot(carrier_most_model, aes(x = carrier, y = carrier_model_ratio, fill = model)) +
  geom_bar(stat = "identity", position = "stack", color = "#777777") +  # 绘制堆叠柱状图 
  geom_text(aes(label = model), 
            position = position_stack(vjust = 0.5), size = 4, angle = 30, color = "black", alpha = 0.8) +  # 仅添加机型名称 
  labs(
    title = "航空公司 (carrier) 和其前三频率使用 机型 (model) 的关系图",
    x = "航空公司 (carrier)",
    y = "占对应公司使用机型比例",
  ) +
  theme_minimal() +  # 使用简洁主题 
  theme(
    plot.title  = element_text(face = "bold", size = 20),  # 标题加粗 
    axis.title  = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    legend.position  = "none"  # 移除图例 
  ) +
  scale_y_continuous(labels = scales::percent)  # 将 y 轴标签显示为百分比 

六)2.探究起飞延误与航空公司的关系与可视化

六)2.1.探究起飞延误与航空公司的关系与可视化

通过系统的数据处理与可视化分析流程,深入探究航空公司航班延误情况。运用`count`函数对`flights`数据按`carrier`分组,精准统计各航空公司航班数量;借助`summarise`函数计算平均出发延误时间及标准差,量化延误特征。利用`ggplot`绘制的柱状图,以`carrier`为分类轴、平均延误时间为度量轴,通过简洁专业的可视化呈现,直观展现各航空公司平均延误差异。配合`aov`函数进行方差分析,从统计学层面检验不同航空公司间出发延误时间的显著性差异。整套分析流程完整且严谨,将数据统计、可视化呈现与假设检验相结合,为航空公司运营状况评估提供了科学的数据支撑与分析视角。

# carrier_mean_dep_delay ~ carrier carrier----------------------------------------------------------------
carrier_counts <- flights %>%
  count(carrier, name = "count")
carrier_counts
# A tibble: 16 × 2
   carrier count
   <chr>   <int>
 1 9E      17416
 2 AA      32645
 3 AS        714
 4 B6      54635
 5 DL      48110
 6 EV      54173
 7 F9        682
 8 FL       3260
 9 HA        342
10 MQ      26395
11 OO         32
12 UA      57979
13 US      19873
14 VX       5162
15 WN      12245
16 YV        601
# 按 carrier 分组计算平均延误 
carrier_dep_delay <- flights %>%
  group_by(carrier) %>%
  summarise(carrier_mean_dep_delay = mean(dep_delay, na.rm  = TRUE),
            carrier_sd_dep_delay = sd(dep_delay, na.rm  = TRUE))
# carrier_mean_dep_delay ~ carrier
ggplot(carrier_dep_delay, aes(x = carrier, y = carrier_mean_dep_delay, fill = carrier)) +
  geom_bar(stat = "identity", color = "black") +  # 绘制柱状图 
  labs(
    title = "各航空公司飞机平均延误分布图",
    y = "平均延误 (minute)",
    x = "航空公司 (carrier)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title  = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x  = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    legend.position  = "none"  # 移除图例 
  )

#*相差很大!
anova_dep_delay_carrier_result <- aov(dep_delay ~ carrier, data = flights)
summary(anova_dep_delay_carrier_result)
                Df    Sum Sq Mean Sq F value Pr(>F)    
carrier         15   6348034  423202   264.9 <2e-16 ***
Residuals   328505 524819199    1598                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness
#* F 值:反映组间差异相对于组内差异的大小,F = 组间变异 / 组内变异
#* p 值:判断组间差异是否具有统计学意义,通常 p < 0.05 可以拒绝原假设,认为组间差异显著

六)2.2.进一步探究起飞延误与航空公司的关系与可视化

将 dep_delay 按 [-60, 0, 60, 360, 720, 1080] 分组创建 bin 变量,过滤掉为缺失值的数据。

统计 carrier 的比例,绘制堆叠柱状图。

flights |> 
  mutate(bin = cut(dep_delay, breaks = c(-60, 0, 60, 360, 720, 1080))) |> 
  filter(!is.na(bin)) |>
  ggplot(aes(x = bin, fill = as.factor(carrier))) +
  geom_bar(position = "fill", color = "black", alpha = 1) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "dep_delay", y = "Proportion") +
  theme_minimal()  +
  theme(
    plot.title   = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position  = "none"  # 移除图例 
  )

可以发现延误在 [720, 1080] 区间的航空公司只有9E, AA, DL, F9, MQ,没有其他航空公司

六)3.探究起飞延误与月份的关系与可视化

为深入剖析了航班出发延误时间与月份的关联。先按月份对航班数据分组,算出各月平均起飞延误时间与延误标准差,为后续分析铺就基石。随后绘制的柱状图,清晰直观地展示出各月平均延误的差异,让人能快速洞察不同月份延误情况的区别。最后进行的方差分析,从统计学层面检验不同月份出发延误时间是否存在显著差异,把数据处理、可视化呈现与统计检验巧妙融合,为深入探究航班延误在不同月份的特征提供了系统且有力的分析支撑。

# dep_delay ~ month----------------------------------------------------------------
# 按 month 分组计算平均起飞延误 
month_dep_delay <- flights %>%
  group_by(month) %>%
  summarise(month_mean_dep_delay = mean(dep_delay, na.rm  = TRUE),
            month_sd_dep_delay = sd(dep_delay, na.rm  = TRUE))

# mean_dep_delay ~ month 
ggplot(month_dep_delay, aes(x = month, y = month_mean_dep_delay, fill = factor(month))) +
  geom_bar(stat = "identity", color = "black") +  # 绘制柱状图 
  scale_x_continuous(breaks = 1:12, labels = month.abb)  +  # 将月份显示为缩写
  labs(
    title = "各月飞机平均延误时间",
    y = "平均延误 (minute)",
    x = "月份"
  ) +
  theme_minimal() +
  theme(
    plot.title   = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position  = "none"  # 移除图例 
  )

# 方差分析:dep_delay ~ month 
anova_dep_delay_month_result <- aov(dep_delay ~ factor(month), data = flights)
summary(anova_dep_delay_month_result)
                  Df    Sum Sq Mean Sq F value Pr(>F)    
factor(month)     11   8449792  768163   482.8 <2e-16 ***
Residuals     328509 522717440    1591                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness

六)4.探究航空公司与月份的关系与可视化

以严谨的逻辑深入剖析了各月份与航空公司航班占比的关系。先按月份和航空公司分组统计航班数量,进而算出各月总航班数,再合并数据得出各航空公司在对应月份的航班占比。从分布均匀的结果来看,有理由推测航班延误或许确实和季节相关,整体分析为进一步研究航班分布影响因素提供了有力支撑。

# carrier ~ month----------------------------------------------------------------
month_carrier_count <- flights %>%
  group_by(month, carrier) %>%
  summarise(count = n(), .groups = 'drop')
# 提取出各月份的总航班数 
month_total <- month_carrier_count %>%
  group_by(month) %>%
  summarise(total = sum(count), .groups = 'drop')
# 合并数据,计算比例 
month_carrier <- month_carrier_count %>%
  left_join(month_total, by = "month") %>%
  mutate(month_carrier_ratio = count / total)
# carrier ~ month
ggplot(month_carrier, aes(x = month, y = month_carrier_ratio, fill = carrier)) +
  geom_bar(stat = "identity", position = "stack", color = "#777777") +  # 绘制堆叠柱状图 
  geom_text(aes(label = carrier), 
            position = position_stack(vjust = 0.5), size = 5, color = "black", alpha = 0.8) +  # 添加航空公司名称 
  labs(
    title = "各月份 (month) 航空公司 (carrier) 所占航班比例的关系图",
    x = "月份 (month)",
    y = "占对应月份使用航空公司比例",
  ) +
  theme_minimal() +  # 使用简洁主题 
  theme(
    plot.title   = element_text(face = "bold", size = 20),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position   = "none"  # 移除图例 
  ) +
  scale_y_continuous(labels = scales::percent) +  # 将 y 轴标签显示为百分比 
  scale_x_continuous(breaks = 1:12, labels = month.abb)   # 将月份显示为缩写(Jan, Feb 等)

#*分布得十分均匀,说明延误确实和季节有关系