nycflights13

一)准备工作

一)1.加载库

library(nycflights13)
library(autoReg)
library(naniar)
library(patchwork)
library(tidyverse)
library(skimr)
library(showtext)
font_add("simhei", "C:/Users/21565/AppData/Local/Microsoft/Windows/Fonts/simhei.ttf") 
showtext_auto()

一)2.整体数据探索性分析

一)2.1.预处理部分缺失值

flights <- nycflights13::flights #主要表
glimpse(flights)
Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
#* 发现有19个(列)变量, 336766行
summary(flights)  # 每列的统计 
      year          month             day           dep_time    sched_dep_time
 Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
 1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
 Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
 Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
 Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
                                                 NA's   :8255                 
   dep_delay          arr_time    sched_arr_time   arr_delay       
 Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
 1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
 Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
 Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
 3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
 Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
 NA's   :8255      NA's   :8713                  NA's   :9430      
   carrier              flight       tailnum             origin         
 Length:336776      Min.   :   1   Length:336776      Length:336776     
 Class :character   1st Qu.: 553   Class :character   Class :character  
 Mode  :character   Median :1496   Mode  :character   Mode  :character  
                    Mean   :1972                                        
                    3rd Qu.:3465                                        
                    Max.   :8500                                        
                                                                        
     dest              air_time        distance         hour      
 Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
 Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
 Mode  :character   Median :129.0   Median : 872   Median :13.00  
                    Mean   :150.7   Mean   :1040   Mean   :13.18  
                    3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
                    Max.   :695.0   Max.   :4983   Max.   :23.00  
                    NA's   :9430                                  
     minute        time_hour                     
 Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
 1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
 Median :29.00   Median :2013-07-03 10:00:00.00  
 Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
 3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
 Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
                                                 
myft(gaze(flights))

name

levels

stats

year

Mean ± SD

2013.0 ± 0.0

month

Mean ± SD

6.5 ± 3.4

day

Mean ± SD

15.7 ± 8.8

dep_time

Mean ± SD

1349.1 ± 488.3

sched_dep_time

Mean ± SD

1344.3 ± 467.3

dep_delay

Mean ± SD

12.6 ± 40.2

arr_time

Mean ± SD

1502.1 ± 533.3

sched_arr_time

Mean ± SD

1536.4 ± 497.5

arr_delay

Mean ± SD

6.9 ± 44.6

carrier

9E

18460 (5.5%)

AA

32729 (9.7%)

AS

714 (0.2%)

B6

54635 (16.2%)

DL

48110 (14.3%)

EV

54173 (16.1%)

F9

685 (0.2%)

FL

3260 (1.0%)

HA

342 (0.1%)

MQ

26397 (7.8%)

OO

32 (0.0%)

UA

58665 (17.4%)

US

20536 (6.1%)

VX

5162 (1.5%)

WN

12275 (3.6%)

YV

601 (0.2%)

flight

Mean ± SD

1971.9 ± 1632.5

tailnum

4044 unique values

origin

EWR

120835 (35.9%)

JFK

111279 (33.0%)

LGA

104662 (31.1%)

dest

105 unique values

air_time

Mean ± SD

150.7 ± 93.7

distance

Mean ± SD

1039.9 ± 733.2

hour

Mean ± SD

13.2 ± 4.7

minute

Mean ± SD

26.2 ± 19.3

time_hour

6936 unique values

skim(flights)
Data summary
Name flights
Number of rows 336776
Number of columns 19
_______________________
Column type frequency:
character 4
numeric 14
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
carrier 0 1.00 2 2 0 16 0
tailnum 2512 0.99 5 6 0 4043 0
origin 0 1.00 3 3 0 3 0
dest 0 1.00 3 3 0 105 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2013.00 0.00 2013 2013 2013 2013 2013 ▁▁▇▁▁
month 0 1.00 6.55 3.41 1 4 7 10 12 ▇▆▆▆▇
day 0 1.00 15.71 8.77 1 8 16 23 31 ▇▇▇▇▆
dep_time 8255 0.98 1349.11 488.28 1 907 1401 1744 2400 ▁▇▆▇▃
sched_dep_time 0 1.00 1344.25 467.34 106 906 1359 1729 2359 ▁▇▇▇▃
dep_delay 8255 0.98 12.64 40.21 -43 -5 -2 11 1301 ▇▁▁▁▁
arr_time 8713 0.97 1502.05 533.26 1 1104 1535 1940 2400 ▁▃▇▇▇
sched_arr_time 0 1.00 1536.38 497.46 1 1124 1556 1945 2359 ▁▃▇▇▇
arr_delay 9430 0.97 6.90 44.63 -86 -17 -5 14 1272 ▇▁▁▁▁
flight 0 1.00 1971.92 1632.47 1 553 1496 3465 8500 ▇▃▃▁▁
air_time 9430 0.97 150.69 93.69 20 82 129 192 695 ▇▂▂▁▁
distance 0 1.00 1039.91 733.23 17 502 872 1389 4983 ▇▃▂▁▁
hour 0 1.00 13.18 4.66 1 9 13 17 23 ▁▇▇▇▅
minute 0 1.00 26.23 19.30 0 8 29 44 59 ▇▃▆▃▅

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
time_hour 0 1 2013-01-01 05:00:00 2013-12-31 23:00:00 2013-07-03 10:00:00 6936
colSums(is.na(flights)) # 每列的缺失值数量 
          year          month            day       dep_time sched_dep_time 
             0              0              0           8255              0 
     dep_delay       arr_time sched_arr_time      arr_delay        carrier 
          8255           8713              0           9430              0 
        flight        tailnum         origin           dest       air_time 
             0           2512              0              0           9430 
      distance           hour         minute      time_hour 
             0              0              0              0 
flights_sample <- slice_sample(flights, n = 2000) # 随机抽取n行 
vis_miss(flights_sample) # 可视化检查缺失值

#* 发现都比较分散,tailnum比较少

# 筛选出tailnum为NA且同一行中还有其他变量也为NA的所有行 
except_tailnum_NA_rows <- flights %>% 
  filter(is.na(tailnum))  %>%  # 筛选 tailnum 为 NA 的行 
  filter(if_any(c(dep_time, arr_time, dep_delay, arr_delay, air_time), is.na)) # 检查指定列是否有 NA 
glimpse(except_tailnum_NA_rows)
Rows: 2,512
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12, 13…
$ dep_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755, 1…
$ dep_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ arr_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, 101…
$ arr_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ carrier        <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US", "…
$ flight         <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 4023, …
$ tailnum        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ origin         <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JFK",…
$ dest           <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BUF",…
$ air_time       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ distance       <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569, 14…
$ hour           <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, 6, …
$ minute         <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, 0, …
$ time_hour      <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-03 0…
#* 发现和缺失tailnum的行数2512完全一样,看起来大多数缺失值都捆绑在一起,可以推测这些航班是被取消了

#1.直接删除 tailnum 含有缺失值的行(如果要分析航班取消的规律则保留)-------------------------------
flights <- flights %>% 
  filter(!is.na(tailnum)) 
colSums(is.na(flights)) # 每列的缺失值数量
          year          month            day       dep_time sched_dep_time 
             0              0              0           5743              0 
     dep_delay       arr_time sched_arr_time      arr_delay        carrier 
          5743           6201              0           6918              0 
        flight        tailnum         origin           dest       air_time 
             0              0              0              0           6918 
      distance           hour         minute      time_hour 
             0              0              0              0 
#* 并且验证了之前

一)2.2.含缺失值的变量分析与可视化

# 筛选出含NA的变量 
NA_variables <- flights %>% 
  summarise(across(everything(), ~ any(is.na(.))))  %>%  # 检查每个变量是否含NA 
  pivot_longer(everything(), names_to = "variable", values_to = "has_na") %>%  # 转换为长格式 
  filter(has_na) %>%  
  select(variable)   
NA_variables <- flights %>%
  select(all_of(NA_variables$variable))  # 选择含 NA 的变量
summary(NA_variables)
    dep_time      dep_delay          arr_time      arr_delay       
 Min.   :   1   Min.   : -43.00   Min.   :   1   Min.   : -86.000  
 1st Qu.: 907   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.: -17.000  
 Median :1401   Median :  -2.00   Median :1535   Median :  -5.000  
 Mean   :1349   Mean   :  12.64   Mean   :1502   Mean   :   6.895  
 3rd Qu.:1744   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:  14.000  
 Max.   :2400   Max.   :1301.00   Max.   :2400   Max.   :1272.000  
 NA's   :5743   NA's   :5743      NA's   :6201   NA's   :6918      
    air_time    
 Min.   : 20.0  
 1st Qu.: 82.0  
 Median :129.0  
 Mean   :150.7  
 3rd Qu.:192.0  
 Max.   :695.0  
 NA's   :6918   
# 计算含NA的变量的NA比例 
NA_variables_proportion <- NA_variables %>%
  summarise(across(everything(), ~ mean(is.na(.))))  %>%
  pivot_longer(everything(), names_to = "variable", values_to = "NA_percent")
# 绘制NA比例柱状图 
ggplot(NA_variables_proportion, aes(x = variable, y = NA_percent)) +
  geom_bar(stat = "identity", width = 0.7, fill = "slategrey", color = "#555555", alpha = 0.8) +  # 绘制柱状图 
  coord_flip() +  # 将坐标轴翻转为横图 
  labs(
    title = "变量缺失值比例", 
    x = "变量", 
    y = "缺失值比例", 
    fill = "变量"
  ) +
  scale_x_discrete(labels = c(
    "dep_time" = "起飞时间 (dep_time)",
    "dep_delay" = "起飞延误 (dep_delay)",
    "arr_time" = "到达时间 (arr_time)",
    "arr_delay" = "到达延误 (arr_delay)",
    "air_time" = "飞行时间 (air_time)")
    ) +  # 将变量名替换为中文 
  scale_y_continuous(limits = c(0, 0.03), breaks = seq(0, 0.3, by = 0.005)) +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 15, face = "bold"),
    legend.position  = "none"  # 移除图例 
  )

#*删去取消航班仍有NA的变量:   dep_time    dep_delay    arr_time    arr_delay    air_time 
#*                             起飞时间    起飞延误     到达时间    到达延误     飞行时间 
#*            
#* dep_time和 dep_delay的缺失值一样多,根本原因是 dep_time缺失
#*解释:dep_delay = dep_time - sched_dep_time
#* arr_delay和 air_time的缺失值一样多,根本原因是 arr_time缺失
#*解释:arr_delay = arr_time - sched_arr_time,air_time = arr_time - dep_time
#* air_time ≈ distance/飞机速度speed
#*可通过 air_time = arr_time - dep_time,求出部分 arr_time
#*

二)探究起飞延迟与到达延迟的关系

二)1.探索性分析与可视化

# arr_delay ~ dep_delay
set.seed(1042)
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行 
# arr_delay ~ dep_delay
ggplot(flights_sample, aes(x = dep_delay, y = arr_delay)) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotdash", linewidth = 0.8) +  # 添加 y = x的参考线
  geom_point(alpha = 0.2, color = "steelblue") +  # 设置点的透明度和颜色  
  labs(
    title = "到达延误 (arr_delay) ≈ 起飞延误 (dep_delay)", y = "到达延误 (arr_delay)",x = "起飞延误 (dep_delay)",
    subtitle = "y = x"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    plot.subtitle = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red")  # 坐标轴标题加粗 
  )

二)2.进一步分析与可视化

# 计算偏离程度 
flights <- flights %>%
  mutate(arr_delay_dep_delay_deviation = arr_delay - dep_delay)

# 统计偏离程度 
mode <- function(x) {  #构造函数计算众数
  as.numeric(names(which.max(table(x))) )
}
deviation_stats <- flights %>%
  summarise(
    mean_deviation = mean(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    sd_deviation = sd(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    median_deviation = median(arr_delay_dep_delay_deviation, na.rm  = TRUE),
    mode_deviation = mode(arr_delay_dep_delay_deviation)
  )
deviation_stats 
# A tibble: 1 × 4
  mean_deviation sd_deviation median_deviation mode_deviation
           <dbl>        <dbl>            <dbl>          <dbl>
1          -5.66         18.0               -7             -9
#* 左偏分布(负偏态),mode < median < mean < 0,数据集中在右侧,左侧有长尾

# 按dep_delay分组并计算arr_delay_dep_delay_deviation的均值 
arr_delay_dep_delay_deviation_by_dep_delay <- flights %>%
  group_by(dep_delay) %>%
  reframe(mode_deviation_by_dep_delay = mode(arr_delay_dep_delay_deviation),
          mean_deviation_by_dep_delay = mean(arr_delay_dep_delay_deviation))  # 自动取消分组 

# mode_deviation_by_dep_delay ~ dep_delay
a1 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mode_deviation_by_dep_delay, x = dep_delay)) +
  geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_point(
    alpha = 0.6, 
    color = "steelblue", 
    size = 2
  ) +
  geom_smooth(
    method = "loess", 
    color = "darkorange", 
    linewidth = 1.5, 
    se = TRUE,  # 置信区间 
    fill = "orange"
  ) +
  labs(
    x = "起飞延误 (dep_delay)", 
    y = "到达延误-起飞延误差值的 众数"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(
      hjust = 0.5,  # 居中 
      size = 20, 
      color = "red"
    )
  )
# mean_deviation_by_dep_delay ~ dep_delay
a2 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mean_deviation_by_dep_delay, x = dep_delay)) +
  geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
  geom_point(
    alpha = 0.6, 
    color = "steelblue", 
    size = 2
  ) +
  geom_smooth(
    method = "loess", 
    color = "darkorange", 
    linewidth = 1.5, 
    se = TRUE,  # 置信区间 
    fill = "orange"
  ) +
  labs(
    x = "起飞延误 (dep_delay)", 
    y = "到达延误-起飞延误差值的 均值"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(
      hjust = 0.5,  # 居中 
      size = 20, 
      color = "red"
    )
  )

wrap_plots(a1, a2, ncol = 2) +
  plot_annotation(
    title = "起飞延误 与 到达延误-起飞延误差值 的关系",
    theme = theme(
      plot.title  = element_text(
        hjust = 0.5,
        face = "bold",           # 加粗 
        size = 26
        )
      )
    )

# 可视化偏离分布 
ggplot(flights, aes(x = arr_delay_dep_delay_deviation)) +
  geom_vline(xintercept = deviation_stats$mode_deviation, color = "red", linetype = "dotdash", linewidth = 0.8) +
  geom_histogram(alpha = 0.9, binwidth = 1, fill = "steelblue") +
  labs(title = "到达延误-起飞延误差值 分布", x = "差值 (deviation = arr_delay - dep_delay)", y = "频数",
       subtitle = "x = - 9") +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 25),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 1, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    plot.subtitle  = element_text(hjust = 0.38, vjust = -3, size = 20, color = "red"))  # 副标题居中

# 将 arr_delay 换为 dep_delay - mean_deviation -----------------------------------------
flights <- flights %>%
  mutate(arr_delay = ifelse(is.na(arr_delay)  & !is.na(dep_delay),  dep_delay + deviation_stats$mean_deviation, arr_delay))
colSums(is.na(flights)) # 每列的缺失值数量
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 

三)探究飞行速度与其他变量的关系

三)1.探究飞行速度与飞行距离的关系与可视化

# 计算平均速度 
flights <- flights %>%
  mutate(speed = (distance / air_time) * 60)  # 速度单位:mph 
# speed ~ distance---------------------------------------------------------------------
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行 
# speed ~ distance
ggplot(flights_sample, aes(x = distance, y = speed)) +
  geom_point(alpha = 0.3, color = "steelblue") +  # 设置点的透明度和颜色 
  stat_smooth(method = "lm", formula = y ~ log(x), color = "red3", fill = "red") +  # 对数回归曲线 
  labs(
    title = " 平均速度 (mph) ~ 飞行距离 (miles)", 
    y = "平均速度 (mph)",
    x = "飞行距离 (miles)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
  )

三)2.探究飞行速度与机型的关系与可视化

三)2.1.连接flights, planes数据表并检查缺失值

planes <- nycflights13::planes %>% # 重命名 planes 表中的 year 变量为 manufacture_year 
  rename(manufacture_year = year)
plane_sample <- slice_sample(planes, n = 5000) # 随机抽取n行 
vis_miss(plane_sample) # 可视化检查缺失值

planes <- planes %>% # 删除本来的speed
  select(-speed)

# 合并 flights 与 planes(通过 tailnum)
flights_planes <- flights %>%
  left_join(planes, by = "tailnum")
colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                         6918                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                        50094                         50094 
                        seats                        engine 
                        50094                         50094 
flights_planes_sample <- slice_sample(flights_planes, n = 5000) # 随机抽取n行 
vis_miss(flights_planes_sample) # 可视化检查缺失值

三)2.2.分析可视化

# model_mean_speed ~ model---------------------------------------------------------------------
# 按 model 分组计算平均速度 
model_speed <- flights_planes %>%
  group_by(model) %>%
  summarise(model_mean_speed = mean(speed, na.rm = TRUE))
# model_mean_speed ~ model
ggplot(model_speed, aes(x = reorder(model, model_mean_speed), y = model_mean_speed, fill = model)) +
  geom_bar(width = 0.6, stat = "identity") +
  coord_flip() +  # 水平柱状图 
  labs(
    title = "各机型平均速度分布图",
    y = "平均速度 (mph)",
    x = "机型 (model)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 8, face = "bold.italic"),
    legend.position  = "none",
    panel.grid.major.y = element_blank(),  # 去除主要水平网格线
    panel.grid.minor.y = element_blank()   # 去除次要水平网格线
  )

anova_speed_model_result <- aov(speed ~ model, data = flights_planes)
summary(anova_speed_model_result)
                Df    Sum Sq Mean Sq F value Pr(>F)    
model          126 435397631 3455537    1554 <2e-16 ***
Residuals   278890 620244978    2224                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
55247 observations deleted due to missingness

三)3.探究飞行速度与天气的关系

三)3.1. 连接flights, weather数据表并检查缺失值

# 合并 flights 与 weather (通过 origin, time_hour)
weather <- nycflights13::weather
weather_sample <- slice_sample(weather, n = 5000) # 随机抽取n行 
vis_miss(weather_sample) # 可视化检查缺失值

weather <- weather %>% # 删除wind_gust
  select(-wind_gust)

flights_weather <- flights %>%
  left_join(weather, by = c("origin", "time_hour"))
colSums(is.na(flights_weather)) # 每列的缺失值数量 
                       year.x                       month.x 
                            0                             0 
                        day.x                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                       hour.x                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed                        year.y 
                         6918                          1542 
                      month.y                         day.y 
                         1542                          1542 
                       hour.y                          temp 
                         1542                          1559 
                         dewp                         humid 
                         1559                          1559 
                     wind_dir                    wind_speed 
                         9743                          1620 
                       precip                      pressure 
                         1542                         38079 
                        visib 
                         1542 
flights_weather_sample <- slice_sample(flights_weather, n = 5000) # 随机抽取n行 
vis_miss(flights_weather_sample) # 可视化检查缺失值

三)3.2. 探究飞行速度与天气的关系与可视化

# 遍历 weather 表中的所有变量
weather_vars <- names(weather) %>% 
  setdiff(c("origin", "time_hour", "year", "month", "day", "hour")) # 排除不需要的列 
# 创建一个空列表,用于存储所有热力图 
speed_weather_plot_list <- list() #创建列表
# 生成所有图形并存储到列表中 
for (var in weather_vars) {
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
    stat_binhex(bins = 30) +  
    scale_fill_gradient(low = "lightblue", high = "red") +
    labs(
      title = paste("平均速度 ~", var),
      y = "平均速度 (mph)",
      x = paste(var, "(单位根据变量而定)")
    ) +
    theme_minimal()
  
  speed_weather_plot_list[[var]] <- p +
    theme(plot.background  = element_rect(fill = "#CCCCCC", color = "white", size = 1),
          legend.key.width  = unit(0.1, "cm"))
}
# 使用 patchwork 将所有图形组合并显示 
wrap_plots(speed_weather_plot_list, ncol = 3)  # 每行显示n个图形 

三)3.3. 进一步探究飞行速度与天气的关系与可视化

# 遍历所有天气变量 
for (var in weather_vars) {
  # --- 动态分箱策略 --- 
  unique_values <- unique(flights_weather[[var]])
  if (length(unique_values) <= 5) {
    breaks <- unique_values  # 离散变量直接分箱(如雾:0/1)
  } else {
    breaks <- unique(quantile(flights_weather[[var]], probs = seq(0, 1, 0.2), na.rm  = TRUE))  # 连续变量分位数分箱 
  }
  
  # --- 分层抽样 --- 
  flights_weather_sample <- flights_weather %>%
    mutate(var_bin = cut(.data[[var]], breaks, include.lowest  = TRUE)) %>%
    group_by(var_bin) %>%
    sample_n(100, replace = TRUE) %>%  # 允许重复抽样(小样本时必需)
    ungroup()
  
  # --- 生成热力图(速度分析) --- 
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
    stat_binhex(bins = 30) +  # 六边形热力图 
    scale_fill_gradient(low = "lightblue", high = "red") +  # 颜色渐变 
    geom_smooth(method = "loess", color = "black", linewidth = 0.5, se = FALSE) +  # 添加拟合线 
    labs(
      title = paste("平均速度 ~", var),  # 动态标题 
      y = "平均速度 (mph)",  # Y轴标签 
      x = paste(var, "(单位根据变量而定)")  # X轴标签 
    ) +
    theme_minimal() +  # 简约主题 
    theme(
      plot.background  = element_rect(fill = "#CCCCCC", color = "white", size = 1),  # 灰色背景+白色边框 
      legend.key.width  = unit(0.1, "cm")  # 调整图例宽度 
    )
  
  # 存储图形 
  speed_weather_plot_list[[var]] <- p
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(speed_weather_plot_list, ncol = 3)  # 每行显示3个图形 

#* 没啥波动,应该极端天气本来就不会起飞

三)?.

#2.用众数填补 model 含有缺失值的数据(?)-------------------------------------------
mode_model <- names(sort(table(flights_planes$model), decreasing = TRUE))[1]
flights_planes <- flights_planes %>%
  mutate(model = ifelse(is.na(model),  mode_model, model))

四)拟合模型预测填补speed缺失值

#3.预测 speed 含有缺失值的数据-----------------------------------
# 训练集:已知 speed 
train_data <- flights_planes %>%
  filter(!is.na(speed)) 
# 预测集:speed 缺失 
predict_data <- flights_planes %>%
  filter(is.na(speed)) 
# 拟合模型 
model_speed <- lm(speed ~ log(distance) + model, data = train_data) #考虑到航空公司 (carrier)是间接影响,不纳入
#*会自动将分类变量转换为虚拟变量

plot(model_speed, which = 2)  # QQ 图(正态性检验)

# 预测 speed 缺失值 
predicted_speed <- predict(model_speed, newdata = predict_data)
# 将预测值填充到原数据中 
predict_data <- predict_data %>%
  mutate(speed = predicted_speed)
# 合并预测结果 
flights_planes <- bind_rows(train_data, predict_data)

colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         6201                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                         6918                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                            0                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                            0                         50094 
                        seats                        engine 
                        50094                         50094 
flights_planes <- flights_planes %>%  # 将速度转换为英里/分钟 
  mutate(speed_minute = speed / 60)
#4.计算 air_time 含有缺失值的数据-----------------------------------
#* air_time ≈ distance/speed_minute
# 计算缺失的 arr_time 
flights_planes <- flights_planes %>%
  mutate(air_time = ifelse(is.na(air_time),  distance / speed_minute, air_time))
#5.计算 arr_time 含有缺失值的数据-----------------------------------
#*可通过 air_time = arr_time - dep_time,求出部分 arr_time
# 计算缺失的 arr_time 
flights_planes <- flights_planes %>%
  mutate(arr_time = ifelse(is.na(arr_time),  dep_time + (distance / speed_minute), arr_time))

colSums(is.na(flights_planes)) # 每列的缺失值数量 
                         year                         month 
                            0                             0 
                          day                      dep_time 
                            0                          5743 
               sched_dep_time                     dep_delay 
                            0                          5743 
                     arr_time                sched_arr_time 
                         5743                             0 
                    arr_delay                       carrier 
                         5743                             0 
                       flight                       tailnum 
                            0                             0 
                       origin                          dest 
                            0                             0 
                     air_time                      distance 
                            0                             0 
                         hour                        minute 
                            0                             0 
                    time_hour arr_delay_dep_delay_deviation 
                            0                          6918 
                        speed              manufacture_year 
                            0                         55400 
                         type                  manufacturer 
                        50094                         50094 
                        model                       engines 
                            0                         50094 
                        seats                        engine 
                        50094                         50094 
                 speed_minute 
                            0 

#5.探究起飞延误与各变量的关系

四)1.探索性分析与可视化

# 处理dep_delay异常值------------------------------------------
# dep_delay分布箱线图 
dep_delay_boxplot_plot <- ggplot(flights, aes(y = dep_delay)) +
  geom_boxplot(fill = "steelblue", color = "black") +
  labs(title = "起飞延误时间箱线图", y = "延误时间(分钟)") +
  theme_minimal() +
  theme(
    axis.title.x  = element_blank(),  # 去除 x 轴标题 
    axis.text.x  = element_blank(),   # 去除 x 轴刻度标签 
    axis.ticks.x  = element_blank(),   # 去除 x 轴刻度线 
    axis.text.y  = element_text(size = 10, face = "bold.italic"),
    plot.title  = element_text(face = "bold", size = 15),
    axis.title  = element_text(face = "bold", size = 15)
  )
# dep_delay分布直方图
dep_delay_histogram_plot <- ggplot(flights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 20, fill = "steelblue") +
  labs(title = "起飞延误时间分布图", x = "延误时间(分钟)", y = "频数") +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 15),
    axis.title  = element_text(face = "bold", size = 15),
    axis.text.x  = element_text(hjust = 0.5, size = 10, face = "bold"),
    axis.text.y  = element_text(size = 10, face = "bold.italic")
  )

dep_delay_plot <- dep_delay_boxplot_plot + dep_delay_histogram_plot 
dep_delay_plot 

#* 发现大数据长尾分布,绝大多数是没什么延迟,有少部分是超长延迟

四)2.探究起飞延误与机场的关系

# dep_delay ~ origin--------------------------
flights %>%
  group_by(origin) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm  = TRUE))
# A tibble: 3 × 2
  origin mean_dep_delay
  <chr>           <dbl>
1 EWR              15.1
2 JFK              12.1
3 LGA              10.3
#* EWR机场平均延误最高,LGA最低

四)3.探究起飞延误与一天中起飞时间的关系与可视化

# dep_delay ~ hour-------------------------
# 计算每小时平均延误 
hour_dep_delay <- flights %>%
  mutate(hour = sched_dep_time %/% 100) %>%  # 提取计划起飞小时 
  group_by(hour) %>%
  summarise(avg_delay = mean(dep_delay, na.rm  = TRUE))

# 绘制峰曲线图 
ggplot(hour_dep_delay, aes(x = hour, y = avg_delay)) +
  geom_line(color = "steelblue", linewidth = 0.8) +  
  geom_point(aes(fill = avg_delay), shape = 21, color = "steelblue", size = 3, alpha = 0.95) + 
  scale_x_continuous(breaks = 0:23) + 
  scale_fill_gradient(low = "blue4", high = "red") +
  labs(
    title = "不同计划起飞小时的平均起飞延误",
    x = "计划起飞小时", 
    y = "平均起飞延误 (dep_delay)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),
    axis.title  = element_text(face = "bold", size = 20),
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"), 
    plot.subtitle  = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red"),
    panel.grid.minor  = element_blank()   # 去除次网格线 
  )

#* 傍晚平均延误较高,早上凌晨延误很低

四)4.探究起飞延误与天气的关系与可视化

# dep_delay ~ weather----------------------------------------------------------
# 随机抽取 4000 行数据 
flights_weather_sample <- slice_sample(flights_weather, n = 4000)

# 遍历 weather 表中的所有变量,排除不需要的列 
weather_vars <- names(weather) %>% 
  setdiff(c("origin", "time_hour", "year", "month", "day", "hour"))

# 创建一个空列表,用于存储所有热力图 
dep_delay_weather_plot_list <- list()

# 生成所有图形并存储到列表中 
for (var in weather_vars) {
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
    stat_binhex(bins = 30) +  # 使用热力图 
    scale_fill_gradient(low = "lightyellow", high = "#154415") +  # 设置颜色渐变 
    labs(
      title = paste("起飞延误 ~", var),  # 动态设置标题 
      y = "起飞延误 (分钟)",
      x = paste(var, "(单位根据变量而定)")
    ) +
    theme_minimal()
  
  dep_delay_weather_plot_list[[var]] <- p +
    theme(plot.background  = element_rect(fill = "lightblue", color = "white", size = 1))
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(dep_delay_weather_plot_list, ncol = 3)

#* 但是数据分布不均衡(好天气样本多、延误低的样本占主导),图都成一团
#* 这会导致热力图无法有效揭示天气变量与延误之间的潜在关联了

四)5.进一步探究起飞延误与天气的关系与可视化

# 遍历所有天气变量
for (var in weather_vars) {
  # --- 第一步:动态分箱策略 --- 
  # 检查当前变量的唯一值数量(例如:降水可能只有0和5两种值)
  unique_values <- unique(flights_weather[[var]])
  
  
  if (length(unique_values) <= 5) {
    breaks <- unique_values # 如果变量值种类≤5种,直接按实际值分箱 
  } else {
    # 变量值种类多时,按20%分位数切分5个区间 
    # 例如:温度可能分为[低温, 中低温, 中温, 中高温, 高温]
    # unique()用于防止分位数重复(如80%分位和100%分位值相同)
    breaks <- unique(quantile(flights_weather[[var]], 
                              probs = seq(0, 1, 0.2), 
                              na.rm  = TRUE))  # 忽略缺失值 
  }
  
  # --- 第二步:分层抽样 --- 
  flights_weather_sample <- flights_weather %>%
    # 创建分箱标签列(var_bin)
    mutate(
      var_bin = cut(
        .data[[var]], 
        breaks = breaks,  # 使用上一步计算的分箱边界 
        include.lowest  = TRUE  # 包含最小值所在的区间 
      )
    ) %>%
    # 按分箱结果分组,每组抽取100条数据 
    group_by(var_bin) %>%
    sample_n(500, replace = TRUE) %>%  # replace = TRUE 可重复抽样
    ungroup()  # 取消分组状态 
  
  # --- 第三步:生成热力图 --- 
  p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
    stat_binhex(bins = 30) +  # 六边形热力图
    scale_fill_gradient(low = "lightyellow", high = "#154415") + 
    geom_smooth(method = "loess", color = "red", linewidth = 0.5, se = FALSE) +  # 添加拟合线 
    labs(
      title = paste("起飞延误 ~", var), 
      y = "起飞延误 (dep_delay)",  
      x = paste(var, "(单位根据变量而定)") 
    ) +
    theme_minimal() + 
    theme(plot.background  = element_rect(fill = "lightblue", color = "white", size = 1))
  
  # 存储图形到列表(按变量名作为键)
  dep_delay_weather_plot_list[[var]] <- p 
}

# 使用 patchwork 将所有图形组合并显示 
wrap_plots(dep_delay_weather_plot_list, ncol = 3)

五)探究起飞延迟与机型,航空公司,月份的关系

五)1.探究机型与航空公司的关系与可视化

# model ~ carrier---------------------------------------------------------------
#1.汇总数据:统计每个 carrier 对应的 model 数量 
carrier_model_count <- flights_planes %>%
  group_by(carrier, model) %>%
  summarise(count = n(), .groups = 'drop')
#2.计算每个 carrier 最多使用的 model 数量及其比例 
carrier_total <- carrier_model_count %>% #提取出各机型在各航空公司的数量
  group_by(carrier) %>%
  summarise(total = sum(count), .groups = 'drop')
carrier_most_model <- carrier_model_count %>% #提取出前三频率的
  group_by(carrier) %>%
  slice_max(order_by = count, n = 3) %>% 
  ungroup()
carrier_most_model <- carrier_most_model %>% #合并
  left_join(carrier_total, by = "carrier") %>%
  mutate(carrier_model_ratio = count / total)
#3. model ~ carrier
ggplot(carrier_most_model, aes(x = carrier, y = carrier_model_ratio, fill = model)) +
  geom_bar(stat = "identity", position = "stack", color = "#777777") +  # 绘制堆叠柱状图 
  geom_text(aes(label = model), 
            position = position_stack(vjust = 0.5), size = 4, angle = 30, color = "black", alpha = 0.8) +  # 仅添加机型名称 
  labs(
    title = "航空公司 (carrier) 和其前三频率使用 机型 (model) 的关系图",
    x = "航空公司 (carrier)",
    y = "占对应公司使用机型比例",
  ) +
  theme_minimal() +  # 使用简洁主题 
  theme(
    plot.title  = element_text(face = "bold", size = 20),  # 标题加粗 
    axis.title  = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x  = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    legend.position  = "none"  # 移除图例 
  ) +
  scale_y_continuous(labels = scales::percent)  # 将 y 轴标签显示为百分比 

五)2.探究起飞延误与航空公司的关系与可视化

# carrier_mean_dep_delay ~ carrier carrier----------------------------------------------------------------
carrier_counts <- flights %>%
  count(carrier, name = "count")
carrier_counts
# A tibble: 16 × 2
   carrier count
   <chr>   <int>
 1 9E      17416
 2 AA      32645
 3 AS        714
 4 B6      54635
 5 DL      48110
 6 EV      54173
 7 F9        682
 8 FL       3260
 9 HA        342
10 MQ      26395
11 OO         32
12 UA      57979
13 US      19873
14 VX       5162
15 WN      12245
16 YV        601
# 按 carrier 分组计算平均延误 
carrier_dep_delay <- flights %>%
  group_by(carrier) %>%
  summarise(carrier_mean_dep_delay = mean(dep_delay, na.rm  = TRUE),
            carrier_sd_dep_delay = sd(dep_delay, na.rm  = TRUE))
# carrier_mean_dep_delay ~ carrier
ggplot(carrier_dep_delay, aes(x = carrier, y = carrier_mean_dep_delay, fill = carrier)) +
  geom_bar(stat = "identity", color = "black") +  # 绘制柱状图 
  labs(
    title = "各航空公司飞机平均延误分布图",
    y = "平均延误 (minute)",
    x = "航空公司 (carrier)"
  ) +
  theme_minimal() +
  theme(
    plot.title  = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title  = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x  = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
    axis.text.y  = element_text(size = 20, face = "bold.italic"),
    legend.position  = "none"  # 移除图例 
  )

#*相差很大!
anova_dep_delay_carrier_result <- aov(dep_delay ~ carrier, data = flights)
summary(anova_dep_delay_carrier_result)
                Df    Sum Sq Mean Sq F value Pr(>F)    
carrier         15   6348034  423202   264.9 <2e-16 ***
Residuals   328505 524819199    1598                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness
#* F 值:反映组间差异相对于组内差异的大小,F = 组间变异 / 组内变异
#* p 值:判断组间差异是否具有统计学意义,通常 p < 0.05 可以拒绝原假设,认为组间差异显著

五)2.探究起飞延误与月份的关系与可视化

# dep_delay ~ month----------------------------------------------------------------
# 按 month 分组计算平均起飞延误 
month_dep_delay <- flights %>%
  group_by(month) %>%
  summarise(month_mean_dep_delay = mean(dep_delay, na.rm  = TRUE),
            month_sd_dep_delay = sd(dep_delay, na.rm  = TRUE))

# mean_dep_delay ~ month 
ggplot(month_dep_delay, aes(x = month, y = month_mean_dep_delay, fill = factor(month))) +
  geom_bar(stat = "identity", color = "black") +  # 绘制柱状图 
  scale_x_continuous(breaks = 1:12, labels = month.abb)  +  # 将月份显示为缩写
  labs(
    title = "各月飞机平均延误时间",
    y = "平均延误 (minute)",
    x = "月份"
  ) +
  theme_minimal() +
  theme(
    plot.title   = element_text(face = "bold", size = 30),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position  = "none"  # 移除图例 
  )

# 方差分析:dep_delay ~ month 
anova_dep_delay_month_result <- aov(dep_delay ~ factor(month), data = flights)
summary(anova_dep_delay_month_result)
                  Df    Sum Sq Mean Sq F value Pr(>F)    
factor(month)     11   8449792  768163   482.8 <2e-16 ***
Residuals     328509 522717440    1591                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness

五)2.探究航空公司与月份的关系与可视化

# carrier ~ month----------------------------------------------------------------
month_carrier_count <- flights %>%
  group_by(month, carrier) %>%
  summarise(count = n(), .groups = 'drop')
# 提取出各月份的总航班数 
month_total <- month_carrier_count %>%
  group_by(month) %>%
  summarise(total = sum(count), .groups = 'drop')
# 合并数据,计算比例 
month_carrier <- month_carrier_count %>%
  left_join(month_total, by = "month") %>%
  mutate(month_carrier_ratio = count / total)
# carrier ~ month
ggplot(month_carrier, aes(x = month, y = month_carrier_ratio, fill = carrier)) +
  geom_bar(stat = "identity", position = "stack", color = "#777777") +  # 绘制堆叠柱状图 
  geom_text(aes(label = carrier), 
            position = position_stack(vjust = 0.5), size = 5, color = "black", alpha = 0.8) +  # 添加航空公司名称 
  labs(
    title = "各月份 (month) 航空公司 (carrier) 所占航班比例的关系图",
    x = "月份 (month)",
    y = "占对应月份使用航空公司比例",
  ) +
  theme_minimal() +  # 使用简洁主题 
  theme(
    plot.title   = element_text(face = "bold", size = 20),  # 标题加粗 
    axis.title   = element_text(face = "bold", size = 20),  # 坐标轴标题加粗 
    axis.text.x   = element_text(hjust = 0.5, size = 20, face = "bold"),
    axis.text.y   = element_text(size = 20, face = "bold.italic"), 
    legend.position   = "none"  # 移除图例 
  ) +
  scale_y_continuous(labels = scales::percent) +  # 将 y 轴标签显示为百分比 
  scale_x_continuous(breaks = 1:12, labels = month.abb)   # 将月份显示为缩写(Jan, Feb 等)

#*分布得十分均匀,说明延误确实和季节有关系