Rows: 336,776
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
year month day dep_time sched_dep_time
Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
NA's :8255
dep_delay arr_time sched_arr_time arr_delay
Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
Median : -2.00 Median :1535 Median :1556 Median : -5.000
Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
NA's :8255 NA's :8713 NA's :9430
carrier flight tailnum origin
Length:336776 Min. : 1 Length:336776 Length:336776
Class :character 1st Qu.: 553 Class :character Class :character
Mode :character Median :1496 Mode :character Mode :character
Mean :1972
3rd Qu.:3465
Max. :8500
dest air_time distance hour
Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
Mode :character Median :129.0 Median : 872 Median :13.00
Mean :150.7 Mean :1040 Mean :13.18
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
Max. :695.0 Max. :4983 Max. :23.00
NA's :9430
minute time_hour
Min. : 0.00 Min. :2013-01-01 05:00:00.00
1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
Median :29.00 Median :2013-07-03 10:00:00.00
Mean :26.23 Mean :2013-07-03 05:22:54.64
3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
Max. :59.00 Max. :2013-12-31 23:00:00.00
name | levels | stats |
|---|---|---|
year | Mean ± SD | 2013.0 ± 0.0 |
month | Mean ± SD | 6.5 ± 3.4 |
day | Mean ± SD | 15.7 ± 8.8 |
dep_time | Mean ± SD | 1349.1 ± 488.3 |
sched_dep_time | Mean ± SD | 1344.3 ± 467.3 |
dep_delay | Mean ± SD | 12.6 ± 40.2 |
arr_time | Mean ± SD | 1502.1 ± 533.3 |
sched_arr_time | Mean ± SD | 1536.4 ± 497.5 |
arr_delay | Mean ± SD | 6.9 ± 44.6 |
carrier | 9E | 18460 (5.5%) |
AA | 32729 (9.7%) | |
AS | 714 (0.2%) | |
B6 | 54635 (16.2%) | |
DL | 48110 (14.3%) | |
EV | 54173 (16.1%) | |
F9 | 685 (0.2%) | |
FL | 3260 (1.0%) | |
HA | 342 (0.1%) | |
MQ | 26397 (7.8%) | |
OO | 32 (0.0%) | |
UA | 58665 (17.4%) | |
US | 20536 (6.1%) | |
VX | 5162 (1.5%) | |
WN | 12275 (3.6%) | |
YV | 601 (0.2%) | |
flight | Mean ± SD | 1971.9 ± 1632.5 |
tailnum | 4044 unique values | |
origin | EWR | 120835 (35.9%) |
JFK | 111279 (33.0%) | |
LGA | 104662 (31.1%) | |
dest | 105 unique values | |
air_time | Mean ± SD | 150.7 ± 93.7 |
distance | Mean ± SD | 1039.9 ± 733.2 |
hour | Mean ± SD | 13.2 ± 4.7 |
minute | Mean ± SD | 26.2 ± 19.3 |
time_hour | 6936 unique values |
| Name | flights |
| Number of rows | 336776 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 14 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| carrier | 0 | 1.00 | 2 | 2 | 0 | 16 | 0 |
| tailnum | 2512 | 0.99 | 5 | 6 | 0 | 4043 | 0 |
| origin | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| dest | 0 | 1.00 | 3 | 3 | 0 | 105 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2013.00 | 0.00 | 2013 | 2013 | 2013 | 2013 | 2013 | ▁▁▇▁▁ |
| month | 0 | 1.00 | 6.55 | 3.41 | 1 | 4 | 7 | 10 | 12 | ▇▆▆▆▇ |
| day | 0 | 1.00 | 15.71 | 8.77 | 1 | 8 | 16 | 23 | 31 | ▇▇▇▇▆ |
| dep_time | 8255 | 0.98 | 1349.11 | 488.28 | 1 | 907 | 1401 | 1744 | 2400 | ▁▇▆▇▃ |
| sched_dep_time | 0 | 1.00 | 1344.25 | 467.34 | 106 | 906 | 1359 | 1729 | 2359 | ▁▇▇▇▃ |
| dep_delay | 8255 | 0.98 | 12.64 | 40.21 | -43 | -5 | -2 | 11 | 1301 | ▇▁▁▁▁ |
| arr_time | 8713 | 0.97 | 1502.05 | 533.26 | 1 | 1104 | 1535 | 1940 | 2400 | ▁▃▇▇▇ |
| sched_arr_time | 0 | 1.00 | 1536.38 | 497.46 | 1 | 1124 | 1556 | 1945 | 2359 | ▁▃▇▇▇ |
| arr_delay | 9430 | 0.97 | 6.90 | 44.63 | -86 | -17 | -5 | 14 | 1272 | ▇▁▁▁▁ |
| flight | 0 | 1.00 | 1971.92 | 1632.47 | 1 | 553 | 1496 | 3465 | 8500 | ▇▃▃▁▁ |
| air_time | 9430 | 0.97 | 150.69 | 93.69 | 20 | 82 | 129 | 192 | 695 | ▇▂▂▁▁ |
| distance | 0 | 1.00 | 1039.91 | 733.23 | 17 | 502 | 872 | 1389 | 4983 | ▇▃▂▁▁ |
| hour | 0 | 1.00 | 13.18 | 4.66 | 1 | 9 | 13 | 17 | 23 | ▁▇▇▇▅ |
| minute | 0 | 1.00 | 26.23 | 19.30 | 0 | 8 | 29 | 44 | 59 | ▇▃▆▃▅ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| time_hour | 0 | 1 | 2013-01-01 05:00:00 | 2013-12-31 23:00:00 | 2013-07-03 10:00:00 | 6936 |
year month day dep_time sched_dep_time
0 0 0 8255 0
dep_delay arr_time sched_arr_time arr_delay carrier
8255 8713 0 9430 0
flight tailnum origin dest air_time
0 2512 0 0 9430
distance hour minute time_hour
0 0 0 0
#* 发现都比较分散,tailnum比较少
# 筛选出tailnum为NA且同一行中还有其他变量也为NA的所有行
except_tailnum_NA_rows <- flights %>%
filter(is.na(tailnum)) %>% # 筛选 tailnum 为 NA 的行
filter(if_any(c(dep_time, arr_time, dep_delay, arr_delay, air_time), is.na)) # 检查指定列是否有 NA
glimpse(except_tailnum_NA_rows)Rows: 2,512
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12, 13…
$ dep_time <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755, 1…
$ dep_delay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ arr_time <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, 101…
$ arr_delay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ carrier <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US", "…
$ flight <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 4023, …
$ tailnum <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ origin <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JFK",…
$ dest <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BUF",…
$ air_time <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ distance <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569, 14…
$ hour <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, 6, …
$ minute <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, 0, …
$ time_hour <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-03 0…
#* 发现和缺失tailnum的行数2512完全一样,看起来大多数缺失值都捆绑在一起,可以推测这些航班是被取消了
#1.直接删除 tailnum 含有缺失值的行(如果要分析航班取消的规律则保留)-------------------------------
flights <- flights %>%
filter(!is.na(tailnum))
colSums(is.na(flights)) # 每列的缺失值数量 year month day dep_time sched_dep_time
0 0 0 5743 0
dep_delay arr_time sched_arr_time arr_delay carrier
5743 6201 0 6918 0
flight tailnum origin dest air_time
0 0 0 0 6918
distance hour minute time_hour
0 0 0 0
# 筛选出含NA的变量
NA_variables <- flights %>%
summarise(across(everything(), ~ any(is.na(.)))) %>% # 检查每个变量是否含NA
pivot_longer(everything(), names_to = "variable", values_to = "has_na") %>% # 转换为长格式
filter(has_na) %>%
select(variable)
NA_variables <- flights %>%
select(all_of(NA_variables$variable)) # 选择含 NA 的变量
summary(NA_variables) dep_time dep_delay arr_time arr_delay
Min. : 1 Min. : -43.00 Min. : 1 Min. : -86.000
1st Qu.: 907 1st Qu.: -5.00 1st Qu.:1104 1st Qu.: -17.000
Median :1401 Median : -2.00 Median :1535 Median : -5.000
Mean :1349 Mean : 12.64 Mean :1502 Mean : 6.895
3rd Qu.:1744 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.: 14.000
Max. :2400 Max. :1301.00 Max. :2400 Max. :1272.000
NA's :5743 NA's :5743 NA's :6201 NA's :6918
air_time
Min. : 20.0
1st Qu.: 82.0
Median :129.0
Mean :150.7
3rd Qu.:192.0
Max. :695.0
NA's :6918
# 计算含NA的变量的NA比例
NA_variables_proportion <- NA_variables %>%
summarise(across(everything(), ~ mean(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "NA_percent")
# 绘制NA比例柱状图
ggplot(NA_variables_proportion, aes(x = variable, y = NA_percent)) +
geom_bar(stat = "identity", width = 0.7, fill = "slategrey", color = "#555555", alpha = 0.8) + # 绘制柱状图
coord_flip() + # 将坐标轴翻转为横图
labs(
title = "变量缺失值比例",
x = "变量",
y = "缺失值比例",
fill = "变量"
) +
scale_x_discrete(labels = c(
"dep_time" = "起飞时间 (dep_time)",
"dep_delay" = "起飞延误 (dep_delay)",
"arr_time" = "到达时间 (arr_time)",
"arr_delay" = "到达延误 (arr_delay)",
"air_time" = "飞行时间 (air_time)")
) + # 将变量名替换为中文
scale_y_continuous(limits = c(0, 0.03), breaks = seq(0, 0.3, by = 0.005)) +
theme(
plot.title = element_text(face = "bold", size = 30),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 15, face = "bold"),
legend.position = "none" # 移除图例
)#*删去取消航班仍有NA的变量: dep_time dep_delay arr_time arr_delay air_time
#* 起飞时间 起飞延误 到达时间 到达延误 飞行时间
#*
#* dep_time和 dep_delay的缺失值一样多,根本原因是 dep_time缺失
#*解释:dep_delay = dep_time - sched_dep_time
#* arr_delay和 air_time的缺失值一样多,根本原因是 arr_time缺失
#*解释:arr_delay = arr_time - sched_arr_time,air_time = arr_time - dep_time
#* air_time ≈ distance/飞机速度speed
#*可通过 air_time = arr_time - dep_time,求出部分 arr_time
#*# arr_delay ~ dep_delay
set.seed(1042)
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行
# arr_delay ~ dep_delay
ggplot(flights_sample, aes(x = dep_delay, y = arr_delay)) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotdash", linewidth = 0.8) + # 添加 y = x的参考线
geom_point(alpha = 0.2, color = "steelblue") + # 设置点的透明度和颜色
labs(
title = "到达延误 (arr_delay) ≈ 起飞延误 (dep_delay)", y = "到达延误 (arr_delay)",x = "起飞延误 (dep_delay)",
subtitle = "y = x"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
plot.subtitle = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red") # 坐标轴标题加粗
)# 计算偏离程度
flights <- flights %>%
mutate(arr_delay_dep_delay_deviation = arr_delay - dep_delay)
# 统计偏离程度
mode <- function(x) { #构造函数计算众数
as.numeric(names(which.max(table(x))) )
}
deviation_stats <- flights %>%
summarise(
mean_deviation = mean(arr_delay_dep_delay_deviation, na.rm = TRUE),
sd_deviation = sd(arr_delay_dep_delay_deviation, na.rm = TRUE),
median_deviation = median(arr_delay_dep_delay_deviation, na.rm = TRUE),
mode_deviation = mode(arr_delay_dep_delay_deviation)
)
deviation_stats # A tibble: 1 × 4
mean_deviation sd_deviation median_deviation mode_deviation
<dbl> <dbl> <dbl> <dbl>
1 -5.66 18.0 -7 -9
#* 左偏分布(负偏态),mode < median < mean < 0,数据集中在右侧,左侧有长尾
# 按dep_delay分组并计算arr_delay_dep_delay_deviation的均值
arr_delay_dep_delay_deviation_by_dep_delay <- flights %>%
group_by(dep_delay) %>%
reframe(mode_deviation_by_dep_delay = mode(arr_delay_dep_delay_deviation),
mean_deviation_by_dep_delay = mean(arr_delay_dep_delay_deviation)) # 自动取消分组
# mode_deviation_by_dep_delay ~ dep_delay
a1 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mode_deviation_by_dep_delay, x = dep_delay)) +
geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
geom_point(
alpha = 0.6,
color = "steelblue",
size = 2
) +
geom_smooth(
method = "loess",
color = "darkorange",
linewidth = 1.5,
se = TRUE, # 置信区间
fill = "orange"
) +
labs(
x = "起飞延误 (dep_delay)",
y = "到达延误-起飞延误差值的 众数"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 25),
axis.title = element_text(face = "bold", size = 15),
axis.text.x = element_text(size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
plot.subtitle = element_text(
hjust = 0.5, # 居中
size = 20,
color = "red"
)
)
# mean_deviation_by_dep_delay ~ dep_delay
a2 <- ggplot(arr_delay_dep_delay_deviation_by_dep_delay, aes(y = mean_deviation_by_dep_delay, x = dep_delay)) +
geom_hline(yintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
geom_vline(xintercept = 0, color = "black", linetype = "solid", linewidth = 0.5) +
geom_point(
alpha = 0.6,
color = "steelblue",
size = 2
) +
geom_smooth(
method = "loess",
color = "darkorange",
linewidth = 1.5,
se = TRUE, # 置信区间
fill = "orange"
) +
labs(
x = "起飞延误 (dep_delay)",
y = "到达延误-起飞延误差值的 均值"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 25),
axis.title = element_text(face = "bold", size = 15),
axis.text.x = element_text(size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
plot.subtitle = element_text(
hjust = 0.5, # 居中
size = 20,
color = "red"
)
)
wrap_plots(a1, a2, ncol = 2) +
plot_annotation(
title = "起飞延误 与 到达延误-起飞延误差值 的关系",
theme = theme(
plot.title = element_text(
hjust = 0.5,
face = "bold", # 加粗
size = 26
)
)
)# 可视化偏离分布
ggplot(flights, aes(x = arr_delay_dep_delay_deviation)) +
geom_vline(xintercept = deviation_stats$mode_deviation, color = "red", linetype = "dotdash", linewidth = 0.8) +
geom_histogram(alpha = 0.9, binwidth = 1, fill = "steelblue") +
labs(title = "到达延误-起飞延误差值 分布", x = "差值 (deviation = arr_delay - dep_delay)", y = "频数",
subtitle = "x = - 9") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 25),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 1, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
plot.subtitle = element_text(hjust = 0.38, vjust = -3, size = 20, color = "red")) # 副标题居中# 将 arr_delay 换为 dep_delay - mean_deviation -----------------------------------------
flights <- flights %>%
mutate(arr_delay = ifelse(is.na(arr_delay) & !is.na(dep_delay), dep_delay + deviation_stats$mean_deviation, arr_delay))
colSums(is.na(flights)) # 每列的缺失值数量 year month
0 0
day dep_time
0 5743
sched_dep_time dep_delay
0 5743
arr_time sched_arr_time
6201 0
arr_delay carrier
5743 0
flight tailnum
0 0
origin dest
0 0
air_time distance
6918 0
hour minute
0 0
time_hour arr_delay_dep_delay_deviation
0 6918
# 计算平均速度
flights <- flights %>%
mutate(speed = (distance / air_time) * 60) # 速度单位:mph
# speed ~ distance---------------------------------------------------------------------
flights_sample <- slice_sample(flights, n = 5000) # 随机抽取n行
# speed ~ distance
ggplot(flights_sample, aes(x = distance, y = speed)) +
geom_point(alpha = 0.3, color = "steelblue") + # 设置点的透明度和颜色
stat_smooth(method = "lm", formula = y ~ log(x), color = "red3", fill = "red") + # 对数回归曲线
labs(
title = " 平均速度 (mph) ~ 飞行距离 (miles)",
y = "平均速度 (mph)",
x = "飞行距离 (miles)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
)planes <- planes %>% # 删除本来的speed
select(-speed)
# 合并 flights 与 planes(通过 tailnum)
flights_planes <- flights %>%
left_join(planes, by = "tailnum")
colSums(is.na(flights_planes)) # 每列的缺失值数量 year month
0 0
day dep_time
0 5743
sched_dep_time dep_delay
0 5743
arr_time sched_arr_time
6201 0
arr_delay carrier
5743 0
flight tailnum
0 0
origin dest
0 0
air_time distance
6918 0
hour minute
0 0
time_hour arr_delay_dep_delay_deviation
0 6918
speed manufacture_year
6918 55400
type manufacturer
50094 50094
model engines
50094 50094
seats engine
50094 50094
flights_planes_sample <- slice_sample(flights_planes, n = 5000) # 随机抽取n行
vis_miss(flights_planes_sample) # 可视化检查缺失值# model_mean_speed ~ model---------------------------------------------------------------------
# 按 model 分组计算平均速度
model_speed <- flights_planes %>%
group_by(model) %>%
summarise(model_mean_speed = mean(speed, na.rm = TRUE))
# model_mean_speed ~ model
ggplot(model_speed, aes(x = reorder(model, model_mean_speed), y = model_mean_speed, fill = model)) +
geom_bar(width = 0.6, stat = "identity") +
coord_flip() + # 水平柱状图
labs(
title = "各机型平均速度分布图",
y = "平均速度 (mph)",
x = "机型 (model)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 8, face = "bold.italic"),
legend.position = "none",
panel.grid.major.y = element_blank(), # 去除主要水平网格线
panel.grid.minor.y = element_blank() # 去除次要水平网格线
)anova_speed_model_result <- aov(speed ~ model, data = flights_planes)
summary(anova_speed_model_result) Df Sum Sq Mean Sq F value Pr(>F)
model 126 435397631 3455537 1554 <2e-16 ***
Residuals 278890 620244978 2224
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
55247 observations deleted due to missingness
weather <- weather %>% # 删除wind_gust
select(-wind_gust)
flights_weather <- flights %>%
left_join(weather, by = c("origin", "time_hour"))
colSums(is.na(flights_weather)) # 每列的缺失值数量 year.x month.x
0 0
day.x dep_time
0 5743
sched_dep_time dep_delay
0 5743
arr_time sched_arr_time
6201 0
arr_delay carrier
5743 0
flight tailnum
0 0
origin dest
0 0
air_time distance
6918 0
hour.x minute
0 0
time_hour arr_delay_dep_delay_deviation
0 6918
speed year.y
6918 1542
month.y day.y
1542 1542
hour.y temp
1542 1559
dewp humid
1559 1559
wind_dir wind_speed
9743 1620
precip pressure
1542 38079
visib
1542
flights_weather_sample <- slice_sample(flights_weather, n = 5000) # 随机抽取n行
vis_miss(flights_weather_sample) # 可视化检查缺失值# 遍历 weather 表中的所有变量
weather_vars <- names(weather) %>%
setdiff(c("origin", "time_hour", "year", "month", "day", "hour")) # 排除不需要的列
# 创建一个空列表,用于存储所有热力图
speed_weather_plot_list <- list() #创建列表
# 生成所有图形并存储到列表中
for (var in weather_vars) {
p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
stat_binhex(bins = 30) +
scale_fill_gradient(low = "lightblue", high = "red") +
labs(
title = paste("平均速度 ~", var),
y = "平均速度 (mph)",
x = paste(var, "(单位根据变量而定)")
) +
theme_minimal()
speed_weather_plot_list[[var]] <- p +
theme(plot.background = element_rect(fill = "#CCCCCC", color = "white", size = 1),
legend.key.width = unit(0.1, "cm"))
}
# 使用 patchwork 将所有图形组合并显示
wrap_plots(speed_weather_plot_list, ncol = 3) # 每行显示n个图形 # 遍历所有天气变量
for (var in weather_vars) {
# --- 动态分箱策略 ---
unique_values <- unique(flights_weather[[var]])
if (length(unique_values) <= 5) {
breaks <- unique_values # 离散变量直接分箱(如雾:0/1)
} else {
breaks <- unique(quantile(flights_weather[[var]], probs = seq(0, 1, 0.2), na.rm = TRUE)) # 连续变量分位数分箱
}
# --- 分层抽样 ---
flights_weather_sample <- flights_weather %>%
mutate(var_bin = cut(.data[[var]], breaks, include.lowest = TRUE)) %>%
group_by(var_bin) %>%
sample_n(100, replace = TRUE) %>% # 允许重复抽样(小样本时必需)
ungroup()
# --- 生成热力图(速度分析) ---
p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = speed)) +
stat_binhex(bins = 30) + # 六边形热力图
scale_fill_gradient(low = "lightblue", high = "red") + # 颜色渐变
geom_smooth(method = "loess", color = "black", linewidth = 0.5, se = FALSE) + # 添加拟合线
labs(
title = paste("平均速度 ~", var), # 动态标题
y = "平均速度 (mph)", # Y轴标签
x = paste(var, "(单位根据变量而定)") # X轴标签
) +
theme_minimal() + # 简约主题
theme(
plot.background = element_rect(fill = "#CCCCCC", color = "white", size = 1), # 灰色背景+白色边框
legend.key.width = unit(0.1, "cm") # 调整图例宽度
)
# 存储图形
speed_weather_plot_list[[var]] <- p
}
# 使用 patchwork 将所有图形组合并显示
wrap_plots(speed_weather_plot_list, ncol = 3) # 每行显示3个图形 #3.预测 speed 含有缺失值的数据-----------------------------------
# 训练集:已知 speed
train_data <- flights_planes %>%
filter(!is.na(speed))
# 预测集:speed 缺失
predict_data <- flights_planes %>%
filter(is.na(speed))
# 拟合模型
model_speed <- lm(speed ~ log(distance) + model, data = train_data) #考虑到航空公司 (carrier)是间接影响,不纳入
#*会自动将分类变量转换为虚拟变量
plot(model_speed, which = 2) # QQ 图(正态性检验)# 预测 speed 缺失值
predicted_speed <- predict(model_speed, newdata = predict_data)
# 将预测值填充到原数据中
predict_data <- predict_data %>%
mutate(speed = predicted_speed)
# 合并预测结果
flights_planes <- bind_rows(train_data, predict_data)
colSums(is.na(flights_planes)) # 每列的缺失值数量 year month
0 0
day dep_time
0 5743
sched_dep_time dep_delay
0 5743
arr_time sched_arr_time
6201 0
arr_delay carrier
5743 0
flight tailnum
0 0
origin dest
0 0
air_time distance
6918 0
hour minute
0 0
time_hour arr_delay_dep_delay_deviation
0 6918
speed manufacture_year
0 55400
type manufacturer
50094 50094
model engines
0 50094
seats engine
50094 50094
flights_planes <- flights_planes %>% # 将速度转换为英里/分钟
mutate(speed_minute = speed / 60)
#4.计算 air_time 含有缺失值的数据-----------------------------------
#* air_time ≈ distance/speed_minute
# 计算缺失的 arr_time
flights_planes <- flights_planes %>%
mutate(air_time = ifelse(is.na(air_time), distance / speed_minute, air_time))
#5.计算 arr_time 含有缺失值的数据-----------------------------------
#*可通过 air_time = arr_time - dep_time,求出部分 arr_time
# 计算缺失的 arr_time
flights_planes <- flights_planes %>%
mutate(arr_time = ifelse(is.na(arr_time), dep_time + (distance / speed_minute), arr_time))
colSums(is.na(flights_planes)) # 每列的缺失值数量 year month
0 0
day dep_time
0 5743
sched_dep_time dep_delay
0 5743
arr_time sched_arr_time
5743 0
arr_delay carrier
5743 0
flight tailnum
0 0
origin dest
0 0
air_time distance
0 0
hour minute
0 0
time_hour arr_delay_dep_delay_deviation
0 6918
speed manufacture_year
0 55400
type manufacturer
50094 50094
model engines
0 50094
seats engine
50094 50094
speed_minute
0
#5.探究起飞延误与各变量的关系
# 处理dep_delay异常值------------------------------------------
# dep_delay分布箱线图
dep_delay_boxplot_plot <- ggplot(flights, aes(y = dep_delay)) +
geom_boxplot(fill = "steelblue", color = "black") +
labs(title = "起飞延误时间箱线图", y = "延误时间(分钟)") +
theme_minimal() +
theme(
axis.title.x = element_blank(), # 去除 x 轴标题
axis.text.x = element_blank(), # 去除 x 轴刻度标签
axis.ticks.x = element_blank(), # 去除 x 轴刻度线
axis.text.y = element_text(size = 10, face = "bold.italic"),
plot.title = element_text(face = "bold", size = 15),
axis.title = element_text(face = "bold", size = 15)
)
# dep_delay分布直方图
dep_delay_histogram_plot <- ggplot(flights, aes(x = dep_delay)) +
geom_histogram(binwidth = 20, fill = "steelblue") +
labs(title = "起飞延误时间分布图", x = "延误时间(分钟)", y = "频数") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 15),
axis.title = element_text(face = "bold", size = 15),
axis.text.x = element_text(hjust = 0.5, size = 10, face = "bold"),
axis.text.y = element_text(size = 10, face = "bold.italic")
)
dep_delay_plot <- dep_delay_boxplot_plot + dep_delay_histogram_plot
dep_delay_plot # dep_delay ~ hour-------------------------
# 计算每小时平均延误
hour_dep_delay <- flights %>%
mutate(hour = sched_dep_time %/% 100) %>% # 提取计划起飞小时
group_by(hour) %>%
summarise(avg_delay = mean(dep_delay, na.rm = TRUE))
# 绘制峰曲线图
ggplot(hour_dep_delay, aes(x = hour, y = avg_delay)) +
geom_line(color = "steelblue", linewidth = 0.8) +
geom_point(aes(fill = avg_delay), shape = 21, color = "steelblue", size = 3, alpha = 0.95) +
scale_x_continuous(breaks = 0:23) +
scale_fill_gradient(low = "blue4", high = "red") +
labs(
title = "不同计划起飞小时的平均起飞延误",
x = "计划起飞小时",
y = "平均起飞延误 (dep_delay)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30),
axis.title = element_text(face = "bold", size = 20),
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
plot.subtitle = element_text(hjust = 0.8, vjust = -20, size = 20, color = "red"),
panel.grid.minor = element_blank() # 去除次网格线
)# dep_delay ~ weather----------------------------------------------------------
# 随机抽取 4000 行数据
flights_weather_sample <- slice_sample(flights_weather, n = 4000)
# 遍历 weather 表中的所有变量,排除不需要的列
weather_vars <- names(weather) %>%
setdiff(c("origin", "time_hour", "year", "month", "day", "hour"))
# 创建一个空列表,用于存储所有热力图
dep_delay_weather_plot_list <- list()
# 生成所有图形并存储到列表中
for (var in weather_vars) {
p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
stat_binhex(bins = 30) + # 使用热力图
scale_fill_gradient(low = "lightyellow", high = "#154415") + # 设置颜色渐变
labs(
title = paste("起飞延误 ~", var), # 动态设置标题
y = "起飞延误 (分钟)",
x = paste(var, "(单位根据变量而定)")
) +
theme_minimal()
dep_delay_weather_plot_list[[var]] <- p +
theme(plot.background = element_rect(fill = "lightblue", color = "white", size = 1))
}
# 使用 patchwork 将所有图形组合并显示
wrap_plots(dep_delay_weather_plot_list, ncol = 3)# 遍历所有天气变量
for (var in weather_vars) {
# --- 第一步:动态分箱策略 ---
# 检查当前变量的唯一值数量(例如:降水可能只有0和5两种值)
unique_values <- unique(flights_weather[[var]])
if (length(unique_values) <= 5) {
breaks <- unique_values # 如果变量值种类≤5种,直接按实际值分箱
} else {
# 变量值种类多时,按20%分位数切分5个区间
# 例如:温度可能分为[低温, 中低温, 中温, 中高温, 高温]
# unique()用于防止分位数重复(如80%分位和100%分位值相同)
breaks <- unique(quantile(flights_weather[[var]],
probs = seq(0, 1, 0.2),
na.rm = TRUE)) # 忽略缺失值
}
# --- 第二步:分层抽样 ---
flights_weather_sample <- flights_weather %>%
# 创建分箱标签列(var_bin)
mutate(
var_bin = cut(
.data[[var]],
breaks = breaks, # 使用上一步计算的分箱边界
include.lowest = TRUE # 包含最小值所在的区间
)
) %>%
# 按分箱结果分组,每组抽取100条数据
group_by(var_bin) %>%
sample_n(500, replace = TRUE) %>% # replace = TRUE 可重复抽样
ungroup() # 取消分组状态
# --- 第三步:生成热力图 ---
p <- ggplot(flights_weather_sample, aes(x = .data[[var]], y = dep_delay)) +
stat_binhex(bins = 30) + # 六边形热力图
scale_fill_gradient(low = "lightyellow", high = "#154415") +
geom_smooth(method = "loess", color = "red", linewidth = 0.5, se = FALSE) + # 添加拟合线
labs(
title = paste("起飞延误 ~", var),
y = "起飞延误 (dep_delay)",
x = paste(var, "(单位根据变量而定)")
) +
theme_minimal() +
theme(plot.background = element_rect(fill = "lightblue", color = "white", size = 1))
# 存储图形到列表(按变量名作为键)
dep_delay_weather_plot_list[[var]] <- p
}
# 使用 patchwork 将所有图形组合并显示
wrap_plots(dep_delay_weather_plot_list, ncol = 3)# model ~ carrier---------------------------------------------------------------
#1.汇总数据:统计每个 carrier 对应的 model 数量
carrier_model_count <- flights_planes %>%
group_by(carrier, model) %>%
summarise(count = n(), .groups = 'drop')
#2.计算每个 carrier 最多使用的 model 数量及其比例
carrier_total <- carrier_model_count %>% #提取出各机型在各航空公司的数量
group_by(carrier) %>%
summarise(total = sum(count), .groups = 'drop')
carrier_most_model <- carrier_model_count %>% #提取出前三频率的
group_by(carrier) %>%
slice_max(order_by = count, n = 3) %>%
ungroup()
carrier_most_model <- carrier_most_model %>% #合并
left_join(carrier_total, by = "carrier") %>%
mutate(carrier_model_ratio = count / total)
#3. model ~ carrier
ggplot(carrier_most_model, aes(x = carrier, y = carrier_model_ratio, fill = model)) +
geom_bar(stat = "identity", position = "stack", color = "#777777") + # 绘制堆叠柱状图
geom_text(aes(label = model),
position = position_stack(vjust = 0.5), size = 4, angle = 30, color = "black", alpha = 0.8) + # 仅添加机型名称
labs(
title = "航空公司 (carrier) 和其前三频率使用 机型 (model) 的关系图",
x = "航空公司 (carrier)",
y = "占对应公司使用机型比例",
) +
theme_minimal() + # 使用简洁主题
theme(
plot.title = element_text(face = "bold", size = 20), # 标题加粗
axis.title = element_text(face = "bold", size = 20), # 坐标轴标题加粗
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
legend.position = "none" # 移除图例
) +
scale_y_continuous(labels = scales::percent) # 将 y 轴标签显示为百分比 # A tibble: 16 × 2
carrier count
<chr> <int>
1 9E 17416
2 AA 32645
3 AS 714
4 B6 54635
5 DL 48110
6 EV 54173
7 F9 682
8 FL 3260
9 HA 342
10 MQ 26395
11 OO 32
12 UA 57979
13 US 19873
14 VX 5162
15 WN 12245
16 YV 601
# 按 carrier 分组计算平均延误
carrier_dep_delay <- flights %>%
group_by(carrier) %>%
summarise(carrier_mean_dep_delay = mean(dep_delay, na.rm = TRUE),
carrier_sd_dep_delay = sd(dep_delay, na.rm = TRUE))
# carrier_mean_dep_delay ~ carrier
ggplot(carrier_dep_delay, aes(x = carrier, y = carrier_mean_dep_delay, fill = carrier)) +
geom_bar(stat = "identity", color = "black") + # 绘制柱状图
labs(
title = "各航空公司飞机平均延误分布图",
y = "平均延误 (minute)",
x = "航空公司 (carrier)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30), # 标题加粗
axis.title = element_text(face = "bold", size = 20), # 坐标轴标题加粗
axis.text.x = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
legend.position = "none" # 移除图例
)#*相差很大!
anova_dep_delay_carrier_result <- aov(dep_delay ~ carrier, data = flights)
summary(anova_dep_delay_carrier_result) Df Sum Sq Mean Sq F value Pr(>F)
carrier 15 6348034 423202 264.9 <2e-16 ***
Residuals 328505 524819199 1598
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness
# dep_delay ~ month----------------------------------------------------------------
# 按 month 分组计算平均起飞延误
month_dep_delay <- flights %>%
group_by(month) %>%
summarise(month_mean_dep_delay = mean(dep_delay, na.rm = TRUE),
month_sd_dep_delay = sd(dep_delay, na.rm = TRUE))
# mean_dep_delay ~ month
ggplot(month_dep_delay, aes(x = month, y = month_mean_dep_delay, fill = factor(month))) +
geom_bar(stat = "identity", color = "black") + # 绘制柱状图
scale_x_continuous(breaks = 1:12, labels = month.abb) + # 将月份显示为缩写
labs(
title = "各月飞机平均延误时间",
y = "平均延误 (minute)",
x = "月份"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 30), # 标题加粗
axis.title = element_text(face = "bold", size = 20), # 坐标轴标题加粗
axis.text.x = element_text(hjust = 0.5, vjust = 6, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
legend.position = "none" # 移除图例
)# 方差分析:dep_delay ~ month
anova_dep_delay_month_result <- aov(dep_delay ~ factor(month), data = flights)
summary(anova_dep_delay_month_result) Df Sum Sq Mean Sq F value Pr(>F)
factor(month) 11 8449792 768163 482.8 <2e-16 ***
Residuals 328509 522717440 1591
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
5743 observations deleted due to missingness
# carrier ~ month----------------------------------------------------------------
month_carrier_count <- flights %>%
group_by(month, carrier) %>%
summarise(count = n(), .groups = 'drop')
# 提取出各月份的总航班数
month_total <- month_carrier_count %>%
group_by(month) %>%
summarise(total = sum(count), .groups = 'drop')
# 合并数据,计算比例
month_carrier <- month_carrier_count %>%
left_join(month_total, by = "month") %>%
mutate(month_carrier_ratio = count / total)
# carrier ~ month
ggplot(month_carrier, aes(x = month, y = month_carrier_ratio, fill = carrier)) +
geom_bar(stat = "identity", position = "stack", color = "#777777") + # 绘制堆叠柱状图
geom_text(aes(label = carrier),
position = position_stack(vjust = 0.5), size = 5, color = "black", alpha = 0.8) + # 添加航空公司名称
labs(
title = "各月份 (month) 航空公司 (carrier) 所占航班比例的关系图",
x = "月份 (month)",
y = "占对应月份使用航空公司比例",
) +
theme_minimal() + # 使用简洁主题
theme(
plot.title = element_text(face = "bold", size = 20), # 标题加粗
axis.title = element_text(face = "bold", size = 20), # 坐标轴标题加粗
axis.text.x = element_text(hjust = 0.5, size = 20, face = "bold"),
axis.text.y = element_text(size = 20, face = "bold.italic"),
legend.position = "none" # 移除图例
) +
scale_y_continuous(labels = scales::percent) + # 将 y 轴标签显示为百分比
scale_x_continuous(breaks = 1:12, labels = month.abb) # 将月份显示为缩写(Jan, Feb 等)