library(data.table)
library(ggplot2)
## Warning: 程序包'ggplot2'是用R版本4.4.3 来建造的

第一题

利用ggplot2::economics数据完成下题:

dat1 = as.data.table(ggplot2::economics)

library(dplyr)     
## Warning: 程序包'dplyr'是用R版本4.4.3 来建造的
## 
## 载入程序包:'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate) 
## Warning: 程序包'lubridate'是用R版本4.4.3 来建造的
## 
## 载入程序包:'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#筛选出2000-2010年但不包括2008年的数据
data <- dat1 %>%
  filter(year(date) >= 2000 & year(date) <= 2010 & year(date) != 2008)
#计算失业率
data <- data %>%
  mutate(unemployment_rate = (unemploy * 1000) / (pop * 1000000))

#将失业率划分为低、中、高三等
data <- data %>%
  mutate(unemployment_level = cut(unemployment_rate, 
                                  breaks = quantile(unemployment_rate, probs = c(0, 1/3, 2/3, 1)), 
                                  labels = c("低", "中", "高"),
                                  include.lowest = TRUE))
# 按年份统计低、中、高失业率月份的数量
yearly_summary <- data %>%
  group_by(year = year(date)) %>%
  summarise(
    low = sum(unemployment_level == "低"),
    medium = sum(unemployment_level == "中"),
    high = sum(unemployment_level == "高")
  )

# 标记年份的失业率水平
yearly_summary <- yearly_summary %>%
  mutate(year_unemployment_level = case_when(
    low > medium & low > high ~ "低",  # 低失业率月份最多
    medium > low & medium > high ~ "中",  # 中失业率月份最多
    high > low & high > medium ~ "高",  # 高失业率月份最多
    low == medium & low > high ~ "中",  # 低和中失业率月份相同且最多
    low == high & low > medium ~ "中",  # 低和高失业率月份相同且最多
    medium == high & medium > low ~ "中",  # 中和高失业率月份相同且最多
    TRUE ~ "中"  # 其他情况(如低、中、高月份数量相同),默认标记为“中”
  ))

# 添加 year 列
data <- data %>%
  mutate(year = year(date))

# 合并年份标记到原始数据
data <- data %>%
  left_join(yearly_summary %>% select(year,year_unemployment_level), by = c("year" = "year"))
# 画图
ggplot(data, aes(x = psavert, y = pce, color = year_unemployment_level)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "个人消费支出随个人储蓄率的变化",
    x = "个人储蓄率 (%)",
    y = "个人消费支出",
    color = "失业率水平"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

第二题

利用 nycflights13 package 的各数据完成下题(连接数据可使用data.table::merge函数):

library(nycflights13)
## Warning: 程序包'nycflights13'是用R版本4.4.3 来建造的
airline_flights <- flights %>%
  group_by(carrier) %>%  # 按航空公司代码分组
  summarise(flights_count = n(),.groups = 'drop') %>%  # 计算每家航空公司的航班数量
  arrange(desc(flights_count))  # 按航班数量降序排列

# 连接 airlines 数据,获取航空公司名称
airline_flights <- airline_flights %>%
  left_join(airlines, by = "carrier")

# 显示结果
print(airline_flights)
## # A tibble: 16 × 3
##    carrier flights_count name                       
##    <chr>           <int> <chr>                      
##  1 UA              58665 United Air Lines Inc.      
##  2 B6              54635 JetBlue Airways            
##  3 EV              54173 ExpressJet Airlines Inc.   
##  4 DL              48110 Delta Air Lines Inc.       
##  5 AA              32729 American Airlines Inc.     
##  6 MQ              26397 Envoy Air                  
##  7 US              20536 US Airways Inc.            
##  8 9E              18460 Endeavor Air Inc.          
##  9 WN              12275 Southwest Airlines Co.     
## 10 VX               5162 Virgin America             
## 11 FL               3260 AirTran Airways Corporation
## 12 AS                714 Alaska Airlines Inc.       
## 13 F9                685 Frontier Airlines Inc.     
## 14 YV                601 Mesa Airlines Inc.         
## 15 HA                342 Hawaiian Airlines Inc.     
## 16 OO                 32 SkyWest Airlines Inc.
# 仅考虑抵达延误无缺失的航班
arr_delay_summary <- flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(carrier) %>%
  summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(mean_arr_delay))

# 连接 airlines 数据,获取航空公司名称
arr_delay_summary <- arr_delay_summary %>%
  left_join(airlines, by = "carrier")

max_delay_airline <- arr_delay_summary[1, ]
min_delay_airline <- arr_delay_summary[nrow(arr_delay_summary), ]
# 显示结果
print(max_delay_airline)  # 平均延误最高的航空公司
## # A tibble: 1 × 3
##   carrier mean_arr_delay name                  
##   <chr>            <dbl> <chr>                 
## 1 F9                21.9 Frontier Airlines Inc.
print(min_delay_airline)  # 平均延误最低的航空公司
## # A tibble: 1 × 3
##   carrier mean_arr_delay name                
##   <chr>            <dbl> <chr>               
## 1 AS               -9.93 Alaska Airlines Inc.
# 找到出发延误最长的那一天
longest_dep_delay_day <- flights %>%
  group_by(date = as.Date(time_hour), origin) %>%  # 按日期和机场分组
  summarise(max_dep_delay = max(dep_delay, na.rm = TRUE), .groups = 'drop') %>%
  arrange(desc(max_dep_delay)) %>%
  slice(1)  # 取出发延误最长的那一天

# 提取日期和机场信息
max_date <- longest_dep_delay_day$date
max_origin <- longest_dep_delay_day$origin

# 获取当天的平均温度
temp_on_max_delay_day <- weather %>%
  filter(origin == max_origin & as.Date(time_hour) == max_date) %>%
  summarise(avg_temp = mean(temp, na.rm = TRUE))

# 合并结果
result <- longest_dep_delay_day %>%
  mutate(avg_temp = temp_on_max_delay_day$avg_temp) %>%
  select(origin, date, max_dep_delay, avg_temp)

# 显示结果
print(result)
## # A tibble: 1 × 4
##   origin date       max_dep_delay avg_temp
##   <chr>  <date>             <dbl>    <dbl>
## 1 JFK    2013-01-09          1301     40.8
# 找到飞行时间最长的航班
longest_flight <- flights %>%
  arrange(desc(air_time)) %>%
  slice(1) %>%  # 取飞行时间最长的航班
  select(air_time, origin, dest, tailnum)  # 选择需要的列

# 连接 planes 数据,获取座位数量
longest_flight_details <- longest_flight %>%
  left_join(planes %>% select(tailnum, seats), by = "tailnum") %>%
  select(air_time, seats, origin, dest)  # 整理结果

# 显示结果
print(longest_flight_details)
## # A tibble: 1 × 4
##   air_time seats origin dest 
##      <dbl> <int> <chr>  <chr>
## 1      695   292 EWR    HNL
# 5. 绘制 noDepDelay 影响到达延误的箱线图
flights_filtered <- flights %>%
  mutate(noDepDelay = dep_delay <= 0) %>%
  filter(month == 5 & dest %in% c("MIA", "BOS"))

ggplot(flights_filtered, aes(x = factor(dest), y = arr_delay, fill = noDepDelay)) +
  geom_boxplot() +
  facet_grid(origin ~ dest) +
  labs(title = "Arrival Delay by Departure Delay Status", x = "Destination", y = "Arrival Delay (min)") +
  theme_minimal()
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

daily_delays <- flights %>%
  group_by(date = as.Date(time_hour), origin) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = 'drop')
daily_visibility <- weather %>%
  group_by(date = as.Date(time_hour), origin) %>%
  summarise(avg_visibility = mean(visib, na.rm = TRUE), .groups = 'drop')
delay_visibility <- merge(daily_delays, daily_visibility, by = c("date", "origin"), all = TRUE)
ggplot(delay_visibility, aes(x = avg_visibility, y = avg_dep_delay, color = origin)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Average Departure Delay vs Visibility", x = "Average Visibility (miles)", y = "Average Departure Delay (min)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).