library(data.table)
library(ggplot2)
## Warning: 程序包'ggplot2'是用R版本4.4.3 来建造的
利用
ggplot2::economics数据完成下题:
year函数)。unemploy)占总人口(pop)的比例,注意unemploy是以千人为单位,pop是以百万人为单位。factor
函数)。dat1 = as.data.table(ggplot2::economics)
library(dplyr)
## Warning: 程序包'dplyr'是用R版本4.4.3 来建造的
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: 程序包'lubridate'是用R版本4.4.3 来建造的
##
## 载入程序包:'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#筛选出2000-2010年但不包括2008年的数据
data <- dat1 %>%
filter(year(date) >= 2000 & year(date) <= 2010 & year(date) != 2008)
#计算失业率
data <- data %>%
mutate(unemployment_rate = (unemploy * 1000) / (pop * 1000000))
#将失业率划分为低、中、高三等
data <- data %>%
mutate(unemployment_level = cut(unemployment_rate,
breaks = quantile(unemployment_rate, probs = c(0, 1/3, 2/3, 1)),
labels = c("低", "中", "高"),
include.lowest = TRUE))
# 按年份统计低、中、高失业率月份的数量
yearly_summary <- data %>%
group_by(year = year(date)) %>%
summarise(
low = sum(unemployment_level == "低"),
medium = sum(unemployment_level == "中"),
high = sum(unemployment_level == "高")
)
# 标记年份的失业率水平
yearly_summary <- yearly_summary %>%
mutate(year_unemployment_level = case_when(
low > medium & low > high ~ "低", # 低失业率月份最多
medium > low & medium > high ~ "中", # 中失业率月份最多
high > low & high > medium ~ "高", # 高失业率月份最多
low == medium & low > high ~ "中", # 低和中失业率月份相同且最多
low == high & low > medium ~ "中", # 低和高失业率月份相同且最多
medium == high & medium > low ~ "中", # 中和高失业率月份相同且最多
TRUE ~ "中" # 其他情况(如低、中、高月份数量相同),默认标记为“中”
))
# 添加 year 列
data <- data %>%
mutate(year = year(date))
# 合并年份标记到原始数据
data <- data %>%
left_join(yearly_summary %>% select(year,year_unemployment_level), by = c("year" = "year"))
pce)如何随个人储蓄率(psavert)的变化而变化。# 画图
ggplot(data, aes(x = psavert, y = pce, color = year_unemployment_level)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "个人消费支出随个人储蓄率的变化",
x = "个人储蓄率 (%)",
y = "个人消费支出",
color = "失业率水平"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
回答:在低、中、高三种失业率水平年份中,哪种失业率水平年份随着个人储蓄率的增加,个人消费支出呈现出相对比较平缓的下降趋势。
通过观察图中的趋势线,可以得出结论: 低失业率年份:随着个人储蓄率的增加,个人消费支出呈现出相对比较平缓的下降趋势。 中和高失业率年份的下降趋势相对更陡峭。 因此,低失业率年份的个人消费支出随个人储蓄率增加的下降趋势最为平缓。
利用
nycflights13package 的各数据完成下题(连接数据可使用data.table::merge函数):
library(nycflights13)
## Warning: 程序包'nycflights13'是用R版本4.4.3 来建造的
airline_flights <- flights %>%
group_by(carrier) %>% # 按航空公司代码分组
summarise(flights_count = n(),.groups = 'drop') %>% # 计算每家航空公司的航班数量
arrange(desc(flights_count)) # 按航班数量降序排列
# 连接 airlines 数据,获取航空公司名称
airline_flights <- airline_flights %>%
left_join(airlines, by = "carrier")
# 显示结果
print(airline_flights)
## # A tibble: 16 × 3
## carrier flights_count name
## <chr> <int> <chr>
## 1 UA 58665 United Air Lines Inc.
## 2 B6 54635 JetBlue Airways
## 3 EV 54173 ExpressJet Airlines Inc.
## 4 DL 48110 Delta Air Lines Inc.
## 5 AA 32729 American Airlines Inc.
## 6 MQ 26397 Envoy Air
## 7 US 20536 US Airways Inc.
## 8 9E 18460 Endeavor Air Inc.
## 9 WN 12275 Southwest Airlines Co.
## 10 VX 5162 Virgin America
## 11 FL 3260 AirTran Airways Corporation
## 12 AS 714 Alaska Airlines Inc.
## 13 F9 685 Frontier Airlines Inc.
## 14 YV 601 Mesa Airlines Inc.
## 15 HA 342 Hawaiian Airlines Inc.
## 16 OO 32 SkyWest Airlines Inc.
# 仅考虑抵达延误无缺失的航班
arr_delay_summary <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(carrier) %>%
summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(mean_arr_delay))
# 连接 airlines 数据,获取航空公司名称
arr_delay_summary <- arr_delay_summary %>%
left_join(airlines, by = "carrier")
max_delay_airline <- arr_delay_summary[1, ]
min_delay_airline <- arr_delay_summary[nrow(arr_delay_summary), ]
# 显示结果
print(max_delay_airline) # 平均延误最高的航空公司
## # A tibble: 1 × 3
## carrier mean_arr_delay name
## <chr> <dbl> <chr>
## 1 F9 21.9 Frontier Airlines Inc.
print(min_delay_airline) # 平均延误最低的航空公司
## # A tibble: 1 × 3
## carrier mean_arr_delay name
## <chr> <dbl> <chr>
## 1 AS -9.93 Alaska Airlines Inc.
# 找到出发延误最长的那一天
longest_dep_delay_day <- flights %>%
group_by(date = as.Date(time_hour), origin) %>% # 按日期和机场分组
summarise(max_dep_delay = max(dep_delay, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(max_dep_delay)) %>%
slice(1) # 取出发延误最长的那一天
# 提取日期和机场信息
max_date <- longest_dep_delay_day$date
max_origin <- longest_dep_delay_day$origin
# 获取当天的平均温度
temp_on_max_delay_day <- weather %>%
filter(origin == max_origin & as.Date(time_hour) == max_date) %>%
summarise(avg_temp = mean(temp, na.rm = TRUE))
# 合并结果
result <- longest_dep_delay_day %>%
mutate(avg_temp = temp_on_max_delay_day$avg_temp) %>%
select(origin, date, max_dep_delay, avg_temp)
# 显示结果
print(result)
## # A tibble: 1 × 4
## origin date max_dep_delay avg_temp
## <chr> <date> <dbl> <dbl>
## 1 JFK 2013-01-09 1301 40.8
# 找到飞行时间最长的航班
longest_flight <- flights %>%
arrange(desc(air_time)) %>%
slice(1) %>% # 取飞行时间最长的航班
select(air_time, origin, dest, tailnum) # 选择需要的列
# 连接 planes 数据,获取座位数量
longest_flight_details <- longest_flight %>%
left_join(planes %>% select(tailnum, seats), by = "tailnum") %>%
select(air_time, seats, origin, dest) # 整理结果
# 显示结果
print(longest_flight_details)
## # A tibble: 1 × 4
## air_time seats origin dest
## <dbl> <int> <chr> <chr>
## 1 695 292 EWR HNL
# 5. 绘制 noDepDelay 影响到达延误的箱线图
flights_filtered <- flights %>%
mutate(noDepDelay = dep_delay <= 0) %>%
filter(month == 5 & dest %in% c("MIA", "BOS"))
ggplot(flights_filtered, aes(x = factor(dest), y = arr_delay, fill = noDepDelay)) +
geom_boxplot() +
facet_grid(origin ~ dest) +
labs(title = "Arrival Delay by Departure Delay Status", x = "Destination", y = "Arrival Delay (min)") +
theme_minimal()
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
daily_delays <- flights %>%
group_by(date = as.Date(time_hour), origin) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = 'drop')
daily_visibility <- weather %>%
group_by(date = as.Date(time_hour), origin) %>%
summarise(avg_visibility = mean(visib, na.rm = TRUE), .groups = 'drop')
delay_visibility <- merge(daily_delays, daily_visibility, by = c("date", "origin"), all = TRUE)
ggplot(delay_visibility, aes(x = avg_visibility, y = avg_dep_delay, color = origin)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Average Departure Delay vs Visibility", x = "Average Visibility (miles)", y = "Average Departure Delay (min)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).