---
title: "Work"
format:
html:
theme: cosmo
css: styles.css
toc: true
toc-title: "Contents"
toc-location: left
code-fold: show
code-tools: true
css: styles.css
---
Author: Haochun Qi
# 1.准备工作
## 1.1导入package
```{r, message=FALSE, warning=FALSE}
library(tidyverse)
library(openxlsx)
library(RColorBrewer)
library(data.table)
library(DT)
```
## 1.2 读取数据集
```{r}
getwd() # 获取工作空间位置
```
```{r}
data <- read.csv("C:/Users/TST/Desktop/统计学_学习/周末作业数据集/nobel_winners.csv") %>%
setDT() #读取数据集并转为数据框
```
# 2.数据探查
## 2.1 数据类型及基础信息
```{r}
# 数据类型
glimpse(data)
# 数据集
datatable(data,
options = list(
scrollX = TRUE,
scrollY = '200px',
pageLength = 5,
autoWidth = TRUE,
columnDefs = list(
list(width = '100px', targets = "_all")
)
))
```
## 2.2 缺失值
```{r, warning=FALSE}
# 总行数
total_row <- nrow(data)
# 缺失值及比例
missing_infor <- data %>%
summarise(across(everything(), list(
miss_count = ~ sum(is.na(.x)),
miss_pct = ~ sum(is.na(.x)) / total_row
))) %>%
pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
arrange(desc(miss_pct))
knitr::kable(missing_infor)
# 最大缺失值
max_missing <- missing_infor %>%
slice(which.max(miss_pct)) %>%
print()
# 最小缺失值
min_missint <- missing_infor %>%
slice(which.min(miss_pct)) %>%
print()
```
## 2.3 重复值
```{r}
# 是否存在重复数据(整体列)
has_duplicates <- anyDuplicated(data) > 0
if (has_duplicates) {
print("存在重复数据")
} else {
print("没有重复数据")
}
```
### 2.3.1 同一id是否对应多个 个人信息
```{r, warning=FALSE}
check_columns <- c("laureate_type", "full_name", "birth_date", "birth_city", "birth_country", "gender")
dup_check <- function(data, check_col, col_list) {
dup_data <- data %>%
group_by({{check_col}}) %>%
summarise(across(all_of(col_list), ~ n_distinct(.x) > 1, .names = "diff_{col}")) %>%
filter(if_any(starts_with("diff_"), ~ .x)) %>%
return(dup_data)
}
dup_id_data <- dup_check(data, laureate_id, check_columns)
nrow(dup_id_data)
```
### 2.3.2 是否存在重复数据
```{r}
dup_data <- data %>%
group_by(prize_year, category, prize, laureate_id) %>%
filter(n() > 1) %>%
arrange(laureate_id)
print(head(dup_data))
```
### 2.3.3 数据去重
```{r}
fina_data <- data %>%
distinct(prize_year, category, prize, laureate_id, .keep_all = TRUE) %>%
arrange(prize_year)
nrow(fina_data)
datatable(fina_data,
options = list(
scrollX = TRUE,
scrollY = '200px',
pageLength = 5,
autoWidth = TRUE,
columnDefs = list(
list(width = '100px', targets = "_all")
)
))
```
### 2.3.4 重新确定缺失比例
```{r, warning=FALSE}
total_row_new <- nrow(fina_data)
missing_infor_new <- fina_data %>%
summarise(across(everything(), list(
miss_count = ~ sum(is.na(.x)),
miss_pct = ~ sum(is.na(.x)) / total_row_new
))) %>%
pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
arrange(desc(miss_pct))
missing_infor_new %>%
filter(miss_count == 0)
knitr::kable(missing_infor_new)
```
```{r, warning=FALSE}
# 最大缺失值
max_missing_infor <- missing_infor_new %>%
slice(which.max(miss_pct)) %>%
print()
```
```{r, warning=FALSE}
# 最小缺失值
min_missing_infor <- missing_infor_new %>%
slice(which.min(miss_pct)) %>%
print()
```
### 2.3.5 基础信息
```{r}
# 总体奖项类别
table(fina_data$category)
# 获奖的时间跨度
quantile(fina_data$prize_year)
```
# 3.探索性分析
## 3.1 每个学科颁奖次数
```{r}
awards_count <- data %>%
group_by(category) %>%
summarise(award_count = n(), .groups = 'drop')
ggplot(awards_count, aes(x = reorder(category, -award_count), y = award_count, fill = category)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) + # 自定义颜色
labs(title = "诺贝尔奖各学科颁奖次数",
x = "类别 (Category)",
y = "颁奖次数 (Award Count)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = award_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))
```
## 3.2 获奖人的出生年代分布
```{r}
fina_data$birth_date <- as.Date(fina_data$birth_date)
fina_data$prize_year <- as.numeric(fina_data$prize_year)
fina_data <- fina_data %>%
mutate(birth_year = year(birth_date)) %>%
mutate(decade = floor(birth_year / 10) * 10) %>%
mutate(award_age = as.numeric(prize_year - birth_year))
decade_count <- fina_data %>%
filter(!is.na(birth_year)) %>%
group_by(decade) %>%
summarise(num_count = n(), .groups = 'drop')
decade_count$decade <- as.character(decade_count$decade)
ggplot(decade_count, aes(x = decade, y = num_count, fill = decade)) +
geom_bar(stat = "identity") +
labs(title = "获奖者年代人数统计",
x = "年代 (Decade)",
y = "人数 (Number)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = num_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))
```
## 3.3 性别比例
```{r}
gender_count <- fina_data %>%
filter(laureate_type == "Individual" & !is.na(gender)) %>%
group_by(prize_year, gender) %>%
summarise(num_count = n(), .groups = 'drop')
total_per_year <- fina_data %>%
filter(laureate_type == "Individual" & !is.na(gender)) %>%
group_by(prize_year) %>%
summarise(p_c_cnt = n(), .groups = 'drop')
gender_ratio <- gender_count %>%
left_join(total_per_year, by = "prize_year") %>%
mutate(ratio = num_count / p_c_cnt)
# 创建堆积条形图
ggplot(gender_ratio, aes(x = factor(prize_year), y = ratio, fill = gender)) +
geom_bar(stat = "identity", position = "fill") + # 使用"fill"使得每一年的比例总和为1
scale_y_continuous(labels = scales::percent) + # 将y轴标签格式化为百分比
labs(title = "不同年份性别占比",
x = "年份",
y = "比例",
fill = "性别") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
) +
scale_x_discrete(breaks = seq(min(as.numeric(gender_ratio$prize_year)), max(as.numeric(gender_ratio$prize_year)), by = 3))
```
## 3.4 平均年龄
```{r}
fina_data$prize_year <- as.numeric(fina_data$prize_year)
fina_data <- fina_data %>%
mutate(award_decade = floor(prize_year / 10) * 10)
fina_data$award_decade <- as.character(fina_data$award_decade)
average_age <- fina_data %>%
filter(!is.na(award_decade)) %>%
group_by(award_decade, category) %>%
summarise(avg_age = mean(award_age, na.rm = TRUE), .groups = 'drop')
ggplot(average_age, aes(x = factor(award_decade), y = avg_age, color = category, group = category)) +
geom_line() + # 添加折线
geom_point() + # 添加点以突出每个数据点
scale_x_discrete() + # 确保x轴按离散值处理
labs(title = "获奖者平均年龄",
x = "获奖年代",
y = "平均年龄",
color = "类别") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
plot.title = element_text(hjust = 0.5))
```
## 3.5 最年轻的诺贝尔获奖者
```{r}
min_age_records <- fina_data %>%
filter(laureate_type == "Individual") %>%
slice(which.min(award_age))
knitr::kable(min_age_records)
```
## 3.6 中国获得诺贝尔奖的有哪些
```{r, echo=TRUE}
China_awarders <- fina_data %>%
filter(birth_country == "China") %>%
select(full_name, prize_year, category, prize, award_age)
knitr::kable(China_awarders)
```