1 背景

数据分析思路学习

2 数据描述

2.1 依赖宏包

library(readr)
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)

2.2 读取数据

df <- read_csv("test.csv")

## Rows: 969 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): category, prize, motivation, prize_share, laureate_type, full_name...
## dbl  (2): prize_year, laureate_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df

总共18个字段，969条数据，列信息如下：

prize_year ,category ,prize ,motivation ,prize_share ,laureate_id ,laureate_type ,full_name ,birth_date ,birth_city ,birth_country ,gender ,organization_name ,organization_city ,organization_country ,death_date ,death_city ,death_country

年份 ,学科 ,奖品名称 ,获奖原因 ,获奖时间 ,获奖人id ,获奖人类型（个人、团体） ,获奖人名称 ,出生日期 ,出生城市 ,出生国家 ,性别 ,组织名称 ,组织城市 ,组织国家 ,死亡日期 ,死亡城市 ,死亡国家

3 数据清洗

3.1 数据缺失

df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing_count") %>%
  mutate(total_count = nrow(df),
         missing_ratio = missing_count / total_count)

可以看到获奖原因缺失88条，出生日期、出生城市、出生国家、性别、组织名称、组织城市、组织国家、死亡时间、死亡城市、死亡国家皆有缺失。

性别缺失怎么造成的？

df %>% count(laureate_type)

组织没有性别….，死亡时间为空的等也是同理。

3.2 数据是否重复

# 将每一行转换为字符向量并计算其频率
row_counts <- table(do.call(paste, df))

# 找出所有出现次数大于1的行
duplicate_rows_info <- row_counts[row_counts > 1]
duplicate_rows_info

## named integer(0)

可以看到无重复数据

4 探索性研究

4.1 每年的获奖数量趋势

df_prize_cnt <- df %>%
  group_by(prize_year) %>%
  summarise(prize_cnt = n(), .groups = 'drop')


ggplot(df_prize_cnt, aes(x = prize_year, y = prize_cnt)) +
  geom_line(color = "blue") +  # 设置线条颜色和宽度
  geom_point(color = "red") +  # 设置点的颜色和大小
  labs(
    title = "每年的获奖数量",
    x = "年份",
    y = "奖品数量"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),  # 居中标题
  )

总体呈现逐年增长的趋势

建立年份和获奖人数的回归模型，并预测2017年的获奖人数

# 计算每年总获奖数量
df_prize_cnt <- df %>%
  group_by(prize_year) %>%
  summarise(prize_cnt = n(), .groups = 'drop')

# 构建线性回归模型
award_model <- lm(prize_cnt ~ prize_year, data = df_prize_cnt)

# 查看模型摘要
summary(award_model)

## 
## Call:
## lm(formula = prize_cnt ~ prize_year, data = df_prize_cnt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2975 -1.2581  0.1866  1.5978  5.1729 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.579e+02  1.092e+01  -14.46   <2e-16 ***
## prize_year   8.498e-02  5.573e-03   15.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.003 on 111 degrees of freedom
## Multiple R-squared:  0.6768, Adjusted R-squared:  0.6739 
## F-statistic: 232.5 on 1 and 111 DF,  p-value: < 2.2e-16

# 创建一个用于预测的数据框
new_data <- data.frame(prize_year = 2017)

# 使用模型进行预测
predicted_2017 <- predict(award_model, newdata = new_data)

cat("预测2017年的获奖数量为:", round(predicted_2017), "\n")

## 预测2017年的获奖数量为: 14

# 绘制实际数据点和回归线
regression_plot <- ggplot(df_prize_cnt, aes(x = prize_year, y = prize_cnt)) +
  geom_point() + # 绘制散点图
  geom_smooth(method = "lm", col = "blue") + # 添加线性回归线
  labs(
    title = "年份与获奖人数的线性回归",
    x = "年份",
    y = "总获奖人数"
  ) +
  theme_minimal()

# 添加2017年的预测点
regression_plot <- regression_plot +
  geom_point(data = new_data, aes(x = prize_year, y = predicted_2017), color = "red", size = 3) +
  geom_text(data = new_data, aes(x = prize_year, y = predicted_2017, label = paste("预测: ", round(predicted_2017))), vjust = -1, color = "red") +
  theme(
    plot.title = element_text(hjust = 0.5),  # 居中标题
  )
regression_plot

## `geom_smooth()` using formula = 'y ~ x'

一个诺贝尔奖可以同时授予最多三位获奖者，一年6个奖项，最多18个获奖者。

4.2 各学科诺贝尔奖数量占比

# 按学科分组并计算每个学科的奖品数量
df_category_cnt <- df %>%
  group_by(category) %>%
  summarise(category_cnt = n(), .groups = 'drop')


ggplot(df_category_cnt, aes(x = "", y = category_cnt, fill = category)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  scale_fill_brewer(palette = "Set3") +  # 使用不同的颜色方案
  labs(
    title = "各学科获奖数量分布",
    fill = "学科"
  ) +
  theme_void() +
  geom_text(aes(label = paste(round((category_cnt/sum(category_cnt)*100), 1), "%")), 
            position = position_stack(vjust = 0.5)) + # 添加百分比标签
  theme(
  plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "black")
)

医学、化学、物理学是三巨头，占了66.3%，经济学占比最少（8.6%）

医学、化学和物理学只所以占大头，因为它们本身的基础性和应用性，也是由于诺贝尔奖设立的历史背景和个人意愿。同时，随着科技的发展，这些学科之间的相互联系愈发紧密，进一步巩固了它们在诺贝尔奖中的主导地位。

经济学奖并非最初设立的五个奖项之一。它是1969年开始颁发。相比其他诺贝尔奖项自1901年就已设立，经济学奖的历史较短，因此在总获奖次数上自然会少一些。

4.3 个人、团体获奖数量占比

df_laureate_type_cnt <- df %>%
  group_by(laureate_type) %>%
  summarise(laureate_type_cnt = n(), .groups = 'drop')


ggplot(df_laureate_type_cnt, aes(x = "", y = laureate_type_cnt, fill = laureate_type)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  scale_fill_brewer(palette = "Set3") +  # 使用不同的颜色方案
  labs(
    title = "个人、团体获奖数量占比",
    fill = "学科"
  ) +
  theme_void() +
  geom_text(aes(label = paste(round((laureate_type_cnt/sum(laureate_type_cnt)*100), 1), "%")), 
            position = position_stack(vjust = 0.5)) + # 添加百分比标签
  theme(
  plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "black")
)

4.4 各学科按获奖人类型划分的获奖次数

# 计算各学科、各获奖人类型的获奖次数
df_award_count_cnt <- df %>%
  group_by(category, laureate_type) %>%
  summarise(award_count_cnt = n(), .groups = 'drop')


# 创建堆叠条形图
ggplot(df_award_count_cnt, aes(x = category, y = award_count_cnt, fill = laureate_type)) +
  geom_bar(stat = "identity") + # 使用堆叠条形图
  labs(
    title = "各学科按获奖人类型划分的获奖次数",
    x = "学科",
    y = "获奖次数",
    fill = "获奖人类型"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1") # 使用预定义的颜色调色板

除了诺贝尔和平奖，其他的获奖者都是个人，这是由于诺贝尔委员会在评估提名时，通常会寻找那些具有明确原创性和创新性的成果。而个人研究者往往更容易被认定为某一发现或理论的主要创始人。

值得一提的是，诺贝尔和平奖是一个特例，它可以颁发给组织或团体，并且没有三人限制。

4.5 出生国家的获奖数量

df_birth_country_cnt <- df %>%
  filter(!is.na(birth_country) & birth_country != "") %>%  
  group_by(birth_country) %>%
  summarise(birth_country_cnt = n(), .groups = 'drop') %>% 
  arrange(desc(birth_country_cnt)) %>%  # 按获奖数量降序排列
  top_n(10)  # 取出获奖数量最多的前10个国家

## Selecting by birth_country_cnt

ggplot(df_birth_country_cnt, aes(x = reorder(birth_country, birth_country_cnt), y = birth_country_cnt)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # 使用条形图表示获奖数量
  labs(
    title = "获奖数量最多的前10个出生国家",
    x = "出生国家",
    y = "获奖数量"
  ) +
  theme_minimal() +  # 使用简约主题
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # 旋转x轴标签以适应长名称
    plot.title = element_text(hjust = 0.5)  # 居中标题
  ) +
  coord_flip()  # 翻转坐标轴，使国家名称垂直排列，更易阅读

美、英、德三国是20年代老牌发达归家，这些国家在科研投入、教育体系以及学术自由等方面，有着强大实力和支持。我泱泱大国当自强！

4.6 中国各城市获奖数量

df_China_cnt <- df %>%
  filter(str_trim(birth_country) == "China") %>%
  filter(!is.na(birth_city) & birth_city != "") %>%  
  group_by(birth_city) %>%
  summarise(birth_city_cnt = n(), .groups = 'drop') %>% 
  arrange(desc(birth_city_cnt))

ggplot(df_China_cnt, aes(x = reorder(birth_city, birth_city_cnt), y = birth_city_cnt)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # 使用条形图表示获奖数量
  labs(
    title = "中国各城市获奖数量",
    x = "出生城市",
    y = "获奖数量"
  ) +
  theme_minimal() +  # 使用简约主题
  theme(
    plot.title = element_text(hjust = 0.5)  # 居中标题
  ) +
  coord_flip()  # 翻转坐标轴，使国家名称垂直排列，更易阅读

可以看到中国获诺奖数量排名第一个的城市是上海，大都市机会多多…..

4.7 获奖人性别占比

df_gender_cnt <- df %>%
  filter(!is.na(gender) & gender != "") %>%  
  group_by(gender) %>%
  summarise(gender_cnt = n(), .groups = 'drop')

# 创建饼图
ggplot(df_gender_cnt, aes(x = "", y = gender_cnt, fill = gender)) +
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y", start = 0) + 
  theme_void() +
  labs(
    title = "各性别的获奖比例",
    fill = "性别"
  ) +
  scale_fill_brewer(palette = "Set3") +
  geom_text(aes(label = paste0(round((gender_cnt/sum(gender_cnt))*100, 2), "%")), position = position_stack(vjust = 0.5))

大部分都男性。

描述性数学分析-诺奖获得者

Yonghao Liu

2024-10-28