Work

Author: Haochun Qi

1.准备工作

1.1导入package

Code

library(tidyverse)
library(openxlsx)
library(RColorBrewer)
library(data.table)
library(DT)

1.2 读取数据集

Code

getwd() # 获取工作空间位置

[1] "D:/R_program/nober_winners_new"

Code

data <- read.csv("C:/Users/TST/Desktop/统计学_学习/周末作业数据集/nobel_winners.csv") %>%
  setDT() #读取数据集并转为数据框

2.数据探查

2.1 数据类型及基础信息

Code

# 数据类型
glimpse(data)

Rows: 969
Columns: 18
$ prize_year           <int> 1901, 1901, 1901, 1901, 1901, 1901, 1902, 1902, 1…
$ category             <chr> "Chemistry", "Literature", "Medicine", "Peace", "…
$ prize                <chr> "The Nobel Prize in Chemistry 1901", "The Nobel P…
$ motivation           <chr> "\"in recognition of the extraordinary services h…
$ prize_share          <chr> "1/1", "1/1", "1/1", "1/2", "1/2", "1/1", "1/1", …
$ laureate_id          <int> 160, 569, 293, 462, 463, 1, 161, 571, 294, 464, 4…
$ laureate_type        <chr> "Individual", "Individual", "Individual", "Indivi…
$ full_name            <chr> "Jacobus Henricus van 't Hoff", "Sully Prudhomme"…
$ birth_date           <chr> "1852-08-30", "1839-03-16", "1854-03-15", "1828-0…
$ birth_city           <chr> "Rotterdam", "Paris", "Hansdorf (Lawice)", "Genev…
$ birth_country        <chr> "Netherlands", "France", "Prussia (Poland)", "Swi…
$ gender               <chr> "Male", "Male", "Male", "Male", "Male", "Male", "…
$ organization_name    <chr> "Berlin University", NA, "Marburg University", NA…
$ organization_city    <chr> "Berlin", NA, "Marburg", NA, NA, "Munich", "Berli…
$ organization_country <chr> "Germany", NA, "Germany", NA, NA, "Germany", "Ger…
$ death_date           <chr> "1911-03-01", "1907-09-07", "1917-03-31", "1910-1…
$ death_city           <chr> "Berlin", "Châtenay", "Marburg", "Heiden", "Paris…
$ death_country        <chr> "Germany", "France", "Germany", "Switzerland", "F…

Code

# 数据集
datatable(data, 
          options = list(
            scrollX = TRUE,
            scrollY = '200px',
            pageLength = 5,
            autoWidth = TRUE,
            columnDefs = list(
              list(width = '100px', targets = "_all")
          )
          ))

2.2 缺失值

Code

 # 总行数
  total_row <- nrow(data)
 # 缺失值及比例
  missing_infor <- data %>%
    summarise(across(everything(), list(
      miss_count = ~ sum(is.na(.x)),
      miss_pct = ~ sum(is.na(.x)) / total_row
    ))) %>%
    pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
    arrange(desc(miss_pct))
  
  knitr::kable(missing_infor)

Column	miss_count	miss_pct
death_city	370	0.3818369
death_country	364	0.3756450
death_date	352	0.3632611
organization_city	253	0.2610939
organization_country	253	0.2610939
organization_name	247	0.2549020
motivation	88	0.0908153
birth_date	31	0.0319917
birth_city	28	0.0288958
birth_country	26	0.0268318
gender	26	0.0268318
prize_year	0	0.0000000
category	0	0.0000000
prize	0	0.0000000
prize_share	0	0.0000000
laureate_id	0	0.0000000
laureate_type	0	0.0000000
full_name	0	0.0000000

Code

  # 最大缺失值
  max_missing <- missing_infor %>%
   slice(which.max(miss_pct)) %>%
   print()

# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 death_city        370    0.382

Code

  # 最小缺失值  
  min_missint <- missing_infor %>%
   slice(which.min(miss_pct)) %>%
   print()

# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 prize_year          0        0

2.3 重复值

Code

# 是否存在重复数据（整体列）
  has_duplicates <- anyDuplicated(data) > 0
  
  if (has_duplicates) {
    print("存在重复数据")
  } else {
    print("没有重复数据")
  }

[1] "没有重复数据"

2.3.1 同一id是否对应多个个人信息

Code

 check_columns <- c("laureate_type", "full_name", "birth_date", "birth_city", "birth_country", "gender") 
  
  dup_check <- function(data, check_col, col_list) {
    
    dup_data <- data %>%
      group_by({{check_col}}) %>%
      summarise(across(all_of(col_list), ~ n_distinct(.x) > 1, .names = "diff_{col}")) %>%
      filter(if_any(starts_with("diff_"), ~ .x)) %>%
    return(dup_data)
  }
  
  dup_id_data <- dup_check(data, laureate_id, check_columns)
  nrow(dup_id_data)

[1] 0

2.3.2 是否存在重复数据

Code

  dup_data <- data %>%
    group_by(prize_year, category, prize, laureate_id) %>%
    filter(n() > 1) %>%
    arrange(laureate_id)

  print(head(dup_data))

# A tibble: 6 × 18
# Groups:   prize_year, category, prize, laureate_id [3]
  prize_year category prize     motivation prize_share laureate_id laureate_type
       <int> <chr>    <chr>     <chr>      <chr>             <int> <chr>        
1       1949 Physics  The Nobe… "\"for hi… 1/1                  54 Individual   
2       1949 Physics  The Nobe… "\"for hi… 1/1                  54 Individual   
3       1954 Physics  The Nobe… "\"for th… 1/2                  62 Individual   
4       1954 Physics  The Nobe… "\"for th… 1/2                  62 Individual   
5       1958 Physics  The Nobe… "\"for th… 1/3                  71 Individual   
6       1958 Physics  The Nobe… "\"for th… 1/3                  71 Individual   
# ℹ 11 more variables: full_name <chr>, birth_date <chr>, birth_city <chr>,
#   birth_country <chr>, gender <chr>, organization_name <chr>,
#   organization_city <chr>, organization_country <chr>, death_date <chr>,
#   death_city <chr>, death_country <chr>

2.3.3 数据去重

Code

  fina_data <- data %>%
    distinct(prize_year, category, prize, laureate_id, .keep_all = TRUE) %>%
    arrange(prize_year)

  nrow(fina_data)

[1] 911

Code

  datatable(fina_data, 
          options = list(
            scrollX = TRUE,
            scrollY = '200px',
            pageLength = 5,
            autoWidth = TRUE,
            columnDefs = list(
              list(width = '100px', targets = "_all")
          )
          ))

2.3.4 重新确定缺失比例

Code

  total_row_new <- nrow(fina_data) 
 
  missing_infor_new <- fina_data %>%
    summarise(across(everything(), list(
      miss_count = ~ sum(is.na(.x)),
      miss_pct = ~ sum(is.na(.x)) / total_row_new
    ))) %>%
    pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
    arrange(desc(miss_pct))
  
  missing_infor_new %>%
    filter(miss_count == 0)

# A tibble: 7 × 3
  Column        miss_count miss_pct
  <chr>              <int>    <dbl>
1 prize_year             0        0
2 category               0        0
3 prize                  0        0
4 prize_share            0        0
5 laureate_id            0        0
6 laureate_type          0        0
7 full_name              0        0

Code

  knitr::kable(missing_infor_new)

Column	miss_count	miss_pct
death_city	335	0.3677278
death_country	329	0.3611416
death_date	318	0.3490670
organization_name	246	0.2700329
organization_city	244	0.2678375
organization_country	244	0.2678375
motivation	88	0.0965971
birth_date	30	0.0329308
birth_city	28	0.0307355
birth_country	26	0.0285401
gender	26	0.0285401
prize_year	0	0.0000000
category	0	0.0000000
prize	0	0.0000000
prize_share	0	0.0000000
laureate_id	0	0.0000000
laureate_type	0	0.0000000
full_name	0	0.0000000

Code

  # 最大缺失值
  max_missing_infor <- missing_infor_new %>%
   slice(which.max(miss_pct)) %>%
   print()

# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 death_city        335    0.368

Code

  # 最小缺失值  
  min_missing_infor <- missing_infor_new %>%
   slice(which.min(miss_pct)) %>%
   print()

# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 prize_year          0        0

2.3.5 基础信息

Code

  # 总体奖项类别
  table(fina_data$category)


 Chemistry  Economics Literature   Medicine      Peace    Physics 
       175         78        113        211        130        204

Code

  # 获奖的时间跨度
  quantile(fina_data$prize_year)

  0%  25%  50%  75% 100% 
1901 1946 1975 1997 2016

3.探索性分析

3.1 每个学科颁奖次数

Code

 awards_count <- data %>%
  group_by(category) %>%
    summarise(award_count = n(), .groups = 'drop')
  
  ggplot(awards_count, aes(x = reorder(category, -award_count), y = award_count, fill = category)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +  # 自定义颜色
    labs(title = "诺贝尔奖各学科颁奖次数",
         x = "类别 (Category)",
         y = "颁奖次数 (Award Count)") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
          plot.title = element_text(hjust = 0.5)) +  
    geom_text(aes(label = award_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))

3.2 获奖人的出生年代分布

Code

 fina_data$birth_date <- as.Date(fina_data$birth_date)
  fina_data$prize_year <- as.numeric(fina_data$prize_year)
  
  fina_data <- fina_data %>%
    mutate(birth_year = year(birth_date)) %>%
    mutate(decade = floor(birth_year / 10) * 10) %>%
    mutate(award_age = as.numeric(prize_year - birth_year))
  
  decade_count <- fina_data %>%
    filter(!is.na(birth_year)) %>% 
    group_by(decade) %>%
    summarise(num_count = n(), .groups = 'drop') 
    
  decade_count$decade <- as.character(decade_count$decade)
  
  ggplot(decade_count, aes(x = decade, y = num_count, fill = decade)) +
    geom_bar(stat = "identity") +
    labs(title = "获奖者年代人数统计",
         x = "年代 (Decade)",
         y = "人数 (Number)") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
          plot.title = element_text(hjust = 0.5)) +  
    geom_text(aes(label = num_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))

3.3 性别比例

Code

  gender_count <- fina_data %>% 
    filter(laureate_type == "Individual" & !is.na(gender)) %>% 
    group_by(prize_year, gender) %>%
    summarise(num_count = n(), .groups = 'drop')
  
  total_per_year <- fina_data %>%
   filter(laureate_type == "Individual" & !is.na(gender)) %>% 
   group_by(prize_year) %>%
   summarise(p_c_cnt = n(), .groups = 'drop')
  
  gender_ratio <- gender_count %>%
   left_join(total_per_year, by = "prize_year") %>%
   mutate(ratio = num_count / p_c_cnt)
  
  # 创建堆积条形图
  ggplot(gender_ratio, aes(x = factor(prize_year), y = ratio, fill = gender)) +
   geom_bar(stat = "identity", position = "fill") + # 使用"fill"使得每一年的比例总和为1
   scale_y_continuous(labels = scales::percent) + # 将y轴标签格式化为百分比
   labs(title = "不同年份性别占比",
        x = "年份",
        y = "比例",
        fill = "性别") +
   theme_minimal() +
   theme(
     plot.title = element_text(hjust = 0.5),
     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
   ) +
   scale_x_discrete(breaks = seq(min(as.numeric(gender_ratio$prize_year)), max(as.numeric(gender_ratio$prize_year)), by = 3))

3.4 平均年龄

Code

  fina_data$prize_year <- as.numeric(fina_data$prize_year)
     
  fina_data <- fina_data %>%
   mutate(award_decade = floor(prize_year / 10) * 10)
     
     
  fina_data$award_decade <- as.character(fina_data$award_decade)
     
  average_age <- fina_data %>%
   filter(!is.na(award_decade)) %>%
   group_by(award_decade, category) %>%
   summarise(avg_age = mean(award_age, na.rm = TRUE), .groups = 'drop')
     
  ggplot(average_age, aes(x = factor(award_decade), y = avg_age, color = category, group = category)) +
   geom_line() +  # 添加折线
   geom_point() + # 添加点以突出每个数据点
   scale_x_discrete() + # 确保x轴按离散值处理
   labs(title = "获奖者平均年龄",
        x = "获奖年代",
        y = "平均年龄",
        color = "类别") +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
                          plot.title = element_text(hjust = 0.5))

3.5 最年轻的诺贝尔获奖者

Code

  min_age_records <- fina_data %>%
   filter(laureate_type == "Individual") %>%
   slice(which.min(award_age))
  
  knitr::kable(min_age_records)

prize_year	category	prize	motivation	prize_share	laureate_id	laureate_type	full_name	birth_date	birth_city	birth_country	gender	organization_name	organization_city	organization_country	death_date	death_city	death_country	birth_year	decade	award_age	award_decade
2014	Peace	The Nobel Peace Prize 2014	“for their struggle against the suppression of children and young people and for the right of all children to education”	1/2	914	Individual	Malala Yousafzai	1997-07-12	Mingora	Pakistan	Female	NA	NA	NA	NA	NA	NA	1997	1990	17	2010

3.6 中国获得诺贝尔奖的有哪些

Code

  China_awarders <- fina_data %>%
   filter(birth_country == "China") %>%
   select(full_name, prize_year, category, prize, award_age)

  knitr::kable(China_awarders)

full_name	prize_year	category	prize	award_age
Walter Houser Brattain	1956	Physics	The Nobel Prize in Physics 1956	54
Chen Ning Yang	1957	Physics	The Nobel Prize in Physics 1957	35
Tsung-Dao (T.D.) Lee	1957	Physics	The Nobel Prize in Physics 1957	31
Edmond H. Fischer	1992	Medicine	The Nobel Prize in Physiology or Medicine 1992	72
Daniel C. Tsui	1998	Physics	The Nobel Prize in Physics 1998	59
Gao Xingjian	2000	Literature	The Nobel Prize in Literature 2000	60
Charles Kuen Kao	2009	Physics	The Nobel Prize in Physics 2009	76
Ei-ichi Negishi	2010	Chemistry	The Nobel Prize in Chemistry 2010	75
Liu Xiaobo	2010	Peace	The Nobel Peace Prize 2010	55
Mo Yan	2012	Literature	The Nobel Prize in Literature 2012	57
Youyou Tu	2015	Medicine	The Nobel Prize in Physiology or Medicine 2015	85

1.准备工作

1.1导入package

1.2 读取数据集

2.数据探查

2.1 数据类型及基础信息

2.2 缺失值

2.3 重复值

2.3.1 同一id是否对应多个 个人信息

2.3.2 是否存在重复数据

2.3.3 数据去重

2.3.4 重新确定缺失比例

2.3.5 基础信息

3.探索性分析

3.1 每个学科颁奖次数

3.2 获奖人的出生年代分布

3.3 性别比例

3.4 平均年龄

3.5 最年轻的诺贝尔获奖者

3.6 中国获得诺贝尔奖的有哪些

2.3.1 同一id是否对应多个个人信息