Author: Haochun Qi

1.准备工作

1.1导入package

Code
library(tidyverse)
library(openxlsx)
library(RColorBrewer)
library(data.table)
library(DT)

1.2 读取数据集

Code
getwd() # 获取工作空间位置
[1] "D:/R_program/nober_winners_new"
Code
data <- read.csv("C:/Users/TST/Desktop/统计学_学习/周末作业数据集/nobel_winners.csv") %>%
  setDT() #读取数据集并转为数据框

2.数据探查

2.1 数据类型及基础信息

Code
# 数据类型
glimpse(data)
Rows: 969
Columns: 18
$ prize_year           <int> 1901, 1901, 1901, 1901, 1901, 1901, 1902, 1902, 1…
$ category             <chr> "Chemistry", "Literature", "Medicine", "Peace", "…
$ prize                <chr> "The Nobel Prize in Chemistry 1901", "The Nobel P…
$ motivation           <chr> "\"in recognition of the extraordinary services h…
$ prize_share          <chr> "1/1", "1/1", "1/1", "1/2", "1/2", "1/1", "1/1", …
$ laureate_id          <int> 160, 569, 293, 462, 463, 1, 161, 571, 294, 464, 4…
$ laureate_type        <chr> "Individual", "Individual", "Individual", "Indivi…
$ full_name            <chr> "Jacobus Henricus van 't Hoff", "Sully Prudhomme"…
$ birth_date           <chr> "1852-08-30", "1839-03-16", "1854-03-15", "1828-0…
$ birth_city           <chr> "Rotterdam", "Paris", "Hansdorf (Lawice)", "Genev…
$ birth_country        <chr> "Netherlands", "France", "Prussia (Poland)", "Swi…
$ gender               <chr> "Male", "Male", "Male", "Male", "Male", "Male", "…
$ organization_name    <chr> "Berlin University", NA, "Marburg University", NA…
$ organization_city    <chr> "Berlin", NA, "Marburg", NA, NA, "Munich", "Berli…
$ organization_country <chr> "Germany", NA, "Germany", NA, NA, "Germany", "Ger…
$ death_date           <chr> "1911-03-01", "1907-09-07", "1917-03-31", "1910-1…
$ death_city           <chr> "Berlin", "Châtenay", "Marburg", "Heiden", "Paris…
$ death_country        <chr> "Germany", "France", "Germany", "Switzerland", "F…
Code
# 数据集
datatable(data, 
          options = list(
            scrollX = TRUE,
            scrollY = '200px',
            pageLength = 5,
            autoWidth = TRUE,
            columnDefs = list(
              list(width = '100px', targets = "_all")
          )
          ))

2.2 缺失值

Code
 # 总行数
  total_row <- nrow(data)
 # 缺失值及比例
  missing_infor <- data %>%
    summarise(across(everything(), list(
      miss_count = ~ sum(is.na(.x)),
      miss_pct = ~ sum(is.na(.x)) / total_row
    ))) %>%
    pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
    arrange(desc(miss_pct))
  
  knitr::kable(missing_infor)
Column miss_count miss_pct
death_city 370 0.3818369
death_country 364 0.3756450
death_date 352 0.3632611
organization_city 253 0.2610939
organization_country 253 0.2610939
organization_name 247 0.2549020
motivation 88 0.0908153
birth_date 31 0.0319917
birth_city 28 0.0288958
birth_country 26 0.0268318
gender 26 0.0268318
prize_year 0 0.0000000
category 0 0.0000000
prize 0 0.0000000
prize_share 0 0.0000000
laureate_id 0 0.0000000
laureate_type 0 0.0000000
full_name 0 0.0000000
Code
  # 最大缺失值
  max_missing <- missing_infor %>%
   slice(which.max(miss_pct)) %>%
   print()
# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 death_city        370    0.382
Code
  # 最小缺失值  
  min_missint <- missing_infor %>%
   slice(which.min(miss_pct)) %>%
   print()
# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 prize_year          0        0

2.3 重复值

Code
# 是否存在重复数据(整体列)
  has_duplicates <- anyDuplicated(data) > 0
  
  if (has_duplicates) {
    print("存在重复数据")
  } else {
    print("没有重复数据")
  }
[1] "没有重复数据"

2.3.1 同一id是否对应多个 个人信息

Code
 check_columns <- c("laureate_type", "full_name", "birth_date", "birth_city", "birth_country", "gender") 
  
  dup_check <- function(data, check_col, col_list) {
    
    dup_data <- data %>%
      group_by({{check_col}}) %>%
      summarise(across(all_of(col_list), ~ n_distinct(.x) > 1, .names = "diff_{col}")) %>%
      filter(if_any(starts_with("diff_"), ~ .x)) %>%
    return(dup_data)
  }
  
  dup_id_data <- dup_check(data, laureate_id, check_columns)
  nrow(dup_id_data) 
[1] 0

2.3.2 是否存在重复数据

Code
  dup_data <- data %>%
    group_by(prize_year, category, prize, laureate_id) %>%
    filter(n() > 1) %>%
    arrange(laureate_id)

  print(head(dup_data))
# A tibble: 6 × 18
# Groups:   prize_year, category, prize, laureate_id [3]
  prize_year category prize     motivation prize_share laureate_id laureate_type
       <int> <chr>    <chr>     <chr>      <chr>             <int> <chr>        
1       1949 Physics  The Nobe… "\"for hi… 1/1                  54 Individual   
2       1949 Physics  The Nobe… "\"for hi… 1/1                  54 Individual   
3       1954 Physics  The Nobe… "\"for th… 1/2                  62 Individual   
4       1954 Physics  The Nobe… "\"for th… 1/2                  62 Individual   
5       1958 Physics  The Nobe… "\"for th… 1/3                  71 Individual   
6       1958 Physics  The Nobe… "\"for th… 1/3                  71 Individual   
# ℹ 11 more variables: full_name <chr>, birth_date <chr>, birth_city <chr>,
#   birth_country <chr>, gender <chr>, organization_name <chr>,
#   organization_city <chr>, organization_country <chr>, death_date <chr>,
#   death_city <chr>, death_country <chr>

2.3.3 数据去重

Code
  fina_data <- data %>%
    distinct(prize_year, category, prize, laureate_id, .keep_all = TRUE) %>%
    arrange(prize_year)

  nrow(fina_data)
[1] 911
Code
  datatable(fina_data, 
          options = list(
            scrollX = TRUE,
            scrollY = '200px',
            pageLength = 5,
            autoWidth = TRUE,
            columnDefs = list(
              list(width = '100px', targets = "_all")
          )
          ))

2.3.4 重新确定缺失比例

Code
  total_row_new <- nrow(fina_data) 
 
  missing_infor_new <- fina_data %>%
    summarise(across(everything(), list(
      miss_count = ~ sum(is.na(.x)),
      miss_pct = ~ sum(is.na(.x)) / total_row_new
    ))) %>%
    pivot_longer(everything(), names_to = c("Column", ".value"), names_pattern = "(.*)_(miss_count|miss_pct)") %>%
    arrange(desc(miss_pct))
  
  missing_infor_new %>%
    filter(miss_count == 0)
# A tibble: 7 × 3
  Column        miss_count miss_pct
  <chr>              <int>    <dbl>
1 prize_year             0        0
2 category               0        0
3 prize                  0        0
4 prize_share            0        0
5 laureate_id            0        0
6 laureate_type          0        0
7 full_name              0        0
Code
  knitr::kable(missing_infor_new)
Column miss_count miss_pct
death_city 335 0.3677278
death_country 329 0.3611416
death_date 318 0.3490670
organization_name 246 0.2700329
organization_city 244 0.2678375
organization_country 244 0.2678375
motivation 88 0.0965971
birth_date 30 0.0329308
birth_city 28 0.0307355
birth_country 26 0.0285401
gender 26 0.0285401
prize_year 0 0.0000000
category 0 0.0000000
prize 0 0.0000000
prize_share 0 0.0000000
laureate_id 0 0.0000000
laureate_type 0 0.0000000
full_name 0 0.0000000
Code
  # 最大缺失值
  max_missing_infor <- missing_infor_new %>%
   slice(which.max(miss_pct)) %>%
   print()
# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 death_city        335    0.368
Code
  # 最小缺失值  
  min_missing_infor <- missing_infor_new %>%
   slice(which.min(miss_pct)) %>%
   print()
# A tibble: 1 × 3
  Column     miss_count miss_pct
  <chr>           <int>    <dbl>
1 prize_year          0        0

2.3.5 基础信息

Code
  # 总体奖项类别
  table(fina_data$category)

 Chemistry  Economics Literature   Medicine      Peace    Physics 
       175         78        113        211        130        204 
Code
  # 获奖的时间跨度
  quantile(fina_data$prize_year)
  0%  25%  50%  75% 100% 
1901 1946 1975 1997 2016 

3.探索性分析

3.1 每个学科颁奖次数

Code
 awards_count <- data %>%
  group_by(category) %>%
    summarise(award_count = n(), .groups = 'drop')
  
  ggplot(awards_count, aes(x = reorder(category, -award_count), y = award_count, fill = category)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +  # 自定义颜色
    labs(title = "诺贝尔奖各学科颁奖次数",
         x = "类别 (Category)",
         y = "颁奖次数 (Award Count)") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
          plot.title = element_text(hjust = 0.5)) +  
    geom_text(aes(label = award_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))

3.2 获奖人的出生年代分布

Code
 fina_data$birth_date <- as.Date(fina_data$birth_date)
  fina_data$prize_year <- as.numeric(fina_data$prize_year)
  
  fina_data <- fina_data %>%
    mutate(birth_year = year(birth_date)) %>%
    mutate(decade = floor(birth_year / 10) * 10) %>%
    mutate(award_age = as.numeric(prize_year - birth_year))
  
  decade_count <- fina_data %>%
    filter(!is.na(birth_year)) %>% 
    group_by(decade) %>%
    summarise(num_count = n(), .groups = 'drop') 
    
  decade_count$decade <- as.character(decade_count$decade)
  
  ggplot(decade_count, aes(x = decade, y = num_count, fill = decade)) +
    geom_bar(stat = "identity") +
    labs(title = "获奖者年代人数统计",
         x = "年代 (Decade)",
         y = "人数 (Number)") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
          plot.title = element_text(hjust = 0.5)) +  
    geom_text(aes(label = num_count), vjust = -0.5, color = "black", size = 4, position = position_dodge(width = 0.9))

3.3 性别比例

Code
  gender_count <- fina_data %>% 
    filter(laureate_type == "Individual" & !is.na(gender)) %>% 
    group_by(prize_year, gender) %>%
    summarise(num_count = n(), .groups = 'drop')
  
  total_per_year <- fina_data %>%
   filter(laureate_type == "Individual" & !is.na(gender)) %>% 
   group_by(prize_year) %>%
   summarise(p_c_cnt = n(), .groups = 'drop')
  
  gender_ratio <- gender_count %>%
   left_join(total_per_year, by = "prize_year") %>%
   mutate(ratio = num_count / p_c_cnt)
  
  # 创建堆积条形图
  ggplot(gender_ratio, aes(x = factor(prize_year), y = ratio, fill = gender)) +
   geom_bar(stat = "identity", position = "fill") + # 使用"fill"使得每一年的比例总和为1
   scale_y_continuous(labels = scales::percent) + # 将y轴标签格式化为百分比
   labs(title = "不同年份性别占比",
        x = "年份",
        y = "比例",
        fill = "性别") +
   theme_minimal() +
   theme(
     plot.title = element_text(hjust = 0.5),
     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
   ) +
   scale_x_discrete(breaks = seq(min(as.numeric(gender_ratio$prize_year)), max(as.numeric(gender_ratio$prize_year)), by = 3))  

3.4 平均年龄

Code
  fina_data$prize_year <- as.numeric(fina_data$prize_year)
     
  fina_data <- fina_data %>%
   mutate(award_decade = floor(prize_year / 10) * 10)
     
     
  fina_data$award_decade <- as.character(fina_data$award_decade)
     
  average_age <- fina_data %>%
   filter(!is.na(award_decade)) %>%
   group_by(award_decade, category) %>%
   summarise(avg_age = mean(award_age, na.rm = TRUE), .groups = 'drop')
     
  ggplot(average_age, aes(x = factor(award_decade), y = avg_age, color = category, group = category)) +
   geom_line() +  # 添加折线
   geom_point() + # 添加点以突出每个数据点
   scale_x_discrete() + # 确保x轴按离散值处理
   labs(title = "获奖者平均年龄",
        x = "获奖年代",
        y = "平均年龄",
        color = "类别") +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45, hjust = 1), # 调整x轴标签角度
                          plot.title = element_text(hjust = 0.5)) 

3.5 最年轻的诺贝尔获奖者

Code
  min_age_records <- fina_data %>%
   filter(laureate_type == "Individual") %>%
   slice(which.min(award_age))
  
  knitr::kable(min_age_records)
prize_year category prize motivation prize_share laureate_id laureate_type full_name birth_date birth_city birth_country gender organization_name organization_city organization_country death_date death_city death_country birth_year decade award_age award_decade
2014 Peace The Nobel Peace Prize 2014 “for their struggle against the suppression of children and young people and for the right of all children to education” 1/2 914 Individual Malala Yousafzai 1997-07-12 Mingora Pakistan Female NA NA NA NA NA NA 1997 1990 17 2010

3.6 中国获得诺贝尔奖的有哪些

Code
  China_awarders <- fina_data %>%
   filter(birth_country == "China") %>%
   select(full_name, prize_year, category, prize, award_age)

  knitr::kable(China_awarders)
full_name prize_year category prize award_age
Walter Houser Brattain 1956 Physics The Nobel Prize in Physics 1956 54
Chen Ning Yang 1957 Physics The Nobel Prize in Physics 1957 35
Tsung-Dao (T.D.) Lee 1957 Physics The Nobel Prize in Physics 1957 31
Edmond H. Fischer 1992 Medicine The Nobel Prize in Physiology or Medicine 1992 72
Daniel C. Tsui 1998 Physics The Nobel Prize in Physics 1998 59
Gao Xingjian 2000 Literature The Nobel Prize in Literature 2000 60
Charles Kuen Kao 2009 Physics The Nobel Prize in Physics 2009 76
Ei-ichi Negishi 2010 Chemistry The Nobel Prize in Chemistry 2010 75
Liu Xiaobo 2010 Peace The Nobel Peace Prize 2010 55
Mo Yan 2012 Literature The Nobel Prize in Literature 2012 57
Youyou Tu 2015 Medicine The Nobel Prize in Physiology or Medicine 2015 85