library(tidyverse)
library(palmerpenguins)
library(lubridate)
<- read.csv("D:/R/task1_nobel_winners/data/nobel_winners.csv") nobel_winner
练手 1 :探索性数据分析-诺奖获得者
练手项目来源:第78章 探索性数据分析-诺奖获得者
项目数据集:nobel_winners.csv
1.1 导入数据
导入csv格式的数据集,数据集内容与诺贝尔奖获得者的信息有关。
1.2 数据结构
1.2.1 数据类型
str(nobel_winner)
'data.frame': 969 obs. of 18 variables:
$ prize_year : int 1901 1901 1901 1901 1901 1901 1902 1902 1902 1902 ...
$ category : chr "Chemistry" "Literature" "Medicine" "Peace" ...
$ prize : chr "The Nobel Prize in Chemistry 1901" "The Nobel Prize in Literature 1901" "The Nobel Prize in Physiology or Medicine 1901" "The Nobel Peace Prize 1901" ...
$ motivation : chr "\"in recognition of the extraordinary services he has rendered by the discovery of the laws of chemical dynamic"| __truncated__ "\"in special recognition of his poetic composition, which gives evidence of lofty idealism, artistic perfection"| __truncated__ "\"for his work on serum therapy, especially its application against diphtheria, by which he has opened a new ro"| __truncated__ NA ...
$ prize_share : chr "1/1" "1/1" "1/1" "1/2" ...
$ laureate_id : int 160 569 293 462 463 1 161 571 294 464 ...
$ laureate_type : chr "Individual" "Individual" "Individual" "Individual" ...
$ full_name : chr "Jacobus Henricus van 't Hoff" "Sully Prudhomme" "Emil Adolf von Behring" "Jean Henry Dunant" ...
$ birth_date : chr "1852-08-30" "1839-03-16" "1854-03-15" "1828-05-08" ...
$ birth_city : chr "Rotterdam" "Paris" "Hansdorf (Lawice)" "Geneva" ...
$ birth_country : chr "Netherlands" "France" "Prussia (Poland)" "Switzerland" ...
$ gender : chr "Male" "Male" "Male" "Male" ...
$ organization_name : chr "Berlin University" NA "Marburg University" NA ...
$ organization_city : chr "Berlin" NA "Marburg" NA ...
$ organization_country: chr "Germany" NA "Germany" NA ...
$ death_date : chr "1911-03-01" "1907-09-07" "1917-03-31" "1910-10-30" ...
$ death_city : chr "Berlin" "Châtenay" "Marburg" "Heiden" ...
$ death_country : chr "Germany" "France" "Germany" "Switzerland" ...
变量名称 | 变量类型 | 变量含义 | 变量名称 | 变量类型 | 变量含义 |
---|---|---|---|---|---|
prize_year | int | 获奖年份 | category | chr | 奖项类别(学科) |
prize | chr | 奖项名称 | motivation | chr | 获奖理由 |
prize_share | chr | 获奖份额(是否共享奖项) | laureate_id | int | 获奖者id |
laureate_type | chr | 获奖者类型 | full_name | chr | 获奖者姓名 |
birth_date | chr | 出生日期 | birth_city | chr | 出生城市 |
birth_country | chr | 出生国家 | gender | chr | 性别 |
organization_name | chr | 组织名称 | organization_city | chr | 组织所属城市 |
organization_country | chr | 组织所属国家 | death_date | chr | 逝世日期 |
death_city | chr | 逝世城市 | death_country | chr | 逝世国家 |
1.2.2 缺失值
<- colSums(is.na(nobel_winner))
col_missing_counts col_missing_counts
prize_year category prize
0 0 0
motivation prize_share laureate_id
88 0 0
laureate_type full_name birth_date
0 0 31
birth_city birth_country gender
28 26 26
organization_name organization_city organization_country
247 253 253
death_date death_city death_country
352 370 364
%>%
nobel_winner group_by(laureate_type) %>%
summarise(num = n())
# A tibble: 2 × 2
laureate_type num
<chr> <int>
1 Individual 939
2 Organization 30
%>%
nobel_winner filter(laureate_type == "Organization") %>%
group_by(full_name) %>%
summarise(n = n(), gender = max(gender)) %>%
arrange(gender, desc(n))
# A tibble: 27 × 3
full_name n gender
<chr> <int> <chr>
1 Aung San Suu Kyi 1 Female
2 Mother Teresa 1 Female
3 Le Duc Tho 1 Male
4 The 14th Dalai Lama (Tenzin Gyatso) 1 Male
5 Comité international de la Croix Rouge (International Committee… 3 <NA>
6 Office of the United Nations High Commissioner for Refugees (UN… 2 <NA>
7 American Friends Service Committee (The Quakers) 1 <NA>
8 Amnesty International 1 <NA>
9 Bureau international permanent de la Paix (Permanent Internatio… 1 <NA>
10 European Union (EU) 1 <NA>
# ℹ 17 more rows
性别的缺失来源于组织不会有性别,但奇怪的是,获奖者为组织的30条数据27个组织中,有四个“组织”是有性别值的,核查后也确实是四名个人,无法确定是数据集内容差错,还是个人作为组织领取(后续处理时,若有对应的性别/出生信息/工作信息,会计为个人)。
1.2.3 数据唯一性
%>%
nobel_winner group_by(full_name, prize) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(n >= 2)
# A tibble: 57 × 3
# Groups: full_name [57]
full_name prize n
<chr> <chr> <int>
1 Jack W. Szostak The Nobel Prize in Physiolo… 3
2 Abdus Salam The Nobel Prize in Physics … 2
3 Adam G. Riess The Nobel Prize in Physics … 2
4 Adolf Friedrich Johann Butenandt The Nobel Prize in Chemistr… 2
5 Albert Fert The Nobel Prize in Physics … 2
6 Alvin E. Roth The Sveriges Riksbank Prize… 2
7 Antonio Caetano de Abreu Freire Egas Moniz The Nobel Prize in Physiolo… 2
8 Barry J. Marshall The Nobel Prize in Physiolo… 2
9 Bruce A. Beutler The Nobel Prize in Physiolo… 2
10 Carl Bosch The Nobel Prize in Chemistr… 2
# ℹ 47 more rows
%>%
nobel_winner inner_join(nobel_winner %>%
group_by(full_name, prize) %>%
summarise(n = n()) %>%
filter(n >= 2), by = c("full_name", "prize")) %>%
select(c(full_name, prize_year, organization_name, organization_country)) %>%
head(10)
full_name prize_year
1 Paul Ehrlich 1908
2 Paul Ehrlich 1908
3 Carl Bosch 1931
4 Carl Bosch 1931
5 Friedrich Bergius 1931
6 Friedrich Bergius 1931
7 William Parry Murphy 1934
8 William Parry Murphy 1934
9 Petrus (Peter) Josephus Wilhelmus Debye 1936
10 Petrus (Peter) Josephus Wilhelmus Debye 1936
organization_name
1 Goettingen University
2 Königliches Institut für experimentelle Therapie (Royal Institute for Experimental Therapy)
3 University of Heidelberg
4 I.G. Farbenindustrie A.G.
5 University of Heidelberg
6 I.G. Farbenindustrie A.G.
7 Harvard University
8 Peter Brent Brigham Hospital
9 Berlin University
10 Kaiser-Wilhelm-Institut (now Max-Planck-Institut) für Physik
organization_country
1 Germany
2 Germany
3 Germany
4 Germany
5 Germany
6 Germany
7 United States of America
8 United States of America
9 Germany
10 Germany
获奖者所属多个机构时数据会重复,即一次获奖会对应多条数据。
<- nobel_winner %>%
nobel_winner_once mutate(once = paste0(full_name," ",prize)) %>%
group_by(once) %>%
summarise(n = n())
length(nobel_winner_once$once)
[1] 911
当一人/一组织获奖一次记为1时,累计获奖次数为911次。
1.3 探索性内容
- 每个学科颁过多少次奖?
- 这些大神都是哪个年代的人?
- 性别比例
- 平均年龄和获奖数量
- 最年轻的诺奖获得者是谁?
- 中国诺奖获得者有哪些?
- 得奖的时候多大年龄?
- 获奖者所在国家的经济情况?
- 有大神多次获得诺贝尔奖,而且在不同科学领域获奖?
- 出生地分布?工作地分布?迁移模式?
- GDP经济与诺奖模型?
- 诺奖分享情况?
1.3.1 各学科颁奖次数
%>%
nobel_winner group_by(category) %>%
summarise(num = n_distinct(paste0(full_name,prize)))
# A tibble: 6 × 2
category num
<chr> <int>
1 Chemistry 175
2 Economics 78
3 Literature 113
4 Medicine 211
5 Peace 130
6 Physics 204
library(ggplot2)
library(forcats)
%>%
nobel_winner group_by(category) %>%
summarise(num = n_distinct(paste0(full_name,prize))) %>%
arrange(num) %>%
ggplot(aes(x = fct_reorder(category, num), y = num, fill = category))+
geom_col()+
scale_fill_brewer(palette = "Blues")+
geom_text(aes(label = num), vjust = -0.5, color = "black", size = 3) +
labs(x = "category", y = "number", title = "Number of Nobel Prize presented in each discipline")+
theme_minimal()
1.3.2 获奖者年代分布
<- nobel_winner %>%
nobel_winner_birth_era filter(!is.na(birth_date)) %>%
group_by(full_name) %>%
summarise(birth_date = max(birth_date)) %>%
mutate(birth_era = paste0(as.numeric(substr(birth_date, start = 1, stop = 3))*10, "s"))
%>% summarise(min = min(birth_date, na.rm = T),
nobel_winner_birth_era max = max(birth_date, na.rm = T))
# A tibble: 1 × 2
min max
<chr> <chr>
1 1817-11-30 1997-07-12
length(nobel_winner_birth_era$full_name)
[1] 877
共计877位有出生日期记录的获奖者,他们的出生日期分布在1810s-1990s之间。
<- data.frame(era = paste0(seq(from = 1810, to = 1990, by = 10), "s"))
eras <- eras %>%
eras_plot left_join(nobel_winner_birth_era %>% select(c(birth_date, birth_era)), by = c("era" = "birth_era"))
<- length(unique(eras$era))
colourCount library(RColorBrewer)
%>%
eras_plot group_by(era) %>%
summarise(num = sum(!is.na(birth_date))) %>%
arrange(era) %>%
ggplot(aes(x = era, y = num, fill = era))+
geom_col()+
scale_x_discrete(labels = function(x) ifelse(seq_along(x) %% 2 == 1, x, ""))+
scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Blues"))(colourCount))+
geom_text(aes(label = num), vjust = -0.5, color = "black", size = 3) +
labs(x = "birth_era", y = "number", title = "Distribution of Nobel laureates by birth era")+
theme_minimal()
1.3.3 获奖者的性别比例
%>% filter(!is.na(gender)) %>% group_by(gender) %>% summarise(n=n_distinct(full_name)) nobel_winner
# A tibble: 2 × 2
gender n
<chr> <int>
1 Female 48
2 Male 833
library(scales)
%>%
nobel_winner filter(!is.na(gender)) %>%
group_by(category, gender) %>%
summarise(n=n_distinct(full_name)) %>%
left_join(nobel_winner %>% filter(!is.na(gender)) %>% group_by(category) %>% summarise(total=n_distinct(full_name)), by = "category") %>%
mutate(pro = n / total) %>%
ggplot(aes(x=category,y=gender))+
geom_tile(aes(fill=pro),color="grey50",height=0.4,width=0.8)+
geom_text(aes(label=percent(pro)))+
labs(x = "category", y = "gender", title = "Gender ratio of Nobel laureates")+
scale_fill_distiller(direction=1,palette = "Blues")+
theme_minimal()
差距悬殊哎,在非学科类的文学奖与和平奖上,差距略小一点。
1.3.4 获奖年龄分布 平均获奖年龄
<- nobel_winner %>%
nobel_winner_age mutate(age = prize_year - year(birth_date)) %>%
filter(!is.na(age))
<- nobel_winner_age %>%
nobel_winner_age_distinct mutate(once = paste0(full_name," ",prize)) %>%
group_by(once, category, full_name) %>%
summarise(age = max(age))
summary(nobel_winner_age_distinct$age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
17.00 50.00 60.00 59.45 69.00 90.00
%>%
nobel_winner_age_distinct group_by(category) %>%
summarise(num = n(),
mean_value = round(mean(age, na.rm = TRUE), 2),
min_value = round(min(age, na.rm = TRUE), 2),
Q1_value = round(quantile(age, 0.25, na.rm = TRUE), 2),
median_value = round(median(age, na.rm = TRUE), 2),
Q3_value = round(quantile(age, 0.75, na.rm = TRUE), 2),
max_value = round(max(age , na.rm = TRUE), 2))
# A tibble: 6 × 8
category num mean_value min_value Q1_value median_value Q3_value max_value
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Chemistry 174 58.0 35 50 57 67 85
2 Economics 77 67.2 51 62 67 73 90
3 Literature 113 64.7 42 57 67 73 88
4 Medicine 211 58.0 32 49 57 65 87
5 Peace 103 61.4 17 54 62 71 87
6 Physics 203 55.4 25 45 54 64 88
诺奖成果基本上需要你在行业内进行大量的积累才能得到产出,可以看见不管是哪类学科奖项,平均年龄都基本在60上下波动。
%>%
nobel_winner_age_distinct group_by(age) %>%
summarise(num = n()) %>%
arrange(age) %>%
ggplot(aes(x = age, y = num, fill = age))+
geom_col()+
scale_fill_gradient(low = "#87CEFA", high = "#27408B", name = "Age")+
labs(x = "age", y = "number", title = "Age of Nobel laureates")+
theme_minimal()
1.3.5 谁是最年轻的诺奖获得者
%>%
nobel_winner inner_join(nobel_winner_age %>%
slice_min(age, n = 1) %>%
select(c(full_name, age)),
by = "full_name")
prize_year category prize
1 2014 Peace The Nobel Peace Prize 2014
motivation
1 "for their struggle against the suppression of children and young people and for the right of all children to education"
prize_share laureate_id laureate_type full_name birth_date birth_city
1 1/2 914 Individual Malala Yousafzai 1997-07-12 Mingora
birth_country gender organization_name organization_city organization_country
1 Pakistan Female <NA> <NA> <NA>
death_date death_city death_country age
1 <NA> <NA> <NA> 17
1.3.6 中国的诺奖获得者
%>%
nobel_winner_age filter(grepl("China", birth_country, ignore.case = TRUE) | grepl("Taiwan", birth_country, ignore.case = TRUE)) %>%
select(c(prize_year, category, prize_share, full_name, birth_date, birth_country, gender, organization_name, organization_country, age))
prize_year category prize_share full_name
1 1956 Physics 1/3 Walter Houser Brattain
2 1957 Physics 1/2 Chen Ning Yang
3 1957 Physics 1/2 Tsung-Dao (T.D.) Lee
4 1986 Chemistry 1/3 Yuan T. Lee
5 1989 Peace 1/1 The 14th Dalai Lama (Tenzin Gyatso)
6 1992 Medicine 1/2 Edmond H. Fischer
7 1998 Physics 1/3 Daniel C. Tsui
8 2000 Literature 1/1 Gao Xingjian
9 2009 Physics 1/2 Charles Kuen Kao
10 2009 Physics 1/2 Charles Kuen Kao
11 2010 Chemistry 1/3 Ei-ichi Negishi
12 2010 Peace 1/1 Liu Xiaobo
13 2012 Literature 1/1 Mo Yan
14 2015 Medicine 1/2 Youyou Tu
birth_date birth_country gender
1 1902-02-10 China Male
2 1922-09-22 China Male
3 1926-11-24 China Male
4 1936-11-19 Taiwan Male
5 1935-07-06 Tibet (People's Republic of China) Male
6 1920-04-06 China Male
7 1939-02-28 China Male
8 1940-01-04 China Male
9 1933-11-04 China Male
10 1933-11-04 China Male
11 1935-07-14 China Male
12 1955-12-28 China Male
13 1955-02-02 China Male
14 1930-12-30 China Female
organization_name organization_country age
1 Bell Telephone Laboratories United States of America 54
2 Institute for Advanced Study United States of America 35
3 Columbia University United States of America 31
4 University of California United States of America 50
5 <NA> <NA> 54
6 University of Washington United States of America 72
7 Princeton University United States of America 59
8 <NA> <NA> 60
9 Standard Telecommunication Laboratories United Kingdom 76
10 Chinese University of Hong Kong China 76
11 Purdue University United States of America 75
12 <NA> <NA> 55
13 <NA> <NA> 57
14 China Academy of Traditional Chinese Medicine China 85
除去其中因多机构重复的Charles Kuen Kao,共有13名出生于中国的人员获得了诺奖(其中外交部指出:两次诺贝尔和平奖的授予违背了奖项宗旨,是对和平奖的亵渎),多数集中在物理学奖,人数比例超过50%(6/11)。
1.3 7 获奖者所在国家的经济情况(GDP经济与诺奖模型)
# library(WDI)
# gdp_data <- WDI(indicator = "NY.GDP.PCAP.CD",
# start = 1960,
# end = Sys.Date() %>% as.numeric(format = "%Y"))
好吧,直接调接口失败了,从世界银行网站下载了数据。
总gdp与总人口数据源:世界银行-世界发展指标(编码:GDP (current US$)-NY.GDP.MKTP.CD和Population,total-SP.POP.TOTL)
library(readxl)
<- read_excel("D:/R/task1_nobel_winners/data/gdp.xls")
gdp_data <- gdp_data %>%
gdp_data select(-c("Country Name", "Indicator Name", "Indicator Code")) %>%
pivot_longer(cols = matches("^19[0-9][0-9]$|^20[0-9][0-9]$"),
names_to = "year",
values_to = "gdp")
<- read_excel("D:/R/task1_nobel_winners/data/pop.xls")
pop_data <- pop_data %>%
pop_data select(-c("Country Name", "Indicator Name", "Indicator Code")) %>%
pivot_longer(cols = matches("^19[0-9][0-9]$|^20[0-9][0-9]$"),
names_to = "year",
values_to = "pop")
<- gdp_data %>%
all_data inner_join(pop_data, by = c("Country Code", "year")) %>%
filter(!is.na(gdp) & !is.na(pop)) %>%
mutate(gdp_pop = gdp / pop) %>%
mutate(year = as.integer(year)) %>%
group_by(year) %>%
mutate(gdp_rank = dense_rank(desc(gdp_pop))) %>%
ungroup()
library(countrycode)
<- nobel_winner %>%
nobel_winner_rank select(c(full_name, organization_country, death_country, prize_year, category)) %>%
mutate(organization_country = case_when(is.na(organization_country) ~ death_country,
TRUE ~ organization_country)) %>%
mutate(organization_country = ifelse(grepl("\\(", organization_country),
str_extract(organization_country,"(?<=\\().+?(?=\\))"),
%>%
organization_country)) mutate(organization_country = case_when(organization_country == "Taiwan" ~ "China",
== "People's Republic of China" ~ "China",
organization_country == "Czechoslovakia" ~ "Czechia",
organization_country == "Scotland" ~ "United Kingdom",
organization_country grepl("Germany", organization_country, ignore.case = TRUE) ~ "Germany",
TRUE ~ organization_country)) %>%
mutate(organization_country_code = countrycode(organization_country,
origin = "country.name",
destination = "iso3c")) %>%
inner_join(all_data %>% select(c("year","Country Code","gdp_rank")), by = c("prize_year" = "year", "organization_country_code" = "Country Code")) %>%
group_by(full_name, prize_year,category) %>%
summarise(max_rank = max(gdp_rank)) %>%
group_by(category,max_rank) %>%
summarise(num=n()) %>% arrange(max_rank)
%>% ggplot(aes(x=max_rank,fill = category,color=category))+
nobel_winner_rank geom_density()+
facet_wrap(~ category) +
scale_fill_brewer(palette = "Blues")+
scale_color_brewer(palette = "Blues")+
labs(x = "rank", y = "number", title = "GDP rank of Nobel laureate countries")+
theme_minimal()
物理学奖、医学奖、化学奖、经济学奖获奖人员工作国家获奖当年GDP排名高度集中在前30名。
1.3.8 多次获奖情况 多领域获奖情况
%>%
nobel_winner_age inner_join(nobel_winner_age_distinct %>%
group_by(full_name) %>%
summarise(n = n()) %>%
filter(n >= 2), by = "full_name") %>%
select(c(prize_year, category, prize_share, full_name, birth_date, birth_country, gender, organization_name, organization_country, age)) %>%
arrange(full_name, prize_year)
prize_year category prize_share full_name birth_date
1 1958 Chemistry 1/1 Frederick Sanger 1918-08-13
2 1980 Chemistry 1/4 Frederick Sanger 1918-08-13
3 1956 Physics 1/3 John Bardeen 1908-05-23
4 1972 Physics 1/3 John Bardeen 1908-05-23
5 1954 Chemistry 1/1 Linus Carl Pauling 1901-02-28
6 1962 Peace 1/1 Linus Carl Pauling 1901-02-28
7 1903 Physics 1/4 Marie Curie, née Sklodowska 1867-11-07
8 1911 Chemistry 1/1 Marie Curie, née Sklodowska 1867-11-07
birth_country gender organization_name
1 United Kingdom Male University of Cambridge
2 United Kingdom Male MRC Laboratory of Molecular Biology
3 United States of America Male University of Illinois
4 United States of America Male University of Illinois
5 United States of America Male California Institute of Technology (Caltech)
6 United States of America Male California Institute of Technology (Caltech)
7 Russian Empire (Poland) Female <NA>
8 Russian Empire (Poland) Female Sorbonne University
organization_country age
1 United Kingdom 40
2 United Kingdom 62
3 United States of America 48
4 United States of America 64
5 United States of America 53
6 United States of America 61
7 <NA> 36
8 France 44
共计4人获得诺奖二次得主的荣誉,其中Frederick Sanger和John Bardeen两位是两次在同一领域获奖,Linus Carl Pauling和Marie Curie, née Sklodowska为跨领域获奖。
1.3.9 出生地分布 工作地分布 迁移模式
数据集中国家名称比较混乱:
- 提取括号中的内容作为国家名
- 对同一国家多种国家名进行统一
%>% group_by(full_name) %>%
nobel_winner slice_head(n = 1) %>%
mutate(birth_country = ifelse(grepl("\\(", birth_country),
str_extract(birth_country,"(?<=\\().+?(?=\\))"), # 零宽度断言(前后预查)
%>%
birth_country)) mutate(birth_country = case_when(birth_country == "Taiwan" ~ "China",
== "People's Republic of China" ~ "China",
birth_country TRUE ~ birth_country)) %>%
group_by(birth_country) %>%
summarise(num = n()) %>%
arrange(desc(num))
# A tibble: 78 × 2
birth_country num
<chr> <int>
1 United States of America 257
2 United Kingdom 84
3 Germany 80
4 France 54
5 Sweden 29
6 Russia 26
7 Poland 25
8 Japan 24
9 <NA> 23
10 Italy 19
# ℹ 68 more rows
工作地以组织国家作为基础值,若组织国家为空将用逝世国家填充。
%>% group_by(full_name) %>%
nobel_winner slice_head(n = 1) %>%
mutate(organization_country = case_when(is.na(organization_country) ~ death_country,
TRUE ~ organization_country)) %>%
mutate(organization_country = ifelse(grepl("\\(", organization_country),
str_extract(organization_country,"(?<=\\().+?(?=\\))"),
%>%
organization_country)) mutate(organization_country = case_when(organization_country == "Taiwan" ~ "China",
== "People's Republic of China" ~ "China",
organization_country grepl("Germany", organization_country, ignore.case = TRUE) ~ "Germany",
TRUE ~ organization_country)) %>%
group_by(organization_country) %>%
summarise(num = n()) %>%
arrange(desc(num))
# A tibble: 43 × 2
organization_country num
<chr> <int>
1 United States of America 361
2 United Kingdom 104
3 Germany 77
4 <NA> 75
5 France 60
6 Switzerland 31
7 Sweden 30
8 Japan 18
9 Italy 14
10 Netherlands 12
# ℹ 33 more rows
迁移模式:
<-
nobel_winner_change %>%
nobel_winner group_by(full_name) %>%
slice_head(n = 1) %>%
mutate(organization_country = case_when(is.na(organization_country) ~ death_country,
TRUE ~ organization_country)) %>%
mutate(across(c(birth_country, organization_country), ~ ifelse(grepl("\\(", .),
str_extract(.,"(?<=\\().+?(?=\\))"),.))) %>%
mutate(across(c(birth_country, organization_country), ~ case_when(. == "Taiwan" ~ "China",
== "People's Republic of China" ~ "China",
. grepl("Germany", ., ignore.case = TRUE) ~ "Germany",
TRUE ~ .))) %>%
select(c(full_name, birth_country, organization_country)) %>%
filter(!is.na(birth_country) & !is.na(organization_country)) %>%
mutate(is_same = case_when(birth_country == organization_country ~ "未迁移",
!= organization_country ~ "有迁移"))
birth_country %>%
nobel_winner_change group_by(is_same) %>%
summarise(num = n())
# A tibble: 2 × 2
is_same num
<chr> <int>
1 有迁移 269
2 未迁移 560
library(ggalluvial)
%>%
nobel_winner_change filter(is_same == "有迁移") %>%
group_by(birth_country, organization_country) %>%
summarise(n = n()) %>%
arrange(birth_country, desc(n)) %>%
ggplot(aes(axis1 = birth_country,
axis2 = organization_country,
y = n)) +
geom_alluvium(aes(fill = birth_country)) +
geom_stratum() +
geom_text(stat = "stratum",
aes(label = after_stat(stratum)),
size = 3) +
scale_fill_viridis_d() +
theme_void()+
theme(legend.position = "none")
轴坐标标签的设置有点糟,找了一个优化方法(根据n值筛选显示标签,但很奇怪都不显示了),图例也被迫舍弃了。
# library(ggalluvial)
# filtered_data <- nobel_winner_change %>%
# filter(is_same == "有迁移") %>%
# group_by(birth_country, organization_country) %>%
# summarise(n = n(), .groups = 'drop') %>%
# filter(n > 1)
# label_countries <- unique(c(filtered_data$birth_country, filtered_data$organization_country))
# nobel_winner_change %>%
# filter(is_same == "有迁移") %>%
# group_by(birth_country, organization_country) %>%
# summarise(n=n()) %>%
# arrange(birth_country, desc(n)) %>%
# ggplot(aes(axis1 = birth_country,
# axis2 = organization_country,
# y = n)) +
# geom_alluvium(aes(fill = birth_country)) +
# geom_stratum() +
# scale_x_discrete(breaks = unique(c(filtered_data$birth_country, filtered_data$organization_country)),
# labels = unique(c(filtered_data$birth_country, filtered_data$organization_country)))+
# scale_fill_viridis_d() +
# theme_void()
1.3.10 共享诺奖情况
%>%
nobel_winner mutate(share_num = substr(prize_share, start = 3, stop = 3)) %>%
group_by(full_name, prize_year,category) %>%
summarise(share_num = max(share_num)) %>%
group_by(category,share_num) %>%
summarise(num = n()) %>%
pivot_wider(names_from = "share_num",
values_from = "num") %>%
arrange(category)
# A tibble: 6 × 5
# Groups: category [6]
category `1` `2` `3` `4`
<chr> <int> <int> <int> <int>
1 Chemistry 63 53 45 14
2 Economics 24 36 18 NA
3 Literature 105 8 NA NA
4 Medicine 39 72 84 16
5 Peace 66 58 6 NA
6 Physics 47 79 48 30
1.4 复盘
- 应该把nobel_winner数据框中后续可能需要用到的列先处理好,比如:国家的名称与编码的统一,年龄以及出生日期,人员去重(但这个需要保留一份未去重数据 与工作组织相关)等。
- 可以从单一变量延伸到多变量,问某某的比例,首先给出总体的比例后,可以去拆不同维度中比例是否有所变化。
- 有时候箱线图比柱状图更易于理解分布。
- 项目源8.13的绘图在做咩,颁奖数量和平均年龄就算能做回归也很像被别的因素干扰了。
- 后续可以试试geom_sf与动图library(gganimate)(动图更有时间序列的感觉)地图绘图