Main Sources:
-
Extra sources will be given
Extra sources will be given
library(tidyverse)
library(lubridate)
library(ggrepel)
library(WDI)
library(countrycode)
theme_set(theme_light())
# load data 1 - David Robinson
nobel_winners <- read_csv(
paste0(
'https://raw.githubusercontent.com/rfordatascience/tidytuesday/',
'master/data/2019/2019-05-14/nobel_winners.csv'
)
)
# load data 1 - David Robinson
nobel_winners <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv") %>%
distinct(full_name, prize_year, category, .keep_all = TRUE) %>%
mutate(decade = 10 * (prize_year %/% 10),
age = prize_year - year(birth_date))
# load data 2 - David Robinson
nobel_winner_all_pubs <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winner_all_pubs.csv") %>%
mutate(prize_decade = 10 * (prize_year %/% 10))
### Exploring
#### 1
nobel_winners %>%
count(category,
decade = 10*(prize_year%/%10), sort = T) %>% # transform year to decades
ggplot(aes(decade, n, fill = category))+
geom_col()
#### 2
nobel_winners %>%
group_by(category,
decade = 10*(prize_year%/%10), sort = T) %>% # transform year to decade s
summarise(winners = n(),
winners_per_year = winners/n_distinct(prize_year)) %>%
ggplot(aes(decade, winners_per_year,color= category))+
geom_line()+
expand_limits(y=0)
#### 3
nobel_winners %>%
add_count(full_name) %>%
arrange(desc(n)) %>%
select(prize_year, category, full_name)
#### 4
nobel_winners %>%
filter(category == "Medicine", full_name == "Jack W. Szostak")
#### 4
nobel_winners %>%
distinct(full_name, prize_year, category) %>%
add_count(full_name) %>%
arrange(desc(n))
#### 5
nobel_winners %>%
distinct(full_name, prize_year, category) %>%
group_by(full_name) %>%
mutate(prizes = n(),
distinct_prizes = n_distinct(category)) %>%
arrange(desc(prizes))
#### 6 - gender
nobel_winners %>%
count(decade,
category,
gender = coalesce(gender, laureate_type)) %>% #coalesce
group_by(decade, category) %>%
mutate(percent = n/sum(n)) %>%
ggplot(aes(decade, percent, fill = gender))+
geom_col()+
facet_wrap(~ category)
#### 6 - gender 2
nobel_winners %>%
count(decade,
category,
gender = coalesce(gender, laureate_type)) %>% #coalesce
group_by(decade, category) %>%
mutate(percent = n/sum(n)) %>%
ggplot(aes(decade, n, fill = gender))+
geom_col()+
facet_wrap(~ category)+
labs(x= "Decades",
y = "# of nobel prize winners",
fill = "Gender",
title = "Nobel Prize gender distribution")
#### 7 - age
nobel_winners %>%
mutate(category = fct_reorder(category, age, median, na.rm = T)) %>%
ggplot(aes(category, age))+
geom_boxplot()+
coord_flip()
#### 8 - age
nobel_winners %>%
filter(!is.na(age)) %>%
group_by(decade, category) %>%
summarise(average_age = mean(age),
median_age = median(age)) %>%
ggplot(aes(decade, average_age, color = category))+
geom_line()
#### 9
nobel_winners %>%
filter(full_name == "Albert Einstein") %>%
pull(motivation)
nobel_winners %>%
group_by(category, decade) %>%
summarize(winners = n(),
winners_per_year = winners / n_distinct(prize_year)) %>%
ggplot(aes(decade, winners_per_year, color = category)) +
geom_line() +
expand_limits(y = 0)
nobel_winners %>%
distinct(full_name, prize_year, category) %>%
group_by(full_name) %>%
mutate(prizes = n(),
distinct_prizes = n_distinct(category)) %>%
arrange(desc(prizes), full_name)
## # A tibble: 911 x 5
## # Groups: full_name [904]
## full_name prize_year category prizes distinct_prizes
## <chr> <dbl> <chr> <int> <int>
## 1 Comité international de la C~ 1917 Peace 3 1
## 2 Comité international de la C~ 1944 Peace 3 1
## 3 Comité international de la C~ 1963 Peace 3 1
## 4 Frederick Sanger 1958 Chemist~ 2 1
## 5 Frederick Sanger 1980 Chemist~ 2 1
## 6 John Bardeen 1956 Physics 2 1
## 7 John Bardeen 1972 Physics 2 1
## 8 Linus Carl Pauling 1954 Chemist~ 2 2
## 9 Linus Carl Pauling 1962 Peace 2 2
## 10 Marie Curie, née Sklodowska 1903 Physics 2 2
## # ... with 901 more rows
nobel_winners %>%
count(decade,
category,
gender = coalesce(gender, laureate_type)) %>%
group_by(decade, category) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(decade, n, fill = gender)) +
geom_col() +
facet_wrap(~ category) +
labs(x = "Decade",
y = "# of nobel prize winners",
fill = "Gender",
title = "Nobel Prize gender distribution over time")
nobel_winners %>%
mutate(category = fct_reorder(category, age, median, na.rm = TRUE)) %>%
ggplot(aes(category, age)) +
geom_boxplot() +
coord_flip()
nobel_winners %>%
filter(!is.na(age)) %>%
group_by(decade, category) %>%
summarize(average_age = mean(age),
median_age = median(age)) %>%
ggplot(aes(decade, average_age, color = category)) +
geom_line()
nobel_winners %>%
filter(prize_year >= 2010, category == "Peace") %>%
select(full_name, age, prize)
## # A tibble: 10 x 3
## full_name age prize
## <chr> <dbl> <chr>
## 1 Liu Xiaobo 55 The Nobel Peace Pri~
## 2 Ellen Johnson Sirleaf 73 The Nobel Peace Pri~
## 3 Leymah Gbowee 39 The Nobel Peace Pri~
## 4 Tawakkol Karman 32 The Nobel Peace Pri~
## 5 European Union (EU) NA The Nobel Peace Pri~
## 6 Organisation for the Prohibition of Chemical~ NA The Nobel Peace Pri~
## 7 Kailash Satyarthi 60 The Nobel Peace Pri~
## 8 Malala Yousafzai 17 The Nobel Peace Pri~
## 9 National Dialogue Quartet NA The Nobel Peace Pri~
## 10 Juan Manuel Santos 65 The Nobel Peace Pri~
nobel_winners %>%
filter(!is.na(birth_country)) %>%
count(birth_country = fct_lump(birth_country, 9),
category,
sort = TRUE) %>%
mutate(birth_country = fct_reorder(birth_country, n)) %>%
ggplot(aes(birth_country, n, fill = category)) +
geom_col() +
facet_wrap(~ category) +
coord_flip()
indicators_raw <- WDI(indicator = "NY.GDP.PCAP.CD",
start = 2016, end = 2016, extra = TRUE) %>%
tbl_df() %>%
select(country,
country_code = iso2c,
income,
gdp_per_capita = NY.GDP.PCAP.CD)
nobel_winners_countries <- nobel_winners %>%
mutate(country_code = countrycode(birth_country, "country.name", "iso2c")) %>%
inner_join(indicators_raw, by = "country_code") %>%
mutate(income = fct_relevel(income, c("Low income", "Lower middle income", "Upper middle income", "High income")))
nobel_winners_countries %>%
filter(!is.na(income)) %>%
count(category, income) %>%
group_by(category) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(income, percent)) +
geom_col() +
facet_wrap(~ category) +
coord_flip() +
labs(x = "Current income level of birth country",
y = "% of this category's prizes",
title = "Where do Nobel Prize winners come from?")
winners_summarized <- nobel_winner_all_pubs %>%
filter(pub_year <= prize_year) %>%
group_by(laureate_id,
laureate_name,
category,
prize_year,
prize_decade) %>%
summarize(papers_before_prize = n(),
papers_before_prize_5_years = sum(pub_year >= prize_year - 5),
average_paper_age = mean(prize_year - pub_year),
winning_paper_age = mean((prize_year - pub_year)[is_prize_winning_paper == "YES"]))
# What fraction had retired based on their papers in last five years?
winners_summarized %>%
group_by(category, prize_decade) %>%
summarize(average_papers = mean(papers_before_prize),
average_paper_age = mean(average_paper_age),
average_winning_paper_age = mean(winning_paper_age)) %>%
ggplot(aes(prize_decade, average_winning_paper_age, color = category)) +
geom_line() +
labs(x = "Prize decade",
y = "Time between when paper was published and won prize",
title = "Scientists have to wait longer for a Nobel Prize than ever",
color = "Category") +
expand_limits(y = 0)
pubs_enriched <- nobel_winner_all_pubs %>%
group_by(laureate_id, category, prize_year) %>%
mutate(papers_before = rank(pub_year, ties.method = "first") - 1,
total_papers = n(),
position_in_career = papers_before / total_papers,
first_pub_year = min(pub_year)) %>%
ungroup()
nobel_winners %>%
filter(!is.na(age),
category %in% c("Chemistry", "Medicine", "Physics")) %>%
group_by(decade, category) %>%
summarize(average_age = mean(age),
median_age = median(age)) %>%
ggplot(aes(decade, average_age, color = category)) +
geom_line()
pubs_enriched %>%
filter(is_prize_winning_paper == "YES") %>%
group_by(prize_decade, category) %>%
summarize(average_position_in_career = mean(position_in_career)) %>%
ggplot(aes(prize_decade, average_position_in_career, color = category)) +
geom_line()
The average recent Nobel Prize winner in Chemistry, Medicine or Physics is in their late 60s and is winning for work published about 25 years ago, about a fifth of the way through their career.
pubs_enriched %>%
filter(is_prize_winning_paper == "YES") %>%
group_by(prize_decade, category) %>%
summarize(average_position_in_career = mean(position_in_career)) %>%
ggplot(aes(prize_decade, average_position_in_career, color = category)) +
geom_line() +
scale_y_continuous(labels = scales::percent_format())
pubs_enriched %>%
filter(pub_year - first_pub_year < 75,
prize_year >= 1910,
prize_year <= 2000) %>%
ggplot(aes(pub_year - first_pub_year, fill = is_prize_winning_paper)) +
geom_density(alpha = .5) +
facet_wrap(~ category) +
labs(title = "Typical arc of a Nobel Prize winner's career",
subtitle = "For people who won between 1910 and 2000",
x = "Years into their publishing career")
dead_nobel_winners <- nobel_winners %>%
filter(!is.na(death_date)) %>%
mutate(years_until_death = year(death_date) - prize_year) %>%
distinct(category, full_name, years_until_death)
avg_years_until_death <- dead_nobel_winners %>%
group_by(category) %>%
summarize(avg_years_until_death = mean(years_until_death))
max_years_until_death <- dead_nobel_winners %>%
group_by(category) %>%
top_n(1, years_until_death)
min_years_until_death <- dead_nobel_winners %>%
group_by(category) %>%
top_n(1, -years_until_death)
dead_nobel_winners %>%
left_join(avg_years_until_death, by = 'category') %>%
anti_join(min_years_until_death) %>%
anti_join(max_years_until_death) %>%
mutate(category = fct_reorder(category, avg_years_until_death)) %>%
ggplot() +
geom_vline(aes(xintercept = 0)) +
geom_jitter(
aes(x = -years_until_death, y = category),
color = 'red', alpha = 1/3, height = 0.15, width = 0
) +
geom_line(
data = avg_years_until_death,
aes(x = -avg_years_until_death, y = category, group = 1)
) +
geom_point(
data = avg_years_until_death,
aes(x = -avg_years_until_death, y = category)
) +
geom_point(
data = max_years_until_death,
aes(x = -years_until_death, y = category),
color = 'red', alpha = 1/3
) +
geom_label_repel(
data = max_years_until_death,
aes(x = -years_until_death, y = category, label = full_name),
hjust = 1, size = 2.25, nudge_y = -0.4, nudge_x = 0.8,
segment.size = 0.3, segment.alpha = 0.9, label.size = 0.25
) +
geom_point(
data = min_years_until_death,
aes(x = -years_until_death, y = category),
color = 'red', alpha = 1/3
) +
geom_label_repel(
data = min_years_until_death,
aes(x = -years_until_death, y = category, label = full_name),
hjust = 1, size = 2.25, nudge_y = -0.4, nudge_x = -0.8,
segment.size = 0.3, segment.alpha = 0.9, label.size = 0.25
) +
scale_x_continuous(
breaks = seq(-60, 0, by = 5), labels = function(l) {-l}
) +
labs(
x = 'Number of Years',
y = 'Category',
title = 'Number of Years between Nobel Prize and Death',
subtitle = paste(
'Distribution and Average',
'#tidytuesday 20|2019',
sep = ' • '
),
caption = '© 2019 spren9er'
)
library(scico)
library(emojifont)
library(showtext)
#library(patchwork)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.