Working in progress

Personal project to reproduce code from #TidyTuesday Project

Main Sources:

Nobel winners

Libraries

library(tidyverse)
library(lubridate)
library(ggrepel)
library(WDI)
library(countrycode)
theme_set(theme_light())

Load data

# load data 1 - David Robinson 
nobel_winners <- read_csv(
  paste0(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/',
    'master/data/2019/2019-05-14/nobel_winners.csv'
  )
)

# load data 1 - David Robinson
nobel_winners <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv") %>%
  distinct(full_name, prize_year, category, .keep_all = TRUE) %>%
  mutate(decade = 10 * (prize_year %/% 10),
         age = prize_year - year(birth_date))

# load data 2 - David Robinson
nobel_winner_all_pubs <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winner_all_pubs.csv") %>%
  mutate(prize_decade = 10 * (prize_year %/% 10))

Code David Robinson & Github

Some exploration #Note Run

### Exploring
#### 1
nobel_winners %>% 
  count(category, 
        decade = 10*(prize_year%/%10), sort = T) %>% # transform year to decades
  ggplot(aes(decade, n, fill = category))+
  geom_col()
#### 2 
nobel_winners %>% 
  group_by(category, 
        decade = 10*(prize_year%/%10), sort = T) %>% # transform year to decade s
  summarise(winners = n(),
            winners_per_year = winners/n_distinct(prize_year)) %>% 
  ggplot(aes(decade, winners_per_year,color= category))+
  geom_line()+
  expand_limits(y=0)
#### 3
nobel_winners %>% 
  add_count(full_name) %>% 
  arrange(desc(n)) %>% 
  select(prize_year, category, full_name) 
#### 4
nobel_winners %>% 
  filter(category == "Medicine", full_name == "Jack W. Szostak")
#### 4
nobel_winners %>% 
  distinct(full_name, prize_year, category) %>% 
  add_count(full_name) %>% 
  arrange(desc(n)) 
#### 5
nobel_winners %>% 
  distinct(full_name, prize_year, category) %>% 
  group_by(full_name) %>% 
  mutate(prizes = n(),
         distinct_prizes = n_distinct(category)) %>% 
  arrange(desc(prizes)) 
#### 6 - gender 
nobel_winners %>% 
  count(decade,
        category,
        gender = coalesce(gender, laureate_type)) %>% #coalesce
  group_by(decade, category) %>% 
  mutate(percent = n/sum(n)) %>% 
  ggplot(aes(decade, percent, fill = gender))+
  geom_col()+
  facet_wrap(~ category)
#### 6 - gender 2
nobel_winners %>% 
  count(decade,
        category,
        gender = coalesce(gender, laureate_type)) %>% #coalesce
  group_by(decade, category) %>% 
  mutate(percent = n/sum(n)) %>% 
  ggplot(aes(decade, n, fill = gender))+
  geom_col()+
  facet_wrap(~ category)+
  labs(x= "Decades",
       y = "# of nobel prize winners",
       fill = "Gender",
       title = "Nobel Prize gender distribution")
#### 7 - age
nobel_winners %>% 
  mutate(category = fct_reorder(category, age, median, na.rm = T)) %>% 
  ggplot(aes(category, age))+
  geom_boxplot()+
  coord_flip()
#### 8 - age
nobel_winners %>% 
  filter(!is.na(age)) %>% 
  group_by(decade, category) %>% 
  summarise(average_age = mean(age),
            median_age = median(age)) %>% 
  ggplot(aes(decade, average_age, color = category))+
  geom_line()
#### 9
nobel_winners %>% 
  filter(full_name == "Albert Einstein") %>% 
  pull(motivation)

Final code - David

nobel_winners %>%
  group_by(category, decade) %>%
  summarize(winners = n(),
            winners_per_year = winners / n_distinct(prize_year)) %>%
  ggplot(aes(decade, winners_per_year, color = category)) +
  geom_line() +
  expand_limits(y = 0)

nobel_winners %>%
  distinct(full_name, prize_year, category) %>%
  group_by(full_name) %>%
  mutate(prizes = n(),
         distinct_prizes = n_distinct(category)) %>%
  arrange(desc(prizes), full_name)
## # A tibble: 911 x 5
## # Groups:   full_name [904]
##    full_name                     prize_year category prizes distinct_prizes
##    <chr>                              <dbl> <chr>     <int>           <int>
##  1 Comité international de la C~       1917 Peace         3               1
##  2 Comité international de la C~       1944 Peace         3               1
##  3 Comité international de la C~       1963 Peace         3               1
##  4 Frederick Sanger                    1958 Chemist~      2               1
##  5 Frederick Sanger                    1980 Chemist~      2               1
##  6 John Bardeen                        1956 Physics       2               1
##  7 John Bardeen                        1972 Physics       2               1
##  8 Linus Carl Pauling                  1954 Chemist~      2               2
##  9 Linus Carl Pauling                  1962 Peace         2               2
## 10 Marie Curie, née Sklodowska         1903 Physics       2               2
## # ... with 901 more rows
nobel_winners %>%
  count(decade,
        category,
        gender = coalesce(gender, laureate_type)) %>%
  group_by(decade, category) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(decade, n, fill = gender)) +
  geom_col() +
  facet_wrap(~ category) +
  labs(x = "Decade",
       y = "# of nobel prize winners",
       fill = "Gender",
       title = "Nobel Prize gender distribution over time")

nobel_winners %>%
  mutate(category = fct_reorder(category, age, median, na.rm = TRUE)) %>%
  ggplot(aes(category, age)) +
  geom_boxplot() +
  coord_flip()

nobel_winners %>%
  filter(!is.na(age)) %>%
  group_by(decade, category) %>%
  summarize(average_age = mean(age),
            median_age = median(age)) %>%
  ggplot(aes(decade, average_age, color = category)) +
  geom_line()

nobel_winners %>%
  filter(prize_year >= 2010, category == "Peace") %>%
  select(full_name, age, prize)
## # A tibble: 10 x 3
##    full_name                                       age prize               
##    <chr>                                         <dbl> <chr>               
##  1 Liu Xiaobo                                       55 The Nobel Peace Pri~
##  2 Ellen Johnson Sirleaf                            73 The Nobel Peace Pri~
##  3 Leymah Gbowee                                    39 The Nobel Peace Pri~
##  4 Tawakkol Karman                                  32 The Nobel Peace Pri~
##  5 European Union (EU)                              NA The Nobel Peace Pri~
##  6 Organisation for the Prohibition of Chemical~    NA The Nobel Peace Pri~
##  7 Kailash Satyarthi                                60 The Nobel Peace Pri~
##  8 Malala Yousafzai                                 17 The Nobel Peace Pri~
##  9 National Dialogue Quartet                        NA The Nobel Peace Pri~
## 10 Juan Manuel Santos                               65 The Nobel Peace Pri~
nobel_winners %>%
  filter(!is.na(birth_country)) %>%
  count(birth_country = fct_lump(birth_country, 9),
        category,
        sort = TRUE) %>%
  mutate(birth_country = fct_reorder(birth_country, n)) %>%
  ggplot(aes(birth_country, n, fill = category)) +
  geom_col() +
  facet_wrap(~ category) +
  coord_flip()

indicators_raw <- WDI(indicator = "NY.GDP.PCAP.CD",
                      start = 2016, end = 2016, extra = TRUE) %>%
  tbl_df() %>%
  select(country,
         country_code = iso2c,
         income,
         gdp_per_capita = NY.GDP.PCAP.CD)
nobel_winners_countries <- nobel_winners %>%
  mutate(country_code = countrycode(birth_country, "country.name", "iso2c")) %>%
  inner_join(indicators_raw, by = "country_code") %>%
  mutate(income = fct_relevel(income, c("Low income", "Lower middle income", "Upper middle income", "High income")))
nobel_winners_countries %>%
  filter(!is.na(income)) %>%
  count(category, income) %>%
  group_by(category) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(income, percent)) +
  geom_col() +
  facet_wrap(~ category) +
  coord_flip() +
  labs(x = "Current income level of birth country",
       y = "% of this category's prizes",
       title = "Where do Nobel Prize winners come from?")

winners_summarized <- nobel_winner_all_pubs %>%
  filter(pub_year <= prize_year) %>%
  group_by(laureate_id,
           laureate_name,
           category,
           prize_year,
           prize_decade) %>%
  summarize(papers_before_prize = n(),
            papers_before_prize_5_years = sum(pub_year >= prize_year - 5),
            average_paper_age = mean(prize_year - pub_year),
            winning_paper_age = mean((prize_year - pub_year)[is_prize_winning_paper == "YES"]))
# What fraction had retired based on their papers in last five years?
winners_summarized %>%
  group_by(category, prize_decade) %>%
  summarize(average_papers = mean(papers_before_prize),
            average_paper_age = mean(average_paper_age),
            average_winning_paper_age = mean(winning_paper_age)) %>%
  ggplot(aes(prize_decade, average_winning_paper_age, color = category)) +
  geom_line() +
  labs(x = "Prize decade",
       y = "Time between when paper was published and won prize",
       title = "Scientists have to wait longer for a Nobel Prize than ever",
       color = "Category") +
  expand_limits(y = 0)

pubs_enriched <- nobel_winner_all_pubs %>%
  group_by(laureate_id, category, prize_year) %>%
  mutate(papers_before = rank(pub_year, ties.method = "first") - 1,
         total_papers = n(),
         position_in_career =  papers_before / total_papers,
         first_pub_year = min(pub_year)) %>%
  ungroup()
nobel_winners %>%
  filter(!is.na(age),
         category %in% c("Chemistry", "Medicine", "Physics")) %>%
  group_by(decade, category) %>%
  summarize(average_age = mean(age),
            median_age = median(age)) %>%
  ggplot(aes(decade, average_age, color = category)) +
  geom_line()

pubs_enriched %>%
  filter(is_prize_winning_paper == "YES") %>%
  group_by(prize_decade, category) %>%
  summarize(average_position_in_career = mean(position_in_career)) %>%
  ggplot(aes(prize_decade, average_position_in_career, color = category)) +
  geom_line()

The average recent Nobel Prize winner in Chemistry, Medicine or Physics is in their late 60s and is winning for work published about 25 years ago, about a fifth of the way through their career.

pubs_enriched %>%
  filter(is_prize_winning_paper == "YES") %>%
  group_by(prize_decade, category) %>%
  summarize(average_position_in_career = mean(position_in_career)) %>%
  ggplot(aes(prize_decade, average_position_in_career, color = category)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent_format())

pubs_enriched %>%
  filter(pub_year - first_pub_year < 75,
         prize_year >= 1910,
         prize_year <= 2000) %>%
  ggplot(aes(pub_year - first_pub_year, fill = is_prize_winning_paper)) +
  geom_density(alpha = .5) +
  facet_wrap(~ category) +
  labs(title = "Typical arc of a Nobel Prize winner's career",
       subtitle = "For people who won between 1910 and 2000",
       x = "Years into their publishing career")

  • Most Nobel-prize winning papers are in the first 20 years of someone’s career
  • A winner’s publishing productivity peaks about 30 years after they publish their first paper

Code spren9er

dead_nobel_winners <- nobel_winners %>%
  filter(!is.na(death_date)) %>%
  mutate(years_until_death = year(death_date) - prize_year) %>%
  distinct(category, full_name, years_until_death)

avg_years_until_death <- dead_nobel_winners %>%
  group_by(category) %>%
  summarize(avg_years_until_death = mean(years_until_death))

max_years_until_death <- dead_nobel_winners %>%
  group_by(category) %>%
  top_n(1, years_until_death)

min_years_until_death <- dead_nobel_winners %>%
  group_by(category) %>%
  top_n(1, -years_until_death)

dead_nobel_winners %>%
  left_join(avg_years_until_death, by = 'category') %>%
  anti_join(min_years_until_death) %>%
  anti_join(max_years_until_death) %>%
  mutate(category = fct_reorder(category, avg_years_until_death)) %>%
  ggplot() +
    geom_vline(aes(xintercept = 0)) +
    geom_jitter(
      aes(x = -years_until_death, y = category),
      color = 'red', alpha = 1/3, height = 0.15, width = 0
    ) +
    geom_line(
      data = avg_years_until_death,
      aes(x = -avg_years_until_death, y = category, group = 1)
    ) +
    geom_point(
      data = avg_years_until_death,
      aes(x = -avg_years_until_death, y = category)
    ) +
    geom_point(
      data = max_years_until_death,
      aes(x = -years_until_death, y = category),
      color = 'red', alpha = 1/3
    ) +
    geom_label_repel(
      data = max_years_until_death,
      aes(x = -years_until_death, y = category, label = full_name),
      hjust = 1, size = 2.25, nudge_y = -0.4, nudge_x = 0.8,
      segment.size = 0.3, segment.alpha = 0.9, label.size = 0.25
    ) +
    geom_point(
      data = min_years_until_death,
      aes(x = -years_until_death, y = category),
      color = 'red', alpha = 1/3
    ) +
    geom_label_repel(
      data = min_years_until_death,
      aes(x = -years_until_death, y = category, label = full_name),
      hjust = 1, size = 2.25, nudge_y = -0.4, nudge_x = -0.8,
      segment.size = 0.3, segment.alpha = 0.9, label.size = 0.25
    ) +
    scale_x_continuous(
      breaks = seq(-60, 0, by = 5), labels = function(l) {-l}
    ) +
    labs(
      x = 'Number of Years',
      y = 'Category',
      title = 'Number of Years between Nobel Prize and Death',
      subtitle = paste(
        'Distribution and Average',
        '#tidytuesday 20|2019',
        sep = ' • '
      ),
      caption = '© 2019 spren9er'
    )

Code Z3tt

library(scico)
library(emojifont)
library(showtext)
#library(patchwork)

Other page

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.