1 Overview

This R Markdown pulls JSON from the Nobel Prize API and answers four questions(2 explorations):

  1. Which birth countries “lost” the most laureates? (born in one country, affiliated in another at the time of award)
  2. Comparison of Age at award by category (and a chart grouping into 20‑year periods for comparison over time)
  3. Which institutions have the most laureates?
  4. Comparison of Awards by gender by birth city
library(tidyverse)
library(httr2)
library(jsonlite)
library(lubridate)
library(janitor)

2 Fetch all pages

base_url <- "https://api.nobelprize.org/2.1"

fetch_all <- function(endpoint, query = list(), page_size = 200, limit = 1000) {
  out <- list(); offset <- 0; total <- Inf
  repeat {
    req <- request(file.path(base_url, endpoint)) |>
      req_url_query(!!!query, offset = offset, limit = page_size) |>
      req_user_agent("MSDS/Assignment10B")
    resp <- req_perform(req)
    js   <- resp_body_json(resp, simplifyVector = FALSE)
    items <- js$laureates %||% js$nobelPrizes %||% list()
    if (length(items) == 0) break
    out <- c(out, items)
    total <- js$meta$count %||% length(out)
    offset <- offset + length(items)
    if (offset >= total || length(out) >= limit) break
  }
  out
}

pluck_chr <- function(x, ...) {
  val <- purrr::pluck(x, ..., .default = NA_character_)
  if (is.null(val)) NA_character_ else as.character(val)
}

3 Get and tidy laureates

laureates_raw <- fetch_all("laureates")

laureates_tbl <- map_dfr(laureates_raw, function(l) {
  id         <- pluck_chr(l, "id")
  known      <- pluck_chr(l, "knownName", "en") %||% pluck_chr(l, "orgName", "en")
  full_name  <- pluck_chr(l, "fullName", "en")
  gender     <- pluck_chr(l, "gender")
  birth_date <- pluck_chr(l, "birth", "date")
  birth_ctry <- pluck_chr(l, "birth", "place", "country", "en")
  birth_city <- pluck_chr(l, "birth", "place", "city", "en")
  awards     <- l$nobelPrizes %||% list()

  if (length(awards) == 0) {
    tibble(
      id, known, full_name, gender, birth_date, birth_ctry, birth_city,
      award_year = NA_character_, category = NA_character_,
      affiliation_name = NA_character_, affiliation_city = NA_character_,
      affiliation_ctry = NA_character_
    )
  } else {
    map_dfr(awards, function(a) {
      affs <- a$affiliations %||% list()
      aff  <- if (length(affs) > 0) affs[[1]] else list()
      tibble(
        id, known, full_name, gender, birth_date, birth_ctry, birth_city,
        award_year = pluck_chr(a, "awardYear"),
        category   = pluck_chr(a, "category", "en"),
        affiliation_name = pluck_chr(aff, "name", "en"),
        affiliation_city = pluck_chr(aff, "city", "en"),
        affiliation_ctry = pluck_chr(aff, "country", "en")
      )
    })
  }
}) |>
  clean_names() |>
  mutate(
    birth_date = suppressWarnings(as_date(birth_date)),
    award_year = suppressWarnings(as.integer(award_year))
  )

# Humans only for person-focused analyses
people <- laureates_tbl |>
  filter(!is.na(gender) & gender %in% c("male", "female"))

4 Q1. Which birth countries “lost” the most laureates?

Definition Count a “loss” when the affiliation country (at award) differs from birth country.

loss_tbl <- people |>
  filter(!is.na(birth_ctry), !is.na(affiliation_ctry)) |>
  mutate(lost = birth_ctry != affiliation_ctry) |>
  filter(lost) |>
  count(birth_ctry, sort = TRUE, name = "lost_count")

head(loss_tbl, 15)
## # A tibble: 15 × 2
##    birth_ctry      lost_count
##    <chr>                <int>
##  1 Germany                 26
##  2 United Kingdom          24
##  3 Canada                  15
##  4 France                  12
##  5 Austria-Hungary         11
##  6 Prussia                 11
##  7 Russia                  10
##  8 Russian Empire          10
##  9 Scotland                 9
## 10 the Netherlands          9
## 11 Italy                    8
## 12 China                    7
## 13 Hungary                  7
## 14 India                    7
## 15 Australia                6
loss_tbl %>%
  slice_head(n = 15) %>%
  ggplot(aes(x = reorder(birth_ctry, lost_count), y = lost_count)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL, y = "Lost laureates", title = "Birth countries ‘losing’ laureates (affiliation abroad at award)")
Top birth countries by number of ‘lost’ laureates.

Top birth countries by number of ‘lost’ laureates.

It looks like Germany tops out the list followed by the U.K. and Canada for lost award recipients.

5 Q2. Copmaring Age at award by category (20‑year periods)

Age estimate = award_year − birth year. Then we group by category and 20‑year period.

age_data <- people |>
  mutate(
    birth_year = year(birth_date),
    age_at_award = if_else(!is.na(birth_year) & !is.na(award_year),
                           award_year - birth_year, as.numeric(NA)),
    period20 = cut(award_year,
                   breaks = seq(1890, year(Sys.Date()) + 10, by = 20),
                   right = FALSE, include.lowest = TRUE, dig.lab = 4)
  ) |>
  filter(!is.na(age_at_award), age_at_award > 0, age_at_award < 120)

age_summary <- age_data |>
  group_by(category, period20) |>
  summarize(n = n(), avg_age = mean(age_at_award), med_age = median(age_at_award), .groups = "drop")

head(age_summary, 12)
## # A tibble: 12 × 5
##    category          period20        n avg_age med_age
##    <chr>             <fct>       <int>   <dbl>   <dbl>
##  1 Chemistry         [1890,1910)     9    51      50  
##  2 Chemistry         [1910,1930)    18    50.8    50  
##  3 Chemistry         [1930,1950)    22    49.5    50.5
##  4 Chemistry         [1950,1970)    29    54.4    52  
##  5 Chemistry         [1970,1990)    35    58.9    58  
##  6 Chemistry         [1990,2010)    41    64.1    63  
##  7 Chemistry         [2010,2030]    37    68.7    70  
##  8 Economic Sciences [1950,1970)     2    70      70  
##  9 Economic Sciences [1970,1990)    25    67.4    67  
## 10 Economic Sciences [1990,2010)    36    66.4    64.5
## 11 Economic Sciences [2010,2030]    30    67.7    68.5
## 12 Literature        [1890,1910)    10    64.9    66.5
ggplot(age_summary, aes(x = period20, y = avg_age, group = category)) +
  geom_line() +
  geom_point() +
  facet_wrap(~ category, scales = "free_y") +
  labs(x = "20‑year period", y = "Average age", title = "Age at award by category (20‑year bins)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
Average age at award by category in 20‑year periods.

Average age at award by category in 20‑year periods.

This I was very curious about as some things seem to lean towards an age. Very interesting to see physics and Medicine age has steadily went up while the peace prize age has gone down. Not sure if this trend will reverse as it suggests a lacking interest or focus on hard sciences in research.

6 Q3. Which institutions have the most laureates?

inst_counts <- people |>
  filter(!is.na(affiliation_name)) |>
  mutate(inst_norm = str_squish(str_to_lower(affiliation_name))) |>
  count(inst_norm, sort = TRUE, name = "awards")

head(inst_counts, 25)
## # A tibble: 25 × 2
##    inst_norm                                    awards
##    <chr>                                         <int>
##  1 university of california                         38
##  2 harvard university                               28
##  3 massachusetts institute of technology (mit)      23
##  4 stanford university                              22
##  5 university of chicago                            20
##  6 columbia university                              18
##  7 california institute of technology (caltech)     17
##  8 princeton university                             17
##  9 university of cambridge                          17
## 10 rockefeller university                           13
## # ℹ 15 more rows
inst_counts %>%
  slice_head(n = 15) %>%
  ggplot(aes(x = reorder(inst_norm, awards), y = awards)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL, y = "Awards", title = "Institutions with the most Nobel‑affiliated awards")
Top institutions by affiliated awards.

Top institutions by affiliated awards.

Unsurprisingly UC(LA,Berkeley), Harvard,MIT, and Stanford top out the list but it is interesting to see how high up the University of Chicago is on this list. # Q4. Comparing our Awards by gender by birth city

city_gender <- people |>
  filter(!is.na(birth_city), !is.na(gender), !is.na(award_year)) |>
  count(birth_city, gender, name = "awards")

top_cities <- city_gender |>
  group_by(birth_city) |>
  summarize(total = sum(awards), .groups = "drop") |>
  arrange(desc(total)) |>
  slice_head(n = 20) |>
  pull(birth_city)

city_gender_top <- city_gender |>
  filter(birth_city %in% top_cities)

head(city_gender_top, 12)
## # A tibble: 12 × 3
##    birth_city            gender awards
##    <chr>                 <chr>   <int>
##  1 Berlin                female      1
##  2 Berlin                male       10
##  3 Boston, MA            female      1
##  4 Boston, MA            male        8
##  5 Brooklyn, NY          male        8
##  6 Budapest              male        8
##  7 Chicago, IL           male       14
##  8 Copenhagen            male        4
##  9 Frankfurt-on-the-Main male        6
## 10 Hamburg               male        7
## 11 Kyoto                 male        4
## 12 London                male       21
ggplot(city_gender_top, aes(x = reorder(birth_city, -awards), y = awards, fill = gender)) +
  geom_col() +
  coord_flip() +
  labs(x = "Birth city(Top 20)", y = "Awards", title = "Awards by gender by birth city ") +
  guides(fill = guide_legend(title = "Gender"))
Awards by gender for the top 20 birth cities.

Awards by gender for the top 20 birth cities.

This chart was largely inconclusive to me but it was interesting to see that Prague probably has the highest percentage of female recipients. I thought the Nordic countries would have had a more substantial grouping here.