This R Markdown pulls JSON from the Nobel Prize API and answers four questions(2 explorations):
library(tidyverse)
library(httr2)
library(jsonlite)
library(lubridate)
library(janitor)
base_url <- "https://api.nobelprize.org/2.1"
fetch_all <- function(endpoint, query = list(), page_size = 200, limit = 1000) {
out <- list(); offset <- 0; total <- Inf
repeat {
req <- request(file.path(base_url, endpoint)) |>
req_url_query(!!!query, offset = offset, limit = page_size) |>
req_user_agent("MSDS/Assignment10B")
resp <- req_perform(req)
js <- resp_body_json(resp, simplifyVector = FALSE)
items <- js$laureates %||% js$nobelPrizes %||% list()
if (length(items) == 0) break
out <- c(out, items)
total <- js$meta$count %||% length(out)
offset <- offset + length(items)
if (offset >= total || length(out) >= limit) break
}
out
}
pluck_chr <- function(x, ...) {
val <- purrr::pluck(x, ..., .default = NA_character_)
if (is.null(val)) NA_character_ else as.character(val)
}
laureates_raw <- fetch_all("laureates")
laureates_tbl <- map_dfr(laureates_raw, function(l) {
id <- pluck_chr(l, "id")
known <- pluck_chr(l, "knownName", "en") %||% pluck_chr(l, "orgName", "en")
full_name <- pluck_chr(l, "fullName", "en")
gender <- pluck_chr(l, "gender")
birth_date <- pluck_chr(l, "birth", "date")
birth_ctry <- pluck_chr(l, "birth", "place", "country", "en")
birth_city <- pluck_chr(l, "birth", "place", "city", "en")
awards <- l$nobelPrizes %||% list()
if (length(awards) == 0) {
tibble(
id, known, full_name, gender, birth_date, birth_ctry, birth_city,
award_year = NA_character_, category = NA_character_,
affiliation_name = NA_character_, affiliation_city = NA_character_,
affiliation_ctry = NA_character_
)
} else {
map_dfr(awards, function(a) {
affs <- a$affiliations %||% list()
aff <- if (length(affs) > 0) affs[[1]] else list()
tibble(
id, known, full_name, gender, birth_date, birth_ctry, birth_city,
award_year = pluck_chr(a, "awardYear"),
category = pluck_chr(a, "category", "en"),
affiliation_name = pluck_chr(aff, "name", "en"),
affiliation_city = pluck_chr(aff, "city", "en"),
affiliation_ctry = pluck_chr(aff, "country", "en")
)
})
}
}) |>
clean_names() |>
mutate(
birth_date = suppressWarnings(as_date(birth_date)),
award_year = suppressWarnings(as.integer(award_year))
)
# Humans only for person-focused analyses
people <- laureates_tbl |>
filter(!is.na(gender) & gender %in% c("male", "female"))
Definition Count a “loss” when the affiliation country (at award) differs from birth country.
loss_tbl <- people |>
filter(!is.na(birth_ctry), !is.na(affiliation_ctry)) |>
mutate(lost = birth_ctry != affiliation_ctry) |>
filter(lost) |>
count(birth_ctry, sort = TRUE, name = "lost_count")
head(loss_tbl, 15)
## # A tibble: 15 × 2
## birth_ctry lost_count
## <chr> <int>
## 1 Germany 26
## 2 United Kingdom 24
## 3 Canada 15
## 4 France 12
## 5 Austria-Hungary 11
## 6 Prussia 11
## 7 Russia 10
## 8 Russian Empire 10
## 9 Scotland 9
## 10 the Netherlands 9
## 11 Italy 8
## 12 China 7
## 13 Hungary 7
## 14 India 7
## 15 Australia 6
loss_tbl %>%
slice_head(n = 15) %>%
ggplot(aes(x = reorder(birth_ctry, lost_count), y = lost_count)) +
geom_col() +
coord_flip() +
labs(x = NULL, y = "Lost laureates", title = "Birth countries ‘losing’ laureates (affiliation abroad at award)")
Top birth countries by number of ‘lost’ laureates.
It looks like Germany tops out the list followed by the U.K. and Canada for lost award recipients.
Age estimate = award_year − birth year. Then we group by
category and 20‑year period.
age_data <- people |>
mutate(
birth_year = year(birth_date),
age_at_award = if_else(!is.na(birth_year) & !is.na(award_year),
award_year - birth_year, as.numeric(NA)),
period20 = cut(award_year,
breaks = seq(1890, year(Sys.Date()) + 10, by = 20),
right = FALSE, include.lowest = TRUE, dig.lab = 4)
) |>
filter(!is.na(age_at_award), age_at_award > 0, age_at_award < 120)
age_summary <- age_data |>
group_by(category, period20) |>
summarize(n = n(), avg_age = mean(age_at_award), med_age = median(age_at_award), .groups = "drop")
head(age_summary, 12)
## # A tibble: 12 × 5
## category period20 n avg_age med_age
## <chr> <fct> <int> <dbl> <dbl>
## 1 Chemistry [1890,1910) 9 51 50
## 2 Chemistry [1910,1930) 18 50.8 50
## 3 Chemistry [1930,1950) 22 49.5 50.5
## 4 Chemistry [1950,1970) 29 54.4 52
## 5 Chemistry [1970,1990) 35 58.9 58
## 6 Chemistry [1990,2010) 41 64.1 63
## 7 Chemistry [2010,2030] 37 68.7 70
## 8 Economic Sciences [1950,1970) 2 70 70
## 9 Economic Sciences [1970,1990) 25 67.4 67
## 10 Economic Sciences [1990,2010) 36 66.4 64.5
## 11 Economic Sciences [2010,2030] 30 67.7 68.5
## 12 Literature [1890,1910) 10 64.9 66.5
ggplot(age_summary, aes(x = period20, y = avg_age, group = category)) +
geom_line() +
geom_point() +
facet_wrap(~ category, scales = "free_y") +
labs(x = "20‑year period", y = "Average age", title = "Age at award by category (20‑year bins)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Average age at award by category in 20‑year periods.
This I was very curious about as some things seem to lean towards an age. Very interesting to see physics and Medicine age has steadily went up while the peace prize age has gone down. Not sure if this trend will reverse as it suggests a lacking interest or focus on hard sciences in research.
inst_counts <- people |>
filter(!is.na(affiliation_name)) |>
mutate(inst_norm = str_squish(str_to_lower(affiliation_name))) |>
count(inst_norm, sort = TRUE, name = "awards")
head(inst_counts, 25)
## # A tibble: 25 × 2
## inst_norm awards
## <chr> <int>
## 1 university of california 38
## 2 harvard university 28
## 3 massachusetts institute of technology (mit) 23
## 4 stanford university 22
## 5 university of chicago 20
## 6 columbia university 18
## 7 california institute of technology (caltech) 17
## 8 princeton university 17
## 9 university of cambridge 17
## 10 rockefeller university 13
## # ℹ 15 more rows
inst_counts %>%
slice_head(n = 15) %>%
ggplot(aes(x = reorder(inst_norm, awards), y = awards)) +
geom_col() +
coord_flip() +
labs(x = NULL, y = "Awards", title = "Institutions with the most Nobel‑affiliated awards")
Top institutions by affiliated awards.
Unsurprisingly UC(LA,Berkeley), Harvard,MIT, and Stanford top out the list but it is interesting to see how high up the University of Chicago is on this list. # Q4. Comparing our Awards by gender by birth city
city_gender <- people |>
filter(!is.na(birth_city), !is.na(gender), !is.na(award_year)) |>
count(birth_city, gender, name = "awards")
top_cities <- city_gender |>
group_by(birth_city) |>
summarize(total = sum(awards), .groups = "drop") |>
arrange(desc(total)) |>
slice_head(n = 20) |>
pull(birth_city)
city_gender_top <- city_gender |>
filter(birth_city %in% top_cities)
head(city_gender_top, 12)
## # A tibble: 12 × 3
## birth_city gender awards
## <chr> <chr> <int>
## 1 Berlin female 1
## 2 Berlin male 10
## 3 Boston, MA female 1
## 4 Boston, MA male 8
## 5 Brooklyn, NY male 8
## 6 Budapest male 8
## 7 Chicago, IL male 14
## 8 Copenhagen male 4
## 9 Frankfurt-on-the-Main male 6
## 10 Hamburg male 7
## 11 Kyoto male 4
## 12 London male 21
ggplot(city_gender_top, aes(x = reorder(birth_city, -awards), y = awards, fill = gender)) +
geom_col() +
coord_flip() +
labs(x = "Birth city(Top 20)", y = "Awards", title = "Awards by gender by birth city ") +
guides(fill = guide_legend(title = "Gender"))
Awards by gender for the top 20 birth cities.
This chart was largely inconclusive to me but it was interesting to see that Prague probably has the highest percentage of female recipients. I thought the Nordic countries would have had a more substantial grouping here.