This report explores Nobel Prize data using the public JSON API provided by NobelPrize.org. The goal is to practice working with real-world JSON data in R while performing meaningful exploratory analysis. After retrieving and cleaning data I address four key questions: (1) Which countries have produced the most Nobel laureates by birth? (2) Which countries have “lost” the most laureates—those born there but affiliated elsewhere when awarded? (3) Which universities or research institutions most frequently appear as laureate affiliations? (4) How has the age of Nobel Prize recipients evolved over time by discipline and decade? Together, these analyses illustrate how JSON APIs can be leveraged to uncover insights into global scientific and cultural trends.
laureates_id <- laureates_raw %>%
transmute(
id = id,
knownName = coalesce(knownName.en, fullName.en, orgName.en),
givenName = givenName.en,
familyName = familyName.en,
gender = gender,
birth_date = suppressWarnings(ymd(birth.date)),
birth_country = coalesce(birth.place.country.en, birth.place.countryNow.en),
death_date = suppressWarnings(ymd(death.date)),
is_organization = if_else(!is.na(orgName.en), TRUE, FALSE)
)
prizes_per_laureate <- laureates_raw %>%
select(id, nobelPrizes) %>%
tidyr::unnest(nobelPrizes, keep_empty = TRUE) %>%
transmute(
id = id,
awardYear = suppressWarnings(as.integer(awardYear)),
category = category.en,
prizeStatus = prizeStatus,
prizeAmount = prizeAmount,
prizeAmountAdjusted = prizeAmountAdjusted,
affiliations = affiliations
)
affils <- prizes_per_laureate %>%
mutate(row_id = dplyr::row_number()) %>%
unnest(affiliations, keep_empty = TRUE) %>%
transmute(
row_id,
affiliation_name = name.en,
affiliation_city = city.en,
affiliation_country = coalesce(country.en, countryNow.en)
)
prizes_per_laureate <- prizes_per_laureate %>%
mutate(row_id = dplyr::row_number()) %>%
left_join(affils, by = "row_id") %>%
select(-row_id)
# Combined table for analysis
laureate_prizes <- prizes_per_laureate %>%
left_join(laureates_id, by = "id")
q1 <- laureates_id %>%
filter(is_organization == FALSE, !is.na(birth_country)) %>%
distinct(id, birth_country) %>%
count(birth_country, sort = TRUE) %>%
slice_head(n = 15) %>%
mutate(birth_country = fct_reorder(birth_country, n))
ggplot(q1, aes(n, birth_country)) +
geom_col() +
labs(
title = "Top birth countries of Nobel laureates (persons)",
x = "Number of laureates",
y = NULL
)
person_affil <- laureate_prizes %>%
filter(is_organization == FALSE) %>%
filter(!is.na(affiliation_country), !is.na(birth_country))
lost_flags <- person_affil %>%
mutate(lost = affiliation_country != birth_country) %>%
group_by(id, birth_country) %>%
summarise(lost_any = any(lost, na.rm = TRUE), .groups = "drop")
q2 <- lost_flags %>%
filter(lost_any) %>%
count(birth_country, sort = TRUE) %>%
slice_head(n = 15) %>%
mutate(birth_country = fct_reorder(birth_country, n))
ggplot(q2, aes(n, birth_country)) +
geom_col() +
labs(
title = "Which birth countries 'lost' the most laureates?",
subtitle = "Born in one country, awarded while affiliated in another",
x = "Number of 'lost' laureates (persons)",
y = NULL
)
top_affiliations <- laureate_prizes %>%
filter(is_organization == FALSE) %>%
filter(!is.na(affiliation_name)) %>%
mutate(affiliation_name = str_squish(affiliation_name)) %>%
filter(affiliation_name != "", affiliation_name != "—") %>%
count(affiliation_name, sort = TRUE) %>%
slice_head(n = 20) %>%
mutate(affiliation_name = fct_reorder(affiliation_name, n))
ggplot(top_affiliations, aes(n, affiliation_name)) +
geom_col() +
labs(
title = "Most frequent affiliations (universities/institutions)",
subtitle = "Based on affiliation recorded for the prize",
x = "Count of laureate–prize records",
y = NULL
)
age_df <- laureate_prizes %>%
filter(is_organization == FALSE, !is.na(birth_date), !is.na(awardYear)) %>%
mutate(
award_date = suppressWarnings(ymd(paste0(awardYear, "-06-01"))),
age_years = as.numeric(difftime(award_date, birth_date, units = "days")) / 365.25,
decade = (awardYear %/% 10) * 10
) %>%
filter(is.finite(age_years), age_years > 0, age_years < 120)
summary(age_df$age_years)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.89 51.47 60.56 60.41 69.57 96.85
age_summary <- age_df %>%
group_by(decade, category) %>%
summarise(
n = n(),
median_age = median(age_years, na.rm = TRUE),
iqr = IQR(age_years, na.rm = TRUE),
.groups = "drop"
)
ggplot(age_summary, aes(decade, median_age, color = category)) +
geom_line(linewidth = 1) +
geom_point() +
labs(
title = "Median age at award by category and decade",
x = "Decade",
y = "Median age (years)"
)
The analysis reveal distinct patterns in the Nobel landscape. Historically, most Nobel laureates were born in a few number of countries countries, reflecting global concentrations of research and education infrastructure and financial capacity. Many laureates received their awards while affiliated with institutions outside their birth countries. Prestigious universities such as Harvard, Cambridge, and the University of California system dominate affiliation counts, reinforcing their central role in global research excellence. Finally, the age analysis shows that Nobel recognition often comes later in life, with the median age at award steadily increasing across decades in most categories. Overall, this assignment demonstrates how structured API data can be transformed and visualized in R to answer real analytical questions about human achievement and global scientific collaboration.