We are prompted to work with JSON data available through the Nobel Prize API.
We are answering the questions:
#Pull prizes
prizes_url <- "https://api.nobelprize.org/2.1/nobelPrizes?limit=1000"
prizes_json <- read_json(prizes_url) # , simplifyVector = TRUE
prizes <- tibble(prizes_json$nobelPrizes)
prizes <- prizes %>%
unnest_wider("prizes_json$nobelPrizes") %>%
unnest_wider("category", names_sep = "_") %>%
unnest_wider("categoryFullName", names_sep = "_") %>%
unnest_longer("laureates") %>%
unnest_wider("laureates", names_sep = "_") %>%
select(awardYear, category_en, categoryFullName_en, dateAwarded, prizeAmount, prizeAmountAdjusted,
laureates_id, laureates_knownName, laureates_fullName, laureates_portion, laureates_sortOrder, laureates_portion)
#Pull Laureates
laureates_url <- "https://api.nobelprize.org/2.1/laureates?limit=1000"
laureates_json <- read_json(laureates_url) # , simplifyVector = TRUE
laureates <- tibble(laureates_json$laureates)
laureates <- laureates %>%
unnest_wider("laureates_json$laureates") %>%
unnest_wider("knownName", names_sep = "_") %>%
unnest_wider("givenName", names_sep = "_") %>%
unnest_wider("fullName", names_sep = "_") %>%
unnest_wider("birth", names_sep = "_") %>%
unnest_wider("birth_place", names_sep = "_") %>%
unnest_wider("birth_place_city", names_sep = "_") %>%
unnest_wider("birth_place_country", names_sep = "_") %>%
unnest_wider("birth_place_continent", names_sep = "_") %>%
unnest_wider("death", names_sep = "_")%>%
unnest_wider("death_place", names_sep = "_") %>%
unnest_wider("death_place_city", names_sep = "_") %>%
unnest_wider("death_place_country", names_sep = "_") %>%
unnest_wider("death_place_continent", names_sep = "_") %>%
unnest_longer("nobelPrizes") %>%
unnest_wider("nobelPrizes", names_sep = "_") %>%
unnest_longer("nobelPrizes_affiliations") %>% # expand affiliations (if any)
unnest_wider("nobelPrizes_affiliations", names_sep = "_") %>%
unnest_wider("nobelPrizes_affiliations_city", names_sep = "_") %>%
unnest_wider("nobelPrizes_affiliations_country", names_sep = "_") %>%
unnest_wider("nobelPrizes_affiliations_continent", names_sep = "_") %>%
unnest_wider("nobelPrizes_affiliations_name", names_sep = "_") %>%
unnest_wider("nobelPrizes_category", names_sep = "_") %>%
select(id, knownName_en, givenName_en, fullName_en, gender,
birth_date, birth_place_city_en, birth_place_country_en, birth_place_continent_en,
death_date, death_place_city_en, death_place_country_en, death_place_continent_en,
nobelPrizes_awardYear, nobelPrizes_category_en, nobelPrizes_sortOrder, nobelPrizes_portion,
nobelPrizes_dateAwarded, nobelPrizes_prizeStatus, nobelPrizes_prizeAmount,
nobelPrizes_affiliations_name_en,
nobelPrizes_affiliations_city_en, nobelPrizes_affiliations_country_en, nobelPrizes_affiliations_continent_en
)
Which country “lost” the most nobel laureates (who were born there but received their Nobel prize in a different country)? (This was a suggested question.)
lost_laureates <- laureates %>%
filter(!is.na(birth_place_country_en),
!is.na(nobelPrizes_affiliations_country_en),
birth_place_country_en != nobelPrizes_affiliations_country_en)
lost_counts <- lost_laureates %>%
count(birth_place_country_en, sort = TRUE)
head(lost_counts, 10)
## # A tibble: 10 × 2
## birth_place_country_en n
## <chr> <int>
## 1 Germany 33
## 2 United Kingdom 27
## 3 Canada 15
## 4 France 15
## 5 Russia 14
## 6 Austria-Hungary 12
## 7 Prussia 12
## 8 the Netherlands 11
## 9 Russian Empire 10
## 10 Hungary 9
From this I see that Germany has lost the most Nobel laureates — people born in Germany, but who were affiliated with another country when receiving their Nobel Prize.
Which country “gained” the most nobel laureates (who were not born there but received their Nobel prize there)?
gained_counts <- lost_laureates %>%
count(nobelPrizes_affiliations_country_en, sort = TRUE)
head(gained_counts, 10)
## # A tibble: 10 × 2
## nobelPrizes_affiliations_country_en n
## <chr> <int>
## 1 USA 149
## 2 United Kingdom 39
## 3 Germany 37
## 4 Switzerland 13
## 5 France 12
## 6 USSR 11
## 7 Canada 6
## 8 Israel 6
## 9 Sweden 4
## 10 Denmark 3
By far, the USA “gained” the most nobel laureates.
How has the distribution of gender of Nobel laureates changed over time?
gender_by_year <- laureates %>%
filter(!is.na(gender), !is.na(nobelPrizes_awardYear)) %>%
mutate(year = as.numeric(nobelPrizes_awardYear)) %>%
# had to deal with the duplicates from multiple awards per person
group_by(id, year) %>%
summarise(gender = first(gender), .groups = "drop") %>%
# Business as usual
count(year, gender, name = "n_laureates") %>%
group_by(year) %>%
mutate(share = n_laureates / sum(n_laureates)) %>%
ungroup()
# Plot - opted for stacked bar
# Need summary by decade
gender_decade <- gender_by_year %>%
mutate(decade = floor(year / 10) * 10) %>% # collapse to decades
filter(gender %in% c("male", "female")) %>% # exclude orgs for clarity
group_by(decade, gender) %>%
summarise(avg_share = mean(share, na.rm = TRUE), .groups = "drop") %>%
group_by(decade) %>%
mutate(total = sum(avg_share),
prop = avg_share / total) # ensure normalized shares per decade
# the ral plot
ggplot(gender_decade, aes(x = factor(decade), y = prop, fill = gender)) +
geom_col(position = "stack") +
scale_y_continuous(labels = percent_format(accuracy = 1), limits = c(0, 1)) +
scale_fill_manual(values = c("male" = "#1f77b4", "female" = "#e377c2")) +
labs(
title = "Distribution of Nobel Laureates by Gender Over Time",
x = "Decade",
y = "Share of Laureates",
fill = "Gender"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(angle = 45, hjust = 1)
)
Clearly, women have only made up a small fraction of nobel laureate winners regardless of the decade. in the 1900s and 1950s, they made up no fraction at all.
Which countries have the highest number of laureates by field?
# Grouping the data
country_field <- laureates %>%
filter(!is.na(nobelPrizes_affiliations_country_en),
!is.na(nobelPrizes_category_en)) %>%
mutate(country = nobelPrizes_affiliations_country_en,
field = nobelPrizes_category_en) %>%
distinct(id, field, country, nobelPrizes_awardYear) %>%
count(country, field, name = "n_laureates") %>%
arrange(desc(n_laureates))
# top countries
top_countries <- country_field %>%
group_by(country) %>%
summarise(total = sum(n_laureates)) %>%
slice_max(total, n = 10) %>%
pull(country)
# Plot sticking with stacked bars
country_field %>%
filter(country %in% top_countries) %>%
ggplot(aes(x = reorder(country, n_laureates), y = n_laureates, fill = field)) +
geom_col(position = "stack") +
coord_flip() +
labs(
title = "Top 10 Countries by Number of Nobel Laureates (by Field)",
x = "Country of Affiliation at Award",
y = "Number of Laureates",
fill = "Nobel Category"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(size = 10),
legend.position = "bottom"
)
The U.S. is leading by a substantial margin in each field.