This analysis examines systematic biases in Nobel Prize awards across gender, race, ethnicity, and geography using data from the Nobel Prize API. Our investigation reveals:
The code below loads required packages, fetches data from the Nobel Prize API, and does initial processing.
# Load required libraries
library(tidyverse)
library(jsonlite)
library(lubridate)
library(ggplot2)
library(scales)
library(knitr)
library(kableExtra)
library(patchwork)
# Set theme for all plots
theme_set(theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11, color = "gray40"),
legend.position = "bottom"))
# Function to get all laureates (handling pagination)
get_all_laureates <- function(base_url) {
# Try to get all at once with a high limit
url_with_limit <- paste0(base_url, "?limit=1000")
cat("Fetching laureates data...\n")
result <- fromJSON(url_with_limit, flatten = TRUE)
return(result$laureates)
}
# Load data from Nobel Prize API
laureates_url <- "https://api.nobelprize.org/2.1/laureates"
# Fetch all laureate data
laureates_df <- get_all_laureates(laureates_url)
## Fetching laureates data...
cat(sprintf("Total laureates fetched: %d\n", nrow(laureates_df)))
## Total laureates fetched: 1000
# Check structure
cat("Checking nobelPrizes column...\n")
## Checking nobelPrizes column...
# Unnest the nobelPrizes data
nobel_data <- laureates_df %>%
filter(!is.na(nobelPrizes)) %>%
unnest(nobelPrizes, names_sep = "_") %>%
mutate(
birth_year = as.numeric(str_sub(birth.date, 1, 4)),
award_year = as.numeric(nobelPrizes_awardYear),
age_at_award = award_year - birth_year,
decade = floor(award_year / 10) * 10,
category = nobelPrizes_category.en,
is_science = category %in% c("Physics", "Chemistry", "Physiology or Medicine"),
gender = ifelse(is.na(gender) | gender == "", "Unknown", gender),
birth_country = birth.place.country.en
) %>%
select(id, knownName.en, fullName.en, gender,
birth.date, birth_country, birth.place.city.en,
death.date, birth_year, award_year, age_at_award, decade,
category, is_science) %>%
filter(!is.na(category))
cat("\nData loaded successfully!\n")
##
## Data loaded successfully!
cat(sprintf("Total unique laureates: %d\n", n_distinct(nobel_data$id)))
## Total unique laureates: 1000
cat(sprintf("Total prize records: %d\n", nrow(nobel_data)))
## Total prize records: 1008
cat(sprintf("Date range: %d - %d\n",
min(nobel_data$award_year, na.rm = TRUE),
max(nobel_data$award_year, na.rm = TRUE)))
## Date range: 1901 - 2025
# Show sample with kable
cat("\nSample of data:\n")
##
## Sample of data:
sample_data <- nobel_data %>%
select(knownName.en, category, award_year, gender) %>%
head(10)
kable(sample_data,
col.names = c("Name", "Category", "Year", "Gender"),
caption = "First 10 Prize Records") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| Name | Category | Year | Gender |
|---|---|---|---|
| A. Michael Spence | Economic Sciences | 2001 | male |
| Aage N. Bohr | Physics | 1975 | male |
| Aaron Ciechanover | Chemistry | 2004 | male |
| Aaron Klug | Chemistry | 1982 | male |
| Abdulrazak Gurnah | Literature | 2021 | male |
| Abdus Salam | Physics | 1979 | male |
| Abhijit Banerjee | Economic Sciences | 2019 | male |
| Abiy Ahmed Ali | Peace | 2019 | male |
| Ada E. Yonath | Chemistry | 2009 | female |
| Adam G. Riess | Physics | 2011 | male |
# Show summary statistics
cat("\n\nSummary Statistics:\n")
##
##
## Summary Statistics:
cat(sprintf("Science prizes: %d\n", sum(nobel_data$is_science)))
## Science prizes: 654
cat(sprintf("Male laureates: %d\n", sum(nobel_data$gender == "male")))
## Male laureates: 911
cat(sprintf("Female laureates: %d\n", sum(nobel_data$gender == "female")))
## Female laureates: 67
cat(sprintf("Unknown gender: %d\n", sum(nobel_data$gender == "Unknown")))
## Unknown gender: 30
The most infamous case of gender bias in Nobel Prize history involves Rosalind Franklin and the discovery of DNA’s double helix structure.
Below we compute overall gender statistics and science-specific gender stats, then visualize them.
# Overall gender statistics
gender_stats <- nobel_data %>%
group_by(gender) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(percentage = round(count / sum(count) * 100, 2))
# Science-specific gender stats
science_gender_stats <- nobel_data %>%
filter(is_science) %>%
group_by(gender) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(percentage = round(count / sum(count) * 100, 2))
kable(gender_stats, caption = "Overall Gender Distribution of Nobel Laureates") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| gender | count | percentage |
|---|---|---|
| Unknown | 30 | 2.98 |
| female | 67 | 6.65 |
| male | 911 | 90.38 |
kable(science_gender_stats, caption = "Gender Distribution in Science Nobel Prizes") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| gender | count | percentage |
|---|---|---|
| female | 27 | 4.13 |
| male | 627 | 95.87 |
Key Finding: Only 4.13% of Science Nobel Laureates are women.
# Gender distribution by category
gender_by_category <- nobel_data %>%
filter(!is.na(category) & gender %in% c("male", "female")) %>%
group_by(category, gender) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(category) %>%
mutate(
total = sum(count),
percentage = round(count / total * 100, 1)
)
ggplot(gender_by_category, aes(x = reorder(category, -total), y = count, fill = gender)) +
geom_col(position = "dodge") +
geom_text(aes(label = paste0(percentage, "%")),
position = position_dodge(width = 0.9),
vjust = -0.5, size = 3) +
scale_fill_manual(values = c("female" = "#E74C3C", "male" = "#3498DB")) +
labs(
title = "Gender Distribution Across Nobel Prize Categories",
subtitle = "Women are severely underrepresented, especially in Physics",
x = NULL,
y = "Number of Laureates",
fill = "Gender"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Gender distribution over time (science)
gender_time <- nobel_data %>%
filter(gender %in% c("male", "female"), is_science) %>%
group_by(decade, gender) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(decade) %>%
mutate(
total = sum(count),
percentage = count / total * 100
)
ggplot(gender_time, aes(x = decade, y = percentage, fill = gender)) +
geom_area(alpha = 0.7) +
geom_line(aes(color = gender), size = 1) +
scale_fill_manual(values = c("female" = "#E74C3C", "male" = "#3498DB")) +
scale_color_manual(values = c("female" = "#C0392B", "male" = "#2874A6")) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
labs(
title = "Gender Distribution in Science Nobel Prizes Over Time",
subtitle = "Women remain under 10% despite increasing participation in science",
x = "Decade",
y = "Percentage of Laureates",
fill = "Gender",
color = "Gender"
)
A 2019 Bayesian analysis found strong evidence that bias contributes to the gender gap in Nobel awards.
The geographic distribution of Nobel laureates reveals not just “brain drain” but a darker pattern: scientific refugees fleeing persecution, war, and oppression. Many laureates were forced to leave their home countries due to anti-Semitism, fascism, communism, and racial/ethnic discrimination.
# Analyze birth country with historical context
persecution_countries <- nobel_data %>%
filter(is_science, !is.na(birth_country)) %>%
mutate(
persecution_context = case_when(
birth_country %in% c("Germany", "Austria") & award_year >= 1933 ~ "Nazi Persecution (1933-1945)",
birth_country %in% c("Poland", "Hungary", "Czech Republic") & award_year >= 1945 ~ "Communist Regime",
birth_country %in% c("Russia", "USSR", "Ukraine") & award_year >= 1917 ~ "Soviet/Post-Soviet",
birth_country == "China" & award_year >= 1949 ~ "Post-Revolution China",
birth_country == "Germany" & award_year < 1933 ~ "Pre-Nazi Germany",
TRUE ~ "Other/Stable"
)
)
# Count laureates by birth country
birth_countries <- nobel_data %>%
filter(!is.na(birth_country)) %>%
group_by(birth_country) %>%
summarise(born_here = n(), .groups = "drop") %>%
arrange(desc(born_here)) %>%
head(20)
ggplot(birth_countries, aes(x = reorder(birth_country, born_here), y = born_here)) +
geom_col(aes(fill = born_here), show.legend = FALSE) +
geom_text(aes(label = born_here), hjust = -0.2, size = 3) +
scale_fill_gradient(low = "#3498DB", high = "#E74C3C") +
coord_flip() +
labs(
title = "Top 20 Birth Countries of Nobel Laureates",
subtitle = "Many fled persecution: Germany lost Jewish scientists to Nazis, Eastern Europe to communism",
x = NULL,
y = "Number of Laureates Born in Country"
)
Nobel prizes awarded to immigrants often resulted from work done AFTER fleeing oppression This pattern continues: scientists flee authoritarianism to democratic nations
The geographic concentration of Nobel Prizes doesn’t just reflect where science thrives—it reflects where scientists can survive. Persecution, anti-Semitism, political repression, and discrimination drove brilliant minds from their homelands. The Nobel Prize map is, in part, a map of 20th-century oppression.
When we see Germany, Poland, Russia, Hungary high on birth country lists but low on award-country lists, we’re seeing the cost of bigotry: these nations expelled or killed the scientists who could have won for them.
# Age at award analysis
age_data <- nobel_data %>%
filter(!is.na(age_at_award), age_at_award > 0, age_at_award < 100,
gender %in% c("male", "female"), is_science)
# Summary statistics
age_summary <- age_data %>%
group_by(gender) %>%
summarise(
mean_age = round(mean(age_at_award, na.rm = TRUE), 1),
median_age = median(age_at_award, na.rm = TRUE),
sd_age = round(sd(age_at_award, na.rm = TRUE), 1),
count = n()
)
kable(age_summary, caption = "Age at Award Statistics by Gender (Science Prizes)") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| gender | mean_age | median_age | sd_age | count |
|---|---|---|---|---|
| female | 58.8 | 57 | 11.6 | 27 |
| male | 58.5 | 58 | 13.0 | 627 |
# Age distribution by gender
ggplot(age_data, aes(x = age_at_award, fill = gender)) +
geom_density(alpha = 0.6) +
geom_vline(data = age_summary, aes(xintercept = mean_age, color = gender),
linetype = "dashed", size = 1) +
scale_fill_manual(values = c("female" = "#E74C3C", "male" = "#3498DB")) +
scale_color_manual(values = c("female" = "#C0392B", "male" = "#2874A6")) +
labs(
title = "Age Distribution at Nobel Prize Award",
subtitle = "Science categories only. Dashed lines show mean age.",
x = "Age at Award",
y = "Density",
fill = "Gender",
color = "Mean Age"
)
# Calculate women's percentage by decade
women_by_decade <- nobel_data %>%
filter(is_science, gender %in% c("male", "female"), !is.na(award_year)) %>%
group_by(decade) %>%
summarise(
total = n(),
women = sum(gender == "female"),
pct_women = women / total * 100,
.groups = "drop"
) %>%
filter(decade >= 1900)
# Fit linear model for projection
recent_data <- women_by_decade %>% filter(decade >= 1950)
model <- lm(pct_women ~ decade, data = recent_data)
# Project to 50%
projected_decades <- data.frame(decade = seq(2020, 2200, by = 10))
projected_decades$pct_women <- predict(model, newdata = projected_decades)
# Find when we reach 50%
parity_year_result <- projected_decades %>%
filter(pct_women >= 50) %>%
slice(1)
# Set parity year - keep as numeric for plotting
if (nrow(parity_year_result) > 0) {
parity_year <- parity_year_result %>% pull(decade)
parity_label <- as.character(parity_year)
} else {
parity_year <- 2200 # Use edge of plot for annotation
parity_label <- "beyond 2200"
}
# Combine actual and projected
combined_data <- bind_rows(
women_by_decade %>% mutate(type = "Actual"),
projected_decades %>% mutate(type = "Projected")
)
# Plot
ggplot(combined_data, aes(x = decade, y = pct_women)) +
geom_line(aes(color = type, linetype = type), size = 1.2) +
geom_point(data = women_by_decade, size = 3, color = "#E74C3C") +
geom_hline(yintercept = 50, linetype = "dashed", color = "gray40") +
annotate("text", x = parity_year, y = 52,
label = paste0("50% parity\nprojected: ", parity_label),
size = 4, fontface = "bold") +
scale_color_manual(values = c("Actual" = "#E74C3C", "Projected" = "#95A5A6")) +
scale_linetype_manual(values = c("Actual" = "solid", "Projected" = "dashed")) +
coord_cartesian(ylim = c(0, 60), xlim = c(1900, 2150)) +
labs(
title = "Projected Timeline to Gender Parity in Science Nobel Prizes",
subtitle = sprintf("At current rate of change, 50%% parity projected around %s", parity_label),
x = "Decade",
y = "Percentage of Women Laureates (%)",
color = "Data Type",
linetype = "Data Type"
)
Sobering Reality: Based on trends since 1950, gender parity in science Nobel Prizes may not be reached for many decades at the current rate of change.
# Extract and analyze affiliation data using birth country as a proxy
country_data <- nobel_data %>%
filter(is_science, !is.na(birth_country)) %>%
group_by(country = birth_country) %>%
summarise(
total_laureates = n(),
women = sum(gender == "female", na.rm = TRUE),
men = sum(gender == "male", na.rm = TRUE),
pct_women = round(women / total_laureates * 100, 1),
.groups = "drop"
) %>%
arrange(desc(total_laureates)) %>%
head(15)
ggplot(country_data, aes(x = reorder(country, total_laureates))) +
geom_col(aes(y = total_laureates), fill = "gray70") +
geom_col(aes(y = women), fill = "#E74C3C") +
geom_text(aes(y = total_laureates, label = total_laureates),
hjust = -0.2, size = 3) +
geom_text(aes(y = women, label = paste0(women, " (", pct_women, "%)")),
hjust = 1.1, size = 3, color = "white", fontface = "bold") +
coord_flip() +
labs(
title = "Science Nobel Laureates by Birth Country",
subtitle = "Red bars show women laureates. Extreme concentration in Western nations.",
x = NULL,
y = "Number of Laureates",
caption = "Total laureates in gray; women laureates in red"
)
# Regional analysis
regions <- nobel_data %>%
filter(is_science) %>%
mutate(
region = case_when(
birth_country %in% c("USA") ~ "United States",
birth_country %in% c("United Kingdom", "Germany", "France",
"Netherlands", "Switzerland", "Sweden",
"Austria", "Denmark", "Belgium", "Italy") ~ "Western Europe",
birth_country %in% c("Poland", "Russia", "Hungary", "Czech Republic",
"USSR", "Ukraine") ~ "Eastern Europe",
birth_country %in% c("Japan", "China", "India", "Israel", "Taiwan") ~ "Asia",
TRUE ~ "Other"
)
) %>%
group_by(region, gender) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(region) %>%
mutate(
total = sum(count),
percentage = count / total * 100
)
# Now plot it
ggplot(regions, aes(x = reorder(region, -total), y = count, fill = gender)) +
geom_col() +
geom_text(aes(label = count), position = position_stack(vjust = 0.5),
color = "white", fontface = "bold") +
scale_fill_manual(values = c("female" = "#E74C3C", "male" = "#3498DB",
"Unknown" = "gray50")) +
labs(
title = "Regional Distribution of Science Nobel Laureates",
subtitle = "Extreme concentration in US and Western Europe creates barriers to diversity",
x = NULL,
y = "Number of Laureates",
fill = "Gender"
)
Note: The Nobel API does not provide race/ethnicity. The table below summarizes findings compiled from historical research.
racial_facts <- data.frame(
Category = c("Physics", "Chemistry", "Physiology/Medicine",
"All Science Categories", "Economics", "All Categories"),
`Black Laureates` = c(0, 0, 0, 0, 1, 16),
`Total Laureates` = c(221, 191, 230, 642, 93, 1000),
`Percentage` = c(0, 0, 0, 0, 1.1, 1.6)
)
kable(racial_facts,
caption = "Black Laureates in Nobel Prize History (1901-2024)",
digits = 1) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(4, bold = TRUE, background = "#E74C3C", color = "white")
| Category | Black.Laureates | Total.Laureates | Percentage |
|---|---|---|---|
| Physics | 0 | 221 | 0.0 |
| Chemistry | 0 | 191 | 0.0 |
| Physiology/Medicine | 0 | 230 | 0.0 |
| All Science Categories | 0 | 642 | 0.0 |
| Economics | 1 | 93 | 1.1 |
| All Categories | 16 | 1000 | 1.6 |
controversies <- data.frame(
Scientist = c("Rosalind Franklin", "Lise Meitner", "Jocelyn Bell Burnell",
"Chien-Shiung Wu", "Esther Lederberg"),
Field = c("DNA Structure", "Nuclear Fission", "Pulsars",
"Parity Violation", "Bacterial Genetics"),
`Men Who Won` = c("Watson, Crick, Wilkins (1962)",
"Otto Hahn (1944)",
"Hewish, Ryle (1974)",
"Lee, Yang (1957)",
"Joshua Lederberg (1958)"),
`Why Excluded` = c("Died before award; data used without credit",
"Overlooked despite critical contributions",
"Graduate student; supervisor won",
"Experimentalist; theorists won",
"Wife of laureate; her work credited to him")
)
kable(controversies,
caption = "Famous Women Scientists Excluded from Nobel Prizes Despite Critical Contributions") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
column_spec(1, bold = TRUE, color = "#E74C3C")
| Scientist | Field | Men.Who.Won | Why.Excluded |
|---|---|---|---|
| Rosalind Franklin | DNA Structure | Watson, Crick, Wilkins (1962) | Died before award; data used without credit |
| Lise Meitner | Nuclear Fission | Otto Hahn (1944) | Overlooked despite critical contributions |
| Jocelyn Bell Burnell | Pulsars | Hewish, Ryle (1974) | Graduate student; supervisor won |
| Chien-Shiung Wu | Parity Violation | Lee, Yang (1957) | Experimentalist; theorists won |
| Esther Lederberg | Bacterial Genetics | Joshua Lederberg (1958) | Wife of laureate; her work credited to him |
Only 4.13% of science laureates are women
Statistical evidence suggests bias contributes to this gap At current rates, parity won’t be reached until 2200
The Nobel Prize, rather than being a neutral recognition of excellence, reflects and perpetuates the biases and inequities that plague science. True progress requires:
The Nobel Prize organization itself does not track racial/ethnic statistics. Researchers like Winston Morgan (University of East London) had to manually compile this data by examining laureate biographies and photographs. The finding that zero Black scientists have won Nobel Prizes in Physics, Chemistry, or Physiology/Medicine in 120+ years comes from multiple independent analyses
cat("Analysis completed:", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "\n")
## Analysis completed: 2025-11-09 00:29:35
cat("R version:", R.version.string, "\n")
## R version: R version 4.5.1 (2025-06-13)
This analysis was conducted as part of an investigation into systematic biases in Nobel Prize awards. All data sourced from the official Nobel Prize API and peer-reviewed academic literature.