1 Import libraries and packages

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

2 Fetch data from API

API_URL <- "https://api.nobelprize.org/2.1/laureates"

# Initialize
all_laureates <- list()
offset <- 0
limit <- 100  # Fetch 100 at a time

repeat {
  # Build URL with pagination
  url <- paste0(API_URL, "?offset=", offset, "&limit=", limit)
  
  response <- GET(url)
  
  if (status_code(response) != 200) {
    stop(paste("API Error: Status code", status_code(response)))
  }
  
  data <- content(response, as = "text", encoding = "UTF-8")
  laureates_json <- fromJSON(data, flatten = TRUE)
  
  # Get laureates from this batch
  batch <- laureates_json$laureates
  
  if (is.null(batch) || nrow(batch) == 0) {
    break  # No more data
  }
  
  all_laureates[[length(all_laureates) + 1]] <- batch
  
  print(paste("Fetched", nrow(batch), "laureates. Total so far:", 
              sum(sapply(all_laureates, nrow))))
  
  # Check if we got less than limit (last page)
  if (nrow(batch) < limit) {
    break
  }
  
  offset <- offset + limit
}
## [1] "Fetched 100 laureates. Total so far: 100"
## [1] "Fetched 100 laureates. Total so far: 200"
## [1] "Fetched 100 laureates. Total so far: 300"
## [1] "Fetched 100 laureates. Total so far: 400"
## [1] "Fetched 100 laureates. Total so far: 500"
## [1] "Fetched 100 laureates. Total so far: 600"
## [1] "Fetched 100 laureates. Total so far: 700"
## [1] "Fetched 100 laureates. Total so far: 800"
## [1] "Fetched 100 laureates. Total so far: 900"
## [1] "Fetched 100 laureates. Total so far: 1000"
## [1] "Fetched 18 laureates. Total so far: 1018"
# Combine all batches
laureates <- bind_rows(all_laureates)

print(paste(" Total laureates fetched:", nrow(laureates)))
## [1] " Total laureates fetched: 1018"

3 Data exploration

# View structure
#glimpse(laureates)

# Check column names
colnames(laureates)
##   [1] "id"                               "fileName"                        
##   [3] "gender"                           "sameAs"                          
##   [5] "links"                            "nobelPrizes"                     
##   [7] "acronym"                          "nativeName"                      
##   [9] "penName"                          "knownName.en"                    
##  [11] "knownName.se"                     "knownName.no"                    
##  [13] "givenName.en"                     "givenName.se"                    
##  [15] "givenName.no"                     "familyName.en"                   
##  [17] "familyName.se"                    "familyName.no"                   
##  [19] "fullName.en"                      "fullName.se"                     
##  [21] "fullName.no"                      "birth.date"                      
##  [23] "birth.place.city.en"              "birth.place.city.no"             
##  [25] "birth.place.city.se"              "birth.place.country.en"          
##  [27] "birth.place.country.no"           "birth.place.country.se"          
##  [29] "birth.place.cityNow.en"           "birth.place.cityNow.no"          
##  [31] "birth.place.cityNow.se"           "birth.place.cityNow.sameAs"      
##  [33] "birth.place.cityNow.latitude"     "birth.place.cityNow.longitude"   
##  [35] "birth.place.countryNow.en"        "birth.place.countryNow.no"       
##  [37] "birth.place.countryNow.se"        "birth.place.countryNow.sameAs"   
##  [39] "birth.place.countryNow.latitude"  "birth.place.countryNow.longitude"
##  [41] "birth.place.continent.en"         "birth.place.continent.no"        
##  [43] "birth.place.continent.se"         "birth.place.locationString.en"   
##  [45] "birth.place.locationString.no"    "birth.place.locationString.se"   
##  [47] "wikipedia.slug"                   "wikipedia.english"               
##  [49] "wikidata.id"                      "wikidata.url"                    
##  [51] "death.date"                       "death.place.city.en"             
##  [53] "death.place.city.no"              "death.place.city.se"             
##  [55] "death.place.country.en"           "death.place.country.no"          
##  [57] "death.place.country.se"           "death.place.country.sameAs"      
##  [59] "death.place.cityNow.en"           "death.place.cityNow.no"          
##  [61] "death.place.cityNow.se"           "death.place.cityNow.sameAs"      
##  [63] "death.place.cityNow.latitude"     "death.place.cityNow.longitude"   
##  [65] "death.place.countryNow.en"        "death.place.countryNow.no"       
##  [67] "death.place.countryNow.se"        "death.place.countryNow.sameAs"   
##  [69] "death.place.countryNow.latitude"  "death.place.countryNow.longitude"
##  [71] "death.place.continent.en"         "death.place.continent.no"        
##  [73] "death.place.continent.se"         "death.place.locationString.en"   
##  [75] "death.place.locationString.no"    "death.place.locationString.se"   
##  [77] "orgName.en"                       "orgName.no"                      
##  [79] "orgName.se"                       "founded.date"                    
##  [81] "founded.place.city.en"            "founded.place.city.no"           
##  [83] "founded.place.city.se"            "founded.place.country.en"        
##  [85] "founded.place.country.no"         "founded.place.country.se"        
##  [87] "founded.place.country.sameAs"     "founded.place.cityNow.en"        
##  [89] "founded.place.cityNow.no"         "founded.place.cityNow.se"        
##  [91] "founded.place.cityNow.sameAs"     "founded.place.countryNow.en"     
##  [93] "founded.place.countryNow.no"      "founded.place.countryNow.se"     
##  [95] "founded.place.countryNow.sameAs"  "founded.place.continent.en"      
##  [97] "founded.place.continent.no"       "founded.place.continent.se"      
##  [99] "founded.place.locationString.en"  "founded.place.locationString.no" 
## [101] "founded.place.locationString.se"  "penNameOf.fullName"              
## [103] "foundedCountry.en"                "foundedCountry.no"               
## [105] "foundedCountry.se"                "foundedCountryNow.en"            
## [107] "foundedCountryNow.no"             "foundedCountryNow.se"            
## [109] "foundedContinent.en"
# View first laureate's name
laureates$knownName.en[1]
## [1] "A. Michael Spence"
# Nobel prizes are nested - let's look at one
laureates$nobelPrizes[[1]]
##   awardYear sortOrder portion dateAwarded prizeStatus prizeAmount
## 1      2001         2     1/3  2001-10-10    received    10000000
##   prizeAmountAdjusted
## 1            15547541
##                                                                                                                                                                                                                                                                                                                                                                                                                                                 affiliations
## 1 Stanford University, Stanford University, Stanford University, Stanford University, Stanford, CA, Stanford, CA, Stanford, CA, USA, USA, USA, Stanford, CA, Stanford, CA, Stanford, CA, https://www.wikidata.org/wiki/Q173813, https://www.wikipedia.org/wiki/Stanford,_California, 37.424734, -122.163858, USA, USA, USA, https://www.wikidata.org/wiki/Q30, 39.828175, -98.579500, North America, Stanford, CA, USA, Stanford, CA, USA, Stanford, CA, USA
##                                                                                                                                                                                                                                                                                                                                                                                                                               links
## 1 nobelPrize, external, external, https://api.nobelprize.org/2/nobelPrize/eco/2001, https://www.nobelprize.org/prizes/economic-sciences/2001/spence/facts/, https://www.nobelprize.org/prizes/economic-sciences/2001/summary/, GET, GET, GET, application/json, text/html, text/html, NA, A. Michael Spence - Facts, The Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel 2001, laureate facts, prize summary
##         category.en category.no category.se
## 1 Economic Sciences     Økonomi     Ekonomi
##                                                          categoryFullName.en
## 1 The Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel
##                                                       categoryFullName.no
## 1 Sveriges Riksbanks pris i økonomisk vitenskap til minne om Alfred Nobel
##                                                      categoryFullName.se
## 1 Sveriges Riksbanks pris i ekonomisk vetenskap till Alfred Nobels minne
##                                               motivation.en
## 1 for their analyses of markets with asymmetric information
##                                                motivation.se
## 1 för deras analys av marknader med assymetrisk informations

4 Analytical questions for the Nobel Peace Prize Portfolio

  • How has the distribution of Nobel Prize winners shifted across continents over the past 50 years?
  • How has the Median Age of Laureates Changed in the Past Fifty Years?
  • What percentage of prizes are shared among 2-3 winners versus awarded to individuals?
  • Which universities and research institutions have produced the most Nobel laureates in the last 30 years?
  • Who were the youngest Nobel Prize winners from each major geographic region at the time of their award?
  • Which Nobel Prize categories have awarded the most prize?

5 Question 1: Continental distribution of Nobel Prize winners (1975-2025)

## [1] "Total laureate-prize records: 1026"
## [1] "Records with missing continent: 23"
## [1] "Continental Distribution by Decade:"
## # A tibble: 26 × 4
##    decade    birth_continent count percentage
##    <chr>     <chr>           <int>      <dbl>
##  1 1975-1984 Europe             49      49.5 
##  2 1975-1984 North America      39      39.4 
##  3 1975-1984 South America       4       4.04
##  4 1975-1984 Africa              3       3.03
##  5 1975-1984 Asia                3       3.03
##  6 1975-1984 Oceania             1       1.01
##  7 1985-1994 North America      46      47.9 
##  8 1985-1994 Europe             35      36.5 
##  9 1985-1994 Asia                8       8.33
## 10 1985-1994 Africa              7       7.29
## # ℹ 16 more rows

6 Continental distribution summary table

## [1] "\nSummary Table (Wide Format):"
## # A tibble: 6 × 6
##   birth_continent `1975-1984` `1985-1994` `1995-2004` `2005-2014` `2015-2024`
##   <chr>                 <int>       <int>       <int>       <int>       <int>
## 1 Africa                    3           7           6           5           3
## 2 Asia                      3           8          11          23          19
## 3 Europe                   49          35          40          34          57
## 4 North America            39          46          55          48          46
## 5 Oceania                   1           0           4           3           0
## 6 South America             4           0           0           1           2

7 Continental distribution stacked bar chart

8 Continental winners trend line

viz3 <- ggplot(continental_distribution, 
               aes(x = decade, y = percentage, color = birth_continent, 
                   group = birth_continent)) +
  geom_line(linewidth = 1.2, alpha = 0.9) +
  geom_point(size = 3, alpha = 0.9) +
  scale_color_manual(values = pastel_colors) +
  labs(
    title = "Continental Trends in Nobel Prize Winners (1975-2024)",
    subtitle = "Asia shows strongest upward trajectory while Europe declines proportionally",
    x = "Decade",
    y = "Percentage of Laureates (%)",
    color = "Continent of Birth",
    caption = "Source: Nobel Prize API"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 15, face = "bold", hjust = 0),
    plot.subtitle = element_text(size = 11, color = "gray40", hjust = 0),
    plot.caption = element_text(size = 9, color = "gray50", hjust = 1),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    legend.position = "right",
    legend.title = element_text(size = 11, face = "bold"),
    panel.grid.minor = element_blank()
  )

print(viz3)

# Save the plot
ggsave("nobel_continental_trend_lines.png", viz3, width = 12, height = 7, dpi = 300)

9 Analysis of the distribition of Nobel Prize wnners across continents over the past 50 years

  • Europe’s Decline: Europe’s share decreased from 49.5% (1975-1984) to approximately 35-40% in recent decades, showing a relative decline in dominance.
  • Asia’s Rise: Asia’s representation increased dramatically from just 3% (1975-1984) to over 18% (2015-2024), representing a more than 6-fold increase.
  • North America’s Stability: North America maintained relatively stable representation at 39-48% across all decades, showing consistent scientific output.
  • Underrepresented Regions: Africa, South America, and Oceania continue to have minimal representation (under 5% combined), indicating persistent geographic inequality in Nobel
    Prize recognition.

10 Question 2: How has the median age of laureates changed in the past 50 years?

### Median age at award by category and decade

# Use the more precise prize date when available; otherwise default to Dec 10 of the award year
age_by_category_decade <- laureates_expanded %>%
  filter(!is.na(nobelPrizes_awardYear)) %>%
  mutate(
    award_year = as.integer(nobelPrizes_awardYear),
    # Prefer exact date if present (e.g., "YYYY-MM-DD"); else assume Nobel ceremony date (Dec 10)
    award_date_exact = suppressWarnings(lubridate::ymd(nobelPrizes_dateAwarded)),
    award_date_fallback = lubridate::ymd(paste0(award_year, "-12-10")),
    award_date = dplyr::coalesce(award_date_exact, award_date_fallback),
    birth_date = suppressWarnings(lubridate::ymd(birth.date)),
    age_at_award = as.numeric(lubridate::interval(birth_date, award_date) / lubridate::years(1)),
    category = nobelPrizes_category.en,
    decade = dplyr::case_when(
      award_year >= 1975 & award_year < 1985 ~ "1975-1984",
      award_year >= 1985 & award_year < 1995 ~ "1985-1994",
      award_year >= 1995 & award_year < 2005 ~ "1995-2004",
      award_year >= 2005 & award_year < 2015 ~ "2005-2014",
      award_year >= 2015 & award_year <= 2025 ~ "2015-2024",
      TRUE ~ NA_character_
    )
  ) %>%
  # Keep sensible records
  filter(!is.na(birth_date), !is.na(award_date), !is.na(age_at_award),
         age_at_award > 0, !is.na(category), !is.na(decade))

11 Median age by nobel prize category

# Summary table: median age by category & decade (+ count for context)
median_age_tbl <- age_by_category_decade %>%
  group_by(category, decade) %>%
  summarise(
    median_age = round(stats::median(age_at_award, na.rm = TRUE), 1),
    n = dplyr::n(),
    .groups = "drop"
  ) %>%
  arrange(category, decade)

print(median_age_tbl)
## # A tibble: 30 × 4
##    category          decade    median_age     n
##    <chr>             <chr>          <dbl> <int>
##  1 Chemistry         1975-1984       60.7    15
##  2 Chemistry         1985-1994       57.7    19
##  3 Chemistry         1995-2004       63.6    26
##  4 Chemistry         2005-2014       69.3    22
##  5 Chemistry         2015-2024       71.7    25
##  6 Economic Sciences 1975-1984       64.7    13
##  7 Economic Sciences 1985-1994       67      15
##  8 Economic Sciences 1995-2004       61.3    17
##  9 Economic Sciences 2005-2014       69.7    20
## 10 Economic Sciences 2015-2024       67.5    19
## # ℹ 20 more rows
# Optional: wide-format table (categories as rows, decades as columns)
median_age_wide <- median_age_tbl %>%
  select(category, decade, median_age) %>%
  tidyr::pivot_wider(names_from = decade, values_from = median_age) %>%
  arrange(category)

print(median_age_wide)
## # A tibble: 6 × 6
##   category           `1975-1984` `1985-1994` `1995-2004` `2005-2014` `2015-2024`
##   <chr>                    <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1 Chemistry                 60.7        57.7        63.6        69.3        71.7
## 2 Economic Sciences         64.7        67          61.3        69.7        67.5
## 3 Literature                73.2        65.3        70.3        71.9        69.6
## 4 Peace                     54.4        58          61.7        59.5        59  
## 5 Physics                   53.8        61          64.1        68.5        77.1
## 6 Physiology or Med…        55.6        58.8        60.5        62.9        70.2

12 Visualization of faceted lines of median age by category across decades

# Visualization: faceted lines by category across decades
viz_median_age <- ggplot(median_age_tbl,
                         aes(x = decade, y = median_age, group = category)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  labs(
    title = "Median Age at Nobel Prize Award by Category and Decade",
    subtitle = "Based on Nobel Prize API data; exact award date used when available",
    x = "Decade",
    y = "Median age (years)",
    caption = "Source: Nobel Prize API (laureates endpoint)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 15, face = "bold", hjust = 0),
    plot.subtitle = element_text(size = 11, color = "gray40", hjust = 0),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank()
  ) +
  facet_wrap(~ category, scales = "free_y")

print(viz_median_age)

ggsave("median_age_by_category_decade.png", viz_median_age,
       width = 12, height = 8, dpi = 300)

13 Analysis of the median age of laureates in the past fifty 50 years?

  • Scientific categories show pronounced upward trajectories, with Physics experiencing the steepest increase. This suggests growing complexity in scientific validation and the
    extended time required for breakthrough discoveries to gain consensus recognition.
  • Literature & Economics: Moderate increases with some volatility, suggesting field-specific recognition dynamics
  • The data suggests that peak scientific recognition now occurs 10-20 years later than in the 1970s,

14 Question 3: What percentage of prizes are shared among 2-3 winners versus awarded to individuals?

## ANALYSIS: How often are prizes shared vs. individual?

# Build prize-level rows by counting distinct laureates per prize
prize_level <- laureates_expanded %>%
  mutate(
    award_year = as.integer(nobelPrizes_awardYear),
    category   = nobelPrizes_category.en
  ) %>%
  filter(!is.na(award_year), award_year >= 1975, award_year <= 2025,
         !is.na(category), !is.na(id)) %>%
  group_by(award_year, category) %>%
  summarise(
    recipients = n_distinct(id),  # number of laureates attached to that prize
    .groups = "drop"
  ) %>%
  mutate(
    share_type = case_when(
      recipients == 1 ~ "Individual (1)",
      recipients %in% 2:3 ~ "Shared (2–3)",
      recipients >= 4 ~ "Shared (4+)"
    ),
    decade = case_when(
      award_year >= 1975 & award_year < 1985 ~ "1975-1984",
      award_year >= 1985 & award_year < 1995 ~ "1985-1994",
      award_year >= 1995 & award_year < 2005 ~ "1995-2004",
      award_year >= 2005 & award_year < 2015 ~ "2005-2014",
      award_year >= 2015 & award_year <= 2025 ~ "2015-2024",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(decade))

# Overall percentages (1975–2024)
sharing_overall <- prize_level %>%
  count(share_type, name = "prize_count") %>%
  mutate(percentage = round(100 * prize_count / sum(prize_count), 1)) %>%
  arrange(desc(percentage))

print(sharing_overall)
## # A tibble: 2 × 3
##   share_type     prize_count percentage
##   <chr>                <int>      <dbl>
## 1 Shared (2–3)           174       56.9
## 2 Individual (1)         132       43.1
# Percentages by decade
sharing_by_decade <- prize_level %>%
  count(decade, share_type, name = "prize_count") %>%
  group_by(decade) %>%
  mutate(percentage = 100 * prize_count / sum(prize_count)) %>%
  ungroup()

15 Data visualization showing the percentage of prizes are shared among 2-3 winners versus awarded to individuals?

library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
viz_share_decade <- ggplot(sharing_by_decade,
                           aes(x = decade, y = percentage, fill = share_type)) +
  geom_col(width = 0.72) +
  scale_y_continuous(labels = label_percent(scale = 1), expand = expansion(mult = c(0, 0.02))) +
  labs(
    title = "How Often Nobel Prizes Are Shared vs. Individual (1975–2024)",
    subtitle = "Share of prizes per decade",
    x = "Decade", y = "Percentage of prizes",
    fill = "Prize type"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 15, hjust = 0),
    plot.subtitle = element_text(color = "gray40", hjust = 0),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank(),
    legend.position = "right"
  )

print(viz_share_decade)

ggsave("nobel_prize_sharing_by_decade.png", viz_share_decade, width = 11, height = 6.5, dpi = 300)

16 Analysis of the distribution of prizes shared among 2-3 winners versus awarded to Individuals

  • Shared prizes begin to outnumber individual prizes beginning in 1995 and the distribution of shared prizes continued to outnumber the amount of individual prizes awarded.

17 Question 4: Which universities and research institutions have produced the most nobel laureates in the last 50 years?

# ----------------------------------------------------------------------------
# IMPORTANT NOTE ABOUT METHODOLOGY
# ----------------------------------------------------------------------------
# This analysis counts institutional affiliations at time of award (where 
# laureates were working when they received the prize), NOT where they:
# - Earned their PhD
# - Conducted their prize-winning research
# - Were educated
# 
# Different methodologies will produce different rankings. Internet sources
# may use different criteria (e.g., counting by PhD institution or research 
# institution rather than affiliation at award time).
# ----------------------------------------------------------------------------

library(tidyverse)

# ----------------------------------------------------------------------------
# Extract Institution Data
# ----------------------------------------------------------------------------

# Get institutions from affiliations
institutions_simple <- laureates_expanded %>%
  filter(!is.na(nobelPrizes_awardYear)) %>%
  mutate(award_year = as.numeric(nobelPrizes_awardYear)) %>%
  filter(award_year >= 1975 & award_year <= 2025) %>%
  unnest(nobelPrizes_affiliations, names_sep = "_", keep_empty = FALSE) %>%
  filter(!is.na(nobelPrizes_affiliations_name.en)) %>%
  select(
    laureate_id = id,
    award_year,
    institution = nobelPrizes_affiliations_name.en,
    country = nobelPrizes_affiliations_country.en
  )

# Count laureates per institution
top_institutions <- institutions_simple %>%
  group_by(institution, country) %>%
  summarise(laureates = n_distinct(laureate_id), .groups = "drop") %>%
  arrange(desc(laureates)) %>%
  head(15)

print("Top 15 Institutions:")
## [1] "Top 15 Institutions:"
print(top_institutions)
## # A tibble: 15 × 3
##    institution                                  country        laureates
##    <chr>                                        <chr>              <int>
##  1 University of California                     USA                   32
##  2 Massachusetts Institute of Technology (MIT)  USA                   22
##  3 Stanford University                          USA                   17
##  4 University of Chicago                        USA                   17
##  5 Howard Hughes Medical Institute              USA                   16
##  6 Princeton University                         USA                   16
##  7 Harvard University                           USA                   14
##  8 Columbia University                          USA                   11
##  9 California Institute of Technology (Caltech) USA                   10
## 10 Yale University                              USA                    8
## 11 MRC Laboratory of Molecular Biology          United Kingdom         7
## 12 Rockefeller University                       USA                    7
## 13 Harvard Medical School                       USA                    6
## 14 University of Cambridge                      United Kingdom         6
## 15 Bell Laboratories                            USA                    5
# ----------------------------------------------------------------------------
# Bar chart
# ----------------------------------------------------------------------------

viz_simple <- ggplot(top_institutions, 
                     aes(x = reorder(institution, laureates),  
                         y = laureates)) +
  geom_col(fill = "#FF5F1F", alpha = 0.85, width = 0.7) +
  geom_text(aes(label = laureates), 
            hjust = 1.2,
            size = 4, 
            fontface = "bold",
            color = "navy") +
  coord_flip() +
  labs(
    title = "Top 15 Institutions \n Producing Nobel Laureates\n (1975-2024)",
    x = NULL,
    y = "Number of Nobel Laureates",
    caption = "Source: Nobel Prize API"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 15, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),  # Smaller text for long names
    panel.grid.major.y = element_blank(),
    plot.margin = margin(10, 10, 10, 80)  # Extra left margin for long names
  )

print(viz_simple)

ggsave("nobel_institutions_simple.png", viz_simple, 
       width = 14, height = 8, dpi = 300)  # Wider to accommodate long names

# Quick summary
cat("\nQuick Summary:\n")
## 
## Quick Summary:
cat("Top Institution:", top_institutions$institution[1], 
    "with", top_institutions$laureates[1], "laureates\n")
## Top Institution: University of California with 32 laureates
cat("US institutions in top 15:", sum(top_institutions$country == "USA", na.rm = TRUE), "\n")
## US institutions in top 15: 13

18 Question 5: Who are the youngest nobel prize winners from each major geographic region at the time of their award?

library(tidyverse)

# ============================================================================
# Question 5: Youngest Nobel Prize Winners by Region (1985-2025)
# ============================================================================

# Calculate age at award
youngest_by_region <- nobel_geographic %>%
  filter(award_year >= 1985 & award_year <= 2025) %>%
  filter(!is.na(birth_continent) & !is.na(birth_country)) %>%
  left_join(
    laureates %>% select(id, birth.date),
    by = c("laureate_id" = "id")
  ) %>%
  mutate(
    birth_year = as.numeric(str_sub(birth.date, 1, 4)),
    age_at_award = award_year - birth_year
  ) %>%
  filter(!is.na(age_at_award) & age_at_award > 0) %>%
  group_by(birth_continent) %>%
  arrange(age_at_award) %>%
  slice(1) %>%
  ungroup() %>%
  arrange(age_at_award) %>%
  select(name, birth_continent, birth_country, category, award_year, age_at_award)

# Print results
cat("\n", strrep("=", 80), "\n")
## 
##  ================================================================================
cat("YOUNGEST NOBEL PRIZE WINNERS BY REGION AT TIME OF AWARD (1985-2025)\n")
## YOUNGEST NOBEL PRIZE WINNERS BY REGION AT TIME OF AWARD (1985-2025)
cat(strrep("=", 80), "\n\n")
## ================================================================================
for(i in 1:nrow(youngest_by_region)) {
  cat(sprintf("%s: %s (%s) - Age %d - %s %d\n",
              youngest_by_region$birth_continent[i],
              youngest_by_region$name[i],
              youngest_by_region$birth_country[i],
              youngest_by_region$age_at_award[i],
              youngest_by_region$category[i],
              youngest_by_region$award_year[i]))
}
## Asia: Malala Yousafzai (Pakistan) - Age 17 - Peace 2014
## North America: Rigoberta Menchú Tum (Guatemala) - Age 33 - Peace 1992
## Europe: Konstantin Novoselov (Russia) - Age 36 - Physics 2010
## Africa: Leymah Gbowee (Liberia) - Age 39 - Peace 2011
## Oceania: José Ramos-Horta (East Timor) - Age 47 - Peace 1996
## South America: Maria Corina Machado (Venezuela) - Age 58 - Peace 2025
cat("\n", strrep("=", 80), "\n")
## 
##  ================================================================================
print(youngest_by_region)
## # A tibble: 6 × 6
##   name            birth_continent birth_country category award_year age_at_award
##   <chr>           <chr>           <chr>         <chr>         <dbl>        <dbl>
## 1 Malala Yousafz… Asia            Pakistan      Peace          2014           17
## 2 Rigoberta Menc… North America   Guatemala     Peace          1992           33
## 3 Konstantin Nov… Europe          Russia        Physics        2010           36
## 4 Leymah Gbowee   Africa          Liberia       Peace          2011           39
## 5 José Ramos-Hor… Oceania         East Timor    Peace          1996           47
## 6 Maria Corina M… South America   Venezuela     Peace          2025           58

19 Analysis of the Results of the youngest Nobel Prize winners from each major geographic region at the time of their award?

  • Among the youngest laureates from each continent, Malala Yousafzai from Pakistan (Asia) stands out as the youngest overall at just 17 years old when she received the Peace Prize in 2014, making her an extraordinary outlier in Nobel Prize history.
  • The Peace Prize dominates this list, with five of the six youngest regional winners awarded in this category, reflecting that humanitarian achievements can be recognized earlier in life compared to scientific contributions that typically require decades of research.
  • Age disparities across regions are notable: while Asia’s youngest winner was 17, South America’s youngest laureate, María Corina Machado from Venezuela, was 58 years old when
    awarded the Peace Prize in 2025—highlighting significant variation in the age at which different regions produce Nobel-recognized contributions.

20 Which Nobel Prize Categories has Awarded the Most OPrizes?

library(tidyverse)


# Count prizes by category and decade
category_by_decade <- nobel_geographic %>%
  filter(award_year >= 1975 & award_year <= 2025) %>%
  filter(!is.na(category)) %>%
  mutate(
    decade = case_when(
      award_year >= 1975 & award_year < 1985 ~ "1975-1984",
      award_year >= 1985 & award_year < 1995 ~ "1985-1994",
      award_year >= 1995 & award_year < 2005 ~ "1995-2004",
      award_year >= 2005 & award_year < 2015 ~ "2005-2014",
      award_year >= 2015 & award_year <= 2025 ~ "2015-2024",
      TRUE ~ "Other"
    )
  ) %>%
  filter(decade != "Other") %>%
  group_by(decade, category) %>%
  summarise(prizes = n(), .groups = "drop")

# Overall totals by category
category_totals <- category_by_decade %>%
  group_by(category) %>%
  summarise(total_prizes = sum(prizes), .groups = "drop") %>%
  arrange(desc(total_prizes))

# Print results
cat("\n", strrep("=", 80), "\n")
## 
##  ================================================================================
cat("NOBEL PRIZES BY CATEGORY (1975-2025)\n")
## NOBEL PRIZES BY CATEGORY (1975-2025)
cat(strrep("=", 80), "\n\n")
## ================================================================================
cat("Overall Totals:\n")
## Overall Totals:
for(i in 1:nrow(category_totals)) {
  cat(sprintf("%d. %s: %d prizes\n",
              i,
              category_totals$category[i],
              category_totals$total_prizes[i]))
}
## 1. Physics: 129 prizes
## 2. Physiology or Medicine: 119 prizes
## 3. Chemistry: 114 prizes
## 4. Economic Sciences: 90 prizes
## 5. Peace: 72 prizes
## 6. Literature: 51 prizes
cat("\n\nBreakdown by Decade:\n")
## 
## 
## Breakdown by Decade:
print(category_by_decade %>% 
        pivot_wider(names_from = decade, values_from = prizes, values_fill = 0))
## # A tibble: 6 × 6
##   category           `1975-1984` `1985-1994` `1995-2004` `2005-2014` `2015-2024`
##   <chr>                    <int>       <int>       <int>       <int>       <int>
## 1 Chemistry                   15          19          26          23          31
## 2 Economic Sciences           13          15          18          20          24
## 3 Literature                  10          10          10          10          11
## 4 Peace                       13          13          15          16          15
## 5 Physics                     24          21          28          25          31
## 6 Physiology or Med…          26          20          23          25          25
# ============================================================================
# Visualization: Stacked Bar Chart by Decade
# ============================================================================

# Pastel colors for categories
category_colors <- c(
  "Physics" = "#B4A7D6",              # Pastel purple
  "Chemistry" = "#A8D8EA",            # Pastel blue
  "Physiology or Medicine" = "#CD5B45", # Pastel coral
  "Literature" = "#014D4E",           # Pastel teal
  "Peace" = "#F7B7A3",                # Pastel peach
  "Economic Sciences" = "#FFF1A8"     # Pastel yellow

)

viz_categories <- ggplot(category_by_decade, 
                         aes(x = decade, y = prizes, fill = category)) +
  geom_col(alpha = 0.85, width = 0.7) +
  geom_text(aes(label = prizes), 
            position = position_stack(vjust = 0.5),
            size = 3.5, 
            fontface = "bold",
            color = "white") +
  scale_fill_manual(values = category_colors) +
  labs(
    title = "Nobel Prize Distribution by Category\nAcross Five Decades (1975-2024)",
    x = "Decade",
    y = "Number of Prizes Awarded",
    fill = "Category",
    caption = "Source: Nobel Prize API"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
    axis.text.y = element_text(size = 10),
    legend.position = "bottom",
    legend.title = element_text(face = "bold"),
    panel.grid.major.x = element_blank()
  ) +
  guides(fill = guide_legend(nrow = 2))

print(viz_categories)

ggsave("nobel_categories_by_decade.png", viz_categories, 
       width = 12, height = 8, dpi = 300)

21 Nobel Prize Distribution Across Decades

Overall Prize Distribution:

  • Physics leads with 129 total prizes, showing the most consistent output across all five decades (20-31 prizes per decade)
  • Physiology or Medicine ranks second with 119 prizes, maintaining steady distribution (20-26 prizes per decade)
  • Chemistry follows with 114 prizes, displaying gradual growth from 15 (1975-1984) to 31 (2015-2024)
  • Economic Sciences awarded 90 prizes, showing stable growth as a relatively newer category (added 1969)
  • Peace distributed 72 prizes with consistent patterns (13-16 prizes per decade)
  • Literature awarded the fewest with only 51 prizes (10-11 per decade)

Decade-by-Decade Trends:

  • 1975-1984: Scientific categories dominated, with Physics (24) and Medicine (26) leading
  • 1985-1994: Most balanced distribution across categories; total prizes slightly declined
  • 1995-2004: Peak decade with 116 total prizes; Chemistry and Physics both surged
  • 2005-2014: Slight decline in overall prizes; categories remained relatively stable
  • 2015-2024: Strongest decade for Chemistry (31 prizes) and Physics (31 prizes); notable growth in scientific recognition