Assignment 10B: More JSON

library(jsonlite)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()  masks stats::filter()
✖ purrr::flatten() masks jsonlite::flatten()
✖ dplyr::lag()     masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

laureates_url <- "https://api.nobelprize.org/2.1/laureates?limit=1018"
laureates_raw <- fromJSON(laureates_url, flatten = TRUE)
laureates_df <- as_tibble(laureates_raw$laureates)

Looking at the column names

prizes_df <- laureates_df %>%
  select(id, gender, `birth.year`, nobelPrizes) %>%
  unnest(nobelPrizes, names_sep = "p_")   # affiliations is now p_affiliations

cat("Columns after first unnest:\n")

Columns after first unnest:

print(names(prizes_df))

 [1] "id"                               "gender"                          
 [3] "birth.year"                       "nobelPrizesp_awardYear"          
 [5] "nobelPrizesp_sortOrder"           "nobelPrizesp_portion"            
 [7] "nobelPrizesp_dateAwarded"         "nobelPrizesp_prizeStatus"        
 [9] "nobelPrizesp_prizeAmount"         "nobelPrizesp_prizeAmountAdjusted"
[11] "nobelPrizesp_affiliations"        "nobelPrizesp_links"              
[13] "nobelPrizesp_category.en"         "nobelPrizesp_category.no"        
[15] "nobelPrizesp_category.se"         "nobelPrizesp_categoryFullName.en"
[17] "nobelPrizesp_categoryFullName.no" "nobelPrizesp_categoryFullName.se"
[19] "nobelPrizesp_motivation.en"       "nobelPrizesp_motivation.se"      
[21] "nobelPrizesp_motivation.no"       "nobelPrizesp_residences"         
[23] "nobelPrizesp_topMotivation.en"    "nobelPrizesp_topMotivation.se"

Fixing the dataframe to be usable for analysis

prizes_df <- laureates_df %>%
  select(id, gender, `birth.year`, nobelPrizes) %>%
  unnest(nobelPrizes, names_sep = "_")

affiliations_df <- prizes_df %>%
  unnest(nobelPrizes_affiliations, names_sep = "_", keep_empty = TRUE)

Which university produce the most amount of winners

q1 <- affiliations_df %>%
  filter(!is.na(nobelPrizes_affiliations_name.en)) %>%
  group_by(nobelPrizes_affiliations_name.en) %>%
  summarise(laureate_count = n_distinct(id), .groups = "drop") %>%
  arrange(desc(laureate_count))

cat("=== Q1: Top Universities ===\n")

=== Q1: Top Universities ===

print(head(q1, 10))

# A tibble: 10 × 2
   nobelPrizes_affiliations_name.en             laureate_count
   <chr>                                                 <int>
 1 University of California                                 43
 2 Harvard University                                       29
 3 Massachusetts Institute of Technology (MIT)              25
 4 Stanford University                                      22
 5 University of Chicago                                    20
 6 California Institute of Technology (Caltech)             19
 7 Columbia University                                      18
 8 Princeton University                                     18
 9 University of Cambridge                                  18
10 Howard Hughes Medical Institute                          16

Average age at prize per category

q2 <- prizes_df %>%
  filter(!is.na(birth.year), !is.na(nobelPrizes_awardYear)) %>%
  mutate(
    age_at_prize = as.integer(nobelPrizes_awardYear) - as.integer(birth.year)
  ) %>%
  group_by(nobelPrizes_category.en) %>%
  summarise(avg_age = round(mean(age_at_prize, na.rm = TRUE), 1), .groups = "drop") %>%
  arrange(avg_age)

cat("\n=== Q2: Average Age at Prize by Category ===\n")


=== Q2: Average Age at Prize by Category ===

print(q2)

# A tibble: 6 × 2
  nobelPrizes_category.en avg_age
  <chr>                     <dbl>
1 Physics                    57.7
2 Physiology or Medicine     58.9
3 Chemistry                  59.2
4 Peace                      60.8
5 Literature                 65  
6 Economic Sciences          67

q3 <- prizes_df %>%
  filter(!is.na(gender)) %>%
  group_by(nobelPrizes_category.en, gender) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = gender, values_from = count, values_fill = 0)

cat("\n=== Q3: Gender Breakdown by Category ===\n")


=== Q3: Gender Breakdown by Category ===

print(q3)

# A tibble: 6 × 3
  nobelPrizes_category.en female  male
  <chr>                    <int> <int>
1 Chemistry                    8   192
2 Economic Sciences            3    96
3 Literature                  18   104
4 Peace                       20    92
5 Physics                      5   225
6 Physiology or Medicine      14   218

q4 <- prizes_df %>%
  filter(!is.na(gender)) %>%
  group_by(nobelPrizes_category.en) %>%
  summarise(
    total      = n(),
    female     = sum(gender == "female"),
    pct_female = round(100 * female / total, 1),
    .groups    = "drop"
  ) %>%
  arrange(desc(pct_female))

cat("\n=== Q4: % Female Laureates by Category ===\n")


=== Q4: % Female Laureates by Category ===

print(q4)

# A tibble: 6 × 4
  nobelPrizes_category.en total female pct_female
  <chr>                   <int>  <int>      <dbl>
1 Peace                     112     20       17.9
2 Literature                122     18       14.8
3 Physiology or Medicine    232     14        6  
4 Chemistry                 200      8        4  
5 Economic Sciences          99      3        3  
6 Physics                   230      5        2.2