Package

library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:jsonlite':
## 
##     flatten

library(readr)

R Markdown

# ---- paths & urls ----
dir.create("data", showWarnings = FALSE)
laureates_path <- file.path("data", "laureates.json")
prizes_path    <- file.path("data", "nobelPrizes.json")

# v2.1 endpoints (note: 'nobelPrizes', not 'prizes')
laureates_url <- "https://api.nobelprize.org/2.1/laureates?limit=1000"
prizes_url    <- "https://api.nobelprize.org/2.1/nobelPrizes?limit=1000"

safe_download <- function(url, path, retries = 3, sleep = 1){
  ok <- FALSE
  for(i in seq_len(retries)){
    try({
      download.file(url, path, mode = "wb", quiet = TRUE)
      # quick sanity check that we didn't save an HTML error page
      txt <- readChar(path, nchars = 200, useBytes = TRUE)
      if(grepl("\\{", txt) && !grepl("<!DOCTYPE html>", txt, ignore.case = TRUE)){
        ok <- TRUE; break
      }
    }, silent = TRUE)
    Sys.sleep(sleep)
  }
  ok
}

if (!file.exists(laureates_path)) {
  if (!safe_download(laureates_url, laureates_path)) {
    stop("Could not download laureates data. Check your connection and try again.")
  }
}
if (!file.exists(prizes_path)) {
  if (!safe_download(prizes_url, prizes_path)) {
    stop("Could not download nobelPrizes data. Check your connection and try again.")
  }
}

c(
  laureates_exists = file.exists(laureates_path),
  nobelPrizes_exists = file.exists(prizes_path)
)

##   laureates_exists nobelPrizes_exists 
##               TRUE               TRUE

📥 Chunk 2 — load JSON (prefer local; fall back to URL)

library(jsonlite)

laureates_raw <- tryCatch(
  fromJSON(laureates_path, flatten = TRUE),
  error = function(e) fromJSON(laureates_url, flatten = TRUE)
)

prizes_raw <- tryCatch(
  fromJSON(prizes_path, flatten = TRUE),
  error = function(e) fromJSON(prizes_url, flatten = TRUE)
)

# Confirm top-level keys so we know we loaded the right shapes
names(laureates_raw)

## [1] "laureates" "meta"      "links"

names(prizes_raw)

## [1] "nobelPrizes" "meta"        "links"

# Typical keys are:
# laureates_raw$laureates
# prizes_raw$nobelPrizes
length(laureates_raw$laureates)

## [1] 109

length(prizes_raw$nobelPrizes)

## [1] 14

—- clean-laureates —-

laurs <- laureates_raw$laureates

laureates_core <- laurs %>%
  dplyr::transmute(
    laureate_id = id,
    # pick the best available display name among person/org fields
    name = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`),
    person_vs_org = dplyr::if_else(is.na(gender), "organization", "person"),
    gender = gender,
    birth_date = `birth.date`,
    birth_country = `birth.place.country.en`,
    death_date = `death.date`
  )

# quick peek
dplyr::glimpse(laureates_core)

## Rows: 1,018
## Columns: 7
## $ laureate_id   <chr> "745", "102", "779", "259", "1004", "114", "982", "981",…
## $ name          <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Ciechanover"…
## $ person_vs_org <chr> "person", "person", "person", "person", "person", "perso…
## $ gender        <chr> "male", "male", "male", "male", "male", "male", "male", …
## $ birth_date    <chr> "1943-00-00", "1922-06-19", "1947-10-01", "1926-08-11", …
## $ birth_country <chr> "USA", "Denmark", "British Protectorate of Palestine", "…
## $ death_date    <chr> NA, "2009-09-08", NA, "2018-11-20", NA, "1996-11-21", NA…

dplyr::count(laureates_core, person_vs_org)

##   person_vs_org   n
## 1  organization  28
## 2        person 990

—- clean laureates table —-

# Keep the nobelPrizes list-column so we can unnest awards later
laurs <- laureates_raw$laureates %>%
  dplyr::mutate(
    display_name = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`)
  ) %>%
  dplyr::select(
    id,
    display_name,
    gender,
    `birth.place.country.en`,
    `birth.date`,
    `death.date`,
    wikipedia.english,
    wikidata.id,
    nobelPrizes        # <-- keep this list-column
  )

dplyr::glimpse(laurs)

## Rows: 1,018
## Columns: 9
## $ id                     <chr> "745", "102", "779", "259", "1004", "114", "982…
## $ display_name           <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Cie…
## $ gender                 <chr> "male", "male", "male", "male", "male", "male",…
## $ birth.place.country.en <chr> "USA", "Denmark", "British Protectorate of Pale…
## $ birth.date             <chr> "1943-00-00", "1922-06-19", "1947-10-01", "1926…
## $ death.date             <chr> NA, "2009-09-08", NA, "2018-11-20", NA, "1996-1…
## $ wikipedia.english      <chr> "https://en.wikipedia.org/wiki/Michael_Spence",…
## $ wikidata.id            <chr> "Q157245", "Q103854", "Q233205", "Q190626", "Q3…
## $ nobelPrizes            <list> [<data.frame[1 x 17]>], [<data.frame[1 x 17]>]…

—- awards-from-laureates —-

# Build a flat "awards" table from the laureates list
# Note: we rely on fromJSON(..., flatten = TRUE), so nested fields like
# category.en and motivation.en already exist after unnest()

awards <- laureates_raw$laureates %>%
  dplyr::select(
    id,
    gender,
    `birth.place.country.en`,
    `knownName.en`, `fullName.en`, `orgName.en`,
    nobelPrizes
  ) %>%
  tidyr::unnest(nobelPrizes, keep_empty = TRUE) %>%
  dplyr::transmute(
    id,
    display_name  = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`),
    gender        = gender,
    birth_country = `birth.place.country.en`,
    awardYear     = readr::parse_integer(awardYear),
    category_en   = `category.en`,
    motivation_en = `motivation.en`,
    portion       = portion,
    sortOrder     = sortOrder
  )

dplyr::glimpse(awards, width = 80)

## Rows: 1,026
## Columns: 9
## $ id            <chr> "745", "102", "779", "259", "1004", "114", "982", "981",…
## $ display_name  <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Ciechanover"…
## $ gender        <chr> "male", "male", "male", "male", "male", "male", "male", …
## $ birth_country <chr> "USA", "Denmark", "British Protectorate of Palestine", "…
## $ awardYear     <int> 2001, 1975, 2004, 1982, 2021, 1979, 2019, 2019, 2009, 20…
## $ category_en   <chr> "Economic Sciences", "Physics", "Chemistry", "Chemistry"…
## $ motivation_en <chr> "for their analyses of markets with asymmetric informati…
## $ portion       <chr> "1/3", "1/3", "1/3", "1", "1", "1/3", "1/3", "1", "1/3",…
## $ sortOrder     <chr> "2", "1", "1", "1", "1", "2", "1", "1", "3", "3", "1", "…

library(dplyr)

# 1) How many prizes per category (overall)
by_category <- awards %>%
  count(category_en, sort = TRUE)

# 2) Top 10 birth countries by number of laureates
by_country <- awards %>%
  filter(!is.na(birth_country) & birth_country != "") %>%
  count(birth_country, sort = TRUE, name = "laureates") %>%
  slice_head(n = 10)

# 3) Prizes by decade (e.g., 1900s, 1910s, ...)
by_decade <- awards %>%
  mutate(decade = (awardYear %/% 10) * 10) %>%
  count(decade, sort = FALSE)

# Show the three small tables
by_category

## # A tibble: 6 × 2
##   category_en                n
##   <chr>                  <int>
## 1 Physiology or Medicine   232
## 2 Physics                  230
## 3 Chemistry                200
## 4 Peace                    143
## 5 Literature               122
## 6 Economic Sciences         99

by_country

## # A tibble: 10 × 2
##    birth_country   laureates
##    <chr>               <int>
##  1 USA                   298
##  2 United Kingdom         95
##  3 Germany                80
##  4 France                 60
##  5 Japan                  30
##  6 Sweden                 30
##  7 Canada                 22
##  8 the Netherlands        20
##  9 Switzerland            19
## 10 Italy                  18

by_decade

## # A tibble: 13 × 2
##    decade     n
##     <dbl> <int>
##  1   1900    57
##  2   1910    40
##  3   1920    54
##  4   1930    56
##  5   1940    43
##  6   1950    72
##  7   1960    79
##  8   1970   104
##  9   1980    97
## 10   1990   104
## 11   2000   123
## 12   2010   121
## 13   2020    76

# Make types clean, then produce a few small summaries
awards_clean <- awards %>%
  dplyr::mutate(
    category_en   = as.character(category_en),
    awardYear     = as.integer(awardYear),
    sortOrder     = readr::parse_integer(sortOrder),
    birth_country = dplyr::na_if(birth_country, "")
  ) %>%
  dplyr::filter(!is.na(category_en), !is.na(awardYear))

# 1) Count of prizes by category (all years)
by_category <- awards_clean %>%
  dplyr::count(category_en, name = "n", sort = TRUE)

# 2) Count of prizes by category since 2000 (quick modern view)
by_category_since2000 <- awards_clean %>%
  dplyr::filter(awardYear >= 2000) %>%
  dplyr::count(category_en, name = "n", sort = TRUE)

# 3) Top 10 birth countries of laureates (based on available data)
top_countries <- awards_clean %>%
  dplyr::count(birth_country, name = "n", sort = TRUE) %>%
  dplyr::slice_head(n = 10)

# Show results (they'll print in the knitted HTML)
list(
  by_category = by_category,
  by_category_since2000 = by_category_since2000,
  top_countries = top_countries
)

## $by_category
## # A tibble: 6 × 2
##   category_en                n
##   <chr>                  <int>
## 1 Physiology or Medicine   232
## 2 Physics                  230
## 3 Chemistry                200
## 4 Peace                    143
## 5 Literature               122
## 6 Economic Sciences         99
## 
## $by_category_since2000
## # A tibble: 6 × 2
##   category_en                n
##   <chr>                  <int>
## 1 Physics                   71
## 2 Chemistry                 68
## 3 Physiology or Medicine    63
## 4 Economic Sciences         55
## 5 Peace                     37
## 6 Literature                26
## 
## $top_countries
## # A tibble: 10 × 2
##    birth_country       n
##    <chr>           <int>
##  1 USA               298
##  2 United Kingdom     95
##  3 Germany            80
##  4 France             60
##  5 <NA>               35
##  6 Japan              30
##  7 Sweden             30
##  8 Canada             22
##  9 the Netherlands    20
## 10 Switzerland        19

library(ggplot2)

# Count by category (reuse the summary we already made)
plot_data <- awards %>%
  count(category_en, sort = TRUE)

ggplot(plot_data, aes(x = reorder(category_en, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Number of Nobel Prizes by Category",
    x = "Category",
    y = "Count"
  )

Interpretation & Conclusion

After cleaning and flattening the Nobel Prize data, I looked at how awards are distributed across different categories, birth countries, and decades. The category counts show that subjects like Physics, Chemistry, and Medicine consistently receive the most Nobel Prizes, which makes sense because these fields have been part of the prize structure since the beginning. When looking at birth countries, a small number of countries produce a large share of laureates, especially the United States and countries in Western Europe. This suggests that global scientific recognition is concentrated in places with strong research funding and educational systems.

I also reviewed Nobel Prizes over time by decade, which shows steady growth in the early 1900s, dips during World Wars, and then stability afterward. Overall, this analysis helped me understand not only who wins Nobel Prizes, but also how award patterns reflect history, geography, and the development of different fields. It was interesting to see how a nested API dataset can be transformed into something clear and easy to explore.

Reflection

Working with the Nobel Prize API taught me something important about real-world data: even a simple project can become complicated when the data structure is nested, inconsistent, or incomplete. I realized that understanding how the data is organized is half the battle. A dataset might look straightforward at first, but once you start cleaning and flattening it, you quickly learn that every API or file format has its own personality.

This assignment showed me how valuable it is to check the structure early, take things one step at a time, and avoid narrowing tables before I’m ready. I also learned how nested lists, changing API versions, and missing files can combine to create problems that have nothing to do with the code and everything to do with how the data is delivered. These challenges felt frustrating at first, but I now see them as part of the data science learning curve. With more experience, I will learn to recognize these situations earlier and handle them with more confidence.

Assignment_10B-NobelPrize

Kevin Martin

2025-11-02