library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
library(readr)
# ---- paths & urls ----
dir.create("data", showWarnings = FALSE)
laureates_path <- file.path("data", "laureates.json")
prizes_path <- file.path("data", "nobelPrizes.json")
# v2.1 endpoints (note: 'nobelPrizes', not 'prizes')
laureates_url <- "https://api.nobelprize.org/2.1/laureates?limit=1000"
prizes_url <- "https://api.nobelprize.org/2.1/nobelPrizes?limit=1000"
safe_download <- function(url, path, retries = 3, sleep = 1){
ok <- FALSE
for(i in seq_len(retries)){
try({
download.file(url, path, mode = "wb", quiet = TRUE)
# quick sanity check that we didn't save an HTML error page
txt <- readChar(path, nchars = 200, useBytes = TRUE)
if(grepl("\\{", txt) && !grepl("<!DOCTYPE html>", txt, ignore.case = TRUE)){
ok <- TRUE; break
}
}, silent = TRUE)
Sys.sleep(sleep)
}
ok
}
if (!file.exists(laureates_path)) {
if (!safe_download(laureates_url, laureates_path)) {
stop("Could not download laureates data. Check your connection and try again.")
}
}
if (!file.exists(prizes_path)) {
if (!safe_download(prizes_url, prizes_path)) {
stop("Could not download nobelPrizes data. Check your connection and try again.")
}
}
c(
laureates_exists = file.exists(laureates_path),
nobelPrizes_exists = file.exists(prizes_path)
)
## laureates_exists nobelPrizes_exists
## TRUE TRUE
library(jsonlite)
laureates_raw <- tryCatch(
fromJSON(laureates_path, flatten = TRUE),
error = function(e) fromJSON(laureates_url, flatten = TRUE)
)
prizes_raw <- tryCatch(
fromJSON(prizes_path, flatten = TRUE),
error = function(e) fromJSON(prizes_url, flatten = TRUE)
)
# Confirm top-level keys so we know we loaded the right shapes
names(laureates_raw)
## [1] "laureates" "meta" "links"
names(prizes_raw)
## [1] "nobelPrizes" "meta" "links"
# Typical keys are:
# laureates_raw$laureates
# prizes_raw$nobelPrizes
length(laureates_raw$laureates)
## [1] 109
length(prizes_raw$nobelPrizes)
## [1] 14
laurs <- laureates_raw$laureates
laureates_core <- laurs %>%
dplyr::transmute(
laureate_id = id,
# pick the best available display name among person/org fields
name = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`),
person_vs_org = dplyr::if_else(is.na(gender), "organization", "person"),
gender = gender,
birth_date = `birth.date`,
birth_country = `birth.place.country.en`,
death_date = `death.date`
)
# quick peek
dplyr::glimpse(laureates_core)
## Rows: 1,018
## Columns: 7
## $ laureate_id <chr> "745", "102", "779", "259", "1004", "114", "982", "981",…
## $ name <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Ciechanover"…
## $ person_vs_org <chr> "person", "person", "person", "person", "person", "perso…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male", …
## $ birth_date <chr> "1943-00-00", "1922-06-19", "1947-10-01", "1926-08-11", …
## $ birth_country <chr> "USA", "Denmark", "British Protectorate of Palestine", "…
## $ death_date <chr> NA, "2009-09-08", NA, "2018-11-20", NA, "1996-11-21", NA…
dplyr::count(laureates_core, person_vs_org)
## person_vs_org n
## 1 organization 28
## 2 person 990
# Keep the nobelPrizes list-column so we can unnest awards later
laurs <- laureates_raw$laureates %>%
dplyr::mutate(
display_name = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`)
) %>%
dplyr::select(
id,
display_name,
gender,
`birth.place.country.en`,
`birth.date`,
`death.date`,
wikipedia.english,
wikidata.id,
nobelPrizes # <-- keep this list-column
)
dplyr::glimpse(laurs)
## Rows: 1,018
## Columns: 9
## $ id <chr> "745", "102", "779", "259", "1004", "114", "982…
## $ display_name <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Cie…
## $ gender <chr> "male", "male", "male", "male", "male", "male",…
## $ birth.place.country.en <chr> "USA", "Denmark", "British Protectorate of Pale…
## $ birth.date <chr> "1943-00-00", "1922-06-19", "1947-10-01", "1926…
## $ death.date <chr> NA, "2009-09-08", NA, "2018-11-20", NA, "1996-1…
## $ wikipedia.english <chr> "https://en.wikipedia.org/wiki/Michael_Spence",…
## $ wikidata.id <chr> "Q157245", "Q103854", "Q233205", "Q190626", "Q3…
## $ nobelPrizes <list> [<data.frame[1 x 17]>], [<data.frame[1 x 17]>]…
# Build a flat "awards" table from the laureates list
# Note: we rely on fromJSON(..., flatten = TRUE), so nested fields like
# category.en and motivation.en already exist after unnest()
awards <- laureates_raw$laureates %>%
dplyr::select(
id,
gender,
`birth.place.country.en`,
`knownName.en`, `fullName.en`, `orgName.en`,
nobelPrizes
) %>%
tidyr::unnest(nobelPrizes, keep_empty = TRUE) %>%
dplyr::transmute(
id,
display_name = dplyr::coalesce(`knownName.en`, `fullName.en`, `orgName.en`),
gender = gender,
birth_country = `birth.place.country.en`,
awardYear = readr::parse_integer(awardYear),
category_en = `category.en`,
motivation_en = `motivation.en`,
portion = portion,
sortOrder = sortOrder
)
dplyr::glimpse(awards, width = 80)
## Rows: 1,026
## Columns: 9
## $ id <chr> "745", "102", "779", "259", "1004", "114", "982", "981",…
## $ display_name <chr> "A. Michael Spence", "Aage N. Bohr", "Aaron Ciechanover"…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male", …
## $ birth_country <chr> "USA", "Denmark", "British Protectorate of Palestine", "…
## $ awardYear <int> 2001, 1975, 2004, 1982, 2021, 1979, 2019, 2019, 2009, 20…
## $ category_en <chr> "Economic Sciences", "Physics", "Chemistry", "Chemistry"…
## $ motivation_en <chr> "for their analyses of markets with asymmetric informati…
## $ portion <chr> "1/3", "1/3", "1/3", "1", "1", "1/3", "1/3", "1", "1/3",…
## $ sortOrder <chr> "2", "1", "1", "1", "1", "2", "1", "1", "3", "3", "1", "…
library(dplyr)
# 1) How many prizes per category (overall)
by_category <- awards %>%
count(category_en, sort = TRUE)
# 2) Top 10 birth countries by number of laureates
by_country <- awards %>%
filter(!is.na(birth_country) & birth_country != "") %>%
count(birth_country, sort = TRUE, name = "laureates") %>%
slice_head(n = 10)
# 3) Prizes by decade (e.g., 1900s, 1910s, ...)
by_decade <- awards %>%
mutate(decade = (awardYear %/% 10) * 10) %>%
count(decade, sort = FALSE)
# Show the three small tables
by_category
## # A tibble: 6 × 2
## category_en n
## <chr> <int>
## 1 Physiology or Medicine 232
## 2 Physics 230
## 3 Chemistry 200
## 4 Peace 143
## 5 Literature 122
## 6 Economic Sciences 99
by_country
## # A tibble: 10 × 2
## birth_country laureates
## <chr> <int>
## 1 USA 298
## 2 United Kingdom 95
## 3 Germany 80
## 4 France 60
## 5 Japan 30
## 6 Sweden 30
## 7 Canada 22
## 8 the Netherlands 20
## 9 Switzerland 19
## 10 Italy 18
by_decade
## # A tibble: 13 × 2
## decade n
## <dbl> <int>
## 1 1900 57
## 2 1910 40
## 3 1920 54
## 4 1930 56
## 5 1940 43
## 6 1950 72
## 7 1960 79
## 8 1970 104
## 9 1980 97
## 10 1990 104
## 11 2000 123
## 12 2010 121
## 13 2020 76
# Make types clean, then produce a few small summaries
awards_clean <- awards %>%
dplyr::mutate(
category_en = as.character(category_en),
awardYear = as.integer(awardYear),
sortOrder = readr::parse_integer(sortOrder),
birth_country = dplyr::na_if(birth_country, "")
) %>%
dplyr::filter(!is.na(category_en), !is.na(awardYear))
# 1) Count of prizes by category (all years)
by_category <- awards_clean %>%
dplyr::count(category_en, name = "n", sort = TRUE)
# 2) Count of prizes by category since 2000 (quick modern view)
by_category_since2000 <- awards_clean %>%
dplyr::filter(awardYear >= 2000) %>%
dplyr::count(category_en, name = "n", sort = TRUE)
# 3) Top 10 birth countries of laureates (based on available data)
top_countries <- awards_clean %>%
dplyr::count(birth_country, name = "n", sort = TRUE) %>%
dplyr::slice_head(n = 10)
# Show results (they'll print in the knitted HTML)
list(
by_category = by_category,
by_category_since2000 = by_category_since2000,
top_countries = top_countries
)
## $by_category
## # A tibble: 6 × 2
## category_en n
## <chr> <int>
## 1 Physiology or Medicine 232
## 2 Physics 230
## 3 Chemistry 200
## 4 Peace 143
## 5 Literature 122
## 6 Economic Sciences 99
##
## $by_category_since2000
## # A tibble: 6 × 2
## category_en n
## <chr> <int>
## 1 Physics 71
## 2 Chemistry 68
## 3 Physiology or Medicine 63
## 4 Economic Sciences 55
## 5 Peace 37
## 6 Literature 26
##
## $top_countries
## # A tibble: 10 × 2
## birth_country n
## <chr> <int>
## 1 USA 298
## 2 United Kingdom 95
## 3 Germany 80
## 4 France 60
## 5 <NA> 35
## 6 Japan 30
## 7 Sweden 30
## 8 Canada 22
## 9 the Netherlands 20
## 10 Switzerland 19
library(ggplot2)
# Count by category (reuse the summary we already made)
plot_data <- awards %>%
count(category_en, sort = TRUE)
ggplot(plot_data, aes(x = reorder(category_en, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Number of Nobel Prizes by Category",
x = "Category",
y = "Count"
)
After cleaning and flattening the Nobel Prize data, I looked at how awards are distributed across different categories, birth countries, and decades. The category counts show that subjects like Physics, Chemistry, and Medicine consistently receive the most Nobel Prizes, which makes sense because these fields have been part of the prize structure since the beginning. When looking at birth countries, a small number of countries produce a large share of laureates, especially the United States and countries in Western Europe. This suggests that global scientific recognition is concentrated in places with strong research funding and educational systems.
I also reviewed Nobel Prizes over time by decade, which shows steady growth in the early 1900s, dips during World Wars, and then stability afterward. Overall, this analysis helped me understand not only who wins Nobel Prizes, but also how award patterns reflect history, geography, and the development of different fields. It was interesting to see how a nested API dataset can be transformed into something clear and easy to explore.
Working with the Nobel Prize API taught me something important about real-world data: even a simple project can become complicated when the data structure is nested, inconsistent, or incomplete. I realized that understanding how the data is organized is half the battle. A dataset might look straightforward at first, but once you start cleaning and flattening it, you quickly learn that every API or file format has its own personality.
This assignment showed me how valuable it is to check the structure early, take things one step at a time, and avoid narrowing tables before I’m ready. I also learned how nested lists, changing API versions, and missing files can combine to create problems that have nothing to do with the code and everything to do with how the data is delivered. These challenges felt frustrating at first, but I now see them as part of the data science learning curve. With more experience, I will learn to recognize these situations earlier and handle them with more confidence.