EC 4 - JSON

Working with the two JSON files available through the API at nobelprize.org, ask and answer 4 interesting questions, e.g. “Which country “lost” the most nobel laureates (who were born there but received their Nobel prize as a citizen of a different country)?”

Requesting from the API

V1

# laureates api V1
url_v1 <- GET('http://api.nobelprize.org/v1/laureate.json?gender=All')

laureates_data_v1 <- fromJSON(rawToChar(url_v1$content))$laureates

I first used the GET() function from the httr library to get the information from the API urls. I then used rawToChar() to get the data into a readable format and fromJSON() to read the JSON data into R.

Helpful link: Dataquest R API Tutorial.

V2

# laureates api V2
url_v2 <- GET('https://api.nobelprize.org/2.1/laureates')

laureates_data_v2 <- fromJSON(rawToChar(url_v2$content))$laureates

Helpful link: Dataquest R API Tutorial.

Cleaning up the Data for Analysis

V1

# un-nest the prizes data frame
laureates_v1 <- laureates_data_v1 %>% 
  unnest_wider(prizes)

names(laureates_v1) <- to_snake_case(names(laureates_v1))

# un-nest the year, category, share, motivation, and affiliations columns
years_v1 <- laureates_v1 %>%
  transmute(id, year) %>%
  unnest_longer(year, values_to = "year")

categories_v1 <- laureates_v1 %>%
  transmute(id, category) %>%
  unnest_longer(category, values_to = "category")

shares_v1 <- laureates_v1 %>%
  transmute(id, share) %>%
  unnest_longer(share, values_to = "share")

motivations_v1 <- laureates_v1 %>%
  transmute(id, motivation) %>%
  unnest_longer(motivation, values_to = "motivation")

id_affiliations_v1 <- c()

for (i in 1:nrow(laureates_v1)) {
  temp_list <- c()
  
  for (j in 1:length(unlist(laureates_v1$affiliations[i]))) {
  temp <- unlist(laureates_v1$affiliations[j])
  
  temp_list <- append(temp_list, paste(temp[["name"]], temp[["city"]], temp[["country"]], sep = ", "))
  
  }
  
  id_affiliations_v1 <- append(id_affiliations_v1, paste(laureates_v1$id[i], list(temp_list), sep = ":"))
}

affiliations_v1 <- data.frame(id_affiliations_v1) %>%
  separate(col = id_affiliations_v1, into = c("id", "affiliations"), sep = ":")

# putting together year, categories, motivations
ycm_v1 <- years_v1 %>%
  cbind(categories_v1$category) %>%
  cbind(shares_v1$share) %>%
  cbind(motivations_v1$motivation)

names(ycm_v1) <- c("id", "year", "category", "share", "motivation")

# putting together the final data frame
#laureates <- 
laureates_v1 <- laureates_v1 %>%
  select(-year, -category, -share, -motivation, -affiliations) %>%
  right_join(affiliations_v1, by = "id") %>%
  right_join(ycm_v1, by = "id") %>%
  unique()

rmarkdown::paged_table(laureates_v1)

V2

laureates_v2 <- laureates_data_v2 %>%
  mutate(knownName = knownName[["en"]],
         givenName = givenName[["en"]],
         familyName = familyName[["en"]],
         fullName = fullName[["en"]],
         birth_date = birth[["date"]],
         birth_country = birth[["place"]][["country"]][["en"]],
         birth_continent =  birth[["place"]][["continent"]][["en"]],
         death_date = death[["date"]],
         death_country = death[["place"]][["country"]][["en"]],
         death_continent = death[["place"]][["continent"]][["en"]]) %>%
  unnest_wider(nobelPrizes, names_repair = "unique") %>%
  mutate(category = category[["en"]],
         motivation = motivation[["en"]]) %>%
  transmute(id, fullName, gender, birth_date, birth_country, birth_continent, death_date, death_country, death_continent, awardYear, category, motivation)

rmarkdown::paged_table(laureates_v2)

Questions and Analysis

The version 1 data set is much more robust so I chose to do the analysis on that data frame.

Question 1

Which country produced the most Nobel Prize laureates (based on born_country_code)? What are the top 10 countries with the most winners?

Most Nobel Prizes

laureates_v1 %>%
  count(born_country_code) %>%
  arrange(desc(n)) %>%
  head(1) %>%
  knitr::kable(col.names = c("Country Code", "Number of Laureates"))

Country Code	Number of Laureates
US	289

The country with the most Nobel Prize laureates is the United States, with 289 laureates.

Top 10

laureates_v1 %>%
  filter(!is.na(born_country_code)) %>%
  count(born_country_code) %>%
  mutate(country = countrycode(born_country_code, origin = "iso2c", destination = "country.name")) %>%
  arrange(desc(n)) %>%
  head(10) %>%
  ggplot(aes(x = country, y = n, fill = country)) +
    geom_bar(stat = "identity") +
    labs(title = "Top 10 Countries with the Most Nobel Prize Laureates", 
       x = "Country",
       y = "Number of Prizes") +
    theme(legend.position = "none") +
    geom_text(aes(label = n), vjust = -0.3) +
    theme(axis.text.x = element_text(angle = 15))

Question 2

What is the proportion of Nobel Prize laureates from each continent?

laureates_v1 %>%
  mutate(continent = countrycode(born_country_code, origin = "iso2c", destination = "continent")) %>%
  count(continent) %>%
  mutate(total = sum(n),
         prop = n / total) %>%
  ggplot(aes(x = continent, y = prop)) +
    geom_bar(stat = "identity", fill = "plum") +
    labs(title = "Prizes per Continent", 
       x = "Continent",
       y = "Percentage of Nobel Prizes") +
    geom_text(aes(label = paste0(round(prop*100, 0), '%'), vjust = -0.3)) +
    scale_y_continuous(labels = scales::percent)

Europe produced the highest proportion of Nobel Prize laureates (51%).

Question 3

How many of those winners were male? How many were female?

Table

male_female <- laureates_v1 %>%
  mutate(continent = countrycode(born_country_code, origin = "iso2c", destination = "continent")) %>%
  group_by(gender) %>%
  filter(gender %in% c("male", "female") & !is.na(continent)) %>%
  count(continent) %>%
  pivot_wider(names_from = gender, values_from = n)

male_female %>%
  knitr::kable(col.names = c("Continent", "Female", "Male"))

Continent	Female	Male
Africa	5	22
Americas	20	311
Asia	9	69
Europe	26	483
Oceania	1	12

Graph

laureates_v1 %>%
  mutate(continent = countrycode(born_country_code, origin = "iso2c", destination = "continent")) %>%
  filter(!is.na(continent)) %>%
  ggplot(aes(x = continent, fill = gender)) +
    geom_bar(position = "dodge") +
    labs(title = "Nobel Prize Laureates per Continent per Gender", 
       x = "Continent",
       y = "Number of Nobel Prize Laureates") +
    geom_text(aes(label = after_stat(count)), stat="count", position = position_dodge(.9), vjust = -0.5)

Total Male vs. Female

male_female %>%
  summarize(total_female = sum(female),
            total_male = sum(male)) %>%
  knitr::kable(col.names = c("Total Female Laureates", "Total Male Laureates"))

Total Female Laureates	Total Male Laureates
61	897

There are many more males than females who were awarded Nobel Prizes. There were 61 female and 897 male laureates in total.

Question 4

What years do the data span?

laureates_v1 %>%
  summarize(min_year = min(year), max_year = max(year))

## # A tibble: 1 × 2
##   min_year max_year
##   <chr>    <chr>   
## 1 1901     2022

The data includes Nobel Prize laureates from 1901 till 2022.

Question 5

Under which category were the most Nobel Prizes awarded?

laureates_v1 %>%
  count(category) %>%
  arrange(desc(n)) %>%
  knitr::kable(col.names = c("Category", "Number of Awards"))

Category	Number of Awards
medicine	225
physics	222
chemistry	191
peace	140
literature	119
economics	92

Medicine was the highest awarded Nobel Prize.

Question 6

How many people won more than one Nobel Prize? Who were they? Who won the most? Did any of them share Nobel Prizes or were they all solo?

Most Prizes

most_nobels <- laureates_v1 %>%
  filter(gender != "org") %>%
  mutate(full_name = paste(firstname, surname)) %>%
  count(full_name) %>%
  filter(n > 1)

knitr::kable(most_nobels, col.names = c("Name", "Number of Awards"))

Name	Number of Awards
Barry Sharpless	2
Frederick Sanger	2
John Bardeen	2
Linus Pauling	2
Marie Curie	2

Barry Sharpless, Frederick Sanger, John Bardeen, Linus Pauling, and Marie Curie were awarded the most Nobel Prizes, with 2 each.

most_nobels_info <- laureates_v1 %>%
  mutate(full_name = paste(firstname, surname)) %>%
  filter(full_name %in% most_nobels$full_name) %>%
   transmute(full_name, year, category, share)

knitr::kable(most_nobels_info, col.names = c("Name", "Year", "Category", "Share"))

Name	Year	Category	Share
Marie Curie	1903	physics	4
Marie Curie	1911	chemistry	1
John Bardeen	1956	physics	3
John Bardeen	1972	physics	3
Linus Pauling	1962	peace	1
Linus Pauling	1954	chemistry	1
Frederick Sanger	1958	chemistry	1
Frederick Sanger	1980	chemistry	4
Barry Sharpless	2001	chemistry	2
Barry Sharpless	2022	chemistry	3

Linus Pauling was awarded a Nobel Prize in Physics and a Nobel Peace Prize, each of which he won solo. Marie Curie shared a Nobel Prize in Physics and was awarded her own Nobel Prize in Chemistry. John Bardeen shared both of his Nobel Prizes in Physics. Frederick Sanger received his own Nobel Prize in Chemistry in 1958 and shared a Nobel Prize in Chemistry in 1980. Berry Sharpless shared both of his Nobel Prizes in Chemistry.