library(httr)
library(jsonlite)
library(dplyr)
library(uuid)
library(tidyr)
library(ggplot2)
<- function(suffix, type = "json", persist = FALSE) {
get_data <- "http://api.nobelprize.org/v1/"
base_url <- paste0(base_url, suffix)
full_url <- ifelse(type == "json", "application/json", "text/csv")
accept_header <- GET(full_url, accept(accept_header))
response if (status_code(response) == 200) {
<- content(response, "text", encoding = "UTF-8")
content_data if (persist) {
<- paste0("nogit_", UUIDgenerate(), "_", suffix)
file_name print(sprintf("FileName %s", file_name))
write(content_data, file_name)
}if (type == "json") {return(fromJSON(content_data))}
else if (type == "csv")
return(read.csv(text = content_data, stringsAsFactors = FALSE))}
{else {stop("Type must be either 'json' or 'csv'.")}
else {
} stop("Failed to retrieve data: ", status_code(response))
}
}
= FALSE
persist
<- get_data(suffix = "prize.json", type = "json", persist = persist) %>%
prizes flatten(. $prizes) } %>% unnest(laureates)
{
<- get_data(suffix = "laureate.json", type = "json", persist = persist) %>%
laureates flatten(. $laureates)} %>% unnest(prizes)
{
# Ended up not using these in final product, but it works.
# countries <- get_data(suffix = "country.csv", type = "csv", persist = persist)
Looking at Nobel Data
Assignment
Working with the two json files available through the API at nobelprize.org, ask and answer four interesting questions, like which country lost the most nobel laureates, like who were born in one country but received their prize in a different country.
Load and prepare data
Question 1 : How many?
How many different nobel prizes have been awarded, by type?
%>%
prizes group_by(category) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(category, count), y = count)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "Count of people", y = "Category", title = "Count of people by Nobel category") +
theme_minimal()
Question 2 : Which countries?
Were do these Nobel prize winners come from?
%>%
laureates filter(!is.na(bornCountry)) %>%
group_by(bornCountry, category) %>%
summarize(count = n(), .groups = "drop") %>%
mutate(country_group = ifelse(count <=4, "FourOrLessAwards", as.character(bornCountry))) %>%
group_by(country_group, category) %>%
summarize(count = sum(count), .groups = "drop") %>%
ungroup() %>%
group_by(country_group) %>%
mutate(total_count = sum(count)) %>%
ungroup() %>%
arrange(desc(total_count)) %>%
mutate(country_group = forcats::fct_inorder(country_group)) %>%
ggplot(aes(x = country_group, y = count, fill = category)) +
geom_bar(stat = "identity", position="stack") +
coord_flip() +
labs(x = "Country", y = "Count of Nobel laureates", title = "Nobel laureates by country and category") +
theme_minimal()
Question 3: Which cities?
How about by city? Are there any major city groupings in this data? I assume the big cites are New York and London. After building this chart, I am surprized at how few come from the west coast of the US.
%>%
laureates filter(!is.na(bornCity) & !is.na(bornCountryCode)) %>%
group_by(bornCity, bornCountryCode, category) %>%
summarize(count = n(), .groups = "drop") %>%
mutate(city_country = paste(bornCity, bornCountryCode, sep = ", ")) %>%
group_by(city_country) %>%
mutate(total_count = sum(count)) %>%
ungroup() %>%
filter(total_count > 2) %>%
ggplot(aes(x = reorder(city_country, -total_count), y = count, fill = category)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "City", y = "Count of Nobel laureates", fill = "Category") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
ggtitle("Nobel laureates by city and category")
Question 4 : Over time
Over time, has there been much of a shake up on who gets these awards? When did the US start winning all the prizes? Was it after WWII?
<- c("USA", "United Kingdom", "Germany", "France", "Sweden", "Japan")
track_countries
%>%
laureates filter(bornCountry %in% track_countries, !is.na(bornCountry), !is.na(year)) %>%
mutate(year = as.numeric(year)) %>%
group_by(bornCountry, year_bin = cut(year,
breaks=seq(min(year, na.rm = TRUE),
max(year, na.rm = TRUE), by=5),
include.lowest = TRUE,
right = FALSE)) %>%
summarize(count = n(), .groups = 'drop') %>%
ungroup() %>%
filter(!is.na(year_bin)) %>%
ggplot(aes(x = as.character(year_bin), y = count, color = bornCountry, group = bornCountry)) +
geom_line(linewidth=1.4) +
theme_minimal() +
labs(x = "Year (binned in 5-year intervals)",
y = "Number of Nobel prizes",
title = "Nobel prizes by country over time") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_discrete(guide = guide_axis(n.dodge=2))