Using the API available at nobelprize.org to access two JSON files, come up with and provide answers to four intriguing questions. For example, one question could be: “Which nation has had the highest number of Nobel laureates who were born in their country but received the prize as citizens of another country?”
Loading necessary packages
library(httr)
library(jsonlite)
library(stringr)
library(tidyverse)
library(lubridate)
library(knitr)url <- "https://raw.githubusercontent.com/waheeb123/Assignment_7_607/main/laureates.json"
response <- GET(url)
json_data <- rawToChar(response$content)
laureates_df <- fromJSON(json_data)$laureatesThis code sends an HTTP GET request to the URL, retrieves the response content as raw bytes, converts the raw bytes to a character string, and finally uses fromJSON() to parse the JSON data and convert it to a data frame. The resulting data frame is stored in the laureates_df object, which we can use for further analysis
names(laureates_df)## [1] "id" "knownName" "givenName" "familyName" "fullName"
## [6] "fileName" "gender" "birth" "wikipedia" "wikidata"
## [11] "sameAs" "links" "nobelPrizes" "death"
# extract the 'id' column from the 'laureates_df' data frame
df1 <- laureates_df$id %>% as.data.frame()
# rename the column to 'Laureates.Id'
colnames(df1)[1] <- "Laureates.Id"
# display the first few rows of the resulting data frame
head(df1, 7)## Laureates.Id
## 1 745
## 2 102
## 3 779
## 4 259
## 5 1004
## 6 114
## 7 982
Extract the knownName column from the laureates_df data frame and rename the column to KnownName, then remove the se column:
# extract the 'knownName' column from the 'laureates_df' data frame
df2 <- laureates_df$knownName %>% as.data.frame()
# rename the column to 'KnownName'
colnames(df2)[1] <- "KnownName"
# drop the 'se' column
df2 <- subset(df2, select = -c(se))
# display the first few rows of the resulting data frame
head(df2, 7)## KnownName
## 1 A. Michael Spence
## 2 Aage N. Bohr
## 3 Aaron Ciechanover
## 4 Aaron Klug
## 5 Abdulrazak Gurnah
## 6 Abdus Salam
## 7 Abhijit Banerjee
Extract the givenName column from laureates_df and rename the column to GivenName, then remove the se column
# extract the 'givenName' column from 'laureates_df'
df3 <- laureates_df$givenName %>% as.data.frame()
# rename the column to 'GivenName'
colnames(df3)[1] <- "GivenName"
# drop the 'se' column
df3 <- subset(df3, select = -c(se))
# display the first few rows of the resulting data frame
head(df3, 7)## GivenName
## 1 A. Michael
## 2 Aage N.
## 3 Aaron
## 4 Aaron
## 5 Abdulrazak
## 6 Abdus
## 7 Abhijit
Extract the familyName column from laureates_df and rename the column to FamilyName, then remove the se column
# extract the 'familyName' column from 'laureates_df'
df4 <- laureates_df$familyName %>% as.data.frame()
# rename the column to 'FamilyName'
colnames(df4)[1] <- "FamilyName"
# drop the 'se' column
df4 <- subset(df4, select = -c(se))
# display the first few rows of the resulting data frame
head(df4, 7)## FamilyName
## 1 Spence
## 2 Bohr
## 3 Ciechanover
## 4 Klug
## 5 Gurnah
## 6 Salam
## 7 Banerjee
Extract the fullName column from laureates_df and rename the column to FullName, then remove the se column
# extract the 'fullName' column from 'laureates_df'
df5 <- laureates_df$fullName %>% as.data.frame()
# rename the column to 'FullName'
colnames(df5)[1] <- "FullName"
# drop the 'se' column
df5 <- subset(df5, select = -c(se))
# display the first few rows of the resulting data frame
head(df5, 7)## FullName
## 1 A. Michael Spence
## 2 Aage Niels Bohr
## 3 Aaron Ciechanover
## 4 Aaron Klug
## 5 Abdulrazak Gurnah
## 6 Abdus Salam
## 7 Abhijit Banerjee
Extract the fileName column from laureates_df and rename the column to FileName
# extract the 'fileName' column from 'laureates_df'
df6 <- laureates_df$fileName %>% as.data.frame()
# rename the column to 'FileName'
colnames(df6)[1] <- "FileName"
# display the first few rows of the resulting data frame
head(df6, 7)## FileName
## 1 spence
## 2 bohr
## 3 ciechanover
## 4 klug
## 5 gurnah
## 6 salam
## 7 banerjee
Extract the gender column from laureates_df and rename the column to Gender
# extract the 'gender' column from 'laureates_df'
df7 <- laureates_df$gender %>% as.data.frame()
# rename the column to 'Gender'
colnames(df7)[1] <- "Gender"
# display the first few rows of the resulting data frame
head(df7, 7)## Gender
## 1 male
## 2 male
## 3 male
## 4 male
## 5 male
## 6 male
## 7 male
creating a data frame df8 with the birth date of each laureate, with column name “BirthDate”. The subset function is used to remove the “place” column.
df8 <- as.data.frame(laureates_df$birth) # birth.date
colnames(df8)[1] = "BirthDate" # change column name by colname function
df8 <- subset(df8, select = -c(place)) # drop columns 'place'
head(df8, 7)## BirthDate
## 1 1943-00-00
## 2 1922-06-19
## 3 1947-10-01
## 4 1926-08-11
## 5 1948-00-00
## 6 1926-01-29
## 7 1961-02-21
Extracting the “city” column from the “birth” column in the laureates_df data frame
df8_1<- as.data.frame(laureates_df$birth$place$city)
colnames(df8_1)[1] <- "Birth.PlaceCity"
df8_1 <- subset(df8_1, select = -c(no, se))
head(df8_1, 4)## Birth.PlaceCity
## 1 Montclair, NJ
## 2 Copenhagen
## 3 Haifa
## 4 Zelvas
df8_2 <- as.data.frame(laureates_df$birth$place$country)
colnames(df8_2)[1] <- "Birth.PlaceCountry"
df8_2 <- subset(df8_2, select = -c(no, se))
head(df8_2, 4)## Birth.PlaceCountry
## 1 USA
## 2 Denmark
## 3 British Protectorate of Palestine
## 4 Lithuania
df8_3 <- as.data.frame(laureates_df$birth$place$cityNow)
colnames(df8_3)[1] <- "Birth.PlaceCityNow"
df8_3 <- subset(df8_3, select = -c(no, se, sameAs))
head(df8_3, 4)## Birth.PlaceCityNow
## 1 Montclair, NJ
## 2 Copenhagen
## 3 Haifa
## 4 Zelvas
df8_4 <- as.data.frame(laureates_df$birth$place$countryNow)
colnames(df8_4)[1] <- "Birth.PlaceCountryNow"
df8_4 <- subset(df8_4, select = -c(no, se, sameAs))
head(df8_4, 4)## Birth.PlaceCountryNow
## 1 USA
## 2 Denmark
## 3 Israel
## 4 Lithuania
Extracting the “continent” column from the “place” column in the “birth” column in laureates_df
df8_5 <- as.data.frame(laureates_df$birth$place$continent)
colnames(df8_5)[1] <- "Birth.PlaceContinent"
df8_5 <- subset(df8_5, select = -c(no, se))
head(df8_5, 4)## Birth.PlaceContinent
## 1 North America
## 2 Europe
## 3 Asia
## 4 Europe
df8_6 <- as.data.frame(laureates_df$birth$place$location)
colnames(df8_6)[1] <- "Birth.PlaceLocation"
df8_6 <- subset(df8_6, select = -c(no, se))
head(df8_6, 4)## Birth.PlaceLocation
## 1 Montclair, NJ, USA
## 2 Copenhagen, Denmark
## 3 Haifa, British Protectorate of Palestine (now Israel)
## 4 Zelvas, Lithuania
df9 <- data.frame(laureates_df$death)
colnames(df9) [1]<- c("Birth.Death")
df9 <- subset(df9, select = -c(place))
head(df9, 7)## Birth.Death
## 1 <NA>
## 2 2009-09-08
## 3 <NA>
## 4 2018-11-20
## 5 <NA>
## 6 1996-11-21
## 7 <NA>
combine them using the cbind() function.
res_cbind <- cbind.data.frame(df1, df2, df3, df4, df5, df6, df7, df8, df8_1, df8_2, df8_3,df8_4, df8_5, df8_6,df9)
knitr::kable (head(res_cbind, 4))| Laureates.Id | KnownName | GivenName | FamilyName | FullName | FileName | Gender | BirthDate | Birth.PlaceCity | Birth.PlaceCountry | Birth.PlaceCityNow | Birth.PlaceCountryNow | Birth.PlaceContinent | Birth.PlaceLocation | Birth.Death |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 745 | A. Michael Spence | A. Michael | Spence | A. Michael Spence | spence | male | 1943-00-00 | Montclair, NJ | USA | Montclair, NJ | USA | North America | Montclair, NJ, USA | NA |
| 102 | Aage N. Bohr | Aage N. | Bohr | Aage Niels Bohr | bohr | male | 1922-06-19 | Copenhagen | Denmark | Copenhagen | Denmark | Europe | Copenhagen, Denmark | 2009-09-08 |
| 779 | Aaron Ciechanover | Aaron | Ciechanover | Aaron Ciechanover | ciechanover | male | 1947-10-01 | Haifa | British Protectorate of Palestine | Haifa | Israel | Asia | Haifa, British Protectorate of Palestine (now Israel) | NA |
| 259 | Aaron Klug | Aaron | Klug | Aaron Klug | klug | male | 1926-08-11 | Zelvas | Lithuania | Zelvas | Lithuania | Europe | Zelvas, Lithuania | 2018-11-20 |
Which continent were these laureates born on?
continent <- res_cbind %>%
count(Birth.PlaceContinent)
knitr:: kable(continent)| Birth.PlaceContinent | n |
|---|---|
| Africa | 3 |
| Asia | 6 |
| Europe | 9 |
| North America | 4 |
| Oceania | 1 |
| South America | 1 |
| NA | 1 |
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
stat_count(width = 0.7, aes(fill = Birth.PlaceCountry)) +
labs(x = "Continent", y = "Count",
title = "Nobel Prizes by Continent and Birth Place") +
coord_flip()How many Nobel laureates are listed in the dataset based on their gender, and what is the respective count for males and females?
country_gender <- res_cbind %>%
count(res_cbind$Gender)
knitr::kable(country_gender)| res_cbind$Gender | n |
|---|---|
| female | 1 |
| male | 24 |
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
stat_count(width = 0.5, aes(fill = Gender)) +
labs(x = "Continent", y = "Count",
title = "Nobel Prizes by Continent and Gender") +
scale_fill_manual(values = c("yellow", "lightblue"))Which country “lost” the most Nobel laureates (who were born there but received their Nobel prize as a citizen of a different country)?
born_country <- res_cbind$Birth.PlaceCountry
prize_country <- res_cbind$Birth.PlaceContinent
lost_laureates <- table(born_country[born_country != prize_country])
lost_laureates[order(lost_laureates, decreasing = TRUE)][1]## USA
## 4
When was the dataset of Laureates’ birth years recorded?“
birth_year <- str_sub(res_cbind$BirthDate, 1, 4)
birth_year## [1] "1943" "1922" "1947" "1926" "1948" "1926" "1961" "1976" "1939" "1969"
## [11] "1903" "1835" "1876" "1931" "1946" "1930" "1948" "1948" "1947" "1936"
## [21] "1914" "1927" "1852" "1913" "1898"
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent, birth_year)) +
labs(title = "Nobel Prizes by Birth Place Continent and Birth Year") +
geom_point(colour = "black", size = 4)Which country has produced the most Nobel laureates?
res_cbind %>%
group_by(Birth.PlaceCountry) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
head(10)## # A tibble: 10 × 2
## Birth.PlaceCountry n
## <chr> <int>
## 1 USA 4
## 2 Germany 2
## 3 India 2
## 4 Japan 2
## 5 Prussia 2
## 6 Argentina 1
## 7 Belgium 1
## 8 British Mandate of Palestine 1
## 9 British Protectorate of Palestine 1
## 10 Denmark 1