Introduction

Using the API available at nobelprize.org to access two JSON files, come up with and provide answers to four intriguing questions. For example, one question could be: “Which nation has had the highest number of Nobel laureates who were born in their country but received the prize as citizens of another country?”

Loading necessary packages

library(httr)
library(jsonlite)
library(stringr)
library(tidyverse)
library(lubridate)
library(knitr)

Send an API request to the server.

url <- "https://raw.githubusercontent.com/waheeb123/Assignment_7_607/main/laureates.json"
response <- GET(url)
json_data <- rawToChar(response$content)
laureates_df <- fromJSON(json_data)$laureates

This code sends an HTTP GET request to the URL, retrieves the response content as raw bytes, converts the raw bytes to a character string, and finally uses fromJSON() to parse the JSON data and convert it to a data frame. The resulting data frame is stored in the laureates_df object, which we can use for further analysis

 names(laureates_df)
##  [1] "id"          "knownName"   "givenName"   "familyName"  "fullName"   
##  [6] "fileName"    "gender"      "birth"       "wikipedia"   "wikidata"   
## [11] "sameAs"      "links"       "nobelPrizes" "death"

Interpret and produce data.

# extract the 'id' column from the 'laureates_df' data frame
df1 <- laureates_df$id %>% as.data.frame()

# rename the column to 'Laureates.Id'
colnames(df1)[1] <- "Laureates.Id"

# display the first few rows of the resulting data frame
head(df1, 7)
##   Laureates.Id
## 1          745
## 2          102
## 3          779
## 4          259
## 5         1004
## 6          114
## 7          982

Extract the knownName column from the laureates_df data frame and rename the column to KnownName, then remove the se column:

# extract the 'knownName' column from the 'laureates_df' data frame
df2 <- laureates_df$knownName %>% as.data.frame()

# rename the column to 'KnownName'
colnames(df2)[1] <- "KnownName"

# drop the 'se' column
df2 <- subset(df2, select = -c(se))

# display the first few rows of the resulting data frame
head(df2, 7)
##           KnownName
## 1 A. Michael Spence
## 2      Aage N. Bohr
## 3 Aaron Ciechanover
## 4        Aaron Klug
## 5 Abdulrazak Gurnah
## 6       Abdus Salam
## 7  Abhijit Banerjee

Extract the givenName column from laureates_df and rename the column to GivenName, then remove the se column

# extract the 'givenName' column from 'laureates_df'
df3 <- laureates_df$givenName %>% as.data.frame()

# rename the column to 'GivenName'
colnames(df3)[1] <- "GivenName"

# drop the 'se' column
df3 <- subset(df3, select = -c(se))
# display the first few rows of the resulting data frame
head(df3, 7)
##    GivenName
## 1 A. Michael
## 2    Aage N.
## 3      Aaron
## 4      Aaron
## 5 Abdulrazak
## 6      Abdus
## 7    Abhijit

Extract the familyName column from laureates_df and rename the column to FamilyName, then remove the se column

# extract the 'familyName' column from 'laureates_df'
df4 <- laureates_df$familyName %>% as.data.frame()

# rename the column to 'FamilyName'
colnames(df4)[1] <- "FamilyName"

# drop the 'se' column
df4 <- subset(df4, select = -c(se))

# display the first few rows of the resulting data frame
head(df4, 7)
##    FamilyName
## 1      Spence
## 2        Bohr
## 3 Ciechanover
## 4        Klug
## 5      Gurnah
## 6       Salam
## 7    Banerjee

Extract the fullName column from laureates_df and rename the column to FullName, then remove the se column

# extract the 'fullName' column from 'laureates_df'

df5 <- laureates_df$fullName %>% as.data.frame()

# rename the column to 'FullName'
colnames(df5)[1] <- "FullName"

# drop the 'se' column
df5 <- subset(df5, select = -c(se))

# display the first few rows of the resulting data frame
head(df5, 7)
##            FullName
## 1 A. Michael Spence
## 2   Aage Niels Bohr
## 3 Aaron Ciechanover
## 4        Aaron Klug
## 5 Abdulrazak Gurnah
## 6       Abdus Salam
## 7  Abhijit Banerjee

Extract the fileName column from laureates_df and rename the column to FileName

# extract the 'fileName' column from 'laureates_df'
df6 <- laureates_df$fileName %>% as.data.frame()

# rename the column to 'FileName'
colnames(df6)[1] <- "FileName"
# display the first few rows of the resulting data frame
head(df6, 7)
##      FileName
## 1      spence
## 2        bohr
## 3 ciechanover
## 4        klug
## 5      gurnah
## 6       salam
## 7    banerjee

Extract the gender column from laureates_df and rename the column to Gender

# extract the 'gender' column from 'laureates_df'
df7 <- laureates_df$gender %>% as.data.frame()

# rename the column to 'Gender'
colnames(df7)[1] <- "Gender"

# display the first few rows of the resulting data frame
head(df7, 7)
##   Gender
## 1   male
## 2   male
## 3   male
## 4   male
## 5   male
## 6   male
## 7   male

creating a data frame df8 with the birth date of each laureate, with column name “BirthDate”. The subset function is used to remove the “place” column.

df8 <- as.data.frame(laureates_df$birth)   # birth.date
colnames(df8)[1] = "BirthDate"  # change column name by colname function
df8 <- subset(df8, select = -c(place)) # drop columns 'place'

head(df8, 7)
##    BirthDate
## 1 1943-00-00
## 2 1922-06-19
## 3 1947-10-01
## 4 1926-08-11
## 5 1948-00-00
## 6 1926-01-29
## 7 1961-02-21

Extracting the “city” column from the “birth” column in the laureates_df data frame

df8_1<- as.data.frame(laureates_df$birth$place$city)
colnames(df8_1)[1] <- "Birth.PlaceCity"
df8_1 <- subset(df8_1, select = -c(no, se))
head(df8_1, 4)
##   Birth.PlaceCity
## 1   Montclair, NJ
## 2      Copenhagen
## 3           Haifa
## 4          Zelvas
df8_2 <- as.data.frame(laureates_df$birth$place$country)
colnames(df8_2)[1] <- "Birth.PlaceCountry"
df8_2 <- subset(df8_2, select = -c(no, se))

head(df8_2, 4)
##                  Birth.PlaceCountry
## 1                               USA
## 2                           Denmark
## 3 British Protectorate of Palestine
## 4                         Lithuania
df8_3 <- as.data.frame(laureates_df$birth$place$cityNow)
colnames(df8_3)[1] <- "Birth.PlaceCityNow"
df8_3 <- subset(df8_3, select = -c(no, se, sameAs))

head(df8_3, 4)
##   Birth.PlaceCityNow
## 1      Montclair, NJ
## 2         Copenhagen
## 3              Haifa
## 4             Zelvas
df8_4 <- as.data.frame(laureates_df$birth$place$countryNow)
colnames(df8_4)[1] <- "Birth.PlaceCountryNow"
df8_4 <- subset(df8_4, select = -c(no, se, sameAs))

head(df8_4, 4)
##   Birth.PlaceCountryNow
## 1                   USA
## 2               Denmark
## 3                Israel
## 4             Lithuania

Extracting the “continent” column from the “place” column in the “birth” column in laureates_df

df8_5 <- as.data.frame(laureates_df$birth$place$continent)
colnames(df8_5)[1] <- "Birth.PlaceContinent"
df8_5 <- subset(df8_5, select = -c(no, se))

head(df8_5, 4)
##   Birth.PlaceContinent
## 1        North America
## 2               Europe
## 3                 Asia
## 4               Europe
df8_6 <- as.data.frame(laureates_df$birth$place$location)
colnames(df8_6)[1] <- "Birth.PlaceLocation"
df8_6 <- subset(df8_6, select = -c(no, se))
head(df8_6, 4)
##                                     Birth.PlaceLocation
## 1                                    Montclair, NJ, USA
## 2                                   Copenhagen, Denmark
## 3 Haifa, British Protectorate of Palestine (now Israel)
## 4                                     Zelvas, Lithuania
df9 <- data.frame(laureates_df$death)
colnames(df9) [1]<- c("Birth.Death")
df9 <- subset(df9, select = -c(place))
head(df9, 7)
##   Birth.Death
## 1        <NA>
## 2  2009-09-08
## 3        <NA>
## 4  2018-11-20
## 5        <NA>
## 6  1996-11-21
## 7        <NA>

combine them using the cbind() function.

res_cbind <- cbind.data.frame(df1, df2, df3, df4, df5, df6, df7, df8, df8_1, df8_2, df8_3,df8_4, df8_5, df8_6,df9)
knitr::kable (head(res_cbind, 4))
Laureates.Id KnownName GivenName FamilyName FullName FileName Gender BirthDate Birth.PlaceCity Birth.PlaceCountry Birth.PlaceCityNow Birth.PlaceCountryNow Birth.PlaceContinent Birth.PlaceLocation Birth.Death
745 A. Michael Spence A. Michael Spence A. Michael Spence spence male 1943-00-00 Montclair, NJ USA Montclair, NJ USA North America Montclair, NJ, USA NA
102 Aage N. Bohr Aage N. Bohr Aage Niels Bohr bohr male 1922-06-19 Copenhagen Denmark Copenhagen Denmark Europe Copenhagen, Denmark 2009-09-08
779 Aaron Ciechanover Aaron Ciechanover Aaron Ciechanover ciechanover male 1947-10-01 Haifa British Protectorate of Palestine Haifa Israel Asia Haifa, British Protectorate of Palestine (now Israel) NA
259 Aaron Klug Aaron Klug Aaron Klug klug male 1926-08-11 Zelvas Lithuania Zelvas Lithuania Europe Zelvas, Lithuania 2018-11-20

Answering the four questions

question 1

Which continent were these laureates born on?

continent <- res_cbind %>%
  count(Birth.PlaceContinent)
    knitr:: kable(continent)
Birth.PlaceContinent n
Africa 3
Asia 6
Europe 9
North America 4
Oceania 1
South America 1
NA 1
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
  stat_count(width = 0.7, aes(fill = Birth.PlaceCountry)) +
  labs(x = "Continent", y = "Count", 
       title = "Nobel Prizes by Continent and Birth Place") +
coord_flip()

question 2

How many Nobel laureates are listed in the dataset based on their gender, and what is the respective count for males and females?

country_gender <- res_cbind %>%
  count(res_cbind$Gender)

knitr::kable(country_gender)
res_cbind$Gender n
female 1
male 24
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
  stat_count(width = 0.5, aes(fill = Gender)) +
  labs(x = "Continent", y = "Count", 
       title = "Nobel Prizes by Continent and Gender") +
  scale_fill_manual(values = c("yellow", "lightblue"))

Question 3

Which country “lost” the most Nobel laureates (who were born there but received their Nobel prize as a citizen of a different country)?

born_country <- res_cbind$Birth.PlaceCountry
prize_country <- res_cbind$Birth.PlaceContinent
lost_laureates <- table(born_country[born_country != prize_country])

lost_laureates[order(lost_laureates, decreasing = TRUE)][1]
## USA 
##   4

Question 4

When was the dataset of Laureates’ birth years recorded?“

birth_year <- str_sub(res_cbind$BirthDate, 1, 4) 

birth_year
##  [1] "1943" "1922" "1947" "1926" "1948" "1926" "1961" "1976" "1939" "1969"
## [11] "1903" "1835" "1876" "1931" "1946" "1930" "1948" "1948" "1947" "1936"
## [21] "1914" "1927" "1852" "1913" "1898"
ggplot(data = res_cbind, aes(x = Birth.PlaceContinent, birth_year)) +
  labs(title = "Nobel Prizes by Birth Place Continent and Birth Year") +

geom_point(colour = "black", size = 4)

Question 5

Which country has produced the most Nobel laureates?

res_cbind %>%
  group_by(Birth.PlaceCountry) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  head(10)
## # A tibble: 10 × 2
##    Birth.PlaceCountry                    n
##    <chr>                             <int>
##  1 USA                                   4
##  2 Germany                               2
##  3 India                                 2
##  4 Japan                                 2
##  5 Prussia                               2
##  6 Argentina                             1
##  7 Belgium                               1
##  8 British Mandate of Palestine          1
##  9 British Protectorate of Palestine     1
## 10 Denmark                               1