Introduction

Using the API available at nobelprize.org to access two JSON files, come up with and provide answers to four intriguing questions. For example, one question could be: “Which nation has had the highest number of Nobel laureates who were born in their country but received the prize as citizens of another country?”

Loading necessary packages

library(httr)
library(jsonlite)
library(stringr)
library(tidyverse)
library(lubridate)
library(knitr)

Send an API request to the server.

url <- "https://raw.githubusercontent.com/waheeb123/Assignment_7_607/main/laureates.json"
response <- GET(url)
json_data <- rawToChar(response$content)
laureates_df <- fromJSON(json_data)$laureates

This code sends an HTTP GET request to the URL, retrieves the response content as raw bytes, converts the raw bytes to a character string, and finally uses fromJSON() to parse the JSON data and convert it to a data frame. The resulting data frame is stored in the laureates_df object, which we can use for further analysis

 names(laureates_df)

##  [1] "id"          "knownName"   "givenName"   "familyName"  "fullName"   
##  [6] "fileName"    "gender"      "birth"       "wikipedia"   "wikidata"   
## [11] "sameAs"      "links"       "nobelPrizes" "death"

Interpret and produce data.

# extract the 'id' column from the 'laureates_df' data frame
df1 <- laureates_df$id %>% as.data.frame()

# rename the column to 'Laureates.Id'
colnames(df1)[1] <- "Laureates.Id"

# display the first few rows of the resulting data frame
head(df1, 7)

##   Laureates.Id
## 1          745
## 2          102
## 3          779
## 4          259
## 5         1004
## 6          114
## 7          982

Extract the knownName column from the laureates_df data frame and rename the column to KnownName, then remove the se column:

# extract the 'knownName' column from the 'laureates_df' data frame
df2 <- laureates_df$knownName %>% as.data.frame()

# rename the column to 'KnownName'
colnames(df2)[1] <- "KnownName"

# drop the 'se' column
df2 <- subset(df2, select = -c(se))

# display the first few rows of the resulting data frame
head(df2, 7)

##           KnownName
## 1 A. Michael Spence
## 2      Aage N. Bohr
## 3 Aaron Ciechanover
## 4        Aaron Klug
## 5 Abdulrazak Gurnah
## 6       Abdus Salam
## 7  Abhijit Banerjee

Extract the givenName column from laureates_df and rename the column to GivenName, then remove the se column

# extract the 'givenName' column from 'laureates_df'
df3 <- laureates_df$givenName %>% as.data.frame()

# rename the column to 'GivenName'
colnames(df3)[1] <- "GivenName"

# drop the 'se' column
df3 <- subset(df3, select = -c(se))
# display the first few rows of the resulting data frame
head(df3, 7)

##    GivenName
## 1 A. Michael
## 2    Aage N.
## 3      Aaron
## 4      Aaron
## 5 Abdulrazak
## 6      Abdus
## 7    Abhijit

Extract the familyName column from laureates_df and rename the column to FamilyName, then remove the se column

# extract the 'familyName' column from 'laureates_df'
df4 <- laureates_df$familyName %>% as.data.frame()

# rename the column to 'FamilyName'
colnames(df4)[1] <- "FamilyName"

# drop the 'se' column
df4 <- subset(df4, select = -c(se))

# display the first few rows of the resulting data frame
head(df4, 7)

##    FamilyName
## 1      Spence
## 2        Bohr
## 3 Ciechanover
## 4        Klug
## 5      Gurnah
## 6       Salam
## 7    Banerjee

Extract the fullName column from laureates_df and rename the column to FullName, then remove the se column

# extract the 'fullName' column from 'laureates_df'

df5 <- laureates_df$fullName %>% as.data.frame()

# rename the column to 'FullName'
colnames(df5)[1] <- "FullName"

# drop the 'se' column
df5 <- subset(df5, select = -c(se))

# display the first few rows of the resulting data frame
head(df5, 7)

##            FullName
## 1 A. Michael Spence
## 2   Aage Niels Bohr
## 3 Aaron Ciechanover
## 4        Aaron Klug
## 5 Abdulrazak Gurnah
## 6       Abdus Salam
## 7  Abhijit Banerjee

Extract the fileName column from laureates_df and rename the column to FileName

# extract the 'fileName' column from 'laureates_df'
df6 <- laureates_df$fileName %>% as.data.frame()

# rename the column to 'FileName'
colnames(df6)[1] <- "FileName"
# display the first few rows of the resulting data frame
head(df6, 7)

##      FileName
## 1      spence
## 2        bohr
## 3 ciechanover
## 4        klug
## 5      gurnah
## 6       salam
## 7    banerjee

Extract the gender column from laureates_df and rename the column to Gender

# extract the 'gender' column from 'laureates_df'
df7 <- laureates_df$gender %>% as.data.frame()

# rename the column to 'Gender'
colnames(df7)[1] <- "Gender"

# display the first few rows of the resulting data frame
head(df7, 7)

##   Gender
## 1   male
## 2   male
## 3   male
## 4   male
## 5   male
## 6   male
## 7   male

creating a data frame df8 with the birth date of each laureate, with column name “BirthDate”. The subset function is used to remove the “place” column.

df8 <- as.data.frame(laureates_df$birth)   # birth.date
colnames(df8)[1] = "BirthDate"  # change column name by colname function
df8 <- subset(df8, select = -c(place)) # drop columns 'place'

head(df8, 7)

##    BirthDate
## 1 1943-00-00
## 2 1922-06-19
## 3 1947-10-01
## 4 1926-08-11
## 5 1948-00-00
## 6 1926-01-29
## 7 1961-02-21

Extracting the “city” column from the “birth” column in the laureates_df data frame

df8_1<- as.data.frame(laureates_df$birth$place$city)
colnames(df8_1)[1] <- "Birth.PlaceCity"
df8_1 <- subset(df8_1, select = -c(no, se))
head(df8_1, 4)

##   Birth.PlaceCity
## 1   Montclair, NJ
## 2      Copenhagen
## 3           Haifa
## 4          Zelvas

df8_2 <- as.data.frame(laureates_df$birth$place$country)
colnames(df8_2)[1] <- "Birth.PlaceCountry"
df8_2 <- subset(df8_2, select = -c(no, se))

head(df8_2, 4)

##                  Birth.PlaceCountry
## 1                               USA
## 2                           Denmark
## 3 British Protectorate of Palestine
## 4                         Lithuania

df8_3 <- as.data.frame(laureates_df$birth$place$cityNow)
colnames(df8_3)[1] <- "Birth.PlaceCityNow"
df8_3 <- subset(df8_3, select = -c(no, se, sameAs))

head(df8_3, 4)

##   Birth.PlaceCityNow
## 1      Montclair, NJ
## 2         Copenhagen
## 3              Haifa
## 4             Zelvas

df8_4 <- as.data.frame(laureates_df$birth$place$countryNow)
colnames(df8_4)[1] <- "Birth.PlaceCountryNow"
df8_4 <- subset(df8_4, select = -c(no, se, sameAs))

head(df8_4, 4)

##   Birth.PlaceCountryNow
## 1                   USA
## 2               Denmark
## 3                Israel
## 4             Lithuania

Extracting the “continent” column from the “place” column in the “birth” column in laureates_df

df8_5 <- as.data.frame(laureates_df$birth$place$continent)
colnames(df8_5)[1] <- "Birth.PlaceContinent"
df8_5 <- subset(df8_5, select = -c(no, se))

head(df8_5, 4)

##   Birth.PlaceContinent
## 1        North America
## 2               Europe
## 3                 Asia
## 4               Europe

df8_6 <- as.data.frame(laureates_df$birth$place$location)
colnames(df8_6)[1] <- "Birth.PlaceLocation"
df8_6 <- subset(df8_6, select = -c(no, se))
head(df8_6, 4)

##                                     Birth.PlaceLocation
## 1                                    Montclair, NJ, USA
## 2                                   Copenhagen, Denmark
## 3 Haifa, British Protectorate of Palestine (now Israel)
## 4                                     Zelvas, Lithuania

df9 <- data.frame(laureates_df$death)
colnames(df9) [1]<- c("Birth.Death")
df9 <- subset(df9, select = -c(place))
head(df9, 7)

##   Birth.Death
## 1        <NA>
## 2  2009-09-08
## 3        <NA>
## 4  2018-11-20
## 5        <NA>
## 6  1996-11-21
## 7        <NA>

combine them using the cbind() function.

res_cbind <- cbind.data.frame(df1, df2, df3, df4, df5, df6, df7, df8, df8_1, df8_2, df8_3,df8_4, df8_5, df8_6,df9)
knitr::kable (head(res_cbind, 4))

Laureates.Id	KnownName	GivenName	FamilyName	FullName	FileName	Gender	BirthDate	Birth.PlaceCity	Birth.PlaceCountry	Birth.PlaceCityNow	Birth.PlaceCountryNow	Birth.PlaceContinent	Birth.PlaceLocation	Birth.Death
745	A. Michael Spence	A. Michael	Spence	A. Michael Spence	spence	male	1943-00-00	Montclair, NJ	USA	Montclair, NJ	USA	North America	Montclair, NJ, USA	NA
102	Aage N. Bohr	Aage N.	Bohr	Aage Niels Bohr	bohr	male	1922-06-19	Copenhagen	Denmark	Copenhagen	Denmark	Europe	Copenhagen, Denmark	2009-09-08
779	Aaron Ciechanover	Aaron	Ciechanover	Aaron Ciechanover	ciechanover	male	1947-10-01	Haifa	British Protectorate of Palestine	Haifa	Israel	Asia	Haifa, British Protectorate of Palestine (now Israel)	NA
259	Aaron Klug	Aaron	Klug	Aaron Klug	klug	male	1926-08-11	Zelvas	Lithuania	Zelvas	Lithuania	Europe	Zelvas, Lithuania	2018-11-20

Answering the four questions

question 1

Which continent were these laureates born on?

continent <- res_cbind %>%
  count(Birth.PlaceContinent)
    knitr:: kable(continent)

Birth.PlaceContinent	n
Africa	3
Asia	6
Europe	9
North America	4
Oceania	1
South America	1
NA	1

ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
  stat_count(width = 0.7, aes(fill = Birth.PlaceCountry)) +
  labs(x = "Continent", y = "Count", 
       title = "Nobel Prizes by Continent and Birth Place") +
coord_flip()

question 2

How many Nobel laureates are listed in the dataset based on their gender, and what is the respective count for males and females?

country_gender <- res_cbind %>%
  count(res_cbind$Gender)

knitr::kable(country_gender)

res_cbind$Gender	n
female	1
male	24

ggplot(data = res_cbind, aes(x = Birth.PlaceContinent)) +
  stat_count(width = 0.5, aes(fill = Gender)) +
  labs(x = "Continent", y = "Count", 
       title = "Nobel Prizes by Continent and Gender") +
  scale_fill_manual(values = c("yellow", "lightblue"))

Question 3

Which country “lost” the most Nobel laureates (who were born there but received their Nobel prize as a citizen of a different country)?

born_country <- res_cbind$Birth.PlaceCountry
prize_country <- res_cbind$Birth.PlaceContinent
lost_laureates <- table(born_country[born_country != prize_country])

lost_laureates[order(lost_laureates, decreasing = TRUE)][1]

## USA 
##   4

Question 4

When was the dataset of Laureates’ birth years recorded?“

birth_year <- str_sub(res_cbind$BirthDate, 1, 4) 

birth_year

##  [1] "1943" "1922" "1947" "1926" "1948" "1926" "1961" "1976" "1939" "1969"
## [11] "1903" "1835" "1876" "1931" "1946" "1930" "1948" "1948" "1947" "1936"
## [21] "1914" "1927" "1852" "1913" "1898"

ggplot(data = res_cbind, aes(x = Birth.PlaceContinent, birth_year)) +
  labs(title = "Nobel Prizes by Birth Place Continent and Birth Year") +

geom_point(colour = "black", size = 4)

Question 5

Which country has produced the most Nobel laureates?

res_cbind %>%
  group_by(Birth.PlaceCountry) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  head(10)

## # A tibble: 10 × 2
##    Birth.PlaceCountry                    n
##    <chr>                             <int>
##  1 USA                                   4
##  2 Germany                               2
##  3 India                                 2
##  4 Japan                                 2
##  5 Prussia                               2
##  6 Argentina                             1
##  7 Belgium                               1
##  8 British Mandate of Palestine          1
##  9 British Protectorate of Palestine     1
## 10 Denmark                               1

Extra Credit On JSON

waheeb Algabri