The New York Times web site provides a rich set of APIs, as described here: https://developer.nytimes.com/apis.

Task: Choose one of the New York Times APIs, construct an interface in R to read in the JSON data, and transform it into an R DataFrame.

Step 1: Load libraries

Create a request to the top stories NY Times API

req <- request("https://api.nytimes.com/svc/topstories/v2/science.json?api-key=4ZFLCUM2HxGjmPqm5PvB5InkBUcM7n2I")
req
## <httr2_request>
## GET
## https://api.nytimes.com/svc/topstories/v2/science.json?api-key=4ZFLCUM2HxGjmPqm5PvB5InkBUcM7n2I
## Body: empty
req %>% req_headers("Accept" = "application/json")
## <httr2_request>
## GET
## https://api.nytimes.com/svc/topstories/v2/science.json?api-key=4ZFLCUM2HxGjmPqm5PvB5InkBUcM7n2I
## Headers:
## • Accept: 'application/json'
## Body: empty
req %>% req_body_json(list(x = 1, y = 2))
## <httr2_request>
## POST
## https://api.nytimes.com/svc/topstories/v2/science.json?api-key=4ZFLCUM2HxGjmPqm5PvB5InkBUcM7n2I
## Body: json encoded data

Create a response to read data into a data frame

resp <- req_perform(req)
resp %>% resp_content_type()
## [1] "application/json"
resp <- resp %>% resp_body_json()

Create a data frame

df <- tibble(resp)
df <- df[6,]
df <- df %>% unnest_longer(resp) %>%
  unnest_wider(resp)
head(df)

Select rows and columns we are interested in and create a new df top_stories

top_stories <- df %>% select(section, title, abstract, url, byline, published_date)
top_stories <- top_stories[c(-1,-2),]
glimpse(top_stories)
## Rows: 25
## Columns: 6
## $ section        <chr> "health", "health", "travel", "magazine", "health", "he…
## $ title          <chr> "CUNY Halts Investigation of Alzheimer’s Researcher", "…
## $ abstract       <chr> "Citing questions about the integrity of the process, t…
## $ url            <chr> "https://www.nytimes.com/2023/10/28/health/cassava-cuny…
## $ byline         <chr> "By Apoorva Mandavilli", "By Apoorva Mandavilli", "By S…
## $ published_date <chr> "2023-10-28T09:40:45-04:00", "2023-10-27T15:10:55-04:00…

Clean and tidy byline column

# Extract author name and clean white space for easier tidying 
top_stories$byline <- str_trim(str_extract(top_stories$byline, "\\s\\D*"))

# Widen data frame into author first name and last name 
top_stories <- top_stories %>% separate_longer_delim(cols = byline, delim = " and ")
top_stories <- top_stories %>% separate_wider_delim(cols = byline, 
                            delim = " ", 
                            names = c("author_fname", "author_lname"),
                            too_many = "merge")

Clean published date column

top_stories$published_date <- as.Date(top_stories$published_date) 
top_stories$published_date <- ymd(top_stories$published_date)

View final data frame

head(top_stories)

Data Visualization: The number of articles from each section of the news paper that make up the top stories

top_stories %>% select(section, title) %>% distinct() %>% 
  ggplot(aes(x = section, fill = section)) + 
  geom_bar() +
  geom_text(aes(label = ..count..), stat = "count", vjust = 3, color = "white") +
  ggtitle("Top Stories By Section") + 
  xlab("Section") +
  theme(legend.position = "none",
        axis.title.x = element_text(color="black",size=10),
        axis.text.x = element_text(size=8),
        plot.title = element_text(color = "black",
                                  size=16))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.