Assignment

The New York Times web site provides a rich set of APIs, as described here: https://developer.nytimes.com/apis

You’ll need to start by signing up for an API key. Your task is to choose one of the New York Times APIs, construct an interface in R to read in the JSON data, and transform it into an R DataFrame.

Libraries

library(jsonlite)
library(tidyverse)
library(httr)

Connecting API

Top Stories API is used. There are many sections available. Science section is selected. https://developer.nytimes.com/docs/top-stories-product/1/overview

# API Key
apikey <- "VFeXw5YKKBTWemlgGWuu1GlZm73D1TAM"

#URL
url <- "https://api.nytimes.com/svc/topstories/v2/science.json?api-key="

#GET
resp <- GET(paste(url, apikey, sep=""))
resp
## Response [https://api.nytimes.com/svc/topstories/v2/science.json?api-key=VFeXw5YKKBTWemlgGWuu1GlZm73D1TAM]
##   Date: 2023-11-05 18:55
##   Status: 200
##   Content-Type: application/json
##   Size: 51.9 kB
summary(resp)
##             Length Class       Mode       
## url             1  -none-      character  
## status_code     1  -none-      numeric    
## headers        22  insensitive list       
## all_headers     1  -none-      list       
## cookies         7  data.frame  list       
## content     51887  -none-      raw        
## date            1  POSIXct     numeric    
## times           6  -none-      numeric    
## request         7  request     list       
## handle          1  curl_handle externalptr
text_content <- content(resp, as = "text")
## No encoding supplied: defaulting to UTF-8.

From JSON data to an R dataframe

data <- fromJSON(text_content)
df <- as.data.frame(data)

glimpse(df)
## Rows: 27
## Columns: 24
## $ status                      <chr> "OK", "OK", "OK", "OK", "OK", "OK", "OK", …
## $ copyright                   <chr> "Copyright (c) 2023 The New York Times Com…
## $ section                     <chr> "Science", "Science", "Science", "Science"…
## $ last_updated                <chr> "2023-11-03T07:19:33-04:00", "2023-11-03T0…
## $ num_results                 <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27…
## $ results.section             <chr> "science", "admin", "science", "health", "…
## $ results.subsection          <chr> "", "", "", "", "space", "", "", "", "", "…
## $ results.title               <chr> "Sign Up for the Science Times Newsletter"…
## $ results.abstract            <chr> "Every week, we’ll bring you stories that …
## $ results.url                 <chr> "null", "", "https://www.nytimes.com/2023/…
## $ results.uri                 <chr> "nyt://embeddedinteractive/daba9d03-29b8-5…
## $ results.byline              <chr> "", "", "By Orlando Mayorquin", "By Clay R…
## $ results.item_type           <chr> "EmbeddedInteractive", "EmbeddedInteractiv…
## $ results.updated_date        <chr> "2018-04-07T13:23:25-04:00", "2015-07-20T1…
## $ results.created_date        <chr> "2016-02-05T18:18:53-05:00", "2015-04-16T1…
## $ results.published_date      <chr> "2016-02-05T18:18:53-05:00", "2015-04-16T1…
## $ results.material_type_facet <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ results.kicker              <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ results.des_facet           <list> <>, <>, <"Reptiles", "Research", "Fish an…
## $ results.org_facet           <list> <>, <>, <>, <"Florida International Unive…
## $ results.per_facet           <list> <>, <>, <>, "Pelham, William E (1948- )",…
## $ results.geo_facet           <list> <>, <>, <>, <>, <>, <>, <>, <>, "United A…
## $ results.multimedia          <list> [<data.frame[3 x 8]>], <NULL>, [<data.fra…
## $ results.short_url           <chr> "", "", "", "", "", "", "", "", "", "", ""…

Tidy the Data

# select columns and rename them
df2 <- df |> 
        select(results.title, results.section, results.published_date) |> 
        rename(title = results.title, result_section = results.section, published_date = results.published_date)

# second row need to be removed as it has empty title
df2 <- df2[-2,]

What is the most common result section under science section?

We can see Health related article is more common!!

# Summary of result_section and its count
df3 <- df2 |> 
        group_by(result_section) |> 
        summarise(n = n())
df3
## # A tibble: 5 × 2
##   result_section     n
##   <chr>          <int>
## 1 climate            5
## 2 health            11
## 3 magazine           1
## 4 science            8
## 5 travel             1
# Bar plot
ggplot(df3, aes(x = reorder(result_section, -n), y = n, fill = result_section)) +
geom_bar(stat = "identity") +
labs(title = "Common article section under Science Category",
     x = "Result section",
     y = "Count") +
theme_minimal()