Assignment – Web APIs

The New York Times web site provides a rich set of APIs, as described here: https://developer.nytimes.com/apis You’ll need to start by signing up for an API key. Your task is to choose one of the New York Times APIs, construct an interface in R to read in the JSON data, and transform it into an R DataFrame.
For this assignment I will be using Top Stories API and focus on the section World.

Libraries

# install packages if you don't have them
#install.packages("rjson")


# load libraries
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("jsonlite")
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library("httr")
library("stringr")
library("ggplot2")
library("dplyr")

Connection to API

# API Key
apikey <- "x3mOLM3nRnwjYVOuE89tloE62oC8NofC"

# Get the URL 
theURL <- paste("https://api.nytimes.com/svc/topstories/v2/world.json?api-key=", apikey)

topstories <- GET(theURL)

# Get status code
topstories$status_code
## [1] 200
summary(topstories)
##             Length Class       Mode       
## url              1 -none-      character  
## status_code      1 -none-      numeric    
## headers         19 insensitive list       
## all_headers      1 -none-      list       
## cookies          7 data.frame  list       
## content     107041 -none-      raw        
## date             1 POSIXct     numeric    
## times            6 -none-      numeric    
## request          7 request     list       
## handle           1 curl_handle externalptr
top_stories <- content(topstories, as = "text")
## No encoding supplied: defaulting to UTF-8.

Data Frame Conversion

#Option 1
top_storiesdf <- top_stories %>% 
  fromJSON() %>% 
  as.data.frame()

#Get column names
colnames(top_storiesdf)
##  [1] "status"                      "copyright"                  
##  [3] "section"                     "last_updated"               
##  [5] "num_results"                 "results.section"            
##  [7] "results.subsection"          "results.title"              
##  [9] "results.abstract"            "results.url"                
## [11] "results.uri"                 "results.byline"             
## [13] "results.item_type"           "results.updated_date"       
## [15] "results.created_date"        "results.published_date"     
## [17] "results.material_type_facet" "results.kicker"             
## [19] "results.des_facet"           "results.org_facet"          
## [21] "results.per_facet"           "results.geo_facet"          
## [23] "results.multimedia"          "results.short_url"
#Option 2
top_storiesdf2 <- fromJSON(top_stories, flatten = TRUE)
top_storiesdf2 <- data.frame(top_storiesdf2$results, stringsAsFactors = FALSE)

#Get column names
colnames(top_storiesdf2)
##  [1] "section"             "subsection"          "title"              
##  [4] "abstract"            "url"                 "uri"                
##  [7] "byline"              "item_type"           "updated_date"       
## [10] "created_date"        "published_date"      "material_type_facet"
## [13] "kicker"              "des_facet"           "org_facet"          
## [16] "per_facet"           "geo_facet"           "multimedia"         
## [19] "short_url"
#Rename columns
colnames(top_storiesdf2) <- c("Section","Subsection", "Title", "Abstract", "URL", "URI", "Byline", "Item_Type", "Updated_Date", "Created_Date", "Published_Date", "Material_Type_Facet", "Kicker", "Des_Facet", "Org_Facet", "Per_Facet", "Geo_Facet", "Multimedia", "Short_Url")

#Drop columns not needed
top_storiesdf3 <- top_storiesdf2[, -c(12:18)]

Data Analysis

#Count of Section
Section <- top_storiesdf3%>%
  group_by(Section)%>%
  summarise(num=n())%>%
  arrange(desc(num))
head(Section)
## # A tibble: 6 × 2
##   Section      num
##   <chr>      <int>
## 1 world         32
## 2 briefing       1
## 3 climate        1
## 4 insider        1
## 5 style          1
## 6 technology     1
#Count of Subsection
Subsection <- top_storiesdf3%>%
  group_by(Subsection)%>%
  summarise(num=n())%>%
  arrange(desc(num))
head(Subsection)
## # A tibble: 6 × 2
##   Subsection     num
##   <chr>        <int>
## 1 ""              13
## 2 "asia"           8
## 3 "europe"         6
## 4 "americas"       4
## 5 "africa"         2
## 6 "middleeast"     2

Plots

The bar plot below shows that plenty of the articles belong to the world section followed by briefing and science.

# Bar Plot for Section
top_storiesdf3 %>% 
ggplot(aes(x = Section)) +
  geom_bar()

Within the next bar plot there is an unknown subsection that has the highest count followed by Asia and Europe.

# Bar Plot for Subsection
top_storiesdf3 %>% 
ggplot(aes(x = Subsection)) +
  geom_bar()