Objective:

The New York Times web site provides a rich set of APIs, as described here: https://developer.nytimes.com/apis You’ll need to start by signing up for an API key. Your task is to choose one of the New York Times APIs, construct an interface in R to read in the JSON data, and transform it into an R DataFrame.

Solution:

The solution I chose to implement was to create a database that would hold and store articles from the NY Times RSS feed in the sections of business, economy, jobs and technology. Which will then later be used to do natural language processing on by extracting each articles data.

ui <- shinyUI(fluidPage(
  
  textInput(inputId = 'username', label = 'Database User Name', value = ""),
  textInput(inputId = 'pw', label = 'Datebase Password', value = ""),
  textInput(inputId = 'times_un', label = 'Ny Times User Name', value = ""),
  textInput(inputId = 'times_pw', label = 'Ny Times Password', value = ""),
  actionButton('save_inputs', 'Save inputs')
  
)) 

server <-  shinyServer(function(input, output,session) {
  
  
  observeEvent(input$save_inputs, {
    
    credentials <- list()
    
    credentials <<- list(user_name= input$username, pw = input$pw,api_key = input$apikey,secret_key =input$secret_key)
    
    stopApp(returnValue = invisible())
    
  }) 
  
})

shinyApp(ui = ui, server=server)
## 
## Listening on http://127.0.0.1:3919

plot of chunk unnamed-chunk-1

Establish database connection

Using RMySQL to establish a connection to the database.

library(RMySQL)
db_con <- dbConnect(MySQL(),user=credentials$user_name,password=credentials$pw)

Create Datebase and table

Query statemetns to create the NYTIMES database and the Articles_Meta table, which will hold the links, title, author, and the date the article was published.

create_database_statement<- 'CREATE DATABASE IF NOT EXISTS NYTIMES;'
create_database <- dbSendQuery(db_con, create_database_statement)
use_database <- dbSendQuery(db_con, 'USE NYTIMES;')

create_query <- 'Create Table IF NOT EXISTS Articles_Meta(
                    Links varchar(300) NOT NULL,
                    Title varchar(300) NOT NULL,
                    Description varchar(500),
                    Created_By varchar(100),
                    Published Date,
                    primary key (Links,Title)
                  );'

create_table <- dbSendQuery(db_con, create_query)

Using the NYTIMES API

Here I use the NYTIMES API to extract the article’s meta data. The data wasn’t nice and neat. There were a lot of empty nodes that had to be taken out and the publishing dates had to be cleaned and arranged so that MySQL would accept the dates as a date type. I used dbWriteTable to insert the data into the database.

library(httr)
library(jsonlite)
library(XML)
library(xml2)
library(tidyverse)
library(lubridate)
library(rvest)

sections <- c('business','economy','jobs','technology')
for(s in 1:length(sections)){
  
  link <- paste0('https://api.nytimes.com/services/xml/rss/nyt/',sections[s],'.xml',collapse = '' )
  doc <- GET(link)
  xmlfile<-xmlTreeParse(doc)
  xmltop = xmlRoot(xmlfile)
  
  for( i in 1:length(xmltop[["channel"]][["item"]])){
    
    art_link <- xmltop[["channel"]][[i]][["link"]][['text']]
    art_link <- paste0(xmltop[["channel"]][[i]][["link"]][["text"]],collapse=' ')
    art_link <- gsub('text NULL NULL NULL NULL','',art_link)
    art_link <- gsub('"','',art_link)
    art_link <- trimws(art_link,'l')
    
    art_title <- xmltop[["channel"]][[i]][["title"]][["text"]]
    art_title <- paste0(xmltop[["channel"]][[i]][["title"]][["text"]],collapse=' ')
    art_title <- gsub('text NULL NULL NULL NULL','',art_title)
    art_title <- gsub('"','',art_title)
    art_title <- trimws(art_title,'l')
    
    art_descr <- xmltop[["channel"]][[i]][["description"]][["text"]]
    art_descr<- paste0(xmltop[["channel"]][[i]][["description"]][["text"]],collapse=' ')
    art_descr <- gsub('text NULL NULL NULL NULL','',art_descr)
    art_descr <- gsub('"','',art_descr)
    art_descr <- trimws(art_descr,'l')
    
    art_creat <- xmltop[["channel"]][[i]][["creator"]][["text"]]
    art_creat <- paste0(xmltop[["channel"]][[i]][["creator"]][["text"]],collapse=' ')
    art_creat <- gsub('text NULL NULL NULL NULL','',art_creat)
    art_creat <- gsub('"','',art_creat)
    art_creat <- trimws(art_creat,'l')
    
    art_pub <- paste0(xmltop[["channel"]][[i]][["pubDate"]][["text"]],collapse=' ')
    art_pub <- gsub('text NULL NULL NULL NULL',' ',art_pub)
    art_pub <- str_replace(art_pub, '\\+0000', '')
    art_pub <- str_replace(art_pub, '\\s+', '')
    art_pub <- str_replace(art_pub, '[:upper:][:lower:][:lower:], ', '')
    time <- str_replace(art_pub,'\\d{2}:\\d{2}:\\d{2}','')
    art_pub <- trimws(art_pub,'r')
    art_pub <- gsub(' ','/',art_pub)
    art_day <- str_extract(art_pub,"(\\d{2})")
    art_month <- str_extract(art_pub,'([:upper:][:lower:][:lower:])')
    art_month <- as.character(match(art_month,month.abb))
    art_year <- str_extract(art_pub,'(\\d{4})')
    art_pub <- as.Date(paste0(art_year,'-',art_month,'-',art_day),format='%Y-%m-%d')
    
    if(art_descr != ""){

      df <- data.frame("Links"= paste0("'",art_link,"'",collapse=''),
                     'Title'= paste0("'",art_title,"'",collapse=''),
                     'Description'=paste0("'",art_descr,"'",collapse=''),
                     'Created_By'=paste0("'",art_creat,"'",collapse=''),
                     'Published'= art_pub)
      dbWriteTable(db_con, name='Articles_Meta', value=df , append=T, row.names=F, overwrite=F);

    }
  }
}
## No encoding supplied: defaulting to UTF-8.
## No encoding supplied: defaulting to UTF-8.
## No encoding supplied: defaulting to UTF-8.
## No encoding supplied: defaulting to UTF-8.

Close database connection

R is a memory hog, so to avoid issues with results sets and too many connections open, I close the connections to clear the result sets and free up connections.

lapply(dbListConnections( dbDriver( drv = "MySQL")), dbDisconnect)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE

Reconnect to the database and get Articles_Meta table.

Reestablishing the connection I set the database that I want to query and send a query. The Articles_Meta holds 47 records with attributes of links, title, an article description, author and date published.

library(DT)
db_con <- dbConnect(MySQL(),user=credentials$user_name,password=credentials$pw)
use_database <- dbSendQuery(db_con, 'USE NYTIMES;')

get_am_table_query <- 'SELECT * FROM NYTIMES.Articles_Meta;'
get_am_table <- dbGetQuery(db_con, get_am_table_query)
datatable(get_am_table,extensions='Scroller',options=list(scrollX=T,scrollY=250,scroller=TRUE))

plot of chunk unnamed-chunk-6