JMSC 6116 Lecture 3: Learning R’s List, Loops, and User-defined Functions: Using NYT API

First at all, we install and load the two required libraries: RJSONIO, RCurl, and plotly.

if (!require("RCurl")) install.packages("RCurl", dependencies = TRUE)
if (!require("RJSONIO")) install.packages("RJSONIO", dependencies = TRUE)
if (!require("plotly")) install.packages("plotly", dependencies = TRUE)

Please register and obtain your own API key here http://developer.nytimes.com/ and put it into the following code.

#### API key here
api <- "YOUR KEY HERE"

Let’s do a trial run to search “Hong Kong” in the New York Times 2020 archive. The class of the Variable ‘tesing’ is a list.

year <- 2020
search_q <- URLencode("'Hong Kong'")
url <- paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',search_q,'&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
testing <- fromJSON(getURL(url))  ### Convert the API output to R Object
class(testing)

## [1] "list"

str(testing$response$meta)  # Show its data structure

##  Named num [1:3] 1832 0 75
##  - attr(*, "names")= chr [1:3] "hits" "offset" "time"

print(testing$response$meta["hits"]) ### Display the number of hits in 2019

## hits 
## 1832

Sys.sleep(5) ### Wait a while to limit within 1 call per second

Next, define your variables: search term search_q, time duration year_range, and a empty variable nyt_china for storing the results.

search_q <- URLencode("'China'")
year_range <- 1990:2020 
nyt_ch <- data.frame(year=character(0),hits=numeric(0))

Getting each year’s article count via a for loop

for(year in year_range){
    url <- paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',search_q,'&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
    nyt_robj <- fromJSON(getURL(url))  ## Convert from JSON to R Object
  hits <- nyt_robj$response$meta["hits"]   ## save the article hits
  new_data <- data.frame(year=year,hits=hits)
    nyt_ch <- rbind(nyt_ch,new_data)    ## Add a new row to nyt_china
    print(paste(year,hits,sep=":"))  ### Print the new row contents
    Sys.sleep(5) ### Wait a while to limit within 1 call per second
}

## [1] "1990:2188"
## [1] "1991:2080"
## [1] "1992:2426"
## [1] "1993:2593"
## [1] "1994:2837"
## [1] "1995:2938"
## [1] "1996:3190"
## [1] "1997:3327"
## [1] "1998:3475"
## [1] "1999:3716"
## [1] "2000:4000"
## [1] "2001:4759"
## [1] "2002:3804"
## [1] "2003:4642"
## [1] "2004:4487"
## [1] "2005:6987"
## [1] "2006:11630"
## [1] "2007:8990"
## [1] "2008:12193"
## [1] "2009:7997"
## [1] "2010:6112"
## [1] "2011:5604"
## [1] "2012:6351"
## [1] "2013:5482"
## [1] "2014:6282"
## [1] "2015:5776"
## [1] "2016:4702"
## [1] "2017:4767"
## [1] "2018:5854"
## [1] "2019:5684"
## [1] "2020:6493"

We then repeat the search for “Taiwan”. The data are stored in the variable nyt_tw.

search_q <- URLencode("'Taiwan'")
nyt_tw <- data.frame(year=character(0),hits=numeric(0))

for(year in year_range){
    url <- paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',search_q,'&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
    nyt_robj <- fromJSON(getURL(url))  ## Convert from JSON to R Object
  hits <- nyt_robj$response$meta["hits"]   ## save the article hits
  new_data <- data.frame(year=year,hits=hits)
    nyt_tw <- rbind(nyt_tw,new_data)    ## Add a new row to nyt_china
    print(paste(year,hits,sep=":"))  ### Print the new row contents
    Sys.sleep(5) ### Wait a while to limit within 1 call per second
}

## [1] "1990:389"
## [1] "1991:382"
## [1] "1992:531"
## [1] "1993:534"
## [1] "1994:503"
## [1] "1995:649"
## [1] "1996:855"
## [1] "1997:640"
## [1] "1998:551"
## [1] "1999:845"
## [1] "2000:927"
## [1] "2001:1196"
## [1] "2002:623"
## [1] "2003:752"
## [1] "2004:624"
## [1] "2005:976"
## [1] "2006:1238"
## [1] "2007:821"
## [1] "2008:1345"
## [1] "2009:858"
## [1] "2010:508"
## [1] "2011:373"
## [1] "2012:461"
## [1] "2013:422"
## [1] "2014:428"
## [1] "2015:416"
## [1] "2016:434"
## [1] "2017:425"
## [1] "2018:422"
## [1] "2019:440"
## [1] "2020:699"

A function named SearchNYT is defined to search keyword in the NYT archive. No need to duplciate the code again.

SearchNYT <- function(search_sq,yr){
  sq <- URLencode(search_sq)
  nyt <- data.frame(year=character(0),hits=numeric(0))
  for(year in yr){
      url <- paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',sq,'&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
      nyt_robj <- fromJSON(getURL(url))  ## Convert from JSON to R Object
    hits <- nyt_robj$response$meta["hits"]   ## save the article hits
    new_data <- data.frame(year=year,hits=hits)
      nyt <- rbind(nyt,new_data)    ## Add a new row 
      Sys.sleep(5) ### Wait a while to limit within 1 call per second
  }
  return(nyt)
}

We search for “India,” “Germany,” “South Korea,” and “Hong Kong” by calling the defined function SearchNYT.

nyt_in <- SearchNYT("'India'",year_range)
nyt_ge <- SearchNYT("'Germany'",year_range)
nyt_sk <- SearchNYT("'South Korea'",year_range)
nyt_hk <- SearchNYT("'Hong Kong'",year_range)

Last, all six time series are plotted by line plot.

p <- plot_ly(nyt_ch, x = ~year, y = ~hits, name = "China", type = 'scatter', mode = 'lines')
p <- add_trace(p, data = nyt_tw, y = ~hits, name = "Taiwan")
p <- add_trace(p, data = nyt_in, y = ~hits, name = "India")
p <- add_trace(p, data = nyt_ge, y = ~hits, name = "Germany")
p <- add_trace(p, data = nyt_sk, y = ~hits, name = "South Korea")
p <- add_trace(p, data = nyt_hk, y = ~hits, name = "Hong Kong")
layout(p, title = "Places mentioned in New York Times (1990 to 2020)", xaxis = list(title = "Year"), yaxis = list (title = "Number of hits"))

Finally, we upload the plot to the plotly server. You should first create an account here (https://plot.ly/) and obtain the username and the API key.

#### Upload to plotly server
Sys.setenv("plotly_username"="YOUR PLOTLY ACCOUNT NAME")
Sys.setenv("plotly_api_key"="YOUR PLOTLY KEY HERE")
api_create(p, filename = "lecture3")

#
# NYT Search for XXXX AND virus
#
nyt_hk1 <- SearchNYT("'Hong Kong' AND virus",year_range)
nyt_tw1 <- SearchNYT("'Taiwan' AND virus",year_range)
nyt_in1 <- SearchNYT("'India' AND virus",year_range)
nyt_ge1 <- SearchNYT("'Germany' AND virus",year_range)
nyt_sk1 <- SearchNYT("'South Korea' AND virus",year_range)

p1 <- plot_ly(nyt_hk1, x = ~year, y = ~hits, name = "Hong Kong AND virus", type = 'scatter', mode = 'lines')
p1 <- add_trace(p1, data = nyt_tw1, y = ~hits, name = "Taiwan")
p1 <- add_trace(p1, data = nyt_in1, y = ~hits, name = "India")
p1 <- add_trace(p1, data = nyt_ge1, y = ~hits, name = "Germany")
p1 <- add_trace(p1, data = nyt_sk1, y = ~hits, name = "South Korea")
layout(p1, title = "Mentioning 'HK/TW/SK/IN/DE AND virus' in New York Times (1990 to 2020)", xaxis = list(title = "Year"), yaxis = list (title = "Number of hits"))

JMSC 6116 Lecture 3: Learning R’s List, Loops, and User-defined Functions: Using NYT API

King-wa Fu

Februrary 5, 2021