Text analytics with nytimes data

Search New York Times Articles Through API

Our goal here is to access the archive of New York Times article through its search API.

There are some R packages in the R community. The firstis called rtimes and its main function is to “search and retrieve data from the New York Times congress API”. Another package for interacting with the NYT API is nytimes by Mike Kearney. With nytimes, you also get access to the Article Search API.

But, actualy, we can directly interact with the NYT API through the jsonlite package. Most the codes here are from: http://www.storybench.org/working-with-the-new-york-times-api-in-r/

First, we sign up for an API key here (https://developer.nytimes.com/get-started), and save our key in a the following section. We can use this key requests to access NYT. then we save the search results into R object: data frame, which will then analysis and visualized.

api = "paste_your_api_key_here"

Define the fuction of search NYT

In this function, we first define the search URL with search keyword and time period. We save the search results into R object using the function fromJSON in the jsonlite package. we use for loops to get more results, because every page contains 10 results only.

if (!require("jsonlite")) install.packages("jsonlite")
library(jsonlite)
#################################################################################
####            function - search news article with API                      ####
nytime = function (keyword,year) {
  searchQ = URLencode(keyword)
  url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',searchQ,
              '&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
  #get the total number of search results
  initialsearch = fromJSON(url,flatten = T)
  maxPages = round((initialsearch$response$meta$hits / 10)-1)
  
  #try with the max page limit at 10
  maxPages = ifelse(maxPages >= 10, 10, maxPages)
  
  #creat a empty data frame
  df = data.frame(id=as.numeric(),created_time=character(),snippet=character(),
                  headline=character())
  
  #save search results into data frame
  for(i in 0:maxPages){
    #get the search results of each page
    nytSearch = fromJSON(paste0(url, "&page=", i), flatten = T) 
    temp = data.frame(id=1:nrow(nytSearch$response$docs),
                      created_time = nytSearch$response$docs$pub_date,
                      snippet = nytSearch$response$docs$snippet,
                      headline = nytSearch$response$docs$headline.main)
    df=rbind(df,temp)
    Sys.sleep(5) #sleep for 5 second
  }
  return(df)
}
#################################################################################

Using the nytime function to search NYT articles containing the keyword xi jinping and donald trump in 2018.

dt = nytime('donald trump',2018)
xi = nytime('xi jinping',2018)

write.csv(dt, "NYT news_donald trump.csv")
write.csv(xi, "NYT news_xi jinping.csv")

for those who dont have NYT API, can directly download and readin data files here Download NYT news_donald trump.csv Download NYT news_xi jinping.csv

dt = read.csv("NYT news_donald trump.csv",header=T,stringsAsFactors = F)
xi = read.csv("NYT news_xi jinping.csv",header=T,stringsAsFactors = F)

Text Clean

Then, we define a R function, called Preprocessing, to prepare the text for analysis. the proprocessing followed a standard process of text clean, include: change all characters into lower case, remove the punctuations, remove stopwords, remove non-alphanumeric characteristics. The document is change to corpus before data clean.

###################################################################################
####               function - preprocessing of text                            ####
###################################################################################
#load library
if (!require("tm")) install.packages("tm")
library(tm)
#preprocessing function for text
preprocessing = function (doc){
  doc = gsub("[^[:alnum:]]"," ",doc)
  #create corpus
  corpus = Corpus(VectorSource(doc))
  #Removal of punctuation
  corpus = tm_map(corpus, removePunctuation)
  #Removal of numbers
  corpus = tm_map(corpus, removeNumbers)
  #Conversion to lower case
  corpus = tm_map(corpus, content_transformer(tolower)) 
  #customize my stopwords
  mystopword = "Here’s what you need to know to start your day"
  #Removal of stopwords
  corpus = tm_map(corpus, removeWords, c(stopwords("english"),mystopword))
  #retun result
  return(corpus)
}
###################################################################################

apply preprocessing function to our text document

xi.clean = preprocessing(xi$snippet)
dt.clean = preprocessing(dt$snippet)

Now, we visualize the two corpus with the wordcloud package.

if (!require("wordcloud")) install.packages("wordcloud")
library(wordcloud)

par(mfrow=c(1,2)) # 1x2 panel plot
par(mar=c(1, 3, 1, 3)) # Set the plot margin
par(bg="black") # set background color as black
par(col.main="white") # set title color as white
wordcloud(xi.clean, scale=c(4,.5),min.freq=3, max.words=Inf, random.order=F, 
          colors = brewer.pal(8, "Set3"))   
title("News report of Xi Jinping")
wordcloud(dt.clean, scale=c(4,.5),min.freq=3, max.words=Inf, random.order=F, 
          colors = brewer.pal(8, "Set3"))   
title("News report of Donald Trump")

dev.off()

## null device 
##           1

Next, we plot the frequency of the top keywords with barplot.

#Operations on Term-Document Matrices
xi.dtm = DocumentTermMatrix(xi.clean)
#we can find those terms that occur at least 3 times
findFreqTerms(xi.dtm, 3)

##  [1] "country"      "life"         "new"          "president"   
##  [5] "program"      "china"        "communist"    "party"       
##  [9] "economy"      "may"          "trade"        "war"         
## [13] "chinese"      "leaders"      "trump"        "day"         
## [17] "know"         "need"         "start"        "national"    
## [21] "said"         "state"        "week"         "can"         
## [25] "global"       "order"        "world"        "economic"    
## [29] "internet"     "leader"       "security"     "end"         
## [33] "high"         "limits"       "plan"         "presidential"
## [37] "term"         "power"        "political"    "values"      
## [41] "become"       "little"       "moves"        "government"  
## [45] "now"          "back"         "forum"        "growing"     
## [49] "states"       "tariffs"      "united"       "last"        
## [53] "first"        "meeting"      "risk"         "beijing"     
## [57] "even"         "washington"   "jinping"      "control"     
## [61] "years"        "yet"          "truce"        "america"     
## [65] "growth"       "officials"    "will"         "home"        
## [69] "left"         "human"        "rights"       "asia"        
## [73] "issue"        "meant"        "move"         "group"       
## [77] "made"         "two"          "american"     "meet"        
## [81] "guessing"     "say"          "sea"

#same for the dt document
dt.dtm = DocumentTermMatrix(dt.clean)
findFreqTerms(dt.dtm, 3)

##  [1] "president"     "two"           "years"         "get"          
##  [5] "trump"         "donald"        "american"      "new"          
##  [9] "know"          "york"          "can"           "readers"      
## [13] "global"        "father"        "today"         "year"         
## [17] "need"          "investigation" "old"           "white"        
## [21] "everybody"     "hands"         "mood"          "shake"        
## [25] "sometimes"     "office"        "republicans"   "son"          
## [29] "republican"    "speaks"        "day"           "start"        
## [33] "said"          "like"          "governor"

Plot frequence

#plot frequency
par(mfrow=c(1,2))
freq = sort(colSums(as.matrix(xi.dtm)), decreasing=TRUE) 
barplot(freq[1:9],col=rev(brewer.pal(9, "PuRd")), horiz=TRUE)
title("News about Xi")
#for dt data
freq = sort(colSums(as.matrix(dt.dtm)), decreasing=TRUE) 
barplot(freq[1:9], col=rev(brewer.pal(9, "GnBu")), horiz=TRUE)
title("News about Trump")

dev.off()

## null device 
##           1

#colors of the barplot are from the RColorBrewer package
RColorBrewer::display.brewer.all()

Adcance material

The NYT API has the limit to access their data, usally cap at 100 pages here is the code to access max results from NYT database

nyt_search = function (keyword, year){
  searchQ = URLencode(keyword)
  url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',searchQ,
              '&begin_date=',year,'0101&end_date=',year,'1231&&api-key=',api,sep="")
  ##convert json to R object
  initialQuery = fromJSON(url,flatten=TRUE)
  maxPages = round((initialQuery$response$meta$hits / 10)-1)
  maxPages = ifelse(maxPages >= 99, 99, maxPages)
  
  #download all the data and transform into R obj
  df = data.frame(created_time=character(),snippet=character(),headline=character())
  for(i in 0:maxPages){
    nytSearch <- fromJSON(paste0(url, "&page=", i))
    temp = data.frame(created_time = nytSearch$response$docs$pub_date,
                      snippet = nytSearch$response$docs$snippet,
                      headline = nytSearch$response$docs$headline$main,
                      stringsAsFactors = F)
    df = rbind(df,temp)
    Sys.sleep(5) #sleep for 5 seconds
  }return(df)
}

References:

https://rpubs.com/hmgeiger/373949
http://www.storybench.org/working-with-the-new-york-times-api-in-r/