Our goal here is to access the archive of New York Times article through its search API.
There are some R packages in the R community. The firstis called rtimes and its main function is to “search and retrieve data from the New York Times congress API”. Another package for interacting with the NYT API is nytimes by Mike Kearney. With nytimes, you also get access to the Article Search API.
But, actualy, we can directly interact with the NYT API through the jsonlite package. Most the codes here are from: http://www.storybench.org/working-with-the-new-york-times-api-in-r/
First, we sign up for an API key here (https://developer.nytimes.com/get-started), and save our key in a the following section. We can use this key requests to access NYT. then we save the search results into R object: data frame, which will then analysis and visualized.
api = "paste_your_api_key_here"
In this function, we first define the search URL with search keyword and time period. We save the search results into R object using the function fromJSON in the jsonlite package. we use for loops to get more results, because every page contains 10 results only.
if (!require("jsonlite")) install.packages("jsonlite")
library(jsonlite)
#################################################################################
#### function - search news article with API ####
nytime = function (keyword,year) {
searchQ = URLencode(keyword)
url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',searchQ,
'&begin_date=',year,'0101&end_date=',year,'1231&api-key=',api,sep="")
#get the total number of search results
initialsearch = fromJSON(url,flatten = T)
maxPages = round((initialsearch$response$meta$hits / 10)-1)
#try with the max page limit at 10
maxPages = ifelse(maxPages >= 10, 10, maxPages)
#creat a empty data frame
df = data.frame(id=as.numeric(),created_time=character(),snippet=character(),
headline=character())
#save search results into data frame
for(i in 0:maxPages){
#get the search results of each page
nytSearch = fromJSON(paste0(url, "&page=", i), flatten = T)
temp = data.frame(id=1:nrow(nytSearch$response$docs),
created_time = nytSearch$response$docs$pub_date,
snippet = nytSearch$response$docs$snippet,
headline = nytSearch$response$docs$headline.main)
df=rbind(df,temp)
Sys.sleep(5) #sleep for 5 second
}
return(df)
}
#################################################################################
Using the nytime function to search NYT articles containing the keyword xi jinping and donald trump in 2018.
dt = nytime('donald trump',2018)
xi = nytime('xi jinping',2018)
write.csv(dt, "NYT news_donald trump.csv")
write.csv(xi, "NYT news_xi jinping.csv")
for those who dont have NYT API, can directly download and readin data files here Download NYT news_donald trump.csvDownload NYT news_xi jinping.csv
dt = read.csv("NYT news_donald trump.csv",header=T,stringsAsFactors = F)
xi = read.csv("NYT news_xi jinping.csv",header=T,stringsAsFactors = F)
Then, we define a R function, called Preprocessing, to prepare the text for analysis. the proprocessing followed a standard process of text clean, include: change all characters into lower case, remove the punctuations, remove stopwords, remove non-alphanumeric characteristics. The document is change to corpus before data clean.
###################################################################################
#### function - preprocessing of text ####
###################################################################################
#load library
if (!require("tm")) install.packages("tm")
library(tm)
#preprocessing function for text
preprocessing = function (doc){
doc = gsub("[^[:alnum:]]"," ",doc)
#create corpus
corpus = Corpus(VectorSource(doc))
#Removal of punctuation
corpus = tm_map(corpus, removePunctuation)
#Removal of numbers
corpus = tm_map(corpus, removeNumbers)
#Conversion to lower case
corpus = tm_map(corpus, content_transformer(tolower))
#customize my stopwords
mystopword = "Here’s what you need to know to start your day"
#Removal of stopwords
corpus = tm_map(corpus, removeWords, c(stopwords("english"),mystopword))
#retun result
return(corpus)
}
###################################################################################
apply preprocessing function to our text document
xi.clean = preprocessing(xi$snippet)
dt.clean = preprocessing(dt$snippet)
Now, we visualize the two corpus with the wordcloud package.
if (!require("wordcloud")) install.packages("wordcloud")
library(wordcloud)
par(mfrow=c(1,2)) # 1x2 panel plot
par(mar=c(1, 3, 1, 3)) # Set the plot margin
par(bg="black") # set background color as black
par(col.main="white") # set title color as white
wordcloud(xi.clean, scale=c(4,.5),min.freq=3, max.words=Inf, random.order=F,
colors = brewer.pal(8, "Set3"))
title("News report of Xi Jinping")
wordcloud(dt.clean, scale=c(4,.5),min.freq=3, max.words=Inf, random.order=F,
colors = brewer.pal(8, "Set3"))
title("News report of Donald Trump")
dev.off()
## null device
## 1
Next, we plot the frequency of the top keywords with barplot.
#Operations on Term-Document Matrices
xi.dtm = DocumentTermMatrix(xi.clean)
#we can find those terms that occur at least 3 times
findFreqTerms(xi.dtm, 3)
## [1] "country" "life" "new" "president"
## [5] "program" "china" "communist" "party"
## [9] "economy" "may" "trade" "war"
## [13] "chinese" "leaders" "trump" "day"
## [17] "know" "need" "start" "national"
## [21] "said" "state" "week" "can"
## [25] "global" "order" "world" "economic"
## [29] "internet" "leader" "security" "end"
## [33] "high" "limits" "plan" "presidential"
## [37] "term" "power" "political" "values"
## [41] "become" "little" "moves" "government"
## [45] "now" "back" "forum" "growing"
## [49] "states" "tariffs" "united" "last"
## [53] "first" "meeting" "risk" "beijing"
## [57] "even" "washington" "jinping" "control"
## [61] "years" "yet" "truce" "america"
## [65] "growth" "officials" "will" "home"
## [69] "left" "human" "rights" "asia"
## [73] "issue" "meant" "move" "group"
## [77] "made" "two" "american" "meet"
## [81] "guessing" "say" "sea"
#same for the dt document
dt.dtm = DocumentTermMatrix(dt.clean)
findFreqTerms(dt.dtm, 3)
## [1] "president" "two" "years" "get"
## [5] "trump" "donald" "american" "new"
## [9] "know" "york" "can" "readers"
## [13] "global" "father" "today" "year"
## [17] "need" "investigation" "old" "white"
## [21] "everybody" "hands" "mood" "shake"
## [25] "sometimes" "office" "republicans" "son"
## [29] "republican" "speaks" "day" "start"
## [33] "said" "like" "governor"
Plot frequence
#plot frequency
par(mfrow=c(1,2))
freq = sort(colSums(as.matrix(xi.dtm)), decreasing=TRUE)
barplot(freq[1:9],col=rev(brewer.pal(9, "PuRd")), horiz=TRUE)
title("News about Xi")
#for dt data
freq = sort(colSums(as.matrix(dt.dtm)), decreasing=TRUE)
barplot(freq[1:9], col=rev(brewer.pal(9, "GnBu")), horiz=TRUE)
title("News about Trump")
dev.off()
## null device
## 1
#colors of the barplot are from the RColorBrewer package
RColorBrewer::display.brewer.all()
The NYT API has the limit to access their data, usally cap at 100 pages here is the code to access max results from NYT database
nyt_search = function (keyword, year){
searchQ = URLencode(keyword)
url = paste('http://api.nytimes.com/svc/search/v2/articlesearch.json?q=',searchQ,
'&begin_date=',year,'0101&end_date=',year,'1231&&api-key=',api,sep="")
##convert json to R object
initialQuery = fromJSON(url,flatten=TRUE)
maxPages = round((initialQuery$response$meta$hits / 10)-1)
maxPages = ifelse(maxPages >= 99, 99, maxPages)
#download all the data and transform into R obj
df = data.frame(created_time=character(),snippet=character(),headline=character())
for(i in 0:maxPages){
nytSearch <- fromJSON(paste0(url, "&page=", i))
temp = data.frame(created_time = nytSearch$response$docs$pub_date,
snippet = nytSearch$response$docs$snippet,
headline = nytSearch$response$docs$headline$main,
stringsAsFactors = F)
df = rbind(df,temp)
Sys.sleep(5) #sleep for 5 seconds
}return(df)
}