In this assignment, I retrieve data through the Article Search API from New York Times and transform the recieved JSON data into an R data frame.
Load libraries
library(jsonlite)
library(stringr)
Set up the base API URL and the API Key
API_Key <- "yNSSsAiYxKYx45RBVqbwSEs1UWfpTeWh"
base_url <- "https://api.nytimes.com/svc/search/v2/articlesearch.json?"
Set up a function to validate a data in the format of “mm/dd/yyyy” and transform it to the format “yyyymmdd” that can be used for the API
checkDate <- function(dateString) {
result <- unlist(str_extract_all(dateString,"\\d+"))
result <- as.numeric(result)
if (length(result) != 3)
return(NA)
if (result[3] < as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),str_length(str_extract(Sys.Date(),"\\d+")) - 1)))
result[3] <- result[3] + as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),0,str_length(str_extract(Sys.Date(),"\\d+"))-2)) * 100
else if (result[3] < 100)
result[3] <- result[3] + (as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),0,str_length(str_extract(Sys.Date(),"\\d+"))-2)) -1 ) * 100
if (result[3] < 1851 | result[3] > as.numeric(str_extract(Sys.Date(),"\\d+")))
return(NA)
if (result[1] > 12)
return(NA)
if (result[2] > 31)
return(NA)
if ((result[1] %in% c(4,6,9,11)) & result[2] > 30)
return(NA)
if (result[1] == 2)
if (result[3] %% 4 == 0 & result[2] > 29)
return(NA)
else if (result[2] > 28)
return(NA)
result[1] <- str_pad(result[1], 2, pad = "0")
result[2] <- str_pad(result[2], 2, pad = "0")
return(str_c(result[3],result[1],result[2]))
}
Define the valid inputs for some of the API arguments. These items will be used for input validation
facets <- c("true", "false", NA)
facetFields <- c("day_of_week", "document_type", "ingredients", "news_desk",
"pub_month", "pub_year", "section_name", "source",
"subsection_name", "type_of_material", NA)
facetFilters <- c("true", "false", NA)
sortOptions <- c("newest","oldest","relevance", NA)
Set up a function that takes arguments for the API, process validation and return a completed URL
getQueryURL <- function(burl, apiKey, searchString, fromDate, toDate,
useFacet, facetString, useFacetFilter, fieldList,
filterQuery, pageNum, sortOpt) {
if (is.na(apiKey)) {
print("missing API Key")
return(NA)
}
if (is.na(searchString)) {
print("missing query item")
return(NA)
}
if (!is.na(fromDate)) {
fromDate <- checkDate(fromDate)
if (is.na(fromDate))
print("the begin date is invalid, it will be excluded from query")
}
if (!is.na(toDate)) {
toDate <- checkDate(toDate)
if (is.na(toDate))
print("the end date is invalid, it will be excluded from query")
}
if (!is.na(fromDate) & !is.na(toDate))
if (as.numeric(fromDate) > as.numeric(toDate)) {
fromDate <- NA
toDate <- NA
print("begin date is after end date, they will be excluded from query")
}
if(!is.na(useFacet)) {
if (useFacet == TRUE) useFacet <- "true"
else if (useFacet == FALSE) useFacet <- "false"
else if (!useFacet %in% facets) {
facets <- NA
print("invalid input for using facet or not")
}
}
if (!facetString %in% facetFields) {
facetString <- NA
print("invalid facet field, it will be excluded from query")
}
if(!is.na(useFacetFilter)) {
if (useFacetFilter == TRUE) useFacetFilter <- "true"
else if (useFacetFilter == FALSE) useFacetFilter <- "false"
else if (!useFacetFilter %in% facetFilters) {
useFacetFilter <- NA
print("invalid input for using facet filter or not")
}
}
if (!is.na(fieldList))
if (fieldList == "") fieldList <- NA
if (!is.na(filterQuery))
if (filterQuery == "") filterQuery <- NA
if (!is.na(pageNum)) {
if (pageNum < 0 | pageNum > 100) {
pageNum <- NA
print("invalid page number, it must be from 0 to 100")
}
}
if (!sortOpt %in% sortOptions) {
sortOpt <- NA
print("invalid sorting option, it will be excluded from query")
}
resultRUL <- str_c(burl,"q=",searchString)
if (!is.na(fromDate)) resultRUL <- str_c(resultRUL,"&begin_date=", fromDate)
if (!is.na(toDate)) resultRUL <- str_c(resultRUL,"&end_date=", toDate)
if (!is.na(useFacet)) resultRUL <- str_c(resultRUL,"&facet=", useFacet)
if (!is.na(facetString)) resultRUL <- str_c(resultRUL,"&facet_fields=", facetString)
if (!is.na(useFacetFilter)) resultRUL <- str_c(resultRUL,"facet_filter=", useFacetFilter)
if (!is.na(fieldList)) resultRUL <- str_c(resultRUL,"&fl=", fieldList)
if (!is.na(filterQuery)) resultRUL <- str_c(resultRUL,"&fq=", filterQuery)
if (!is.na(pageNum)) resultRUL <- str_c(resultRUL,"&page=", pageNum)
if (!is.na(sortOpt)) resultRUL <- str_c(resultRUL,"&sort=", sortOpt)
resultRUL <- URLencode(str_c(resultRUL,"&api-key=", apiKey))
return(resultRUL)
}
Set up a function that takes a completed API URL. The JSON file that is recieved from the URL is then transformed into a data frame. This function returns the raw data frame and a simplied data frame as a list. If the query doesn’t find any item, return NA.
getQueryData <- function(result_URL) {
result <- jsonlite::fromJSON(result_URL)
result_Documents <- result$response$docs
if (length(result_Documents) == 0) {
print("no article is found")
return(NA)
} else
{
simplifiedTable <- data.frame(result_Documents$headline$main,
result_Documents$web_url,
result_Documents$source,
result_Documents$byline$original,
result_Documents$pub_date)
#change the column names
colnames(simplifiedTable) <- c("Head Line", "URL","Source","Author","Publication Date")
#split the data and time column into 2 separate columns
simplifiedTable$`Publication Time` <- str_extract(simplifiedTable$`Publication Date`,"\\d+:\\d+:\\d+")
simplifiedTable$`Publication Date` <- str_extract(simplifiedTable$`Publication Date`,"\\d+-\\d+-\\d+")
simplifiedTable$Author <- str_remove(simplifiedTable$Author,"By ")
return(list(result_Documents, simplifiedTable))
}
}
Let try a simple query by using the following arguments
search_item <- "data science"
start_date <- NA
end_date <- NA
facet <- NA
facet_field <- NA
facet_filter <- NA
field_list <- NA
filter_query <- NA
page_number <- NA
sort_option <- NA
Produce the API URL
resURL <- getQueryURL(base_url,API_Key,search_item,start_date, end_date,
facet,facet_field,facet_filter,field_list,
filter_query,page_number,sort_option)
resURL
## [1] "https://api.nytimes.com/svc/search/v2/articlesearch.json?q=data%20science&api-key=yNSSsAiYxKYx45RBVqbwSEs1UWfpTeWh"
Get the result data frames and have a look at the simplified data frame
queryResult <- getQueryData(resURL)
queryResult[2]
## [[1]]
## Head Line
## 1 Ahead of 2020, Facebook Falls Short on Plan to Share Data on Disinformation
## 2 Many Genes Influence Same-Sex Sexuality, Not a Single ‘Gay Gene’
## 3 Hanging Out With Humans Makes This Bird Bad at Its Job
## 4 How to Stop the Abuse of Location Data
## 5 A Signal in Giant Earthquakes That Could Save Lives
## 6 Humans Dominated Earth Earlier Than Previously Thought
## 7 Novartis C.E.O. Defends Company’s Decision to Withhold False Data From the F.D.A.
## 8 This New Liquid Is Magnetic, and Mesmerizing
## 9 Birds Are Vanishing From North America
## 10 Nobel Prize in Physics Awarded for Studies of Earth’s Place in the Universe
## URL
## 1 https://www.nytimes.com/2019/09/29/technology/facebook-disinformation.html
## 2 https://www.nytimes.com/2019/08/29/science/gay-gene-sex.html
## 3 https://www.nytimes.com/2019/08/29/science/weka-birds-seeds-new-zealand.html
## 4 https://www.nytimes.com/2019/10/16/opinion/foursquare-privacy-internet.html
## 5 https://www.nytimes.com/2019/05/29/science/earthquakes-detection-richter-scale.html
## 6 https://www.nytimes.com/2019/08/29/science/archaeology-earth-anthropocene.html
## 7 https://www.nytimes.com/2019/08/07/health/novartis-fda-gene-therapy.html
## 8 https://www.nytimes.com/2019/07/18/science/liquid-magnet.html
## 9 https://www.nytimes.com/2019/09/19/science/bird-populations-america-canada.html
## 10 https://www.nytimes.com/2019/10/08/science/nobel-physics.html
## Source Author Publication Date
## 1 The New York Times Davey Alba 2019-09-29
## 2 The New York Times Pam Belluck 2019-08-29
## 3 The New York Times Cara Giaimo 2019-08-29
## 4 The New York Times Jeff Glueck 2019-10-16
## 5 The New York Times Robin George Andrews 2019-05-29
## 6 The New York Times James Gorman 2019-08-29
## 7 The New York Times Katie Thomas 2019-08-07
## 8 The New York Times Knvul Sheikh 2019-07-18
## 9 The New York Times Carl Zimmer 2019-09-19
## 10 The New York Times Kenneth Chang and Megan Specia 2019-10-08
## Publication Time
## 1 12:00:04
## 2 18:01:08
## 3 09:00:14
## 4 15:00:07
## 5 18:00:08
## 6 18:00:06
## 7 18:05:47
## 8 18:00:06
## 9 18:03:18
## 10 09:57:29
Write the simplified data frame into a csv file for later use
write.csv(queryResult[2], file = "simplifiedResult.csv",row.names=FALSE)