Data_607_Week_9_Assignment

In this assignment, I retrieve data through the Article Search API from New York Times and transform the recieved JSON data into an R data frame.

Load libraries

library(jsonlite)
library(stringr)

Set up the base API URL and the API Key

API_Key <- "yNSSsAiYxKYx45RBVqbwSEs1UWfpTeWh"
base_url <- "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

Set up a function to validate a data in the format of “mm/dd/yyyy” and transform it to the format “yyyymmdd” that can be used for the API

checkDate <- function(dateString) {
  result <- unlist(str_extract_all(dateString,"\\d+"))
  result <- as.numeric(result)
  if (length(result) != 3)
    return(NA)
  if (result[3] < as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),str_length(str_extract(Sys.Date(),"\\d+")) - 1)))
    result[3] <- result[3] + as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),0,str_length(str_extract(Sys.Date(),"\\d+"))-2)) * 100
  else if (result[3] < 100)
    result[3] <- result[3] + (as.numeric(str_sub(str_extract(Sys.Date(),"\\d+"),0,str_length(str_extract(Sys.Date(),"\\d+"))-2)) -1 ) * 100
  if (result[3] < 1851 | result[3] > as.numeric(str_extract(Sys.Date(),"\\d+")))
    return(NA)
  if (result[1] > 12)
    return(NA)
  if (result[2] > 31)
    return(NA)
  if ((result[1] %in% c(4,6,9,11)) & result[2] > 30)
    return(NA)
  if (result[1] == 2) 
    if (result[3] %% 4 == 0 & result[2] > 29)
      return(NA)
    else if (result[2] > 28)
      return(NA)
  result[1] <- str_pad(result[1], 2, pad = "0")
  result[2] <- str_pad(result[2], 2, pad = "0")
  return(str_c(result[3],result[1],result[2]))
}

Define the valid inputs for some of the API arguments. These items will be used for input validation

facets <- c("true", "false", NA)
facetFields <- c("day_of_week", "document_type", "ingredients", "news_desk",
                 "pub_month", "pub_year", "section_name", "source",
                 "subsection_name", "type_of_material", NA)
facetFilters <- c("true", "false", NA)
sortOptions <- c("newest","oldest","relevance", NA)

Set up a function that takes arguments for the API, process validation and return a completed URL

getQueryURL <- function(burl, apiKey, searchString, fromDate, toDate, 
                        useFacet, facetString, useFacetFilter, fieldList, 
                        filterQuery, pageNum, sortOpt) {
  if (is.na(apiKey)) {
    print("missing API Key")
    return(NA)
  }
  
  if (is.na(searchString)) {
    print("missing query item")
    return(NA)
  }
  
  if (!is.na(fromDate)) {
    fromDate <- checkDate(fromDate)
    if (is.na(fromDate))
      print("the begin date is invalid, it will be excluded from query")
  }
  
  if (!is.na(toDate)) {
    toDate <- checkDate(toDate)
    if (is.na(toDate))
      print("the end date is invalid, it will be excluded from query")
  }
  
  if (!is.na(fromDate) & !is.na(toDate))
    if (as.numeric(fromDate) > as.numeric(toDate)) {
      fromDate <- NA
      toDate <- NA
      print("begin date is after end date, they will be excluded from query")
    }
  
  if(!is.na(useFacet)) {
    if (useFacet == TRUE) useFacet <- "true"
    else if (useFacet == FALSE) useFacet <- "false"
    else if (!useFacet %in% facets) {
      facets <- NA
      print("invalid input for using facet or not")
    }
  }
  
  if (!facetString %in% facetFields) {
    facetString <- NA
    print("invalid facet field, it will be excluded from query")
  }
  
  if(!is.na(useFacetFilter)) {
    if (useFacetFilter == TRUE) useFacetFilter <- "true"
    else if (useFacetFilter == FALSE) useFacetFilter <- "false"
    else if (!useFacetFilter %in% facetFilters) {
      useFacetFilter <- NA
      print("invalid input for using facet filter or not")
    }
  }  
  
  if (!is.na(fieldList))
    if (fieldList == "") fieldList <- NA
  
  if (!is.na(filterQuery))
    if (filterQuery == "") filterQuery <- NA
  
  if (!is.na(pageNum)) {
    if (pageNum < 0 | pageNum > 100) {
      pageNum <- NA
      print("invalid page number, it must be from 0 to 100")
    }
  }
    
  if (!sortOpt %in% sortOptions) {
    sortOpt <- NA
    print("invalid sorting option, it will be excluded from query")
  }
  
  resultRUL <- str_c(burl,"q=",searchString)
  
  if (!is.na(fromDate)) resultRUL <- str_c(resultRUL,"&begin_date=", fromDate)
  if (!is.na(toDate)) resultRUL <- str_c(resultRUL,"&end_date=", toDate)
  if (!is.na(useFacet)) resultRUL <- str_c(resultRUL,"&facet=", useFacet)
  if (!is.na(facetString)) resultRUL <- str_c(resultRUL,"&facet_fields=", facetString)
  if (!is.na(useFacetFilter)) resultRUL <- str_c(resultRUL,"facet_filter=", useFacetFilter)
  if (!is.na(fieldList)) resultRUL <- str_c(resultRUL,"&fl=", fieldList)
  if (!is.na(filterQuery)) resultRUL <- str_c(resultRUL,"&fq=", filterQuery) 
  if (!is.na(pageNum)) resultRUL <- str_c(resultRUL,"&page=", pageNum)
  if (!is.na(sortOpt)) resultRUL <- str_c(resultRUL,"&sort=", sortOpt)

  resultRUL <- URLencode(str_c(resultRUL,"&api-key=", apiKey))
  return(resultRUL)
}

Set up a function that takes a completed API URL. The JSON file that is recieved from the URL is then transformed into a data frame. This function returns the raw data frame and a simplied data frame as a list. If the query doesn’t find any item, return NA.

getQueryData <- function(result_URL) {
  result <- jsonlite::fromJSON(result_URL)
  result_Documents <- result$response$docs
  if (length(result_Documents) == 0) {
    print("no article is found")
    return(NA)
    } else
  {
    simplifiedTable <- data.frame(result_Documents$headline$main,
               result_Documents$web_url,
               result_Documents$source,
               result_Documents$byline$original,
               result_Documents$pub_date)
#change the column names
    colnames(simplifiedTable) <- c("Head Line", "URL","Source","Author","Publication Date")
#split the data and time column into 2 separate columns
    simplifiedTable$`Publication Time` <- str_extract(simplifiedTable$`Publication Date`,"\\d+:\\d+:\\d+")
    simplifiedTable$`Publication Date` <- str_extract(simplifiedTable$`Publication Date`,"\\d+-\\d+-\\d+")
    simplifiedTable$Author <- str_remove(simplifiedTable$Author,"By ")
    return(list(result_Documents, simplifiedTable))
  }
}

Let try a simple query by using the following arguments

search_item <- "data science"
start_date <- NA
end_date <- NA
facet <- NA
facet_field <- NA
facet_filter <- NA
field_list <- NA
filter_query <- NA
page_number <- NA
sort_option <- NA

Produce the API URL

resURL <- getQueryURL(base_url,API_Key,search_item,start_date, end_date,
            facet,facet_field,facet_filter,field_list,
            filter_query,page_number,sort_option)
resURL

## [1] "https://api.nytimes.com/svc/search/v2/articlesearch.json?q=data%20science&api-key=yNSSsAiYxKYx45RBVqbwSEs1UWfpTeWh"

Get the result data frames and have a look at the simplified data frame

queryResult <- getQueryData(resURL)
queryResult[2]

## [[1]]
##                                                                            Head Line
## 1        Ahead of 2020, Facebook Falls Short on Plan to Share Data on Disinformation
## 2                   Many Genes Influence Same-Sex Sexuality, Not a Single ‘Gay Gene’
## 3                             Hanging Out With Humans Makes This Bird Bad at Its Job
## 4                                             How to Stop the Abuse of Location Data
## 5                                A Signal in Giant Earthquakes That Could Save Lives
## 6                             Humans Dominated Earth Earlier Than Previously Thought
## 7  Novartis C.E.O. Defends Company’s Decision to Withhold False Data From the F.D.A.
## 8                                       This New Liquid Is Magnetic, and Mesmerizing
## 9                                             Birds Are Vanishing From North America
## 10       Nobel Prize in Physics Awarded for Studies of Earth’s Place in the Universe
##                                                                                    URL
## 1           https://www.nytimes.com/2019/09/29/technology/facebook-disinformation.html
## 2                         https://www.nytimes.com/2019/08/29/science/gay-gene-sex.html
## 3         https://www.nytimes.com/2019/08/29/science/weka-birds-seeds-new-zealand.html
## 4          https://www.nytimes.com/2019/10/16/opinion/foursquare-privacy-internet.html
## 5  https://www.nytimes.com/2019/05/29/science/earthquakes-detection-richter-scale.html
## 6       https://www.nytimes.com/2019/08/29/science/archaeology-earth-anthropocene.html
## 7             https://www.nytimes.com/2019/08/07/health/novartis-fda-gene-therapy.html
## 8                        https://www.nytimes.com/2019/07/18/science/liquid-magnet.html
## 9      https://www.nytimes.com/2019/09/19/science/bird-populations-america-canada.html
## 10                       https://www.nytimes.com/2019/10/08/science/nobel-physics.html
##                Source                         Author Publication Date
## 1  The New York Times                     Davey Alba       2019-09-29
## 2  The New York Times                    Pam Belluck       2019-08-29
## 3  The New York Times                    Cara Giaimo       2019-08-29
## 4  The New York Times                    Jeff Glueck       2019-10-16
## 5  The New York Times           Robin George Andrews       2019-05-29
## 6  The New York Times                   James Gorman       2019-08-29
## 7  The New York Times                   Katie Thomas       2019-08-07
## 8  The New York Times                   Knvul Sheikh       2019-07-18
## 9  The New York Times                    Carl Zimmer       2019-09-19
## 10 The New York Times Kenneth Chang and Megan Specia       2019-10-08
##    Publication Time
## 1          12:00:04
## 2          18:01:08
## 3          09:00:14
## 4          15:00:07
## 5          18:00:08
## 6          18:00:06
## 7          18:05:47
## 8          18:00:06
## 9          18:03:18
## 10         09:57:29

Write the simplified data frame into a csv file for later use

write.csv(queryResult[2], file = "simplifiedResult.csv",row.names=FALSE)

Data_607_Week_9_Assignment

Euclid Zhang

10/26/2019