Github Repository

library(tidyverse)
library(nytimes)
library(httr)
library(jsonlite)

“Manual” API Call using the httr and jsonlite packages.

# Retrieve API key from environment
api_key <- Sys.getenv("NYTIMES_API_KEY")

# Save the Top Sports Stories URL with the API Key included
url <- paste0("https://api.nytimes.com/svc/topstories/v2/sports.json?api-key=", 
              api_key)

# Fetch data
response <- GET(url)
# Check response type
http_type(response)
# Check for errors
http_error(response)
# Convert data from JSON to Text
data <- fromJSON(content(response, as="text"))

# Create sample dataframe of results
articles <- data$results %>%
  select(title, abstract, url, published_date)

NYTimes Package

Another way to retrieve New York Times data is through the nytimes R Package. The code below uses the function ny_archive() to retrieve articles published in July 2024.

# get all articles from July 2024
archive <- ny_archive(2024, 7)

# Display the structure of the archive obj
# str(archive[1:5])
# displaying the structure of the first keywords element within the obj archive
str(archive[[1]]$keywords)

# Check if ESPN is a keyword in the first article using a grepl statement
any(grepl("ESPN", archive[[1]]$keywords))

# Test if ESPN is a keyword in the first article using stringr function
any(str_detect(unlist(archive[[1]]$keywords),"ESPN"))

contains_espn_test <- function(article) {
  # Access keywords from archive obj
  if (!is.null(article$keywords)) {
    any(str_detect(str_to_upper(unlist(article$keywords)), "ESPN"))
  } else {
    FALSE  # Return FALSE if no keywords exist
  }
}
# Test with first article 
contains_espn_test(archive[[1]])
# create a function that will search through the archive list
# and subset articles containing the keyword "ESPN"
contains_espn <- function(article_index,articles_list) {
  # define the sublist as each NYTimes article
  sublist <- articles_list[[article_index]]
  # create a vector of the news desks
  news_desk <- sublist$news_desk
  # create a vector of the main headline
  headline <- sublist$headline$main
  # create a list of the keywords
  keywords_list <- sublist[["keywords"]]
  # create a vector of the snippets
  snippet <- sublist$snippet
  # check if espn appears in the key words list
  keyword_espn <- any(grepl("espn", keywords_list, ignore.case = TRUE))
  # check if espn appears in the headline
  headline_espn <- any(grepl("espn", headline, ignore.case=TRUE))
  # check if espn appears in the snippet
  snippet_espn <- any(grepl("espn", snippet, ignore.case=TRUE))
  # check if espn appears in any of the elements
  any_espn <- any(keyword_espn,
                  headline_espn,
                  snippet_espn)
  # create a df with the desired cols
  return_df <- data.frame(article_index=article_index,
                          news_desk=news_desk,
                          keyword_espn=keyword_espn,
                          headline_espn=headline_espn,
                          snippet_espn=snippet_espn,
                          any_espn=any_espn)
  return(return_df)
}
# calculate start date as 1 year, 3 months, and 29 days from today
start_date <- today() - years (1) - months(3) - days(29)
# calculate end date as 3 months, and 29 days from today
end_date <- today() - months(3) - days(29)
# create a vector of the sequence of dates separating by 1 month apart
range <- seq(ymd(start_date),ymd(end_date), by = '1 month')

summarize_month <- function (year_month) {
  # define month as the respective month
  month <- month(year_month)
  # define month as the respective year
  year <- year(year_month)
  # get data from ny_archive function using the respective month and year
  res <- ny_archive(year,month)
  # map contains_espn function to the response from archive API
  month_df <- map_df(seq_along(res), .f=~contains_espn(.x,res))
  # summarise findings where n= total articles and espn= # of articles containing 
  # espn in headline, snippet, or keyword
  month_summary <- summarise(month_df,
                             n=n(),
                             espn=sum(any_espn))
  return_df <- month_summary |> 
    # add cols to record year and month
    mutate(
      year=year, 
      month=month
    )
  return(return_df)
}

# map range funct to iterate through date range
summary_2024_espn <- map_df(range,function(x){
  summary <- summarize_month(x)
  # sleep so do not overwhelm API
  Sys.sleep(10)
  # return results
  return(summary)
}
)
# Created date column
plot_espn_data <- summary_2024_espn |> 
  mutate(date = make_date(year, month, 1))  # Using 1st of each month for plotting

# Create the plot
ggplot(plot_espn_data, aes(x = date, y = espn)) +
  geom_line(color = "#E41A1C", linewidth = 1) +  # ESPN brand red color
  geom_point(color = "#E41A1C", size = 3) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  scale_y_continuous(breaks = seq(0, max(plot_espn_data$espn) + 1)) +
  labs(title = "NYTimes Articles Mentioning ESPN (Dec 2023 - Dec 2024)",
       subtitle = "Count of articles containing 'ESPN' by month",
       x = "Month",
       y = "Number of Articles",
       caption = "Source: NYTimes Archive") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank()
  )