library(tidyverse)
library(nytimes)
library(httr)
library(jsonlite)
httr and
jsonlite packages.# Retrieve API key from environment
api_key <- Sys.getenv("NYTIMES_API_KEY")
# Save the Top Sports Stories URL with the API Key included
url <- paste0("https://api.nytimes.com/svc/topstories/v2/sports.json?api-key=",
api_key)
# Fetch data
response <- GET(url)
# Check response type
http_type(response)
# Check for errors
http_error(response)
# Convert data from JSON to Text
data <- fromJSON(content(response, as="text"))
# Create sample dataframe of results
articles <- data$results %>%
select(title, abstract, url, published_date)
Another way to retrieve New York Times data is through the
nytimes R Package. The code below uses the function
ny_archive() to retrieve articles published in July 2024.
# get all articles from July 2024
archive <- ny_archive(2024, 7)
# Display the structure of the archive obj
# str(archive[1:5])
# displaying the structure of the first keywords element within the obj archive
str(archive[[1]]$keywords)
# Check if ESPN is a keyword in the first article using a grepl statement
any(grepl("ESPN", archive[[1]]$keywords))
# Test if ESPN is a keyword in the first article using stringr function
any(str_detect(unlist(archive[[1]]$keywords),"ESPN"))
contains_espn_test <- function(article) {
# Access keywords from archive obj
if (!is.null(article$keywords)) {
any(str_detect(str_to_upper(unlist(article$keywords)), "ESPN"))
} else {
FALSE # Return FALSE if no keywords exist
}
}
# Test with first article
contains_espn_test(archive[[1]])
# create a function that will search through the archive list
# and subset articles containing the keyword "ESPN"
contains_espn <- function(article_index,articles_list) {
# define the sublist as each NYTimes article
sublist <- articles_list[[article_index]]
# create a vector of the news desks
news_desk <- sublist$news_desk
# create a vector of the main headline
headline <- sublist$headline$main
# create a list of the keywords
keywords_list <- sublist[["keywords"]]
# create a vector of the snippets
snippet <- sublist$snippet
# check if espn appears in the key words list
keyword_espn <- any(grepl("espn", keywords_list, ignore.case = TRUE))
# check if espn appears in the headline
headline_espn <- any(grepl("espn", headline, ignore.case=TRUE))
# check if espn appears in the snippet
snippet_espn <- any(grepl("espn", snippet, ignore.case=TRUE))
# check if espn appears in any of the elements
any_espn <- any(keyword_espn,
headline_espn,
snippet_espn)
# create a df with the desired cols
return_df <- data.frame(article_index=article_index,
news_desk=news_desk,
keyword_espn=keyword_espn,
headline_espn=headline_espn,
snippet_espn=snippet_espn,
any_espn=any_espn)
return(return_df)
}
# calculate start date as 1 year, 3 months, and 29 days from today
start_date <- today() - years (1) - months(3) - days(29)
# calculate end date as 3 months, and 29 days from today
end_date <- today() - months(3) - days(29)
# create a vector of the sequence of dates separating by 1 month apart
range <- seq(ymd(start_date),ymd(end_date), by = '1 month')
summarize_month <- function (year_month) {
# define month as the respective month
month <- month(year_month)
# define month as the respective year
year <- year(year_month)
# get data from ny_archive function using the respective month and year
res <- ny_archive(year,month)
# map contains_espn function to the response from archive API
month_df <- map_df(seq_along(res), .f=~contains_espn(.x,res))
# summarise findings where n= total articles and espn= # of articles containing
# espn in headline, snippet, or keyword
month_summary <- summarise(month_df,
n=n(),
espn=sum(any_espn))
return_df <- month_summary |>
# add cols to record year and month
mutate(
year=year,
month=month
)
return(return_df)
}
# map range funct to iterate through date range
summary_2024_espn <- map_df(range,function(x){
summary <- summarize_month(x)
# sleep so do not overwhelm API
Sys.sleep(10)
# return results
return(summary)
}
)
# Created date column
plot_espn_data <- summary_2024_espn |>
mutate(date = make_date(year, month, 1)) # Using 1st of each month for plotting
# Create the plot
ggplot(plot_espn_data, aes(x = date, y = espn)) +
geom_line(color = "#E41A1C", linewidth = 1) + # ESPN brand red color
geom_point(color = "#E41A1C", size = 3) +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
scale_y_continuous(breaks = seq(0, max(plot_espn_data$espn) + 1)) +
labs(title = "NYTimes Articles Mentioning ESPN (Dec 2023 - Dec 2024)",
subtitle = "Count of articles containing 'ESPN' by month",
x = "Month",
y = "Number of Articles",
caption = "Source: NYTimes Archive") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank()
)