This assignment consists of using the New York Times Article Search API to examine how the volume and framing of soccer coverage in the New York Times has evolved since the modern Major League Soccer (MLS) expansion era began in 2005.In fact d,API provides rich article-level metadata such as headline, publication date, section name, news desk, word count, and multimedia flags since 1851, making it well suited for detecting decade-long window in editorial attention and story framing by querying the keyword “soccer”.
Below is a structured workflow approach to accomplish our goal and answer our question:
Use a securely stored environment variable to Access the NYT Article Search and prevent API key exposure
Filter and Collect the most recent articles containing the keyword “soccer” at request time to ensure a contemporary dataset. The result will contain several fields as nested lists such as “headline”, “keywords”, “byline`”, “multimedia” etc…
Then i will parse each paginated JSON response into a data frame specifically by extracting only the sub-fields i need and collapsing arrays to scalar values.
Finally, clean the data to calculate which newspaper sections are producing the most content on “SOCCER”.
library(httr2) # Modern HTTP client
## Warning: package 'httr2' was built under R version 4.5.2
library(jsonlite) # JSON parsing
library(tidyverse) # Data wrangling + ggplot2
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # Date handling
library(glue) # String interpolation
library(scales) # Plot axis formatting
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
In this section, i created an account into the New York Times developer portal and i enabled the “Article Search API” and “Most Popular API” to generate an API Key that i securely stored in R through my console to prevent it to be exposed.
NYT_API_KEY <- Sys.getenv("NYT_API_KEY")
if (nchar(NYT_API_KEY) == 0) {
stop(
"NYT API key not found.\n",
" Option A (recommended): add NYT_API_KEY=your_key to ~/.Renviron, then restart R.\n",
" Option B: assign directly: NYT_API_KEY <- 'your_key'"
)
}
BASE_URL <- "https://api.nytimes.com/svc/search/v2/articlesearch.json"
The NYT API provides 10 results per page and maintains a set limit of roughly 12 requests per minute. The script iterates through pages, pausing to remain polite and automatically manage any HTTP 429 rate-limit responses.
#' we start by Fetching one page of NYT article search results
#'
#' @param query Search string
#' @param year 4-digit year (integer)
#' @param page 0-based page index
#' @param api_key NYT API key string
#' @return Raw parsed list from the JSON response, or NULL on error
fetch_nyt_page <- function(query, year, page = 0, api_key) {
begin_date <- glue("{year}0101")
end_date <- glue("{year}1231")
resp <- request(BASE_URL) |>
req_url_query(
q = query,
begin_date = begin_date,
end_date = end_date,
page = page,
sort = "oldest",
`api-key` = api_key
) |>
req_error(is_error = \(r) FALSE) |> # handle HTTP errors manually
req_perform()
if (resp_status(resp) != 200) {
warning(glue("HTTP {resp_status(resp)} for year={year} page={page}"))
return(NULL)
}
resp |> resp_body_json(simplifyVector = FALSE)
}
#' Now, let's Collect articles across multiple pages for a single year
#'
#' @param query Search string
#' @param year 4-digit year
#' @param max_pages Max pages to retrieve per year (10 articles each)
#' @param api_key NYT API key
#' @return List of article metadata lists
fetch_year <- function(query, year, max_pages = 5, api_key) {
all_docs <- list()
for (pg in seq(0, max_pages - 1)) {
message(glue(" Fetching year={year} page={pg} ..."))
result <- fetch_nyt_page(query, year, pg, api_key)
if (is.null(result)) break
hits <- result$response$meta$hits %||% 0
docs <- result$response$docs
if (length(docs) == 0) break
all_docs <- c(all_docs, docs)
# Stop early if we've retrieved all available articles
if (length(all_docs) >= hits) break
Sys.sleep(6.5) # Stay within 10 req/min rate limit
}
all_docs
}
# ── Query parameters ──────────────────────────────────────────────────────────
SEARCH_QUERY <- "soccer MLS"
START_YEAR <- 2005
END_YEAR <- as.integer(format(Sys.Date(), "%Y"))
MAX_PAGES <- 5 # Up to 50 articles per year; increase for deeper pull
message(glue("\nFetching NYT soccer coverage: {START_YEAR}–{END_YEAR}\n"))
## Fetching NYT soccer coverage: 2005–2026
raw_articles <- list()
for (yr in START_YEAR:END_YEAR) {
yr_docs <- fetch_year(SEARCH_QUERY, yr, MAX_PAGES, NYT_API_KEY)
raw_articles[[as.character(yr)]] <- yr_docs
message(glue(" → {length(yr_docs)} articles retrieved for {yr}"))
Sys.sleep(2)
}
## Fetching year=2005 page=0 ...
## Fetching year=2005 page=1 ...
## Fetching year=2005 page=2 ...
## Fetching year=2005 page=3 ...
## Fetching year=2005 page=4 ...
## → 50 articles retrieved for 2005
## Fetching year=2006 page=0 ...
## Fetching year=2006 page=1 ...
## Fetching year=2006 page=2 ...
## Fetching year=2006 page=3 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2006
## page=3
## → 30 articles retrieved for 2006
## Fetching year=2007 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2007
## page=0
## → 0 articles retrieved for 2007
## Fetching year=2008 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2008
## page=0
## → 0 articles retrieved for 2008
## Fetching year=2009 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2009
## page=0
## → 0 articles retrieved for 2009
## Fetching year=2010 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2010
## page=0
## → 0 articles retrieved for 2010
## Fetching year=2011 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2011
## page=0
## → 0 articles retrieved for 2011
## Fetching year=2012 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2012
## page=0
## → 0 articles retrieved for 2012
## Fetching year=2013 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2013
## page=0
## → 0 articles retrieved for 2013
## Fetching year=2014 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2014
## page=0
## → 0 articles retrieved for 2014
## Fetching year=2015 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2015
## page=0
## → 0 articles retrieved for 2015
## Fetching year=2016 page=0 ...
## Fetching year=2016 page=1 ...
## Fetching year=2016 page=2 ...
## Fetching year=2016 page=3 ...
## Fetching year=2016 page=4 ...
## → 50 articles retrieved for 2016
## Fetching year=2017 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2017
## page=0
## → 0 articles retrieved for 2017
## Fetching year=2018 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2018
## page=0
## → 0 articles retrieved for 2018
## Fetching year=2019 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2019
## page=0
## → 0 articles retrieved for 2019
## Fetching year=2020 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2020
## page=0
## → 0 articles retrieved for 2020
## Fetching year=2021 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2021
## page=0
## → 0 articles retrieved for 2021
## Fetching year=2022 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2022
## page=0
## → 0 articles retrieved for 2022
## Fetching year=2023 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2023
## page=0
## → 0 articles retrieved for 2023
## Fetching year=2024 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2024
## page=0
## → 0 articles retrieved for 2024
## Fetching year=2025 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2025
## page=0
## → 0 articles retrieved for 2025
## Fetching year=2026 page=0 ...
## Warning in fetch_nyt_page(query, year, pg, api_key): HTTP 429 for year=2026
## page=0
## → 0 articles retrieved for 2026
Since each article doc is a nested list, we will extract the fields we need.
# Safely extract a nested field (returns NA if missing)
safe_pluck <- function(x, ...) {
tryCatch(purrr::pluck(x, ...) %||% NA_character_, error = \(e) NA_character_)
}
parse_article <- function(doc) {
tibble(
article_id = safe_pluck(doc, "_id"),
pub_date = safe_pluck(doc, "pub_date"),
headline = safe_pluck(doc, "headline", "main"),
abstract = safe_pluck(doc, "abstract"),
lead_paragraph= safe_pluck(doc, "lead_paragraph"),
section = safe_pluck(doc, "section_name"),
desk = safe_pluck(doc, "news_desk"),
doc_type = safe_pluck(doc, "document_type"),
word_count = as.integer(safe_pluck(doc, "word_count") %||% NA),
web_url = safe_pluck(doc, "web_url"),
# Combine all keywords into a semicolon-separated string
keywords = {
kws <- doc$keywords
if (length(kws) > 0) {
paste(sapply(kws, \(k) k$value %||% ""), collapse = "; ")
} else NA_character_
}
)
}
# Step 1: Flatten year buckets into a single list of docs
articles_df <- raw_articles |>
purrr::list_flatten() |>
purrr::map(parse_article) |>
purrr::list_rbind() |>
# Step 2: Let's transform raw date strings into structured time components.
mutate(
pub_date = ymd_hms(pub_date, quiet = TRUE),
year = year(pub_date),
month = month(pub_date),
quarter = quarter(pub_date),
year_month = floor_date(pub_date, "month")
) |>
# Step 3: Framing (These classify each article's emphasis based on headline + abstract text.)
mutate(
text_combined = str_to_lower(paste(headline, abstract, lead_paragraph, sep = " ")),
frame_growth = str_detect(text_combined,
"expansion|grow|record|attendance|stadium|franchise|invest"),
frame_mls = str_detect(text_combined, "\\bmls\\b|major league soccer"),
frame_usmnt = str_detect(text_combined,
"u\\.s\\. national|usmnt|uswnt|world cup|pulisic|reyna"),
frame_global = str_detect(text_combined,
"premier league|champions league|barcelona|real madrid|messi|ronaldo"),
frame_transfer = str_detect(text_combined,
"transfer|signing|contract|deal|salary|designated player"),
frame_youth = str_detect(text_combined,
"youth|academy|college|high school|young"),
frame_culture = str_detect(text_combined,
"culture|fan|supporter|diverse|immigrant|latino|community"),
# Step 4: perform conditional labeling (first match wins; "Other" if none)
framing = case_when(
frame_mls ~ "MLS / League",
frame_usmnt ~ "US National Team",
frame_global ~ "Global / Club",
frame_transfer ~ "Transfers & Business",
frame_growth ~ "Growth & Infrastructure",
frame_youth ~ "Youth & Development",
frame_culture ~ "Culture & Fandom",
TRUE ~ "Other / General"
)
) |>
# Step 5: Let's Deduplicate the process and clean the data
distinct(article_id, .keep_all = TRUE) |>
filter(!is.na(pub_date), year >= START_YEAR) |>
arrange(pub_date)
message(glue("\nFinal dataset: {nrow(articles_df)} unique articles ({START_YEAR}–{END_YEAR})"))
## Final dataset: 130 unique articles (2005–2026)
head(articles_df,10)
## # A tibble: 10 × 24
## article_id pub_date headline abstract lead_paragraph section desk
## <chr> <dttm> <chr> <chr> <chr> <chr> <chr>
## 1 nyt://art… 2005-01-04 05:00:00 MetroSt… MetroSt… <NA> Sports Spor…
## 2 nyt://art… 2005-01-11 05:00:00 Union S… US men'… <NA> Sports Spor…
## 3 nyt://art… 2005-01-18 05:00:00 Convey … CONVEY … <NA> Sports Spor…
## 4 nyt://art… 2005-01-25 05:00:00 Labor F… US Socc… <NA> Sports Spor…
## 5 nyt://art… 2005-01-31 05:00:00 SPORTS … MetroSt… <NA> Sports Spor…
## 6 nyt://art… 2005-02-01 05:00:00 Coach L… Porto g… <NA> Sports Spor…
## 7 nyt://art… 2005-02-08 05:00:00 Donovan… US men'… <NA> Sports Spor…
## 8 nyt://art… 2005-02-10 05:00:00 Johnson… US men'… <NA> Sports Spor…
## 9 nyt://art… 2005-02-22 05:00:00 Djorkae… MetroSt… <NA> Sports Spor…
## 10 nyt://art… 2005-03-01 05:00:00 An Anxi… Colorad… <NA> Sports Spor…
## # ℹ 17 more variables: doc_type <chr>, word_count <int>, web_url <chr>,
## # keywords <chr>, year <dbl>, month <dbl>, quarter <int>, year_month <dttm>,
## # text_combined <chr>, frame_growth <lgl>, frame_mls <lgl>,
## # frame_usmnt <lgl>, frame_global <lgl>, frame_transfer <lgl>,
## # frame_youth <lgl>, frame_culture <lgl>, framing <chr>
Here, we will analyze and visualize the NYT Soccer Coverage Volume since MLS Expansion Era.
annual_volume <- articles_df |>
count(year, name = "n_articles")
annual_volume
## # A tibble: 3 × 2
## year n_articles
## <dbl> <int>
## 1 2005 50
## 2 2006 30
## 3 2016 50
ggplot(annual_volume, aes(x = year, y = n_articles)) +
geom_col(fill = "#003087", alpha = 0.85) +
geom_smooth(method = "loess", se = TRUE, colour = "#E31837", linewidth = 1) +
scale_x_continuous(breaks = seq(START_YEAR, END_YEAR, 2)) +
scale_y_continuous(labels = comma) +
labs(
title = "NYT Soccer Coverage Volume: MLS Expansion Era",
subtitle = glue("Articles matching '{SEARCH_QUERY}', {START_YEAR}–{END_YEAR}"),
x = NULL, y = "Number of Articles",
caption = "Source: NYT Article Search API"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 2004.9
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1.055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 101.1
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2004.9
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.055
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 101.1
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
framing_year <- articles_df |>
count(year, framing) |>
group_by(year) |>
mutate(pct = n / sum(n))
framing_year
## # A tibble: 21 × 4
## # Groups: year [3]
## year framing n pct
## <dbl> <chr> <int> <dbl>
## 1 2005 Culture & Fandom 1 0.02
## 2 2005 Global / Club 3 0.06
## 3 2005 Growth & Infrastructure 4 0.08
## 4 2005 MLS / League 12 0.24
## 5 2005 Other / General 19 0.38
## 6 2005 Transfers & Business 2 0.04
## 7 2005 US National Team 9 0.18
## 8 2006 Culture & Fandom 1 0.0333
## 9 2006 Global / Club 1 0.0333
## 10 2006 Growth & Infrastructure 2 0.0667
## # ℹ 11 more rows
p2 <- ggplot(framing_year, aes(x = year, y = pct, fill = framing)) +
geom_area(alpha = 0.85, colour = "white", linewidth = 0.3) +
scale_x_continuous(breaks = seq(START_YEAR, END_YEAR, 2)) +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "How NYT Soccer Coverage is Framed Over Time",
subtitle = "Share of articles by dominant frame, per year",
x = NULL, y = "Share of Articles", fill = "Frame",
caption = "Source: NYT Article Search API"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "right"
)
print(p2)