New York Times API

Introduction

For this project, I focused on importing data from the “Most Popular” API provided by the New York Times. The data set used in this assignment, looks at the most popular articles over the past 30-days based on the number of times they were emailed, shared, or viewed.

In addition to a number of pre-processing steps, for this project, I focused on analyzing the data to answer the following questions:

Which writers generate the most favorited bylines?
Which sections and subsections of the paper generated the most popular articles?
Which topics were the most popular?
Were there articles that were popular across email, views, and shared?

Setup

#rm(list=ls())

knitr::opts_chunk$set(echo = TRUE)

library(httr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(ggwordcloud)
library(dotenv)

Import Data from the Most Popular API

load_dot_env('../creds.env')


url_base <- 'https://api.nytimes.com/svc/mostpopular/v2'

num_days = 30

viewed_favs <- paste0('/viewed/',num_days,'.json')
emailed_favs <- paste0('/emailed/',num_days,'.json')
shared_favs <- paste0('/shared/',num_days,'.json')


api_call <- paste0(url_base,viewed_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))

viewed_results <- data$results

api_call <- paste0(url_base,emailed_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))

emailed_results <- data$results

api_call <- paste0(url_base,shared_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))

shared_results <- data$results

Create Common Dataframe

Create a single dataframe that has the data from the 3 sub APIs, and create a new column “fav_category” that identifies which of the APIs the data is collected from.

viewed_df <- viewed_results %>% 
  select(c(5,6,8,9,11,13,15)) %>%
  mutate(fav_category = "Viewed")

emailed_df <- emailed_results %>% 
  select(c(5,6,8,9,11,13,15)) %>%
  mutate(fav_category = "Emailed")

shared_df <- shared_results %>% 
  select(c(5,6,8,9,11,13,15)) %>%
  mutate(fav_category = "Shared")

common_df <- rbind(viewed_df, shared_df) 
common_df <- rbind(common_df, emailed_df)

Analysis Questions:

Which writers generate the most favorited bylines?
Which sections and subsections of the paper generated the most popular articles?
Which topics were the most popular? (Use the adx_keywords)
Were there articles that were popular across email, views, and shared?

Question 1 - Which writers generate the most favorited bylines?

## Create data frame of writers with bylines

bylines <- common_df$byline

writers_df = data.frame()

for(i in 1:length(bylines)) {
  clean_bylines <- str_replace(bylines[i], "By", "")
  writers <- str_split(clean_bylines,",|and")
  #print(writers)
  
  for(j in 1:length(writers[[1]])) {
    writer <- str_squish(writers[[1]][j])
    
    if(writer != "" & !(writer %in% writers_df)) {
      writers_df = rbind(writers_df,writer)
    }
  }  
}

colnames(writers_df) <- 'writer'

## Determine number of times writer appears in a byline

writers_df <- writers_df %>%
  distinct(writer)

writers_df <- writers_df %>%
  mutate(num_bylines = NA)

for(i in 1:nrow(writers_df)) {
  selected_writer <- writers_df[i, 'writer']
  
  num_bylines_select <- common_df %>% 
    filter(str_detect(byline, selected_writer)) %>% 
    select(byline) %>% 
    nrow()
  
  writers_df <- writers_df %>%
    mutate(num_bylines = ifelse(writer == selected_writer,num_bylines_select,num_bylines))

  }

top10_writers <- writers_df %>%
  arrange(desc(num_bylines)) %>%
  filter(row_number() <= 10)

top10_writers %>%
  kable(
    row.names = T,
    col.names = c("Writer", "Count"),
    caption = "Top 10 Writers Based on Number of Times They Appear in Bylines"
  ) %>%
  kable_material(c("striped"))

Top 10 Writers Based on Number of Times They Appear in Bylines
	Writer	Count
1	Michael Levenson	4
2	Nicholas Bogel-Burroughs	3
3	Connie Chang	3
4	Noam Chomsky	3
5	Ian Roberts	3
6	Jeffrey Watumull	3
7	Eduardo Medina	3
8	Maggie Haberman	2
9	Jonah E. Bromwich	2
10	Ben Protess	2

ggplot(top10_writers, aes(x=reorder(writer, num_bylines), y=num_bylines, fill=writer)) +
  geom_bar(stat='identity') +
  labs(
    y="Num Bylines",
    x="",
    title = "Top 10 Writers Based on Number of Contributions to Popular Articles"
  ) +
  coord_flip()

Answer: Michael Levenson was the most popular writer with 4 articles appearing in the favorites list.

Question 2 - Which sections and subsections of the paper generated the most popular articles?

common_df %>% 
  count(section) %>%
  arrange(desc(n)) %>%
  kable(
    col.names = c("Section", "Num Favorites"),
    row.names = T,
    caption = "Ranking of Categories Based on Number of Favorites"
  ) %>%
  kable_material(c("striped"))

Ranking of Categories Based on Number of Favorites
	Section	Num Favorites
1	U.S.	16
2	Opinion	8
3	Well	8
4	Arts	4
5	Business	4
6	World	4
7	Magazine	3
8	New York	3
9	Real Estate	3
10	Sports	3
11	Movies	2
12	Science	1
13	Style	1

common_df %>% 
  group_by(section, fav_category) %>%
  summarize(num_favorites = n()) %>%
  ungroup() %>%
  pivot_wider(names_from = fav_category, values_from = num_favorites) %>%
  mutate_at(c(2,3,4), ~replace_na(.,0)) %>%
  mutate(total_favs = Shared+Viewed+Emailed) %>%
  arrange(desc(total_favs)) %>%
  kable(
    row.names = T,
    col.names =  c("Section", "Shared", "Viewed", "Emailed", "Total"),
    caption = "Number of Favorite Appearances per Favorite Category",
    align = c("l", c(rep("c",4)))
  ) %>%
  kable_material(c("striped"))

## `summarise()` has grouped output by 'section'. You can override using the
## `.groups` argument.

Number of Favorite Appearances per Favorite Category
	Section	Shared	Viewed	Emailed	Total
1	U.S.	7	7	2	16
2	Opinion	1	1	6	8
3	Well	3	1	4	8
4	Arts	3	1	0	4
5	Business	1	2	1	4
6	World	2	2	0	4
7	Magazine	0	0	3	3
8	New York	1	2	0	3
9	Real Estate	1	0	2	3
10	Sports	1	1	1	3
11	Movies	0	2	0	2
12	Science	0	1	0	1
13	Style	0	0	1	1

common_df_summary <- common_df %>% 
  group_by(section, fav_category) %>%
  summarize(num_favorites = n()) %>%
  mutate(total_favs = sum(num_favorites))

## `summarise()` has grouped output by 'section'. You can override using the
## `.groups` argument.

ggplot(common_df_summary, aes(x=reorder(section,-total_favs), y=num_favorites, fill=fav_category)) + 
  geom_bar(stat='identity') +
  labs(
    x="",
    y="Total Favorites",
    title = "Number of Favorites by Article Section and Favorite Type",
    fill = 'Favorite Category'
  ) +
  theme(
    axis.text.x = element_text(angle=90)
  )

Answer: The US Section by far has the most contributions to popular articles, with 15 total across the three categories.

Question 3 - Which topics were the most popular?

adx_keywords <- common_df$adx_keywords

keywords_df = data.frame()

for(i in 1:length(adx_keywords)) {
  keywords <- strsplit(adx_keywords[[i]],";")
  
  for(j in 1:length(keywords[[1]])) {
    keyword <- keywords[j]
    keywords_df <- rbind(keywords_df, keyword)
  }
}

colnames(keywords_df) <- 'keyword'

keywords_df <- tibble(keywords_df)

top15_keywords <- keywords_df %>%
  group_by(keyword) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  filter(row_number() <= 15)

top15_keywords %>%
  kable(
    row.names = T,
    col.names = c("Keyword", "Num Articles"),
    caption = "Top 25 Most Topics based on Keywords"
  ) %>%
  kable_material(c("striped"))

Top 25 Most Topics based on Keywords
	Keyword	Num Articles
1	internal-sub-only	19
2	United States Politics and Government	8
3	Content Type: Service	7
4	Movies	7
5	Murders, Attempted Murders and Homicides	6
6	Content Type: Personal Profile	5
7	Artificial Intelligence	4
8	ChatGPT	4
9	Research	4
10	Suits and Litigation (Civil)	4
11	Trump, Donald J	4
12	Academy Awards (Oscars)	3
13	Actors and Actresses	3
14	Age, Chronological	3
15	Banking and Financial Institutions	3

Question 4 - Were there articles that were popular across email, views, and shared?

common_df %>%
  group_by(title, fav_category) %>%
  summarize(is_present = ifelse(n() >= 1, 'yes','no')) %>%
  pivot_wider(names_from = fav_category, values_from = is_present) %>%
  mutate_at(c(2,3,4), ~replace_na(.,"no")) %>%
  mutate_at(c(2,3,4), str_to_title) %>%
  mutate(num_categories = (Emailed == 'Yes')+(Shared == 'Yes') + (Viewed == 'Yes')) %>%
  filter(num_categories >= 2) %>%
  arrange(desc(num_categories)) %>%
  select(c(1:4)) %>%
  kable(
    row.names = T,
    col.names = c("Article Title", "Emailed", "Shared", "Viewed"),
    align = c("l", c(rep("c",3))),
    caption = "Articles That Are Popular Across 2 or More Categories"
  ) %>%
  kable_material(c("striped"))

## `summarise()` has grouped output by 'title'. You can override using the
## `.groups` argument.

Articles That Are Popular Across 2 or More Categories
	Article Title	Emailed	Shared	Viewed
1	5 Exercises to Keep an Aging Body Strong and Fit	Yes	Yes	Yes
2	Noam Chomsky: The False Promise of ChatGPT	Yes	Yes	Yes
3	A Four-Decade Secret: One Man’s Story of Sabotaging Carter’s Re-election	No	Yes	Yes
4	Americans Head to Europe for the Good Life on the Cheap	Yes	Yes	No
5	Breaking Silence, Murdaugh Brother Says ‘Not Knowing Is the Worst Thing’	No	Yes	Yes
6	For Years She Said a Coach Abused Her. Now She Has Named a Legend.	No	Yes	Yes
7	Lance Reddick, Star of ‘The Wire’ and ‘John Wick,’ Dies at 60	No	Yes	Yes
8	Once the World’s Largest, a Hotel Goes ‘Poof!’ Before Our Eyes	No	Yes	Yes
9	Whiskey Fungus Fed by Jack Daniel’s Encrusts a Tennessee Town	Yes	No	Yes

Answer: There were 8 articles that were popular across 2 or more categories.