For this project, I focused on importing data from the “Most Popular” API provided by the New York Times. The data set used in this assignment, looks at the most popular articles over the past 30-days based on the number of times they were emailed, shared, or viewed.
In addition to a number of pre-processing steps, for this project, I focused on analyzing the data to answer the following questions:
#rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
library(httr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(ggwordcloud)
library(dotenv)
load_dot_env('../creds.env')
url_base <- 'https://api.nytimes.com/svc/mostpopular/v2'
num_days = 30
viewed_favs <- paste0('/viewed/',num_days,'.json')
emailed_favs <- paste0('/emailed/',num_days,'.json')
shared_favs <- paste0('/shared/',num_days,'.json')
api_call <- paste0(url_base,viewed_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))
viewed_results <- data$results
api_call <- paste0(url_base,emailed_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))
emailed_results <- data$results
api_call <- paste0(url_base,shared_favs,'?api-key=',Sys.getenv("NYT_API_KEY"))
res <- GET(api_call)
data <- fromJSON(rawToChar(res$content))
shared_results <- data$results
Create a single dataframe that has the data from the 3 sub APIs, and create a new column “fav_category” that identifies which of the APIs the data is collected from.
viewed_df <- viewed_results %>%
select(c(5,6,8,9,11,13,15)) %>%
mutate(fav_category = "Viewed")
emailed_df <- emailed_results %>%
select(c(5,6,8,9,11,13,15)) %>%
mutate(fav_category = "Emailed")
shared_df <- shared_results %>%
select(c(5,6,8,9,11,13,15)) %>%
mutate(fav_category = "Shared")
common_df <- rbind(viewed_df, shared_df)
common_df <- rbind(common_df, emailed_df)
## Create data frame of writers with bylines
bylines <- common_df$byline
writers_df = data.frame()
for(i in 1:length(bylines)) {
clean_bylines <- str_replace(bylines[i], "By", "")
writers <- str_split(clean_bylines,",|and")
#print(writers)
for(j in 1:length(writers[[1]])) {
writer <- str_squish(writers[[1]][j])
if(writer != "" & !(writer %in% writers_df)) {
writers_df = rbind(writers_df,writer)
}
}
}
colnames(writers_df) <- 'writer'
## Determine number of times writer appears in a byline
writers_df <- writers_df %>%
distinct(writer)
writers_df <- writers_df %>%
mutate(num_bylines = NA)
for(i in 1:nrow(writers_df)) {
selected_writer <- writers_df[i, 'writer']
num_bylines_select <- common_df %>%
filter(str_detect(byline, selected_writer)) %>%
select(byline) %>%
nrow()
writers_df <- writers_df %>%
mutate(num_bylines = ifelse(writer == selected_writer,num_bylines_select,num_bylines))
}
top10_writers <- writers_df %>%
arrange(desc(num_bylines)) %>%
filter(row_number() <= 10)
top10_writers %>%
kable(
row.names = T,
col.names = c("Writer", "Count"),
caption = "Top 10 Writers Based on Number of Times They Appear in Bylines"
) %>%
kable_material(c("striped"))
| Writer | Count | |
|---|---|---|
| 1 | Michael Levenson | 4 |
| 2 | Nicholas Bogel-Burroughs | 3 |
| 3 | Connie Chang | 3 |
| 4 | Noam Chomsky | 3 |
| 5 | Ian Roberts | 3 |
| 6 | Jeffrey Watumull | 3 |
| 7 | Eduardo Medina | 3 |
| 8 | Maggie Haberman | 2 |
| 9 | Jonah E. Bromwich | 2 |
| 10 | Ben Protess | 2 |
ggplot(top10_writers, aes(x=reorder(writer, num_bylines), y=num_bylines, fill=writer)) +
geom_bar(stat='identity') +
labs(
y="Num Bylines",
x="",
title = "Top 10 Writers Based on Number of Contributions to Popular Articles"
) +
coord_flip()
Answer: Michael Levenson was the most popular writer with 4 articles appearing in the favorites list.
common_df %>%
count(section) %>%
arrange(desc(n)) %>%
kable(
col.names = c("Section", "Num Favorites"),
row.names = T,
caption = "Ranking of Categories Based on Number of Favorites"
) %>%
kable_material(c("striped"))
| Section | Num Favorites | |
|---|---|---|
| 1 | U.S. | 16 |
| 2 | Opinion | 8 |
| 3 | Well | 8 |
| 4 | Arts | 4 |
| 5 | Business | 4 |
| 6 | World | 4 |
| 7 | Magazine | 3 |
| 8 | New York | 3 |
| 9 | Real Estate | 3 |
| 10 | Sports | 3 |
| 11 | Movies | 2 |
| 12 | Science | 1 |
| 13 | Style | 1 |
common_df %>%
group_by(section, fav_category) %>%
summarize(num_favorites = n()) %>%
ungroup() %>%
pivot_wider(names_from = fav_category, values_from = num_favorites) %>%
mutate_at(c(2,3,4), ~replace_na(.,0)) %>%
mutate(total_favs = Shared+Viewed+Emailed) %>%
arrange(desc(total_favs)) %>%
kable(
row.names = T,
col.names = c("Section", "Shared", "Viewed", "Emailed", "Total"),
caption = "Number of Favorite Appearances per Favorite Category",
align = c("l", c(rep("c",4)))
) %>%
kable_material(c("striped"))
## `summarise()` has grouped output by 'section'. You can override using the
## `.groups` argument.
| Section | Shared | Viewed | Emailed | Total | |
|---|---|---|---|---|---|
| 1 | U.S. | 7 | 7 | 2 | 16 |
| 2 | Opinion | 1 | 1 | 6 | 8 |
| 3 | Well | 3 | 1 | 4 | 8 |
| 4 | Arts | 3 | 1 | 0 | 4 |
| 5 | Business | 1 | 2 | 1 | 4 |
| 6 | World | 2 | 2 | 0 | 4 |
| 7 | Magazine | 0 | 0 | 3 | 3 |
| 8 | New York | 1 | 2 | 0 | 3 |
| 9 | Real Estate | 1 | 0 | 2 | 3 |
| 10 | Sports | 1 | 1 | 1 | 3 |
| 11 | Movies | 0 | 2 | 0 | 2 |
| 12 | Science | 0 | 1 | 0 | 1 |
| 13 | Style | 0 | 0 | 1 | 1 |
common_df_summary <- common_df %>%
group_by(section, fav_category) %>%
summarize(num_favorites = n()) %>%
mutate(total_favs = sum(num_favorites))
## `summarise()` has grouped output by 'section'. You can override using the
## `.groups` argument.
ggplot(common_df_summary, aes(x=reorder(section,-total_favs), y=num_favorites, fill=fav_category)) +
geom_bar(stat='identity') +
labs(
x="",
y="Total Favorites",
title = "Number of Favorites by Article Section and Favorite Type",
fill = 'Favorite Category'
) +
theme(
axis.text.x = element_text(angle=90)
)
Answer: The US Section by far has the most contributions to popular articles, with 15 total across the three categories.
adx_keywords <- common_df$adx_keywords
keywords_df = data.frame()
for(i in 1:length(adx_keywords)) {
keywords <- strsplit(adx_keywords[[i]],";")
for(j in 1:length(keywords[[1]])) {
keyword <- keywords[j]
keywords_df <- rbind(keywords_df, keyword)
}
}
colnames(keywords_df) <- 'keyword'
keywords_df <- tibble(keywords_df)
top15_keywords <- keywords_df %>%
group_by(keyword) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
filter(row_number() <= 15)
top15_keywords %>%
kable(
row.names = T,
col.names = c("Keyword", "Num Articles"),
caption = "Top 25 Most Topics based on Keywords"
) %>%
kable_material(c("striped"))
| Keyword | Num Articles | |
|---|---|---|
| 1 | internal-sub-only | 19 |
| 2 | United States Politics and Government | 8 |
| 3 | Content Type: Service | 7 |
| 4 | Movies | 7 |
| 5 | Murders, Attempted Murders and Homicides | 6 |
| 6 | Content Type: Personal Profile | 5 |
| 7 | Artificial Intelligence | 4 |
| 8 | ChatGPT | 4 |
| 9 | Research | 4 |
| 10 | Suits and Litigation (Civil) | 4 |
| 11 | Trump, Donald J | 4 |
| 12 | Academy Awards (Oscars) | 3 |
| 13 | Actors and Actresses | 3 |
| 14 | Age, Chronological | 3 |
| 15 | Banking and Financial Institutions | 3 |