Research Question

For this assignment, we'll be using the NYTimes API key to obtain data on the top emailed NYTimes articles (for time intervals of 7 and 30 days). We will then compare that data to Facebook engagement to answer our primary question: How do emailed articles compare with Facebook engagement?

Preparing Data

JSON Data Import

Here we'll import the data into a list:

method = c("emailed", "shared")
vec.days = c("7", "30")

lst.data = list()
for (i in vec.days) {
  json = paste0('https://api.nytimes.com/svc/mostpopular/v2/shared/',i, "/facebook.json?api-key=",api_key)
  lst.data[[paste0("facebook.", i)]] = jsonlite::fromJSON(json) 
  json = paste0('https://api.nytimes.com/svc/mostpopular/v2/emailed/',i, ".json?api-key=",api_key)
  lst.data[[paste0("emailed.", i)]]  = jsonlite::fromJSON(json)
}

R DataFrame

The results portion of the JSON file will be of interest here. We'll use another loop to convert the data to data frames:

lst.data.tables = list()
for (i in 1:length(lst.data)) {
  lst.data.tables[[names(lst.data)[i]]] = lst.data[[i]]$results %>% as.data.table()
}

And we can preview them here:

  lst.data.tables$facebook.7[, 1:6] %>% head(2) %>% kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

uri	url	id	asset_id	source	published_date
nyt://article/6db68e52-ca3f-51be-83d6-672aa1b985aa	https://www.nytimes.com/2022/03/27/business/media/chris-wallace-cnn-fox-news.html	1e+14	1e+14	New York Times	2022-03-27
nyt://article/b3771ccd-aa28-56df-af1f-ffe2bdd16cdf	https://www.nytimes.com/2022/03/30/well/live/ba2-omicron-covid.html	1e+14	1e+14	New York Times	2022-03-30

  lst.data.tables$facebook.30[, 1:6] %>% head(2) %>% kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

uri	url	id	asset_id	source	published_date
nyt://article/be93acde-db33-560e-8704-0b319d98986a	https://www.nytimes.com/2022/03/23/nyregion/trump-investigation-felony-resignation-pomerantz.html	1e+14	1e+14	New York Times	2022-03-23
nyt://article/04ff0b39-a00d-5d18-9bdc-831f1d5a056d	https://www.nytimes.com/2022/03/11/us/west-point-cadets-fentanyl-overdose.html	1e+14	1e+14	New York Times	2022-03-11

  lst.data.tables$emailed.7[, 1:6] %>% head(2) %>% kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

uri	url	id	asset_id	source	published_date
nyt://article/4a9e2250-78c0-50f0-9735-19c0de1c3862	https://www.nytimes.com/2022/03/29/opinion/ukraine-war-putin.html	1e+14	1e+14	New York Times	2022-03-29
nyt://article/593b8fa1-a0ae-5707-9eb5-a56de5a77fc0	https://www.nytimes.com/2022/03/29/opinion/how-to-defeat-putin-and-save-the-planet.html	1e+14	1e+14	New York Times	2022-03-29

  lst.data.tables$emailed.30[, 1:6] %>% head(2) %>% kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

uri	url	id	asset_id	source	published_date
nyt://article/d8a34056-bf95-572f-a46f-f106600a198f	https://www.nytimes.com/2022/02/17/opinion/liberalism-democracy-russia-ukraine.html	1e+14	1e+14	New York Times	2022-02-17
nyt://article/38478cfb-a092-5902-8925-31cd44d8c7f4	https://www.nytimes.com/2022/03/11/dining/induction-cooking.html	1e+14	1e+14	New York Times	2022-03-11

Data Analysis: Word Clouds

We can start with some basic word clouds:

Shared on Facebook (7 Days)

data(stop_words)

fb.words <-   lst.data.tables$facebook.7%>% unnest_tokens(word, title) %>% select(word)
dt.fb.7 = fb.words[,.N, by = "word"][order(N, decreasing= T)]
dt.fb.7 = dt.fb.7[!(word %in% stop_words$word),]
wordcloud(words = dt.fb.7$word, freq = dt.fb.7$N, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors = brewer.pal(8, "Dark2"))

Shared on Facebook (30 Days)

data(stop_words)

fb.words <-   lst.data.tables$facebook.30%>% unnest_tokens(word, title) %>% select(word)
dt.fb.7 = fb.words[,.N, by = "word"][order(N, decreasing= T)]
dt.fb.7 = dt.fb.7[!(word %in% stop_words$word),]
wordcloud(words = dt.fb.7$word, freq = dt.fb.7$N, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors = brewer.pal(8, "Dark2"))

Emailed (7 Days)

data(stop_words)

fb.words <-   lst.data.tables$emailed.7%>% unnest_tokens(word, title) %>% select(word)
dt.fb.7 = fb.words[,.N, by = "word"][order(N, decreasing= T)]
dt.fb.7 = dt.fb.7[!(word %in% stop_words$word),]
wordcloud(words = dt.fb.7$word, freq = dt.fb.7$N, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors = brewer.pal(8, "Dark2"))

Emailed (30 Days)

data(stop_words)

fb.words <-   lst.data.tables$emailed.30%>% unnest_tokens(word, title) %>% select(word)
dt.fb.7 = fb.words[,.N, by = "word"][order(N, decreasing= T)]
dt.fb.7 = dt.fb.7[!(word %in% stop_words$word),]
wordcloud(words = dt.fb.7$word, freq = dt.fb.7$N, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors = brewer.pal(8, "Dark2"))

Data Analysis: Alluvial Plots

The next step will be directly comparing article categories between mediums (Facebook/Email) between time periods ## Facebook vs Email

dt.30days = merge(lst.data.tables$emailed.30[,.N, by = .(section)], lst.data.tables$facebook.30[,.N, by = .(section)], by = "section", all.x = T, all.y = T)

dt.7days = merge(lst.data.tables$emailed.7[,.N, by = .(section)], lst.data.tables$facebook.7[,.N, by = .(section)], by = "section", all.x = T, all.y = T)

dt.alluvial = merge(dt.30days, dt.7days, by = "section")

setnames(dt.alluvial, c("N.x.x", "N.y.x", "N.x.y", "N.y.y"), c("Email_30", "FB_30", "Email_7", "FB_7"))
mlt.alluvial = melt(dt.alluvial)
mlt.alluvial[is.na(value), value := 0]

ggplot(data = mlt.alluvial[grepl("30", variable)],
       aes(x = variable, stratum = section, alluvium = section,
           y = value, label = section)) +
  geom_alluvium(aes(fill = section)) +
  geom_stratum() + geom_text(stat = "stratum") +
  labs(x = "Medium/Days", y = "Frequency", title = "(30 Days) Titles") + scale_fill_manual(values = brewer.pal(8, "Dark2")) + theme_minimal()

ggplot(data = mlt.alluvial[grepl("7", variable)],
       aes(x = variable, stratum = section, alluvium = section,
           y = value, label = section)) +
  geom_alluvium(aes(fill = section)) +
  geom_stratum() + geom_text(stat = "stratum") +
  labs(x = "Medium/Days", y = "Frequency", title = "(7 Days) Titles") + scale_fill_manual(values = brewer.pal(8, "Dark2")) + theme_minimal()

Conclusions

On Facebook, the Academy Award fiasco got more traction (last 7 days).
Both COVID-19 and Russia got traction via email and Facebook
Putin headlines got more traction via emailed articles.
Facebook has less opinion articles shared but had more content on world headlines.

Week 9