Introduction

The goal here is to query as much of the contentapi as possible to get the text of articles, then get a count of the number of each type of character used.

First step here is to get all of the article URLs from the past two years. This document is on the internet so the SQL authentication info is redacted.

library(RPostgreSQL)

drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, 
                 dbname = "redacted",
                 host = "redacted",
                 port = 'redacted',
                 user = "redacted",
                 password = "redacted")

articles <- dbGetQuery(con, 'select url, pub_date from slateplus.articles_article where pub_date > \'2015-10-01\';')
head(articles)

Next we loop through each article and download a JSON file. Then we extract the text from the JSON file within another loop. The last step creates a dataframe with all of the article text and publication dates.

library(RCurl)
library(rjson)
library(dplyr)
article_text <- c()
article_jsons <- list()
for(u in seq(1,length(articles$url))){
  art <- getURL(url = paste0('https://my.slate.com/api/articles/url?url=', articles$url[u])) 
  article_jsons[[u]] <- art
  art <- art%>%fromJSON()
  
  text_list <- c()
  for(t in seq(1:length(art$body))){
    ifelse(nchar(art$body[[t]]$text) > 5, 
           text_list[t] <- art$body[[t]]$text,
           next)
  }
  article_text[u] <- paste(text_list, collapse = ' ')
}

articles_text <- data.frame(
  date = articles$pub_date,
  text = article_text
)

Split out each word onto its own line of a dataframe (that’s what unnest_tokens() does!). This is where the computationally expensive stuff starts!

library(tidytext)
article_words <- articles_text %>%
  mutate(text = as.character(text))%>%
  unnest_tokens(word, text)

Repeat this process but split out characters within words. I did it this way so we could identify weird characters and map them to the actual words they’re in. This is the last step which did not work.

chars <- article_words%>%
  mutate(word = as.character(word))%>%
  unnest_tokens(char, word, token = 'characters', drop = FALSE)%>%
  # filter(!char %in% c(letters, as.character(seq(0,9))))%>%
  group_by(char, date)%>%
  summarise(n = n())%>%
  distinct()
LS0tCnRpdGxlOiAiQ2hhcmFjdGVyIENvdW50IEV4cGxvcmF0aW9uIgphdXRob3I6ICdKb3NoIFlhem1hbicKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQojIEludHJvZHVjdGlvbgpUaGUgZ29hbCBoZXJlIGlzIHRvIHF1ZXJ5IGFzIG11Y2ggb2YgdGhlIGNvbnRlbnRhcGkgYXMgcG9zc2libGUgdG8gZ2V0IHRoZSB0ZXh0IG9mIGFydGljbGVzLCB0aGVuIGdldCBhIGNvdW50IG9mIHRoZSBudW1iZXIgb2YgZWFjaCB0eXBlIG9mIGNoYXJhY3RlciB1c2VkLgoKRmlyc3Qgc3RlcCBoZXJlIGlzIHRvIGdldCBhbGwgb2YgdGhlIGFydGljbGUgVVJMcyBmcm9tIHRoZSBwYXN0IHR3byB5ZWFycy4gVGhpcyBkb2N1bWVudCBpcyBvbiB0aGUgaW50ZXJuZXQgc28gdGhlIFNRTCBhdXRoZW50aWNhdGlvbiBpbmZvIGlzIHJlZGFjdGVkLgpgYGAKbGlicmFyeShSUG9zdGdyZVNRTCkKCmRydiA8LSBkYkRyaXZlcigiUG9zdGdyZVNRTCIpCmNvbiA8LSBkYkNvbm5lY3QoZHJ2LCAKICAgICAgICAgICAgICAgICBkYm5hbWUgPSAicmVkYWN0ZWQiLAogICAgICAgICAgICAgICAgIGhvc3QgPSAicmVkYWN0ZWQiLAogICAgICAgICAgICAgICAgIHBvcnQgPSAncmVkYWN0ZWQnLAogICAgICAgICAgICAgICAgIHVzZXIgPSAicmVkYWN0ZWQiLAogICAgICAgICAgICAgICAgIHBhc3N3b3JkID0gInJlZGFjdGVkIikKCmFydGljbGVzIDwtIGRiR2V0UXVlcnkoY29uLCAnc2VsZWN0IHVybCwgcHViX2RhdGUgZnJvbSBzbGF0ZXBsdXMuYXJ0aWNsZXNfYXJ0aWNsZSB3aGVyZSBwdWJfZGF0ZSA+IFwnMjAxNS0xMC0wMVwnOycpCmhlYWQoYXJ0aWNsZXMpCmBgYAoKTmV4dCB3ZSBsb29wIHRocm91Z2ggZWFjaCBhcnRpY2xlIGFuZCBkb3dubG9hZCBhIEpTT04gZmlsZS4gVGhlbiB3ZSBleHRyYWN0IHRoZSB0ZXh0IGZyb20gdGhlIEpTT04gZmlsZSB3aXRoaW4gYW5vdGhlciBsb29wLiBUaGUgbGFzdCBzdGVwIGNyZWF0ZXMgYSBkYXRhZnJhbWUgd2l0aCBhbGwgb2YgdGhlIGFydGljbGUgdGV4dCBhbmQgcHVibGljYXRpb24gZGF0ZXMuIAoKYGBge3IsIGVjaG8gPSBUfQpsaWJyYXJ5KFJDdXJsKQpsaWJyYXJ5KHJqc29uKQpsaWJyYXJ5KGRwbHlyKQphcnRpY2xlX3RleHQgPC0gYygpCmFydGljbGVfanNvbnMgPC0gbGlzdCgpCmZvcih1IGluIHNlcSgxLGxlbmd0aChhcnRpY2xlcyR1cmwpKSl7CiAgYXJ0IDwtIGdldFVSTCh1cmwgPSBwYXN0ZTAoJ2h0dHBzOi8vbXkuc2xhdGUuY29tL2FwaS9hcnRpY2xlcy91cmw/dXJsPScsIGFydGljbGVzJHVybFt1XSkpIAogIGFydGljbGVfanNvbnNbW3VdXSA8LSBhcnQKICBhcnQgPC0gYXJ0JT4lZnJvbUpTT04oKQogIAogIHRleHRfbGlzdCA8LSBjKCkKICBmb3IodCBpbiBzZXEoMTpsZW5ndGgoYXJ0JGJvZHkpKSl7CiAgICBpZmVsc2UobmNoYXIoYXJ0JGJvZHlbW3RdXSR0ZXh0KSA+IDUsIAogICAgICAgICAgIHRleHRfbGlzdFt0XSA8LSBhcnQkYm9keVtbdF1dJHRleHQsCiAgICAgICAgICAgbmV4dCkKICB9CiAgYXJ0aWNsZV90ZXh0W3VdIDwtIHBhc3RlKHRleHRfbGlzdCwgY29sbGFwc2UgPSAnICcpCn0KCmFydGljbGVzX3RleHQgPC0gZGF0YS5mcmFtZSgKICBkYXRlID0gYXJ0aWNsZXMkcHViX2RhdGUsCiAgdGV4dCA9IGFydGljbGVfdGV4dAopCmBgYAoKU3BsaXQgb3V0IGVhY2ggd29yZCBvbnRvIGl0cyBvd24gbGluZSBvZiBhIGRhdGFmcmFtZSAodGhhdCdzIHdoYXQgYHVubmVzdF90b2tlbnMoKWAgZG9lcyEpLiBUaGlzIGlzIHdoZXJlIHRoZSBjb21wdXRhdGlvbmFsbHkgZXhwZW5zaXZlIHN0dWZmIHN0YXJ0cyEKYGBge3IsIGVjaG8gPSBUfQpsaWJyYXJ5KHRpZHl0ZXh0KQphcnRpY2xlX3dvcmRzIDwtIGFydGljbGVzX3RleHQgJT4lCiAgbXV0YXRlKHRleHQgPSBhcy5jaGFyYWN0ZXIodGV4dCkpJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQpgYGAKClJlcGVhdCB0aGlzIHByb2Nlc3MgYnV0IHNwbGl0IG91dCBjaGFyYWN0ZXJzIHdpdGhpbiB3b3Jkcy4gSSBkaWQgaXQgdGhpcyB3YXkgc28gd2UgY291bGQgaWRlbnRpZnkgd2VpcmQgY2hhcmFjdGVycyBhbmQgbWFwIHRoZW0gdG8gdGhlIGFjdHVhbCB3b3JkcyB0aGV5J3JlIGluLiBUaGlzIGlzIHRoZSBsYXN0IHN0ZXAgd2hpY2ggZGlkIG5vdCB3b3JrLgpgYGB7ciwgZWNobyA9IFR9CmNoYXJzIDwtIGFydGljbGVfd29yZHMlPiUKICBtdXRhdGUod29yZCA9IGFzLmNoYXJhY3Rlcih3b3JkKSklPiUKICB1bm5lc3RfdG9rZW5zKGNoYXIsIHdvcmQsIHRva2VuID0gJ2NoYXJhY3RlcnMnLCBkcm9wID0gRkFMU0UpJT4lCiAgIyBmaWx0ZXIoIWNoYXIgJWluJSBjKGxldHRlcnMsIGFzLmNoYXJhY3RlcihzZXEoMCw5KSkpKSU+JQogIGdyb3VwX2J5KGNoYXIsIGRhdGUpJT4lCiAgc3VtbWFyaXNlKG4gPSBuKCkpJT4lCiAgZGlzdGluY3QoKQpgYGA=