require(quanteda)
The coprus include news sumamries of the NYT since 1951. The news summaries were retrieved via API with the following search query:
(body:"diplomacy" OR body:"military") AND (body:"threat*" OR body:"danger*" OR body:"fear*" OR body:"risk*")
The database searches the whole news articles, but we only around 40 wourds for each. The length seems very short, but a lot of research has been done with similar data.
corp <- readRDS('Data/data_corpus_nytimes_summary.RDS')
ndoc(corp)
## [1] 173825
plot(table(docvars(corp, 'year')))
toks <- tokens(corp)
dict <- dictionary(file = 'english.yml')
dict$AMERICA$NORTH$US <- NULL # drop US
toks_dict <- tokens_lookup(toks, dict, level = 1)
mt_dict <- dfm(toks_dict, group = docvars(corp, 'year'))
matplot(mt_dict, type = 'l', lty = 1, xaxt = 'n')
axis(1, seq_len(nrow(mt_dict)), docnames(mt_dict))
grid()
legend('topleft', legend = featnames(mt_dict), lty = 1, col = seq_len(ncol(mt_dict)))
toks_dict2 <- tokens_lookup(toks, dict['ASIA'], level = 1:2)
mt_dict2 <- dfm(toks_dict2, group = docvars(corp, 'year'))
matplot(mt_dict2, type = 'l', lty = 1, xaxt = 'n')
axis(1, seq_len(nrow(mt_dict2)), docnames(mt_dict2))
grid()
legend('topleft', legend = featnames(mt_dict2), lty = 1, col = seq_len(ncol(mt_dict2)))