NYT historical trend with Newsmap

# devtools::install_github("koheiw/newsmap")
require(quanteda)
require(newsmap)
require(stringi)

Data

The coprus include news sumamries of the NYT since 1951. The news summaries were retrieved via API with the following search query:

(body:"diplomacy" OR body:"military") AND (body:"threat*" OR body:"danger*" OR body:"fear*" OR body:"risk*")

The database searches the whole news articles, but we only around 40 wourds for each. The length seems very short, but a lot of research has been done with similar data.

Number of hits

corp <- readRDS('Data/data_corpus_nytimes_summary.RDS')
ndoc(corp)

## [1] 173825

plot(table(docvars(corp, 'year')))

Main focus of news

toks <- tokens(corp)

dict <- dictionary(file = 'english.yml')

# Custom stopwords
month <- c('January', 'February', 'March', 'April', 'May', 'June',
           'July', 'August', 'September', 'October', 'November', 'December')
day <- c('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
agency <- c('AP', 'AFP', 'Reuters')

# Tokenize
toks <- tokens(corp)
toks <- tokens_remove(toks, stopwords('english'), valuetype = 'fixed', padding = TRUE)
toks <- tokens_remove(toks, c(month, day, agency), valuetype = 'fixed', padding = TRUE)

# Traing model using seed dictionary
label_toks <- tokens_lookup(toks, dict, levels = 1:3) # level 3 is country
label_dfm <- dfm(label_toks)

feat_dfm <- dfm(toks, tolower = FALSE)
feat_dfm <- dfm_select(feat_dfm, '^[A-Z][A-Za-z1-2]+', valuetype = 'regex', case_insensitive = FALSE) # include only proper nouns to model
feat_dfm <- dfm_trim(feat_dfm, min_count = 10)

model <- textmodel_newsmap(feat_dfm, label_dfm)
loc <- predict(model, confidence.fit = TRUE)

mat_loc <- t(stri_list2matrix(stri_split_fixed(loc$class, '.')))
data <- cbind(docvars(corp), 
              region1 = mat_loc[,1],
              region2 = paste0(mat_loc[,1], '.', mat_loc[,2]),
              country = mat_loc[,3],
              confidence = loc$confidence.fit, stringsAsFactors = FALSE)
data <- data[data$confidence > 1,]
data$year <- factor(data$year, levels = 1851:2017)

World regions

tb <- as.matrix(table(data$year, data$region1))
matplot(tb, type = 'l', lty = 1, xaxt = 'n')
axis(1, seq_len(nrow(tb)), rownames(tb))
grid()
legend('topleft', legend = colnames(tb), lty = 1, col = seq_len(ncol(tb)))

Asia

data_asia <- data[data$region1 == 'asia',] 
tb_asia <- as.matrix(table(data_asia$year, data_asia$region2))
matplot(tb_asia, type = 'l', lty = 1, xaxt = 'n')
axis(1, seq_len(nrow(tb_asia)), rownames(tb_asia))
grid()
legend('topleft', legend = colnames(tb_asia), lty = 1, col = seq_len(ncol(tb_asia)))

China

data_cn <- data[data$country == 'cn',] 
data_cn <- as.matrix(table(data_cn$year, data_cn$region2))
matplot(data_cn, type = 'l', lty = 1, xaxt = 'n', ylim = c(0, 200))
axis(1, seq_len(nrow(data_cn)), rownames(data_cn))
data_cn2 <- as.matrix(dfm_group(label_dfm[,'asia.east.cn'], docvars(label_dfm, 'year')))
matplot(data_cn2, type = 'l', col = 'red', add = TRUE, xlim = c(1851, 2017))
grid()
legend('topleft', legend = c('newsmap', 'count'), lty = 1, col = 1:2)