R script for building Structured Topic Model using data from YLE and HS web news archives. Datasets are not publicly available. HTML version of this document including all graphics is available on RPubs. Source code is also available.

Energy subcorpus

energy_processed <- textProcessor(
  energy_full$content, 
  metadata = energy_full,
  stem=FALSE,
  customstopwords = stopwords$V1)

energy_out <- prepDocuments(
  energy_processed$documents, 
  energy_processed$vocab,
  energy_processed$meta,
  lower.thresh = 15)

# Let's use automatic selection of topics using parameter K=0
energy_model_fit <- stm(
  energy_out$documents, 
  energy_out$vocab, 
  K = 0,
  prevalence =~ media + s(year), 
  max.em.its = 100,
  data = energy_out$meta, 
  init.type = "Spectral")
energy_model_fit <- readRDS('energy_model_fit.RDs')
energy_out <- readRDS('energy_out.RDs')
labelTopics(energy_model_fit)
energy_prep <- estimateEffect(
  1:72 ~ media + s(year),
  energy_model_fit,
  meta = energy_out$meta,
  uncertainty = "Global")
energy_model_fit <- readRDS('energy_model_fit.RDs')
plot(
  energy_model_fit,
  type = "summary",
  xlim = c(0, .6),
  n = 10,
  labeltype = "frex",
  topics = c(67,25,61,36,30),
  text.cex = 0.6,
  main = "Selected energy-related topics")

Ukrainian problems -topic (topic no 25)

energy_prep <- readRDS('energy_prep.Rds')
plot(
  energy_prep, "year", 
  method = "continuous", 
  topics = 25, 
  printlegend = FALSE,
  xaxt = "n",
  xlab = "Years",
  main = "Ukrainian problems")
axis(1,at=seq(from=2000, to=2016,by=2))

Russia-Ukraine relations (topic no 61)

plot(
  energy_prep, "year", 
  method = "continuous", 
  topics = 61, 
  printlegend = FALSE,
  xaxt = "n",
  xlab = "Years",
  main = "Russia - Ukraine -relations")
axis(1,at=seq(from=2000, to=2016,by=2))

plot(energy_prep, covariate = "media", topics = c(67,25,61,36,30),
  model = energy_model_fit, method = "difference",
  cov.value1 = "HS", cov.value2 = "YLE",
  main = "HS vs YLE",
  xlim = c(-.1, .1),
  labeltype = "custom",
  custom.labels = c(
    "LNG and natural gas",
    "Ukrainan gas pipeline",
    "Russia-Ukraine -relations",
    "Oil price",
    "Price of gasoline"))

Migration subcorpus

migration_processed <- textProcessor(
  migration_full$content,
  metadata = migration_full,
  stem=FALSE,
  customstopwords = stopwords$V1)

migration_out <- prepDocuments(
  migration_processed$documents,
  migration_processed$vocab,
  migration_processed$meta,
  lower.thresh = 15)

# Let's use automatic selection of topics using parameter K=0
migration_model_fit <- stm(
  migration_out$documents,
  migration_out$vocab,
  K = 0,
  prevalence =~ media + s(year),
  max.em.its = 100,
  data = migration_out$meta,
  init.type = "Spectral")
migration_model_fit <- readRDS('migration_model_fit.RDs')
migration_out <- readRDS('migration_out.RDs')
labelTopics(migration_model_fit)
migration_prep <- estimateEffect(
  1:50 ~ media + s(year),
  migration_model_fit,
  meta = migration_out$meta,
  uncertainty = "Global")
migration_model_fit <- readRDS('migration_model_fit.RDs')
plot(
  migration_model_fit,
  type = "summary",
  xlim = c(0, .6),
  n = 10,
  labeltype = "frex",
  topics = c(3,5,13,28,80,1,8),
  text.cex = 0.6,
  main = "Selected immigration-related topics")