Topic Modeling

This is Ben Schmidt's R diagnostics for long-term historical topic modeling. It works off a MALLET state file.

opts_chunk$set(warning = FALSE, error = FALSE, message = FALSE, results = "hide", 
    cache = TRUE)
fileLocation = "../topic-state"
topics = read.table(file = fileLocation, colClasses = c("integer", "NULL", "NULL", 
    "NULL", "character", "integer"), comment.char = "#", col.names = c("doc", 
    "NA", "NA2", "NA3", "word", "topic"))

This is code I borrowed from Andrew Goldstone to match the metadata from Jstor against the topics. If you're not using Jstor, you would just need to
1) create a 'year' column in the topics frame
2) (optionally) create a files frame with other metadata, indexed so that files$numid maps to topics$doc.

ids = read.table("../readOrder.txt", col.names = "filename")
ids$numid = 0:(nrow(ids) - 1)
ids$id = gsub("_", "/", gsub("\\.[[:alpha:]]*$", "", gsub("^.*wordcounts_", 
    "", ids$filename)))

# Goldstone's script.
source("metadata.R")

files = read.citations("../citations_all.csv")
files$year = as.numeric(substr(files$pubdate, 1, 4))
files$filename
files$numid = ids$numid[match(files$id, ids$id)]
# match is fast than merge, and will only work once.
topics$year = files$year[match(topics$doc, files$numid)]
topics = topics[!is.na(topics$year), ]  #eliminates duplicates, among other things.

And finally, topics conventionally have names, not numbers. Here we derive those as the top seven words in each topic.

require(plyr)
topicNames = dlply(topics, .(topic), function(locframe) {
    paste(names(sort(-table(locframe$word)))[1:7], collapse = " ")
}, .progress = "text")
if (sum(duplicated(topicNames)) > 0) {
    warning("Two topics have the same top 7 words. Yikes! Time to rethink the labeling scheme, just leaving it as integers for now.")
} else {
    topics$topic = factor(topics$topic, levels = as.numeric(names(topicNames)), 
        labels = unlist(topicNames))
}

OK, now to look for weirdness.


topicBeforeAndAfterSplit = function(thisTopic=topics[topics$topic ==sample(levels(topics$topic),1),]) {
  #allowing multiple bin sizes, just in case.
  bins=2
  thisTopic$era = cut(thisTopic$year,breaks=quantile(thisTopic$year,na.rm=T,probs=seq(0,1,length.out=bins+1)),include.lowest=T,labels=paste("From",quantile(thisTopic$year,na.rm=T,probs=seq(0,1,length.out=bins+1))[-(bins+1)],'to',quantile(thisTopic$year,na.rm=T,probs=seq(0,1,length.out=bins+1))[-1]))
  tabbed = table(thisTopic$word,thisTopic$era)
  tabbed = tabbed[order(-rowSums(tabbed)),]
  class(tabbed)='matrix'

  #For log-likelihood purposes, consider any non-appearances to be half-appearances.
  tabbed[tabbed==0] = .5

  llr = function(k) { 2 * (llr.H(k) - llr.H(rowSums(k)) - llr.H(colSums(k)) ) }
  llr.H = function(k) { total = sum(k) ; sum( k * log(k / total)) }

  tabbed = cbind(tabbed,round(apply(tabbed, 1, function(row) {
    k = rbind(row, colSums(tabbed))
    2 * (llr.H(k) - llr.H(rowSums(k)) - llr.H(colSums(k))) * (as.numeric(row[1]>row[2])*2-1)
  }),3))
  #A grabbag of outputs to look at
  output = list(
    #all words with their frequencies in each of the year bins
    allWords = tabbed,
    #the name of the topic
    topic = as.character(thisTopic$topic[1]),
    #the number of words in the topic
    size = nrow(thisTopic),
    #the mean Dunning-log-likelihood across all words in the set: higher
    #means more unlikely
    unlikelihood = mean(abs(tabbed[,3])),
    #A per-token normalizetion of the Dunning score.
    unlikelihoodPerToken = mean(abs(tabbed[,3]))/nrow(thisTopic),
    #correlation of word counts from group 1 to group 2, log distribution
    overallCorrelation = cor(log(tabbed[,1]),log(tabbed[,2])),
    #the 100 words with the worst Dunning score in either direction
    worstWords =tabbed[order(-abs(tabbed[,3])),][1:100,],
    #the 100 commonest words
    commonestWords = tabbed[1:100,],
    #the 100 commonest words in time group A and B
    topA = rownames(tabbed)[order(-tabbed[,1])][1:100],
    topB = rownames(tabbed)[order(-tabbed[,2])][1:100],
    #just the words with no data
    topOverall = rownames(tabbed)[order(-rowSums(tabbed[,1:2]))][1:100])
  #The intergroup correlation for the most common words, log distribution
  output$topCor = cor(log(output$commonestWords[1:100,1]),log(output$commonestWords[1:100,2]))
  #The intergroup correlation for the very most common words.
  output$tenCor = cor(log(output$commonestWords[1:10,1]),log(output$commonestWords[1:10,2]))
  return(output)
}

Now plow through all the topics and run that function to extract out various data about them. The exact metrics I'm using are documented inside the code for the previous function

topics = idata.frame(topics)
diagnostics = dlply(topics, .(topic), topicBeforeAndAfterSplit, .progress = "text")

unimeasures = ldply(diagnostics, function(list) {
    data.frame(topCor = list$topCor, tenCor = list$tenCor, unlikelihood = list$unlikelihood, 
        perToken = list$unlikelihoodPerToken)
})

Plot some of the worst ones. This won't work with other models, perhaps. The whole block can be skipped, though.

require(ggplot2)

require(reshape2)
badTopics = unimeasures$topic[order(-unimeasures$tenCor)][1:100]
# Dropping ones that start with fewer than four letter words clears out
# language topics.

badTopics = badTopics[grep("^\\w\\w\\w\\w", badTopics)]
badTopics = badTopics[1:10]
plottable = ldply(badTopics, function(topic) {
    ranks = apply(-diagnostics[[topic]]$allWords[, 1:2], 2, rank)
    plottable = melt(ranks[1:25, ])
    # plottable = melt(diagnostics[[topic]]$commonestWords[1:20,1:2])
    names(plottable) = c("word", "Period", "count")
    plottable$topic = topic
    plottable
})
# From stackOverflow
reverselog_trans <- function(base = 10) {
    require(scales)
    trans <- function(x) -log(x, base)
    inv <- function(x) base^(-x)
    trans_new(paste0("reverselog-", format(base)), trans, inv, log_breaks(base = base), 
        domain = c(1e-100, Inf))
}

ggplot(plottable, aes(x = Period, y = count, label = paste0(count, ". ", word))) + 
    geom_text(size = 3.5) + geom_path(aes(group = word), alpha = 0.25) + facet_wrap(~topic, 
    scales = "free", nrow = 2) + labs(title = "Splitting in half by year shows two very different vocabularies\n(showing the top 20 words overall in the 10 worst English language topics)") + 
    scale_y_continuous("Rank of word among corpuses top 100 words", trans = "reverselog")

ggplot(plottable, aes(x = Period, y = count, label = paste0(count, ". ", word))) + 
    geom_text(size = 3.5) + geom_path(aes(group = word), alpha = 0.25) + facet_wrap(~topic, 
    scales = "free", nrow = 2) + labs(title = "A random set of 10 topics is better, but still shows some change") + 
    scale_y_continuous("Rank of word among corpuses top 100 words", trans = "reverselog")

ggplot(plottable, aes(x = Period, y = count, label = word)) + geom_text(size = 3.5) + 
    geom_path(aes(group = word), alpha = 0.25) + facet_wrap(~topic, scales = "free", 
    ncol = 2)

plot of chunk unnamed-chunk-3

Here's a function to plot a word's usage across its top five topics.

wordDist = function(word) {
    # This requires topics to exist as a frame word can be a vector of any
    # length
    wordframe = as.data.frame(topics[topics$word %in% word, ])
    wordframe = wordframe[wordframe$topic %in% names(head(sort(-table(wordframe$topic)), 
        5)), ]
    wordframe$topic = factor(wordframe$topic)
    tabbed = table(round(as.numeric(as.character(wordframe$year)), -1), wordframe$topic)
    yearvalues = melt(tabbed)
    names(yearvalues) = c("year", "topic", "count")
    yearvalues$year = as.numeric(as.character(yearvalues$year))
    require(ggplot2)
    ggplot(yearvalues) + geom_line(aes(x = year, y = count, color = topic)) + 
        labs(title = paste0("Usage of ", paste(word, collapse = "/"), " across different topics")) + 
        theme(legend.position = "bottom")
}

For example:

wordDist(c("represent", "represents", "represented", "representing"))

plot of chunk unnamed-chunk-5