Number of Topics for LDA on poems from Elliston Poetry Archive

Objective

The purpose of this document is to determine the optimal number of topics to be used to model the poetry archive.
Different R packages like ldatuning and topicmodels is used that leverage the metrics defined below.This method is emperical rather than the intuitive approach of deciding the number of topics.

Metrics used for Comparison

Arun2010: The measure is computed in terms of symmetric KL-Divergence of salient distributions that are derived from these matrix factor and is observed that the divergence values are higher for non-optimal number of topics (maximize)

CaoJuan2009: method of adaptively selecting the best LDA model based on density.(minimize)

Griffths: To evaluate the consequences of changing the number of topics T, used the Gibbs sampling algorithm to obtain samples from the posterior distribution over z at several choices of T(minimize)

Perplexity:The most common way to evaluate a probabilistic model is to measure the log-likelihood of a held-out test set; Perplexity is a measurement of how well a probability distribution or probability model predicts a sample

set.seed(12345) 
sampling <- sample(1:23, replace = FALSE,size = nrow(data)*0.8 )
train_data <- data[sampling,]

test_data <- data[-sampling,]
##Creating the document-term matrix for train data
doc.vec_train <- VectorSource(train_data)
doc.corpus_train <- Corpus(doc.vec_train)
doc.corpus_train <- tm_map(doc.corpus_train , tolower)
doc.corpus_train <- tm_map(doc.corpus_train, removePunctuation)
doc.corpus_train <- tm_map(doc.corpus_train, removeNumbers)
doc.corpus_train <- tm_map(doc.corpus_train, removeWords, stopwords("english"))
doc.corpus_train <- tm_map(doc.corpus_train, stripWhitespace)

TDM_train <- TermDocumentMatrix(doc.corpus_train)
DTM_train <- DocumentTermMatrix(doc.corpus_train)

##Creating the document term matrix for test data
doc.vec_test <- VectorSource(test_data)
doc.corpus_test  <- Corpus(doc.vec_test)
doc.corpus_test  <- tm_map(doc.corpus_test, tolower)
doc.corpus_test  <- tm_map(doc.corpus_test, removePunctuation)
doc.corpus_test  <- tm_map(doc.corpus_test, removeNumbers)
doc.corpus_test  <- tm_map(doc.corpus_test, removeWords, stopwords("english"))
doc.corpus_test  <- tm_map(doc.corpus_test, stripWhitespace)

TDM_test <- TermDocumentMatrix(doc.corpus_test)
DTM_test <- DocumentTermMatrix(doc.corpus_test)

##plot the metrics to get number of topics
system.time({
  tunes <- FindTopicsNumber(
    dtm = DTM_train,
    topics = c(2:15),
    metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
    method = "Gibbs",
    control = list(seed = 12345),
    mc.cores = 4L,
    verbose = TRUE
  )
})

## fit models... done.
## calculate metrics:
##   Griffiths2004... done.
##   CaoJuan2009... done.
##   Arun2010... done.

##    user  system elapsed 
##    0.41    0.05    3.25

FindTopicsNumber_plot(tunes)

##Using perplexity for hold out set
perplexity_df <- data.frame(train=numeric(), test=numeric())
topics <- c(2:15)
burnin = 100
iter = 1000
keep = 50

set.seed(12345)
for (i in topics){
 
  fitted <- LDA(DTM_train, k = i, method = "Gibbs",
                control = list(burnin = burnin, iter = iter, keep = keep) )
  perplexity_df[i,1] <- perplexity(fitted, newdata = DTM_train)
  perplexity_df[i,2]  <- perplexity(fitted, newdata = DTM_test) 
}


##plotting the perplexity of both train and test

g <- ggplot(data=perplexity_df, aes(x= as.numeric(row.names(perplexity_df)))) + labs(y="Perplexity",x="Number of topics") + ggtitle("Perplexity of hold out  and training data")

g <- g + geom_line(aes(y=test), colour="red")
g <- g + geom_line(aes(y=train), colour="green")
g

#----------------5-fold cross-validation, different numbers of topics----------------

doc.vec <- VectorSource(unlist(data))
doc.corpus  <- Corpus(doc.vec)
doc.corpus  <- tm_map(doc.corpus, tolower)
doc.corpus  <- tm_map(doc.corpus, removePunctuation)
doc.corpus  <- tm_map(doc.corpus, removeNumbers)
doc.corpus  <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus  <- tm_map(doc.corpus, stripWhitespace)

TDM <- TermDocumentMatrix(doc.corpus)
DTM <- DocumentTermMatrix(doc.corpus)
#cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare...
#registerDoParallel(cluster)

#clusterEvalQ(cluster, {
#  library(topicmodels)
#})

folds <- 5
splitfolds <- sample(1:folds, 23, replace = TRUE)
candidate_k <- c(2:15) # candidates for how many topics
#clusterExport(cluster, c("train_set", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

# we parallelize by the different number of topics.  A processor is allocated a value
# of k, and does the cross-validation serially.  This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
  results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
    k <- candidate_k[j]
    results_1k <- matrix(0, nrow = folds, ncol = 2)
    colnames(results_1k) <- c("k", "perplexity")
    for(i in 1:folds){
      train_set <- DTM[splitfolds != i , ]
      valid_set <- DTM[splitfolds == i, ]
      
      fitted <- LDA(train_set, k = k, method = "Gibbs",
                    control = list(burnin = burnin, iter = iter, keep = keep) )
      results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
    }
    return(results_1k)
  }
})

##    user  system elapsed 
##   15.55    0.03   15.60

#stopCluster(cluster)

results_df <- as.data.frame(results)

ggplot(results_df, aes(x = k, y = perplexity)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  ggtitle("5-fold cross-validation of topic modeling with the Elliston dataset",
          "(ie five different models fit for each candidate number of topics)") +
  labs(x = "Candidate number of topics", y = "Perplexity when fitting the trained model to the hold-out set")

## `geom_smooth()` using method = 'loess'

References

1.Rajkumar Arun, V. Suresh, C. E. Veni Madhavan, and M. N. Narasimha Murthy. 2010. On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations. In Advances in Knowledge Discovery and Data Mining, Mohammed J. Zaki, Jeffrey Xu Yu, Balaraman Ravindran and Vikram Pudi (eds.). Springer Berlin Heidelberg, 391–402. http://doi.org/10.1007/978-3-642-13657-3_43

2.Cao Juan, Xia Tian, Li Jintao, Zhang Yongdong, and Tang Sheng. 2009. A density-based method for adaptive LDA model selection. Neurocomputing — 16th European Symposium on Artificial Neural Networks 2008 72, 7–9: 1775–1781. http://doi.org/10.1016/j.neucom.2008.06.011

3.Romain Deveaud, Éric SanJuan, and Patrice Bellot. 2014. Accurate and effective latent concept modeling for ad hoc information retrieval. Document numérique 17, 1: 61–84. http://doi.org/10.3166/dn.17.1.61-84

4.Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. Proceedings of the National Academy of Sciences 101, suppl 1: 5228–5235. http://www.pnas.org/content/101/suppl_1/5228.full

5.Martin Ponweiser. 2012. Latent Dirichlet Allocation in R. Retrieved from http://epub.wu.ac.at/id/eprint/3558

6.http://rpubs.com/nikita-moor/107657 : LDA Tuning Package

7.Cross validation for perplexity http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity and http://ellisp.github.io/blog/2017/01/05/topic-model-cv

8.The lecture from David Blei himself on LDA; http://videolectures.net/mlss09uk_blei_tm/

Number of Topics for LDA on poems from Elliston Poetry Archive

Nidhi

March 31, 2017

Objective

Metrics used for Comparison

References