The purpose of this document is to determine the optimal number of topics to be used to model the poetry archive.
Different R packages like ldatuning and topicmodels is used that leverage the metrics defined below.This method is emperical rather than the intuitive approach of deciding the number of topics.
Arun2010: The measure is computed in terms of symmetric KL-Divergence of salient distributions that are derived from these matrix factor and is observed that the divergence values are higher for non-optimal number of topics (maximize)
CaoJuan2009: method of adaptively selecting the best LDA model based on density.(minimize)
Griffths: To evaluate the consequences of changing the number of topics T, used the Gibbs sampling algorithm to obtain samples from the posterior distribution over z at several choices of T(minimize)
Perplexity:The most common way to evaluate a probabilistic model is to measure the log-likelihood of a held-out test set; Perplexity is a measurement of how well a probability distribution or probability model predicts a sample
set.seed(12345)
sampling <- sample(1:23, replace = FALSE,size = nrow(data)*0.8 )
train_data <- data[sampling,]
test_data <- data[-sampling,]
##Creating the document-term matrix for train data
doc.vec_train <- VectorSource(train_data)
doc.corpus_train <- Corpus(doc.vec_train)
doc.corpus_train <- tm_map(doc.corpus_train , tolower)
doc.corpus_train <- tm_map(doc.corpus_train, removePunctuation)
doc.corpus_train <- tm_map(doc.corpus_train, removeNumbers)
doc.corpus_train <- tm_map(doc.corpus_train, removeWords, stopwords("english"))
doc.corpus_train <- tm_map(doc.corpus_train, stripWhitespace)
TDM_train <- TermDocumentMatrix(doc.corpus_train)
DTM_train <- DocumentTermMatrix(doc.corpus_train)
##Creating the document term matrix for test data
doc.vec_test <- VectorSource(test_data)
doc.corpus_test <- Corpus(doc.vec_test)
doc.corpus_test <- tm_map(doc.corpus_test, tolower)
doc.corpus_test <- tm_map(doc.corpus_test, removePunctuation)
doc.corpus_test <- tm_map(doc.corpus_test, removeNumbers)
doc.corpus_test <- tm_map(doc.corpus_test, removeWords, stopwords("english"))
doc.corpus_test <- tm_map(doc.corpus_test, stripWhitespace)
TDM_test <- TermDocumentMatrix(doc.corpus_test)
DTM_test <- DocumentTermMatrix(doc.corpus_test)
##plot the metrics to get number of topics
system.time({
tunes <- FindTopicsNumber(
dtm = DTM_train,
topics = c(2:15),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
method = "Gibbs",
control = list(seed = 12345),
mc.cores = 4L,
verbose = TRUE
)
})
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## user system elapsed
## 0.41 0.05 3.25
FindTopicsNumber_plot(tunes)
##Using perplexity for hold out set
perplexity_df <- data.frame(train=numeric(), test=numeric())
topics <- c(2:15)
burnin = 100
iter = 1000
keep = 50
set.seed(12345)
for (i in topics){
fitted <- LDA(DTM_train, k = i, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
perplexity_df[i,1] <- perplexity(fitted, newdata = DTM_train)
perplexity_df[i,2] <- perplexity(fitted, newdata = DTM_test)
}
##plotting the perplexity of both train and test
g <- ggplot(data=perplexity_df, aes(x= as.numeric(row.names(perplexity_df)))) + labs(y="Perplexity",x="Number of topics") + ggtitle("Perplexity of hold out and training data")
g <- g + geom_line(aes(y=test), colour="red")
g <- g + geom_line(aes(y=train), colour="green")
g
#----------------5-fold cross-validation, different numbers of topics----------------
doc.vec <- VectorSource(unlist(data))
doc.corpus <- Corpus(doc.vec)
doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
TDM <- TermDocumentMatrix(doc.corpus)
DTM <- DocumentTermMatrix(doc.corpus)
#cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare...
#registerDoParallel(cluster)
#clusterEvalQ(cluster, {
# library(topicmodels)
#})
folds <- 5
splitfolds <- sample(1:folds, 23, replace = TRUE)
candidate_k <- c(2:15) # candidates for how many topics
#clusterExport(cluster, c("train_set", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))
# we parallelize by the different number of topics. A processor is allocated a value
# of k, and does the cross-validation serially. This is because it is assumed there
# are more candidate values of k than there are cross-validation folds, hence it
# will be more efficient to parallelise
system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0, nrow = folds, ncol = 2)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- DTM[splitfolds != i , ]
valid_set <- DTM[splitfolds == i, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(burnin = burnin, iter = iter, keep = keep) )
results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
}
return(results_1k)
}
})
## user system elapsed
## 15.55 0.03 15.60
#stopCluster(cluster)
results_df <- as.data.frame(results)
ggplot(results_df, aes(x = k, y = perplexity)) +
geom_point() +
geom_smooth(se = FALSE) +
ggtitle("5-fold cross-validation of topic modeling with the Elliston dataset",
"(ie five different models fit for each candidate number of topics)") +
labs(x = "Candidate number of topics", y = "Perplexity when fitting the trained model to the hold-out set")
## `geom_smooth()` using method = 'loess'
1.Rajkumar Arun, V. Suresh, C. E. Veni Madhavan, and M. N. Narasimha Murthy. 2010. On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations. In Advances in Knowledge Discovery and Data Mining, Mohammed J. Zaki, Jeffrey Xu Yu, Balaraman Ravindran and Vikram Pudi (eds.). Springer Berlin Heidelberg, 391–402. http://doi.org/10.1007/978-3-642-13657-3_43
2.Cao Juan, Xia Tian, Li Jintao, Zhang Yongdong, and Tang Sheng. 2009. A density-based method for adaptive LDA model selection. Neurocomputing — 16th European Symposium on Artificial Neural Networks 2008 72, 7–9: 1775–1781. http://doi.org/10.1016/j.neucom.2008.06.011
3.Romain Deveaud, Éric SanJuan, and Patrice Bellot. 2014. Accurate and effective latent concept modeling for ad hoc information retrieval. Document numérique 17, 1: 61–84. http://doi.org/10.3166/dn.17.1.61-84
4.Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. Proceedings of the National Academy of Sciences 101, suppl 1: 5228–5235. http://www.pnas.org/content/101/suppl_1/5228.full
5.Martin Ponweiser. 2012. Latent Dirichlet Allocation in R. Retrieved from http://epub.wu.ac.at/id/eprint/3558
6.http://rpubs.com/nikita-moor/107657 : LDA Tuning Package
7.Cross validation for perplexity http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity and http://ellisp.github.io/blog/2017/01/05/topic-model-cv
8.The lecture from David Blei himself on LDA; http://videolectures.net/mlss09uk_blei_tm/