Topic Modeling the Tracts for the Times

For this week, I used the Tracts for the Times text files and generated topic models.

Sys.setenv(NOAWT=TRUE) #Not sure what this means
require(mallet)

## Loading required package: mallet
## Loading required package: rJava

documents <- mallet.read.dir("~/Desktop/Clio-3/tracts-for-the-times") #Read this directory and its files into mallet format and then put in "documents"
mallet.instances <- mallet.import(documents$id, documents$text, "~/Desktop/Clio-3/stoplist.txt", token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}") #Format text files into a 
#form mallet can read and add a stoplist text file to the designated dir.

n.topics <- 20 #specify number of topics
topic.model <- MalletLDA(n.topics)

topic.model$loadDocuments(mallet.instances) #apply topic models to documents

vocabulary <- topic.model$getVocabulary() #lists all the unique words
vocabulary[1:50]

##  [1] "thoughts"        "ministerial"     "commission"     
##  [4] "respectfully"    "addressed"       "clergy"         
##  [7] "number"          "newman"          "one"            
## [10] "yourselves,---a" "presbyter"       "conceal"        
## [13] "name"            "lest"            "take"           
## [16] "too"             "myself"          "speaking"       
## [19] "own"             "person"          "speak"          
## [22] "times"           "evil"            "speaks"         
## [25] "against"         "look"            "perform"        
## [28] "confess"         "peril"           "church"         
## [31] "come"            "sit"             "his"            
## [34] "retirement"      "mountains"       "seas"           
## [37] "cut"             "off"             "brother"        
## [40] "suffer"          "while"           "try"            
## [43] "draw"            "forth"           "pleasant"       
## [46] "retreats"        "our"             "blessedness"    
## [49] "hitherto"        "enjoy"

word.freqs <- mallet.word.freqs(topic.model) #lists all the unique words with document and corpus frequency
head(word.freqs)

##          words term.freq doc.freq
## 1     thoughts         8        4
## 2  ministerial        31       18
## 3   commission       130       31
## 4 respectfully         5        4
## 5    addressed        32       17
## 6       clergy       125       38

topic.model$setAlphaOptimization(20, 50) #Optimize the intervals

topic.model$train(400) #Specify number of iterations- not sure what this means

topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
#generates labels for the topics by top words in each topic

print(topics.labels)

##  [1] "doctrine articles article church say"    
##  [2] "said sir will think say"                 
##  [3] "antichrist come time will rome"          
##  [4] "fathers sense mystical meaning his"      
##  [5] "baptism spirit water baptized sins"      
##  [6] "day prayer daily service prayers"        
##  [7] "will god sin thy thou"                   
##  [8] "unto thy thee prayer prayers"            
##  [9] "body christ bread blood sacrament"       
## [10] "his our god him church"                  
## [11] "purgatory fire dead judgment doctrine"   
## [12] "one first his seem new"                  
## [13] "our scripture will one word"             
## [14] "knowledge great divine principle system" 
## [15] "church day state her present"            
## [16] "church apostles bishops authority bishop"
## [17] "sacrifice god offered christ eucharist"  
## [18] "doctrine his system man truth"           
## [19] "one great will say well"                 
## [20] "fasting our fast days fasts"

topic.words.m <- mallet.topic.words(topic.model, smoothed=TRUE,
                                    normalized=TRUE)
dim(topic.words.m) #not sure what this does

## [1]    20 21444

vocabulary <- topic.model$getVocabulary() 
colnames(topic.words.m) <- vocabulary #assigns topic names to columns
topic.words.m[1:3, 1:3]

##       thoughts ministerial commission
## [1,] 2.864e-06   2.864e-06  2.864e-06
## [2,] 3.948e-06   3.948e-06  3.948e-06
## [3,] 3.925e-06   3.925e-06  3.925e-06

keywords <- c("body", "blood") #chooses two topic model words and finds their frequency in each topic. Not sure how this works
topic.words.m[, keywords]

##            body     blood
##  [1,] 1.080e-04 2.864e-06
##  [2,] 3.948e-06 3.948e-06
##  [3,] 3.925e-06 3.925e-06
##  [4,] 2.697e-06 2.697e-06
##  [5,] 3.782e-06 3.782e-06
##  [6,] 3.518e-06 3.518e-06
##  [7,] 1.714e-06 1.714e-06
##  [8,] 7.841e-03 1.873e-06
##  [9,] 5.349e-02 2.454e-02
## [10,] 4.134e-03 1.345e-03
## [11,] 3.766e-06 3.766e-06
## [12,] 1.086e-06 1.086e-06
## [13,] 9.319e-07 9.319e-07
## [14,] 1.296e-06 1.296e-06
## [15,] 2.045e-06 2.045e-06
## [16,] 1.894e-03 2.342e-06
## [17,] 8.575e-07 1.074e-02
## [18,] 3.540e-06 3.540e-06
## [19,] 6.563e-07 6.563e-07
## [20,] 1.554e-03 9.733e-04

imp.row <- which(rowSums(topic.words.m[, keywords]) == max(rowSums(topic.words.m[, keywords])))
mallet.top.words(topic.model, topic.words.m[imp.row,], 10)

##               words  weights
## body           body 0.053493
## christ       christ 0.043084
## bread         bread 0.030623
## blood         blood 0.024544
## sacrament sacrament 0.024164
## wine           wine 0.013907
## substance substance 0.009804
## fathers     fathers 0.007600
## manner       manner 0.007600
## said           said 0.007372

library(wordcloud)

## Loading required package: RColorBrewer

topic.top.words <- mallet.top.words(topic.model, topic.words.m[imp.row,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights,
          c(4,.8), rot.per=0, random.order=F) #makes a word cloud of the body and blood topic

plot of chunk unnamed-chunk-13

doc.topics.m <- mallet.doc.topics(topic.model, smoothed=T,
                                  normalized=T) 
#normalizes the doc.topics.m?

write.csv(doc.topics.m, "~/Desktop/doc-topics.csv")
#generates a csv file in the designated dir

head(doc.topics.m)

##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]    [,7]
## [1,] 0.0002964 0.0121504 0.0003420 1.289e-04 0.0003057 2.438e-04 0.06678
## [2,] 0.0541281 0.0141841 0.0155392 1.358e-04 0.0003222 2.570e-04 0.08279
## [3,] 0.0230966 0.1545656 0.0032561 7.521e-05 0.0001784 2.459e-02 0.10774
## [4,] 0.0171878 0.0444773 0.0008593 8.924e-03 0.0001591 8.082e-04 0.02180
## [5,] 0.0008444 0.0001059 0.0004783 3.733e-05 0.0080531 7.063e-05 0.01289
## [6,] 0.0003762 0.0021259 0.0004341 1.636e-04 0.0003880 3.095e-04 0.09140
##           [,8]      [,9]  [,10]     [,11]    [,12]   [,13]     [,14]
## [1,] 0.0199809 0.0028917 0.2412 9.018e-05 0.006498 0.17954 0.0138125
## [2,] 0.0003578 0.0002875 0.1700 9.504e-05 0.041345 0.11882 0.0007569
## [3,] 0.0001981 0.0001592 0.1018 4.637e-03 0.061865 0.16513 0.0195221
## [4,] 0.0001767 0.0001420 0.1862 7.283e-04 0.065383 0.14929 0.0691891
## [5,] 0.0175444 0.0152495 0.2159 2.612e-05 0.076597 0.04897 0.0210675
## [6,] 0.0020929 0.0003463 0.1632 1.145e-04 0.194388 0.16971 0.0540947
##        [,15]   [,16]     [,17]    [,18]   [,19]     [,20]
## [1,] 0.05899 0.20246 1.563e-04 0.023955 0.15282 0.0173434
## [2,] 0.14220 0.12919 1.545e-03 0.016965 0.20796 0.0030980
## [3,] 0.09785 0.02569 9.119e-05 0.033846 0.17552 0.0001873
## [4,] 0.04977 0.11625 1.643e-02 0.016553 0.21987 0.0158378
## [5,] 0.08839 0.38557 4.526e-05 0.004663 0.09470 0.0088160
## [6,] 0.15631 0.05421 1.983e-04 0.022095 0.08594 0.0020693

library(reshape)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:reshape':
## 
##     rename
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:reshape':
## 
##     stamp

doc_topics_tidy <- melt(doc.topics.m,
                        id=c("V1:20"),
                        variable_name = "Frequency")
#Convert wide data frame with each topic as a column name to long format

head(doc_topics_tidy)

##   X1 X2     value
## 1  1  1 0.0002964
## 2  2  1 0.0541281
## 3  3  1 0.0230966
## 4  4  1 0.0171878
## 5  5  1 0.0008444
## 6  6  1 0.0003762

ggplot(data = doc_topics_tidy, aes(x = X1, y = value, stat = "identity")) + 
geom_density(stat = "identity", position = "identity") + facet_wrap(~X2) + ylab("Frequency") + xlab("Tract") + ggtitle("Frequency of Topic Models in Tracts for the Times")

$plot of chunk unnamed-chunk-19$

This graph shows the frequency of each topic in each document. The topics that involve mostly “church”, “god”, etc. remain constant, but the topic models that involve “blood”, “baptism”, “purgatory”, etc have spikes in certain tracts. This distant text analysis shows that though every tract discusses religious matters, different tracts emphasize certain sacraments, beliefs, and religious practices.

print(topics.labels)

##  [1] "doctrine articles article church say"    
##  [2] "said sir will think say"                 
##  [3] "antichrist come time will rome"          
##  [4] "fathers sense mystical meaning his"      
##  [5] "baptism spirit water baptized sins"      
##  [6] "day prayer daily service prayers"        
##  [7] "will god sin thy thou"                   
##  [8] "unto thy thee prayer prayers"            
##  [9] "body christ bread blood sacrament"       
## [10] "his our god him church"                  
## [11] "purgatory fire dead judgment doctrine"   
## [12] "one first his seem new"                  
## [13] "our scripture will one word"             
## [14] "knowledge great divine principle system" 
## [15] "church day state her present"            
## [16] "church apostles bishops authority bishop"
## [17] "sacrifice god offered christ eucharist"  
## [18] "doctrine his system man truth"           
## [19] "one great will say well"                 
## [20] "fasting our fast days fasts"

doc1 <- doc_topics_tidy %>%
  filter(X1 == "1")

ggplot(data = doc1, aes(x = X2, y = value, stat = "identity")) + geom_line() + ylab("Frequency") + xlab("Topics") + ggtitle("Frequency of Topic Models in Tract01")

plot of chunk unnamed-chunk-21

ggplot(data = doc_topics_tidy, aes(x = X2, y = value, stat = "identity")) + 
geom_density(stat = "identity", position = "identity") + facet_wrap(~X1) + ylab("Frequency") + xlab("Topic") + ggtitle("Frequency of Each Topic Model in each Tract")

plot of chunk unnamed-chunk-22

Topic Modeling the Tracts for the Times

Clio3

October 22, 2014