Empirical Topic Modeling from First Principles

Markdown

The 3 topics selected for the following analysis are Politics,Laptop and Food. The corresponding text for all these topics are retrieved from top 50 websites got from google search.

The text content of each topic are stored on the documents individually and then processed to a single document.Lets start the analysis with the word cloud and COG.

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(igraph)
library(maptpx)

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b,b]  #
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # wow. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
  
  plot(graph, 
       layout = layout.kamada.kawai, 
      main = title)

  } # func ends

The above function is used for form the COG and plot them as a graph. So for all the 3 topics the above function is called independently and each topic will have their respective graphs.

Now load the csv values and clean the data.

file.cr = read.csv("/Users/bhargav/searchfoodText.csv")
file.mi = read.csv("/Users/bhargav/searchlaptopText.csv")
file.lin = read.csv("/Users/bhargav/searchpoliticsText.csv")

file.cr = file.cr[!is.na(file.cr$text)|file.cr$text != '',]

file.mi = file.mi[!is.na(file.mi$text)|file.mi$text != '',]

file.lin = file.lin[!is.na(file.lin$text)|file.lin$text != '',]

n = min(nrow(file.cr),nrow(file.mi),nrow(file.lin))

data = data.frame(id = 1:n, text1 = file.cr$text[1:n],
                      text2 = file.mi$text[1:n],
                      text3 = file.lin$text[1:n],
                      stringsAsFactors = F)
data$text = paste(data$text1,data$text2,data$text3)

dim(data)

## [1] 56  5

Once the text is cleaned completely then bock the words which are out of focus and they can be removed from the stopwords text file as shown below.

# Read Stopwords list
stpw1 = readLines("../../stopwords.txt")# stopwords list from git 
stpw2 = tm::stopwords('english')               # tm package stop word list; tokenizer package has the same name function

comn  = unique(c(stpw1, stpw2))                 # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space
# x  =  stemDocument(x)

The stopwords are removed and now we have to create a document term matrix to find out the token frequency.

t1 = Sys.time()

tok_fun = word_tokenizer

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = F)

vocab = create_vocabulary(it_0,
                          ngram = c(2L, 2L)
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 10)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_0  = create_dtm(it_0, vectorizer)

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       #terms in decreasing order of freq

if (length(tsum) > 1000) {n = 1000} else {n = length(tsum)}
tsum = tsum[1:n]
text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
  #setTxtProgressBar(pb, i)
}


it_m = itoken(text2,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = F)

vocab = create_vocabulary(it_m
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)

## [1]    56 24455

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)

print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 41.62294 secs

# some basic clean-up ops
dim(dtm)

## [1]    56 24455

a0 = apply(dtm, 1, sum)   # apply sum operation to dtm's rows. i.e. get rowSum
  dtm = dtm[(a0 > 5),]    # retain only those rows with token rowSum >5, i.e. delete empty rows
  dim(dtm); rm(a0)        # delete a0 object

## [1]    56 24455

a0 = apply(dtm, 2, sum)   # use apply() to find colSUms this time
  dtm = dtm[, (a0 > 4)]     # retain only those terms that occurred > 4 times in the corpus
  dim(dtm); rm(a0)

## [1]   56 6663

# view summary wordlcoud
a0 = apply(dtm, 2, sum)     # colSum vector of dtm
  a0[1:5]                   # view what a0 obj is like

##       snowman       sliders  freeshipping inches_inches       adblock 
##             5            26            10             8             7

  a1 = order(as.vector(a0), decreasing = TRUE)     # vector of token locations
  a0 = a0[a1]     # a0 ordered asper token locations
  a0[1:5]         # view a0 now

##       food     laptop    laptops   politics intel_core 
##       1110        843        590        530        464

Word Cloud and COG For Resultant Document:

The word cloud of the resultant document clearly indicates the highest frequency of the topics. So, the topics are clearly seperated from the whole of the document content.

wordcloud(names(a0), a0,     # invoke wordcloud() func. Use ?wordcloud for more info
          scale=c(4,1), 
          3, # min.freq 
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))
title(sub = "Quick Summary Wordcloud")

Let us have a look at the distilled COG now:

#quartz()
distill.cog(adj.mat, 'Distilled COG for full corpus',  10,  10)

The above analysis of the topics interlinks with the recommended topics and gives us the summary of the overall analysis.

Now, lets check for the LIFT and ETA proportion values which helps us the probability of the topics in each document.

Summary of values - Theta,Omega and Lift:

K = 3     # overriding model fit criterion

# -- run topic model for selected K -- #
summary( simfit <- topics(dtm,  K=K, verb=2), nwrd = 12 )

## 
## Estimating on a 56 document collection.
## Fitting the 3 topic model.
## log posterior increase: 2308.2, 638.2, 882.9, 547.5, done. (L = -1181739.6)
## 
## Top 12 phrases by topic-over-null term lift (and usage %):
## 
## [1] 'rousseff', 'resealable', 'lockers', 'derwent', 'flipcharts', 'glues', 'compasses', 'protractors', 'scrapbooks', 'gel', 'multiroom', 'endive' (61.3) 
## [2] 'polka', 'burnham', 'divorce', 'migrants', 'migration', 'andromeda', 'cdma', 'compareraja', 'stores_check', 'nom', 'knocks', 'sturgeon' (28.5) 
## [3] 'godfrey', 'snowman', 'rasamalaysia', 'souffle', 'princess', 'reinvented', 'expressive', 'troubleshooting', 'housewife', 'berman', 'fpo', 'buckeye' (10.2) 
## 
## Dispersion = 3.18

rownames1 = gsub(" ", ".", rownames(simfit$theta));  rownames(simfit$theta) = rownames1;  
## what are the factor components of the factorized DTM?

dim(dtm)     # size of the orig input matrix

## [1]   56 6663

str(simfit)     # structure of the output obj

## List of 6
##  $ K    : int 3
##  $ theta: num [1:6663, 1:3] 5.08e-10 5.08e-10 2.13e-09 1.78e-09 1.60e-09 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ phrase: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
##   .. ..$ topic : chr [1:3] "1" "2" "3"
##  $ omega: num [1:56, 1:3] 0.000559 0.000292 0.00016 0.512922 0.808919 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ document: chr [1:56] "1" "2" "3" "4" ...
##   .. ..$ topic   : chr [1:3] "1" "2" "3"
##  $ BF   : NULL
##  $ D    :List of 3
##   ..$ dispersion: num 3.18
##   ..$ pvalue    : num 0
##   ..$ df        : num 322131
##  $ X    :List of 6
##   ..$ i       : int [1:55288] 56 56 54 54 54 53 51 51 51 51 ...
##   ..$ j       : int [1:55288] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ v       : num [1:55288] 5 26 10 8 7 36 14 14 24 24 ...
##   ..$ nrow    : int 56
##   ..$ ncol    : int 6663
##   ..$ dimnames:List of 2
##   .. ..$ Docs : chr [1:56] "1" "2" "3" "4" ...
##   .. ..$ Terms: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
##   ..- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##   ..- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
##  - attr(*, "class")= chr "topics"

dim(simfit$theta)   # analogous to factor loadings

## [1] 6663    3

dim(simfit$omega)   # analogous to factor scores

## [1] 56  3

Theta Values:

Below are the Theta values for top tokens in the document:

simfit$theta[1:5,]

##                topic
## phrase                     1            2            3
##   snowman       5.075749e-10 1.845461e-09 2.920818e-04
##   sliders       5.075751e-10 1.845461e-09 1.518814e-03
##   freeshipping  2.126554e-09 3.513155e-04 2.933013e-09
##   inches_inches 1.776377e-09 2.810529e-04 2.933013e-09
##   adblock       1.603402e-09 2.459215e-04 2.933013e-09

Omega Values:

Below are the Omega values for the tokens in the document:

simfit$omega[1:5,]

##         topic
## document            1            2            3
##        1 0.0005590992 0.9989189550 0.0005219458
##        2 0.0002920402 0.9991193911 0.0005885687
##        3 0.0001596683 0.0003159075 0.9995244242
##        4 0.5129219500 0.4853479812 0.0017300689
##        5 0.8089190860 0.1887835890 0.0022973250

tst = round(ncol(dtm)/100)
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  #print(i)
}

theta = simfit$theta
lift = theta*0;       # lift will have same dimn as the theta matrix

sum1 = sum(dtm)
pterms = ss.col/sum1     # each column's marginal occurrence probability

for (i in 1:nrow(theta)){  
  for (j in 1:ncol(theta)){
    ptermtopic = 0; pterm = 0;
    ptermtopic = theta[i, j]
    pterm = pterms[i]
    lift[i, j] = ptermtopic/pterm     # divide each cell by the column's marg. occurr. proby.
  }
}   

dim(lift); head(lift, 15)

## [1] 6663    3

##                    topic
## phrase                         1            2            3
##   snowman           1.497742e-05 5.445549e-05 8.618693e+00
##   sliders           2.880274e-06 1.047221e-05 8.618629e+00
##   freeshipping      3.137497e-05 5.183274e+00 4.327337e-05
##   inches_inches     3.276061e-05 5.183282e+00 5.409172e-05
##   adblock           3.379490e-05 5.183288e+00 6.181910e-05
##   emi_rs            2.629482e-04 5.182407e+00 1.199793e-05
##   mumbai_rajkot     1.446314e+00 3.081649e-05 4.525178e-03
##   gurgaon_delhi     1.446314e+00 3.081649e-05 4.525178e-03
##   pincode_locations 1.446627e+00 1.800933e-05 2.647700e-03
##   product_delivered 1.446627e+00 1.800933e-05 2.647700e-03
##   nasik             1.446015e+00 4.311127e-05 6.326799e-03
##   stoves            1.444967e+00 8.614228e-05 1.262663e-02
##   photoframes       1.444967e+00 8.614228e-05 1.262663e-02
##   conservation      1.444967e+00 8.614228e-05 1.262663e-02
##   merson            1.445316e+00 7.179873e-05 1.052770e-02

Lift Values

On further analysis, the lift values for the corresponding topics are as below:

lift[25:35,]

##                 topic
## phrase                      1            2            3
##   adageunicorns  1.750409e-05 5.183362e+00 1.087037e-04
##   entrepreneur   2.786827e-03 5.173378e+00 1.731776e-04
##   charles_pierce 8.064532e-06 7.466233e-06 8.618585e+00
##   staggers       8.105816e-04 2.850908e-04 8.613571e+00
##   smartcooky     7.791184e-06 1.800563e-05 8.618614e+00
##   statesman      1.578546e-01 1.249039e-02 7.657820e+00
##   foodnavigator  1.441350e+00 2.057864e-02 9.240647e-05
##   confectionery  1.445557e+00 5.484517e-03 9.209458e-05
##   exhaustive     1.442577e+00 1.615626e-02 6.600552e-05
##   ratings_based  4.160594e-06 5.183346e+00 2.159149e-05
##   prices_ratings 4.160594e-06 5.183346e+00 2.159149e-05

# Generate A censored Lift matrix
censored.lift = lift
for (i in 1:nrow(lift)){
  censored.lift[i,][censored.lift[i,] < max(censored.lift[i,])] = 0   # hard assigning tokens to topics
}

The below censored Lift values of the tokens in the document below shows the weightage of each topic classified.

head(censored.lift, 10)

##                    topic
## phrase                     1        2        3
##   snowman           0.000000 0.000000 8.618693
##   sliders           0.000000 0.000000 8.618629
##   freeshipping      0.000000 5.183274 0.000000
##   inches_inches     0.000000 5.183282 0.000000
##   adblock           0.000000 5.183288 0.000000
##   emi_rs            0.000000 5.182407 0.000000
##   mumbai_rajkot     1.446314 0.000000 0.000000
##   gurgaon_delhi     1.446314 0.000000 0.000000
##   pincode_locations 1.446627 0.000000 0.000000
##   product_delivered 1.446627 0.000000 0.000000

t = Sys.time()

if(nrow(dtm) < 100) {k1 = 10} else {k1= 100}   # to avoid machine choking up in v small datasets

tst = ceiling(nrow(dtm)/k1)  # now using 1% of the rows at a time
a = rep(tst, (k1 - 1))
b = cumsum(a);rm(a)    # cumsum() is cumulative sum.
b = c(0, b, nrow(dtm))  # broke the supermassive dtm into chunks of 1% ncol each
  a0 = which(b > nrow(dtm));    # sometimes, rounding errors cause out of bound errors
  if (length(a0) > 0) {b = b[-a0]}

eta.new = NULL
for (i1 in 1:K){
  
  a2 = c(NULL)
  for (i in 1:(length(b)-3)) {
    tempdtm = dtm[(b[i]+1):(b[i+1]),]
    a = matrix(rep(lift[, i1], nrow(tempdtm)), nrow(tempdtm), ncol(tempdtm), byrow = TRUE)
    a1 = rowSums(as.matrix(tempdtm * a))
    a2 = c(a2, a1); rm(a, a1, tempdtm)
      } # i ends
  
  eta.new = cbind(eta.new, a2); rm(a2)
  
  } # i1 ends

Sys.time() - t  # will take longer than lift building coz ncol is v v high now

## Time difference of 0.1570508 secs

#rownames(eta.new) = rownames(simfit$omega)
colnames(eta.new) = colnames(simfit$theta)

# so what does eta.new look like? what does it mean?
dim(eta.new)

## [1] 48  3

ETA Proportion values and Word Cloud For each Latent Topic:

The Eta proportion values for each topic are calculated and below is the summary. The columns indicates the document numbers and each row represents the topic. For example, if the topic resides in the document-1, all the other document values for the corresponding topic will be assigned Zero.

#head(eta.new)

# eta.new = simfit$theta     # if error is happening, worst case

eta.propn = eta.new / rowSums(eta.new)   # calc topic proportions for each document
  eta.propn [1:5,]

##           1         2         3
## 1 0.1780223 0.6251038 0.1968739
## 2 0.1251950 0.7039112 0.1708938
## 3 0.1032872 0.1628087 0.7339040
## 4 0.2713160 0.5080330 0.2206510
## 5 0.4042983 0.3690693 0.2266324

df.top.terms = data.frame(NULL)    # can't fit ALL terms in plot, so choose top ones with max loading

for (i in 1:K){       # For each topic 
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0, i] # Theta for terms greator than 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift above 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  top.terms = row.names(top_word)
  df.top.terms.t = data.frame(topic = i, top.terms =top.terms, stringsAsFactors = F )
  df.top.terms = rbind(df.top.terms, df.top.terms.t  )
  
} # i loop ends

 
 # pdf(file = paste0(K,' Topic Model results.pdf')) # use pdf() func to save plot directly as PDFs in your getwd()

for (i in 1:K){       # For each topic 
  
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0,i] # Theta for terms greator than 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift above 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  # SUB TCM
  sub.tcm = adj.mat[colnames(adj.mat) %in% names(a0),colnames(adj.mat) %in% names(a0)]
  
  #   Plot wordcloud
  #windows()
  wordcloud(rownames(top_word), top_word,  scale=c(4,.2), 1,
            random.order=FALSE, random.color=FALSE, 
            colors=brewer.pal(8, "Dark2"))
  mtext(paste("Latent Topic",i,"-Word Cloud"), side = 2, line = 0, cex=2)

  # PLot TCM
  #windows()
  distill.cog(sub.tcm, '',  5,  5)
  mtext(paste("Latent Topic",i,"-cog"), side = 2, line = 0, cex=2)
  #mtext(paste("Term co-occurrence - Topic",i), side = 3, line = 2, cex=2)
    
  } # i loop ends

show.top.loading.rows = function(eta.obj, number.of.units){
  
  K = ncol(eta.obj)    # no. of topic factors
  n = number.of.units
  top.loaders = NULL
  for (i in 1:K){
    a0 = order(eta.obj[,i], decreasing = TRUE)
    a1 = rownames(eta.obj[a0[1:n],])
    top.loaders = cbind(a1, top.loaders)
  } # i loop ends

  a2 = matrix()
  return(top.loaders)
  
} # func ends

The above latent topics clearly classifies each topic from the documents are indicated correctly.

show.top.loading.rows(eta.propn, 20)

##       a1   a1   a1  
##  [1,] "3"  "2"  "24"
##  [2,] "20" "29" "27"
##  [3,] "13" "1"  "31"
##  [4,] "43" "38" "15"
##  [5,] "6"  "41" "10"
##  [6,] "48" "47" "32"
##  [7,] "18" "25" "34"
##  [8,] "36" "40" "26"
##  [9,] "39" "22" "16"
## [10,] "26" "8"  "28"
## [11,] "28" "4"  "23"
## [12,] "21" "45" "17"
## [13,] "14" "12" "30"
## [14,] "45" "33" "9" 
## [15,] "19" "35" "39"
## [16,] "22" "7"  "11"
## [17,] "11" "37" "44"
## [18,] "30" "21" "46"
## [19,] "32" "46" "5" 
## [20,] "25" "42" "36"

show.top.loading.rows(eta.new, 10)

##       a1   a1   a1  
##  [1,] "3"  "2"  "24"
##  [2,] "13" "41" "26"
##  [3,] "20" "13" "28"
##  [4,] "26" "28" "31"
##  [5,] "28" "26" "15"
##  [6,] "48" "12" "30"
##  [7,] "15" "1"  "13"
##  [8,] "24" "22" "34"
##  [9,] "31" "14" "9" 
## [10,] "43" "31" "32"

Final Observations :

QNo.1. whether the topic model is able to separate each subject from other subjects. To what extent is it able to do so?

The topics are able to seperate the subject from other subjects. However the subjects Food and Politics has both the contents in their document and hence few words were included in both the latent topics.

QNo.2. Are there mixed tokens (with high lift in more than one topic)? Are the highest LIFT tokens and the document topic proportions (ETA scores) clear and able to identify each topic?

The mixed tokens available in the topic models are however very less. The ETA proportion values are clear and are able to identify each topic individually.

QNo.3. What are your learnings from this exercise.

Below are the learnings from this exercise: 1. The emperical topic modelling though the topics are clubbed into a single document has successfully identified each topic. 2. The topic model has the tokens classified and the theta and omega values clearly indicates the token belongs to a particlar topic which suggests the model is highly effective. 3.The latent topics obtained by the result of the model is as expected. If the K value is increased, the token classification will become close to a perfect model.