Task 1 - Empirical Topic Modeling from First Principles

http://rpubs.com/neerajkhattar/EmpiricalTopicModeling

Install The required Packagaes

require(text2vec)|| install.packages("text2vec")

## Loading required package: text2vec

## [1] TRUE

require(data.table)|| install.packages("data.table")

## Loading required package: data.table

## [1] TRUE

require(stringr)|| install.packages("stringr")

## Loading required package: stringr

## [1] TRUE

require(tm)|| install.packages("tm")

## Loading required package: tm

## Loading required package: NLP

## [1] TRUE

require(RWeka)|| install.packages("RWeka")

## Loading required package: RWeka

## [1] TRUE

require(tokenizers)|| install.packages("tokenizers")

## Loading required package: tokenizers

## 
## Attaching package: 'tokenizers'

## The following object is masked from 'package:tm':
## 
##     stopwords

## [1] TRUE

require(slam)|| install.packages("slam")

## Loading required package: slam

## [1] TRUE

require(wordcloud)|| install.packages("wordcloud")

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## [1] TRUE

require(igraph)|| install.packages("igraph")

## Loading required package: igraph

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following objects are masked from 'package:text2vec':
## 
##     %>%, normalize

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## [1] TRUE

require(maptpx)|| install.packages("maptpx")

## Loading required package: maptpx

## 
## Attaching package: 'maptpx'

## The following object is masked from 'package:igraph':
## 
##     normalize

## The following object is masked from 'package:text2vec':
## 
##     normalize

## [1] TRUE

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(igraph)
library(maptpx)

Clean the text data

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

Cog function

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b,b]  #
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # wow. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
  
  plot(graph, 
       layout = layout.kamada.kawai, 
      main = title)

  } # func ends

Step 1: The 3 Topics selected are Literature, Yoga and Leadership for Topic Analysis. Code used is attached puython file

Step 2: 50 Arcticles for each of these topics was scarped using Pyton code. Attaching the final documents

Step 3: Concatenated 3 text files into one text dataframe and used for analysis

Step 4: Topic model for these 3 topics is runned

Step 5 - Analyse the topic model results - wordclouds, COGs, topic proportions in documents. USe the classwork code for Lift and ETA directly. Please refer the attached word document

Step 6 - Comment on (i) whether the topic model is able to separate each subject from other subjects. To what extent is it able to do so? Answer-> yes the topic model is able to separaye each subject 80% accurately

Are there mixed tokens (with high lift in more than one topic)? Are the highest LIFT tokens and the document topic proportions (ETA scores) clear and able to identify each topic? Answer -> Yes they are able to identify each topic separetely
What are your learnings from this exercise. Answer: This exercise helped us learn TABA end to end and was able to relate to real life practical topic modelling for any text analysis.

Reference Code is as below

#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#

search_terms = c('Literature','yoga','Leadership')

file.cr = read.csv(file.choose(), stringsAsFactors = F)
file.mi = read.csv(file.choose(), stringsAsFactors = F)
file.lin = read.csv(file.choose(), stringsAsFactors = F)

file.cr = file.cr[!is.na(file.cr$text)|file.cr$text != '',]

file.mi = file.mi[!is.na(file.mi$text)|file.mi$text != '',]

file.lin = file.lin[!is.na(file.lin$text)|file.lin$text != '',]

n = min(nrow(file.cr),nrow(file.mi),nrow(file.lin))

data = data.frame(id = 1:n, text1 = file.cr$text[1:n],
                      text2 = file.mi$text[1:n],
                      text3 = file.lin$text[1:n],
                      stringsAsFactors = F)
data$text = paste(data$text1,data$text2,data$text3)

dim(data)

## [1] 51  5

Read Stopwords

# Read Stopwords list
stpw1 = readLines('https://raw.githubusercontent.com/sudhir-voleti/basic-text-analysis-shinyapp/master/data/stopwords.txt')# stopwords list from git
stpw2 = tm::stopwords('english')               # tm package stop word list; tokenizer package has the same name function

comn  = unique(c(stpw1, stpw2))                 # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space
# x  =  stemDocument(x)

#--------------------------------------------------------#
####### Create DTM using text2vec package                #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = F)

vocab = create_vocabulary(it_0,
                          ngram = c(2L, 2L)
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 10)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_0  = create_dtm(it_0, vectorizer)

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       #terms in decreasing order of freq
head(tsum)

##     yoga_adriene        play_play   privacy_policy duration_minutes 
##              118              112              108              107 
##           ago_cc     yoga_classes 
##               93               92

tail(tsum)

##      fiji_mcalpine         tags_books       case_studies 
##                 10                 10                 10 
##         kriya_yoga       dan_mccarthy strategic_planning 
##                 10                 10                 10

text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
 # setTxtProgressBar(pb, i)
}

it_m = itoken(text2,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = F)

vocab = create_vocabulary(it_m
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)

## [1]    51 24556

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)

#print(difftime(Sys.time(), t1, units = 'sec'))

# some basic clean-up ops
dim(dtm)

## [1]    51 24556

a0 = apply(dtm, 1, sum)   # apply sum operation to dtm's rows. i.e. get rowSum
  dtm = dtm[(a0 > 5),]    # retain only those rows with token rowSum >5, i.e. delete empty rows
  dim(dtm); rm(a0)        # delete a0 object

## [1]    51 24556

a0 = apply(dtm, 2, sum)   # use apply() to find colSUms this time
  dtm = dtm[, (a0 > 4)]     # retain only those terms that occurred > 4 times in the corpus
  dim(dtm); rm(a0)

## [1]   51 6083

# view summary wordlcoud
a0 = apply(dtm, 2, sum)     # colSum vector of dtm
  a0[1:5]                   # view what a0 obj is like

##          gove  michael_gove        sahaja view_articles     applejack 
##            17            16             5            10             6

  a1 = order(as.vector(a0), decreasing = TRUE)     # vector of token locations
  a0 = a0[a1]     # a0 ordered asper token locations
  a0[1:5]         # view a0 now

##       yoga leadership literature     people    leaders 
##       1339       1219        859        633        521

windows() # opens new image window
wordcloud(names(a0), a0,     # invoke wordcloud() func. Use ?wordcloud for more info
          scale=c(4,1), 
          3, # min.freq 
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

## Warning in wordcloud(names(a0), a0, scale = c(4, 1), 3, max.words = 100, :
## leadership could not be fit on page. It will not be plotted.

title(sub = "Quick Summary Wordcloud")

#------------------------------------------------------#
# Step 1a - Term Co-occurance Matrix                             #
#------------------------------------------------------#

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 5)

vectorizer = vocab_vectorizer(pruned_vocab, grow_dtm = FALSE, skip_grams_window = 3L)
tcm = create_tcm(it_m, vectorizer)

tcm.mat = as.matrix(tcm)
adj.mat = tcm.mat + t(tcm.mat)

# how about a quick view of the distilled COG as well, now that we're here?
diag(adj.mat) = 0     # set diagonals of the adj matrix to zero --> node isn't its own neighor
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG for full corpus',  10,  10)

#################################################
## --- Step 2: model based text analytics ------ ###
#################################################
# -- select optimal num of topics

K = 3

## Bayes Factor model selection (should choose K or nearby)

#summary(simselect <- topics(dtm, K=K+c(-4:4)), nwrd=0)
 
#K = simselect$K; K  # Change simselect$K to any the number of topics you want to fit in model

#K = 3     # overriding model fit criterion

# -- run topic model for selected K -- #
summary( simfit <- topics(dtm,  K=K, verb=2), nwrd = 12 )

## 
## Estimating on a 51 document collection.
## Fitting the 3 topic model.
## log posterior increase: 1131, 1535.6, 1846.2, 754.7, done. (L = -1043600)
## 
## Top 12 phrases by topic-over-null term lift (and usage %):
## 
## [1] 'expose', 'exkalibur', 'carnival', 'poll', 'rut', 'antenna', 'darknet', 'ackoff', 'johnmaxwellteam', 'proportional', 'maxwells', 'mckeown' (43.2) 
## [2] 'jalan', 'sahaja', 'pfy', 'livingston', 'morristown', 'chatham', 'clifton', 'bloomfield', 'hoboken', 'brussels', 'bangkok', 'eupl' (36.9) 
## [3] 'mohanty', 'lakshminath', 'bhojpuri', 'subcontinent', 'colonial', 'dailies', 'pini', 'aryan', 'pli', 'bhajan', 'heinrich', 'feuerstein' (19.9) 
## 
## Dispersion = 2.48

rownames1 = gsub(" ", ".", rownames(simfit$theta));  rownames(simfit$theta) = rownames1;  


## what are the factor components of the factorized DTM?

dim(dtm)     # size of the orig input matrix

## [1]   51 6083

str(simfit)     # structure of the output obj

## List of 6
##  $ K    : int 3
##  $ theta: num [1:6083, 1:3] 7.65e-10 7.65e-10 7.65e-10 1.56e-07 9.72e-08 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ phrase: chr [1:6083] "gove" "michael_gove" "sahaja" "view_articles" ...
##   .. ..$ topic : chr [1:3] "1" "2" "3"
##  $ omega: num [1:51, 1:3] 0.0594 0.9999 0.8605 0.7724 0.3794 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ document: chr [1:51] "1" "2" "3" "4" ...
##   .. ..$ topic   : chr [1:3] "1" "2" "3"
##  $ BF   : NULL
##  $ D    :List of 3
##   ..$ dispersion: num 2.48
##   ..$ pvalue    : num 0
##   ..$ df        : num 262914
##  $ X    :List of 6
##   ..$ i       : int [1:46393] 51 51 51 50 50 50 49 49 49 49 ...
##   ..$ j       : int [1:46393] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ v       : num [1:46393] 17 16 5 10 6 7 6 6 9 12 ...
##   ..$ nrow    : int 51
##   ..$ ncol    : int 6083
##   ..$ dimnames:List of 2
##   .. ..$ Docs : chr [1:51] "1" "2" "3" "4" ...
##   .. ..$ Terms: chr [1:6083] "gove" "michael_gove" "sahaja" "view_articles" ...
##   ..- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##   ..- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
##  - attr(*, "class")= chr "topics"

dim(simfit$theta)   # analogous to factor loadings

## [1] 6083    3

dim(simfit$omega)   # analogous to factor scores

## [1] 51  3

simfit$theta[1:5,]

##                topic
## phrase                     1            2            3
##   gove          7.651282e-10 7.119543e-04 1.763899e-09
##   michael_gove  7.651282e-10 6.700748e-04 1.763899e-09
##   sahaja        7.651276e-10 2.093998e-04 1.763899e-09
##   view_articles 1.564708e-07 1.019020e-08 3.209864e-04
##   applejack     9.715426e-08 9.526465e-09 1.925831e-04

simfit$omega[1:5,]

##         topic
## document          1            2            3
##        1 0.05936005 9.400969e-01 5.430588e-04
##        2 0.99993631 3.476486e-05 2.892947e-05
##        3 0.86045874 5.094990e-04 1.390318e-01
##        4 0.77240090 2.255541e-01 2.044998e-03
##        5 0.37940335 5.593262e-01 6.127042e-02

# ----------------------------------------------------------#
### Step 2a - compute LIFT for all terms across all topics ###
# ----------------------------------------------------------#

tst = round(ncol(dtm)/100)
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  #print(i)
}

theta = simfit$theta
lift = theta*0;       # lift will have same dimn as the theta matrix

sum1 = sum(dtm)
pterms = ss.col/sum1     # each column's marginal occurrence probability

for (i in 1:nrow(theta)){  
  for (j in 1:ncol(theta)){
    ptermtopic = 0; pterm = 0;
    ptermtopic = theta[i, j]
    pterm = pterms[i]
    lift[i, j] = ptermtopic/pterm     # divide each cell by the column's marg. occurr. proby.
  }
}   

dim(lift); head(lift, 15)

## [1] 6083    3

##                     topic
## phrase                          1            2            3
##   gove               6.009542e-06 5.591899e+00 1.385418e-05
##   michael_gove       6.385138e-06 5.591900e+00 1.472007e-05
##   sahaja             2.043243e-05 5.591938e+00 4.710421e-05
##   view_articles      2.089246e-03 1.360626e-04 4.285907e+00
##   applejack          2.162055e-03 2.120004e-04 4.285712e+00
##   deviantart         2.136052e-03 1.848800e-04 4.285782e+00
##   lumpur             1.739732e-05 5.591927e+00 3.922908e-05
##   kuala              1.739732e-05 5.591927e+00 3.922908e-05
##   bangkok_thailand   1.159821e-05 5.591912e+00 2.615272e-05
##   business_education 8.698662e-06 5.591905e+00 1.961454e-05
##   indira_gandhi      8.921211e-06 2.797127e-05 4.291234e+00
##   patrika            1.635555e-05 5.128067e-05 4.291251e+00
##   mohanty            1.962666e-05 6.153680e-05 4.291258e+00
##   deka               1.226666e-05 3.846050e-05 4.291242e+00
##   lakshminath        1.962666e-05 6.153680e-05 4.291258e+00

lift[25:35,]

##                     topic
## phrase                          1            2            3
##   dan_mccarthy       1.700275e+00 3.825750e-05 2.353312e-05
##   displayname        1.700278e+00 5.465352e-05 3.361875e-05
##   signinemailaddress 1.700280e+00 6.376241e-05 3.922187e-05
##   pacheco            1.700282e+00 7.651486e-05 4.706625e-05
##   yogasanas          2.269782e-05 8.426997e-05 4.291210e+00
##   gurudev            1.991295e-04 3.527740e-03 4.288063e+00
##   namaskar           3.930679e-05 1.877053e-04 4.291112e+00
##   sri_sri            9.014242e-05 1.727416e-03 4.289698e+00
##   governmental       1.593030e-05 5.305862e-05 4.291234e+00
##   art_living         1.890382e-04 3.451274e-03 4.288133e+00
##   heinrich           2.023761e-05 6.130130e-05 4.291257e+00

# Generate A censored Lift matrix
censored.lift = lift
for (i in 1:nrow(lift)){
  censored.lift[i,][censored.lift[i,] < max(censored.lift[i,])] = 0   # hard assigning tokens to topics
} 
head(censored.lift, 10);

##                     topic
## phrase               1        2        3
##   gove               0 5.591899 0.000000
##   michael_gove       0 5.591900 0.000000
##   sahaja             0 5.591938 0.000000
##   view_articles      0 0.000000 4.285907
##   applejack          0 0.000000 4.285712
##   deviantart         0 0.000000 4.285782
##   lumpur             0 5.591927 0.000000
##   kuala              0 5.591927 0.000000
##   bangkok_thailand   0 5.591912 0.000000
##   business_education 0 5.591905 0.000000

# get rid of all NAs in the lift matrices
for (i1 in 1:K){ 
  if (sum(is.na(lift[, i1])) > 0) { lift[which(is.na(lift[,i1])), i1] = min(lift[-which(is.na(lift[,i1])),])}
} # i1 ends

## == How does LIFT differ from THETA scores? Let's see == ##

theta = simfit$theta

theta[order(theta[,1], decreasing = TRUE)[1:10],]    # arrange topic 1 probys in decr order

##             topic
## phrase                 1            2            3
##   leadership 0.012722369 9.209523e-03 1.135348e-08
##   people     0.007679724 1.252465e-03 2.670154e-08
##   leaders    0.006634356 1.129760e-08 7.243243e-09
##   work       0.006041968 1.878593e-04 1.136938e-06
##   leader     0.005818210 3.389151e-04 8.819028e-09
##   time       0.005069357 8.681006e-04 5.536136e-06
##   life       0.004856699 1.117863e-06 8.272986e-05
##   business   0.004396212 2.460971e-03 3.715214e-09
##   team       0.004257418 2.371468e-04 5.420051e-09
##   learn      0.003673611 2.725468e-04 4.931412e-08

lift[order(lift[,1], decreasing = TRUE)[1:10],]    # arrange topic 1 lifts in decr order

##                  topic
## phrase                   1            2            3
##   expose          1.700287 6.129159e-05 4.703314e-05
##   exkalibur       1.700287 6.129159e-05 4.703314e-05
##   carnival        1.700287 6.129159e-05 4.703314e-05
##   poll            1.700287 6.129159e-05 4.703314e-05
##   rut             1.700287 6.129159e-05 4.703314e-05
##   antenna         1.700287 6.129159e-05 4.703314e-05
##   darknet         1.700287 6.129159e-05 4.703314e-05
##   ackoff          1.700287 6.129159e-05 4.703314e-05
##   johnmaxwellteam 1.700287 6.129159e-05 4.703314e-05
##   proportional    1.700287 6.129159e-05 4.703314e-05

theta[order(theta[,2], decreasing = TRUE)[1:10],]    # arrange topic 2 probys in decr order

##                topic
## phrase                     1           2            3
##   yoga          2.497190e-03 0.019510042 2.175877e-02
##   literature    2.794546e-04 0.014917876 1.545359e-02
##   leadership    1.272237e-02 0.009209523 1.135348e-08
##   read          2.134334e-03 0.006379658 1.808206e-06
##   contact       3.349407e-04 0.005701600 6.284418e-04
##   home          1.295838e-03 0.004602207 2.039438e-04
##   classes       3.420503e-04 0.004558053 4.182936e-05
##   undergraduate 2.554637e-05 0.004397097 3.043447e-09
##   search        3.670670e-04 0.004222756 1.007320e-03
##   free          1.587455e-03 0.004127577 1.214232e-04

lift[order(lift[,2], decreasing = TRUE)[1:10],]    # arrange topic 2 lifts in decr order

##             topic
## phrase                  1        2            3
##   jalan      1.898051e-05 5.591943 4.705026e-05
##   sahaja     2.043243e-05 5.591938 4.710421e-05
##   pfy        1.977378e-05 5.591937 4.917960e-05
##   livingston 1.977378e-05 5.591937 4.917960e-05
##   morristown 1.977378e-05 5.591937 4.917960e-05
##   chatham    1.977378e-05 5.591937 4.917960e-05
##   clifton    1.977378e-05 5.591937 4.917960e-05
##   bloomfield 1.977378e-05 5.591937 4.917960e-05
##   hoboken    1.977378e-05 5.591937 4.917960e-05
##   brussels   1.981640e-05 5.591937 4.943561e-05

#----------------------------------------------------------------#
# Step 2b - Calculate ETA - each document's score on each topic  #
#----------------------------------------------------------------#

t = Sys.time()

if(nrow(dtm) < 100) {k1 = 10} else {k1= 100}   # to avoid machine choking up in v small datasets

tst = ceiling(nrow(dtm)/k1)  # now using 1% of the rows at a time
a = rep(tst, (k1 - 1))
b = cumsum(a);rm(a)    # cumsum() is cumulative sum.
b = c(0, b, nrow(dtm))  # broke the supermassive dtm into chunks of 1% ncol each
a0 = which(b > nrow(dtm));    # sometimes, rounding errors cause out of bound errors
if (length(a0) > 0) {b = b[-a0]}
n1 = length(b) 
if((b[n1] - b[n1-1])==1) {b[n1-1] = b[n1-1]-1} else { b[n1-1] = b[n1-1]}


eta.new = NULL
for (i1 in 1:K){
  
  a2 = c(NULL)
  for (i in 1:(length(b)-1)) {
    tempdtm = as.matrix(dtm[(b[i]+1):(b[i+1]),])
    a = matrix(rep(lift[, i1], nrow(tempdtm)), nrow(tempdtm), ncol(tempdtm), byrow = TRUE)
    a1 = rowSums(tempdtm * a)
    a2 = c(a2, a1); rm(a, a1, tempdtm)
  } # i ends
  
  eta.new = cbind(eta.new, a2); rm(a2)
  
} # i1 ends

Sys.time() - t  # will take longer than lift building coz ncol is v v high now

## Time difference of 0.07423306 secs

rownames(eta.new) = rownames(simfit$omega)
colnames(eta.new) = colnames(simfit$theta)

# so what does eta.new look like? what does it mean?
dim(eta.new)

## [1] 51  3

round(head(eta.new),2)

##          1        2        3
## 1  1023.12  2915.38   681.92
## 2 41985.69 14137.81 10274.75
## 3  7494.32  2960.91  4625.14
## 4  2106.68  1903.15   775.16
## 5  1118.48  2417.46  1042.20
## 6   500.28  1383.57   473.52

# eta.new = simfit$theta     # if error is happening, worst case

eta.propn = eta.new / rowSums(eta.new)   # calc topic proportions for each document
eta.propn [1:5,]

##           1         2         3
## 1 0.2214340 0.6309780 0.1475880
## 2 0.6323312 0.2129245 0.1547443
## 3 0.4969586 0.1963422 0.3066993
## 4 0.4402687 0.3977324 0.1619989
## 5 0.2443086 0.5280442 0.2276471

# ----------------------------------------#
# Step 3 : Plot Wordcloud for each topic  #
# ----------------------------------------#

df.top.terms = data.frame(NULL)    # can't fit ALL terms in plot, so choose top ones with max loading

for (i in 1:K){       # For each topic 
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0, i] # Theta for terms with lift > 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift above 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  top.terms = row.names(top_word)
  df.top.terms.t = data.frame(topic = i, top.terms =top.terms, stringsAsFactors = F )
  df.top.terms = rbind(df.top.terms, df.top.terms.t  )
  
} # i loop ends


# pdf(file = paste0(K,' Topic Model results.pdf')) # use pdf() func to save plot directly as PDFs in your getwd()

for (i in 1:K){       # For each topic 
  
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0,i] # Theta for terms with lift greator than 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift > 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  # SUB TCM
  sub.tcm = adj.mat[colnames(adj.mat) %in% names(a0),colnames(adj.mat) %in% names(a0)]
  
  #   Plot wordcloud
  windows()
  wordcloud(rownames(top_word), top_word,  scale=c(4,.2), 1,
            random.order=FALSE, random.color=FALSE, 
            colors=brewer.pal(8, "Dark2"))
  mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
  
  # PLot TCM
  windows()
  distill.cog(sub.tcm, '',  5,  5)
  mtext(paste("Term co-occurrence - Topic",i), side = 3, line = 2, cex=2)
  
} # i loop ends

## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length

# dev.off() # closes the graphical devices

### which rows load most on which topics? ###

show.top.loading.rows = function(eta.obj, number.of.units){
  
  K = ncol(eta.obj)    # no. of topic factors
  n = number.of.units
  top.loaders = NULL
  for (i in 1:K){
    a0 = order(eta.obj[,i], decreasing = TRUE)
    a1 = rownames(eta.obj[a0[1:n],])
    top.loaders = cbind(a1, top.loaders)
  } # i loop ends
  
  a2 = matrix()
  return(top.loaders)
  
} # func ends

show.top.loading.rows(eta.propn, 10)

##       a1   a1   a1  
##  [1,] "48" "8"  "2" 
##  [2,] "19" "49" "7" 
##  [3,] "42" "24" "44"
##  [4,] "41" "26" "37"
##  [5,] "45" "51" "3" 
##  [6,] "39" "35" "4" 
##  [7,] "25" "1"  "33"
##  [8,] "31" "34" "28"
##  [9,] "46" "9"  "50"
## [10,] "11" "14" "13"

show.top.loading.rows(eta.new, 10)

##       a1   a1   a1  
##  [1,] "42" "2"  "2" 
##  [2,] "48" "42" "7" 
##  [3,] "2"  "7"  "3" 
##  [4,] "19" "49" "42"
##  [5,] "31" "26" "44"
##  [6,] "25" "24" "31"
##  [7,] "41" "8"  "30"
##  [8,] "3"  "30" "33"
##  [9,] "7"  "25" "41"
## [10,] "39" "35" "19"

Task 1 - Empirical Topic Modeling from First Principles

Neesha Birla (71620043) Mudit Upadhyay (71620039) Nikhil Saxena (71620107) Neeraj Khattar (71620042)

25 December 2016

http://rpubs.com/neerajkhattar/EmpiricalTopicModeling