Install The required Packagaes
require(text2vec)|| install.packages("text2vec")
## Loading required package: text2vec
## [1] TRUE
require(data.table)|| install.packages("data.table")
## Loading required package: data.table
## [1] TRUE
require(stringr)|| install.packages("stringr")
## Loading required package: stringr
## [1] TRUE
require(tm)|| install.packages("tm")
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
require(RWeka)|| install.packages("RWeka")
## Loading required package: RWeka
## [1] TRUE
require(tokenizers)|| install.packages("tokenizers")
## Loading required package: tokenizers
##
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
##
## stopwords
## [1] TRUE
require(slam)|| install.packages("slam")
## Loading required package: slam
## [1] TRUE
require(wordcloud)|| install.packages("wordcloud")
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## [1] TRUE
require(igraph)|| install.packages("igraph")
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:text2vec':
##
## %>%, normalize
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## [1] TRUE
require(maptpx)|| install.packages("maptpx")
## Loading required package: maptpx
##
## Attaching package: 'maptpx'
## The following object is masked from 'package:igraph':
##
## normalize
## The following object is masked from 'package:text2vec':
##
## normalize
## [1] TRUE
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(igraph)
library(maptpx)
Clean the text data
text.clean = function(x) # text data
{ require("tm")
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
Cog function
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b,b] #
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # wow. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
Step 1: The 3 Topics selected are Literature, Yoga and Leadership for Topic Analysis. Code used is attached puython file
Step 2: 50 Arcticles for each of these topics was scarped using Pyton code. Attaching the final documents
Step 3: Concatenated 3 text files into one text dataframe and used for analysis
Step 4: Topic model for these 3 topics is runned
Step 5 - Analyse the topic model results - wordclouds, COGs, topic proportions in documents. USe the classwork code for Lift and ETA directly. Please refer the attached word document
Step 6 - Comment on (i) whether the topic model is able to separate each subject from other subjects. To what extent is it able to do so? Answer-> yes the topic model is able to separaye each subject 80% accurately
Are there mixed tokens (with high lift in more than one topic)? Are the highest LIFT tokens and the document topic proportions (ETA scores) clear and able to identify each topic? Answer -> Yes they are able to identify each topic separetely
What are your learnings from this exercise. Answer: This exercise helped us learn TABA end to end and was able to relate to real life practical topic modelling for any text analysis.
Reference Code is as below
#--------------------------------------------------------#
# Step 1 - Reading text data #
#--------------------------------------------------------#
search_terms = c('Literature','yoga','Leadership')
file.cr = read.csv(file.choose(), stringsAsFactors = F)
file.mi = read.csv(file.choose(), stringsAsFactors = F)
file.lin = read.csv(file.choose(), stringsAsFactors = F)
file.cr = file.cr[!is.na(file.cr$text)|file.cr$text != '',]
file.mi = file.mi[!is.na(file.mi$text)|file.mi$text != '',]
file.lin = file.lin[!is.na(file.lin$text)|file.lin$text != '',]
n = min(nrow(file.cr),nrow(file.mi),nrow(file.lin))
data = data.frame(id = 1:n, text1 = file.cr$text[1:n],
text2 = file.mi$text[1:n],
text3 = file.lin$text[1:n],
stringsAsFactors = F)
data$text = paste(data$text1,data$text2,data$text3)
dim(data)
## [1] 51 5
Read Stopwords
# Read Stopwords list
stpw1 = readLines('https://raw.githubusercontent.com/sudhir-voleti/basic-text-analysis-shinyapp/master/data/stopwords.txt')# stopwords list from git
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function
comn = unique(c(stpw1, stpw2)) # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn)) # final stop word lsit after removing punctuation
x = text.clean(data$text) # pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
# x = stemDocument(x)
#--------------------------------------------------------#
####### Create DTM using text2vec package #
#--------------------------------------------------------#
t1 = Sys.time()
tok_fun = word_tokenizer
it_0 = itoken( x,
#preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_0,
ngram = c(2L, 2L)
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 10)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_0 = create_dtm(it_0, vectorizer)
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),] #terms in decreasing order of freq
head(tsum)
## yoga_adriene play_play privacy_policy duration_minutes
## 118 112 108 107
## ago_cc yoga_classes
## 93 92
tail(tsum)
## fiji_mcalpine tags_books case_studies
## 10 10 10
## kriya_yoga dan_mccarthy strategic_planning
## 10 10 10
text2 = x
text2 = paste("",text2,"")
pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0
for (term in names(tsum)){
i = i + 1
focal.term = gsub("_", " ",term) # in case dot was word-separator
replacement.term = term
text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
# setTxtProgressBar(pb, i)
}
it_m = itoken(text2,
# preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_m
# ngram = c(2L, 2L),
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1] 51 24556
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
#print(difftime(Sys.time(), t1, units = 'sec'))
# some basic clean-up ops
dim(dtm)
## [1] 51 24556
a0 = apply(dtm, 1, sum) # apply sum operation to dtm's rows. i.e. get rowSum
dtm = dtm[(a0 > 5),] # retain only those rows with token rowSum >5, i.e. delete empty rows
dim(dtm); rm(a0) # delete a0 object
## [1] 51 24556
a0 = apply(dtm, 2, sum) # use apply() to find colSUms this time
dtm = dtm[, (a0 > 4)] # retain only those terms that occurred > 4 times in the corpus
dim(dtm); rm(a0)
## [1] 51 6083
# view summary wordlcoud
a0 = apply(dtm, 2, sum) # colSum vector of dtm
a0[1:5] # view what a0 obj is like
## gove michael_gove sahaja view_articles applejack
## 17 16 5 10 6
a1 = order(as.vector(a0), decreasing = TRUE) # vector of token locations
a0 = a0[a1] # a0 ordered asper token locations
a0[1:5] # view a0 now
## yoga leadership literature people leaders
## 1339 1219 859 633 521
windows() # opens new image window
wordcloud(names(a0), a0, # invoke wordcloud() func. Use ?wordcloud for more info
scale=c(4,1),
3, # min.freq
max.words = 100,
colors = brewer.pal(8, "Dark2"))
## Warning in wordcloud(names(a0), a0, scale = c(4, 1), 3, max.words = 100, :
## leadership could not be fit on page. It will not be plotted.
title(sub = "Quick Summary Wordcloud")
#------------------------------------------------------#
# Step 1a - Term Co-occurance Matrix #
#------------------------------------------------------#
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 5)
vectorizer = vocab_vectorizer(pruned_vocab, grow_dtm = FALSE, skip_grams_window = 3L)
tcm = create_tcm(it_m, vectorizer)
tcm.mat = as.matrix(tcm)
adj.mat = tcm.mat + t(tcm.mat)
# how about a quick view of the distilled COG as well, now that we're here?
diag(adj.mat) = 0 # set diagonals of the adj matrix to zero --> node isn't its own neighor
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])
windows()
distill.cog(adj.mat, 'Distilled COG for full corpus', 10, 10)
#################################################
## --- Step 2: model based text analytics ------ ###
#################################################
# -- select optimal num of topics
K = 3
## Bayes Factor model selection (should choose K or nearby)
#summary(simselect <- topics(dtm, K=K+c(-4:4)), nwrd=0)
#K = simselect$K; K # Change simselect$K to any the number of topics you want to fit in model
#K = 3 # overriding model fit criterion
# -- run topic model for selected K -- #
summary( simfit <- topics(dtm, K=K, verb=2), nwrd = 12 )
##
## Estimating on a 51 document collection.
## Fitting the 3 topic model.
## log posterior increase: 1131, 1535.6, 1846.2, 754.7, done. (L = -1043600)
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'expose', 'exkalibur', 'carnival', 'poll', 'rut', 'antenna', 'darknet', 'ackoff', 'johnmaxwellteam', 'proportional', 'maxwells', 'mckeown' (43.2)
## [2] 'jalan', 'sahaja', 'pfy', 'livingston', 'morristown', 'chatham', 'clifton', 'bloomfield', 'hoboken', 'brussels', 'bangkok', 'eupl' (36.9)
## [3] 'mohanty', 'lakshminath', 'bhojpuri', 'subcontinent', 'colonial', 'dailies', 'pini', 'aryan', 'pli', 'bhajan', 'heinrich', 'feuerstein' (19.9)
##
## Dispersion = 2.48
rownames1 = gsub(" ", ".", rownames(simfit$theta)); rownames(simfit$theta) = rownames1;
## what are the factor components of the factorized DTM?
dim(dtm) # size of the orig input matrix
## [1] 51 6083
str(simfit) # structure of the output obj
## List of 6
## $ K : int 3
## $ theta: num [1:6083, 1:3] 7.65e-10 7.65e-10 7.65e-10 1.56e-07 9.72e-08 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ phrase: chr [1:6083] "gove" "michael_gove" "sahaja" "view_articles" ...
## .. ..$ topic : chr [1:3] "1" "2" "3"
## $ omega: num [1:51, 1:3] 0.0594 0.9999 0.8605 0.7724 0.3794 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ document: chr [1:51] "1" "2" "3" "4" ...
## .. ..$ topic : chr [1:3] "1" "2" "3"
## $ BF : NULL
## $ D :List of 3
## ..$ dispersion: num 2.48
## ..$ pvalue : num 0
## ..$ df : num 262914
## $ X :List of 6
## ..$ i : int [1:46393] 51 51 51 50 50 50 49 49 49 49 ...
## ..$ j : int [1:46393] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ v : num [1:46393] 17 16 5 10 6 7 6 6 9 12 ...
## ..$ nrow : int 51
## ..$ ncol : int 6083
## ..$ dimnames:List of 2
## .. ..$ Docs : chr [1:51] "1" "2" "3" "4" ...
## .. ..$ Terms: chr [1:6083] "gove" "michael_gove" "sahaja" "view_articles" ...
## ..- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## ..- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
## - attr(*, "class")= chr "topics"
dim(simfit$theta) # analogous to factor loadings
## [1] 6083 3
dim(simfit$omega) # analogous to factor scores
## [1] 51 3
simfit$theta[1:5,]
## topic
## phrase 1 2 3
## gove 7.651282e-10 7.119543e-04 1.763899e-09
## michael_gove 7.651282e-10 6.700748e-04 1.763899e-09
## sahaja 7.651276e-10 2.093998e-04 1.763899e-09
## view_articles 1.564708e-07 1.019020e-08 3.209864e-04
## applejack 9.715426e-08 9.526465e-09 1.925831e-04
simfit$omega[1:5,]
## topic
## document 1 2 3
## 1 0.05936005 9.400969e-01 5.430588e-04
## 2 0.99993631 3.476486e-05 2.892947e-05
## 3 0.86045874 5.094990e-04 1.390318e-01
## 4 0.77240090 2.255541e-01 2.044998e-03
## 5 0.37940335 5.593262e-01 6.127042e-02
# ----------------------------------------------------------#
### Step 2a - compute LIFT for all terms across all topics ###
# ----------------------------------------------------------#
tst = round(ncol(dtm)/100)
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
#print(i)
}
theta = simfit$theta
lift = theta*0; # lift will have same dimn as the theta matrix
sum1 = sum(dtm)
pterms = ss.col/sum1 # each column's marginal occurrence probability
for (i in 1:nrow(theta)){
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j]
pterm = pterms[i]
lift[i, j] = ptermtopic/pterm # divide each cell by the column's marg. occurr. proby.
}
}
dim(lift); head(lift, 15)
## [1] 6083 3
## topic
## phrase 1 2 3
## gove 6.009542e-06 5.591899e+00 1.385418e-05
## michael_gove 6.385138e-06 5.591900e+00 1.472007e-05
## sahaja 2.043243e-05 5.591938e+00 4.710421e-05
## view_articles 2.089246e-03 1.360626e-04 4.285907e+00
## applejack 2.162055e-03 2.120004e-04 4.285712e+00
## deviantart 2.136052e-03 1.848800e-04 4.285782e+00
## lumpur 1.739732e-05 5.591927e+00 3.922908e-05
## kuala 1.739732e-05 5.591927e+00 3.922908e-05
## bangkok_thailand 1.159821e-05 5.591912e+00 2.615272e-05
## business_education 8.698662e-06 5.591905e+00 1.961454e-05
## indira_gandhi 8.921211e-06 2.797127e-05 4.291234e+00
## patrika 1.635555e-05 5.128067e-05 4.291251e+00
## mohanty 1.962666e-05 6.153680e-05 4.291258e+00
## deka 1.226666e-05 3.846050e-05 4.291242e+00
## lakshminath 1.962666e-05 6.153680e-05 4.291258e+00
lift[25:35,]
## topic
## phrase 1 2 3
## dan_mccarthy 1.700275e+00 3.825750e-05 2.353312e-05
## displayname 1.700278e+00 5.465352e-05 3.361875e-05
## signinemailaddress 1.700280e+00 6.376241e-05 3.922187e-05
## pacheco 1.700282e+00 7.651486e-05 4.706625e-05
## yogasanas 2.269782e-05 8.426997e-05 4.291210e+00
## gurudev 1.991295e-04 3.527740e-03 4.288063e+00
## namaskar 3.930679e-05 1.877053e-04 4.291112e+00
## sri_sri 9.014242e-05 1.727416e-03 4.289698e+00
## governmental 1.593030e-05 5.305862e-05 4.291234e+00
## art_living 1.890382e-04 3.451274e-03 4.288133e+00
## heinrich 2.023761e-05 6.130130e-05 4.291257e+00
# Generate A censored Lift matrix
censored.lift = lift
for (i in 1:nrow(lift)){
censored.lift[i,][censored.lift[i,] < max(censored.lift[i,])] = 0 # hard assigning tokens to topics
}
head(censored.lift, 10);
## topic
## phrase 1 2 3
## gove 0 5.591899 0.000000
## michael_gove 0 5.591900 0.000000
## sahaja 0 5.591938 0.000000
## view_articles 0 0.000000 4.285907
## applejack 0 0.000000 4.285712
## deviantart 0 0.000000 4.285782
## lumpur 0 5.591927 0.000000
## kuala 0 5.591927 0.000000
## bangkok_thailand 0 5.591912 0.000000
## business_education 0 5.591905 0.000000
# get rid of all NAs in the lift matrices
for (i1 in 1:K){
if (sum(is.na(lift[, i1])) > 0) { lift[which(is.na(lift[,i1])), i1] = min(lift[-which(is.na(lift[,i1])),])}
} # i1 ends
## == How does LIFT differ from THETA scores? Let's see == ##
theta = simfit$theta
theta[order(theta[,1], decreasing = TRUE)[1:10],] # arrange topic 1 probys in decr order
## topic
## phrase 1 2 3
## leadership 0.012722369 9.209523e-03 1.135348e-08
## people 0.007679724 1.252465e-03 2.670154e-08
## leaders 0.006634356 1.129760e-08 7.243243e-09
## work 0.006041968 1.878593e-04 1.136938e-06
## leader 0.005818210 3.389151e-04 8.819028e-09
## time 0.005069357 8.681006e-04 5.536136e-06
## life 0.004856699 1.117863e-06 8.272986e-05
## business 0.004396212 2.460971e-03 3.715214e-09
## team 0.004257418 2.371468e-04 5.420051e-09
## learn 0.003673611 2.725468e-04 4.931412e-08
lift[order(lift[,1], decreasing = TRUE)[1:10],] # arrange topic 1 lifts in decr order
## topic
## phrase 1 2 3
## expose 1.700287 6.129159e-05 4.703314e-05
## exkalibur 1.700287 6.129159e-05 4.703314e-05
## carnival 1.700287 6.129159e-05 4.703314e-05
## poll 1.700287 6.129159e-05 4.703314e-05
## rut 1.700287 6.129159e-05 4.703314e-05
## antenna 1.700287 6.129159e-05 4.703314e-05
## darknet 1.700287 6.129159e-05 4.703314e-05
## ackoff 1.700287 6.129159e-05 4.703314e-05
## johnmaxwellteam 1.700287 6.129159e-05 4.703314e-05
## proportional 1.700287 6.129159e-05 4.703314e-05
theta[order(theta[,2], decreasing = TRUE)[1:10],] # arrange topic 2 probys in decr order
## topic
## phrase 1 2 3
## yoga 2.497190e-03 0.019510042 2.175877e-02
## literature 2.794546e-04 0.014917876 1.545359e-02
## leadership 1.272237e-02 0.009209523 1.135348e-08
## read 2.134334e-03 0.006379658 1.808206e-06
## contact 3.349407e-04 0.005701600 6.284418e-04
## home 1.295838e-03 0.004602207 2.039438e-04
## classes 3.420503e-04 0.004558053 4.182936e-05
## undergraduate 2.554637e-05 0.004397097 3.043447e-09
## search 3.670670e-04 0.004222756 1.007320e-03
## free 1.587455e-03 0.004127577 1.214232e-04
lift[order(lift[,2], decreasing = TRUE)[1:10],] # arrange topic 2 lifts in decr order
## topic
## phrase 1 2 3
## jalan 1.898051e-05 5.591943 4.705026e-05
## sahaja 2.043243e-05 5.591938 4.710421e-05
## pfy 1.977378e-05 5.591937 4.917960e-05
## livingston 1.977378e-05 5.591937 4.917960e-05
## morristown 1.977378e-05 5.591937 4.917960e-05
## chatham 1.977378e-05 5.591937 4.917960e-05
## clifton 1.977378e-05 5.591937 4.917960e-05
## bloomfield 1.977378e-05 5.591937 4.917960e-05
## hoboken 1.977378e-05 5.591937 4.917960e-05
## brussels 1.981640e-05 5.591937 4.943561e-05
#----------------------------------------------------------------#
# Step 2b - Calculate ETA - each document's score on each topic #
#----------------------------------------------------------------#
t = Sys.time()
if(nrow(dtm) < 100) {k1 = 10} else {k1= 100} # to avoid machine choking up in v small datasets
tst = ceiling(nrow(dtm)/k1) # now using 1% of the rows at a time
a = rep(tst, (k1 - 1))
b = cumsum(a);rm(a) # cumsum() is cumulative sum.
b = c(0, b, nrow(dtm)) # broke the supermassive dtm into chunks of 1% ncol each
a0 = which(b > nrow(dtm)); # sometimes, rounding errors cause out of bound errors
if (length(a0) > 0) {b = b[-a0]}
n1 = length(b)
if((b[n1] - b[n1-1])==1) {b[n1-1] = b[n1-1]-1} else { b[n1-1] = b[n1-1]}
eta.new = NULL
for (i1 in 1:K){
a2 = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = as.matrix(dtm[(b[i]+1):(b[i+1]),])
a = matrix(rep(lift[, i1], nrow(tempdtm)), nrow(tempdtm), ncol(tempdtm), byrow = TRUE)
a1 = rowSums(tempdtm * a)
a2 = c(a2, a1); rm(a, a1, tempdtm)
} # i ends
eta.new = cbind(eta.new, a2); rm(a2)
} # i1 ends
Sys.time() - t # will take longer than lift building coz ncol is v v high now
## Time difference of 0.07423306 secs
rownames(eta.new) = rownames(simfit$omega)
colnames(eta.new) = colnames(simfit$theta)
# so what does eta.new look like? what does it mean?
dim(eta.new)
## [1] 51 3
round(head(eta.new),2)
## 1 2 3
## 1 1023.12 2915.38 681.92
## 2 41985.69 14137.81 10274.75
## 3 7494.32 2960.91 4625.14
## 4 2106.68 1903.15 775.16
## 5 1118.48 2417.46 1042.20
## 6 500.28 1383.57 473.52
# eta.new = simfit$theta # if error is happening, worst case
eta.propn = eta.new / rowSums(eta.new) # calc topic proportions for each document
eta.propn [1:5,]
## 1 2 3
## 1 0.2214340 0.6309780 0.1475880
## 2 0.6323312 0.2129245 0.1547443
## 3 0.4969586 0.1963422 0.3066993
## 4 0.4402687 0.3977324 0.1619989
## 5 0.2443086 0.5280442 0.2276471
# ----------------------------------------#
# Step 3 : Plot Wordcloud for each topic #
# ----------------------------------------#
df.top.terms = data.frame(NULL) # can't fit ALL terms in plot, so choose top ones with max loading
for (i in 1:K){ # For each topic
a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0, i] # Theta for terms with lift > 1
freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
top.terms = row.names(top_word)
df.top.terms.t = data.frame(topic = i, top.terms =top.terms, stringsAsFactors = F )
df.top.terms = rbind(df.top.terms, df.top.terms.t )
} # i loop ends
# pdf(file = paste0(K,' Topic Model results.pdf')) # use pdf() func to save plot directly as PDFs in your getwd()
for (i in 1:K){ # For each topic
a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms with lift greator than 1
freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift > 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# SUB TCM
sub.tcm = adj.mat[colnames(adj.mat) %in% names(a0),colnames(adj.mat) %in% names(a0)]
# Plot wordcloud
windows()
wordcloud(rownames(top_word), top_word, scale=c(4,.2), 1,
random.order=FALSE, random.color=FALSE,
colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
# PLot TCM
windows()
distill.cog(sub.tcm, '', 5, 5)
mtext(paste("Term co-occurrence - Topic",i), side = 3, line = 2, cex=2)
} # i loop ends
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length
## Warning in vattrs[[name]][index] <- value: number of items to replace is
## not a multiple of replacement length
# dev.off() # closes the graphical devices
### which rows load most on which topics? ###
show.top.loading.rows = function(eta.obj, number.of.units){
K = ncol(eta.obj) # no. of topic factors
n = number.of.units
top.loaders = NULL
for (i in 1:K){
a0 = order(eta.obj[,i], decreasing = TRUE)
a1 = rownames(eta.obj[a0[1:n],])
top.loaders = cbind(a1, top.loaders)
} # i loop ends
a2 = matrix()
return(top.loaders)
} # func ends
show.top.loading.rows(eta.propn, 10)
## a1 a1 a1
## [1,] "48" "8" "2"
## [2,] "19" "49" "7"
## [3,] "42" "24" "44"
## [4,] "41" "26" "37"
## [5,] "45" "51" "3"
## [6,] "39" "35" "4"
## [7,] "25" "1" "33"
## [8,] "31" "34" "28"
## [9,] "46" "9" "50"
## [10,] "11" "14" "13"
show.top.loading.rows(eta.new, 10)
## a1 a1 a1
## [1,] "42" "2" "2"
## [2,] "48" "42" "7"
## [3,] "2" "7" "3"
## [4,] "19" "49" "42"
## [5,] "31" "26" "44"
## [6,] "25" "24" "31"
## [7,] "41" "8" "30"
## [8,] "3" "30" "33"
## [9,] "7" "25" "41"
## [10,] "39" "35" "19"