The 3 topics selected for the following analysis are Politics,Laptop and Food. The corresponding text for all these topics are retrieved from top 50 websites got from google search.
The text content of each topic are stored on the documents individually and then processed to a single document.Lets start the analysis with the word cloud and COG.
library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(igraph)
library(maptpx)
text.clean = function(x) # text data
{ require("tm")
x = gsub("<.*?>", " ", x) # regex for removing HTML tags
x = iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
x = gsub("[^[:alnum:]]", " ", x) # keep only alpha numeric
x = tolower(x) # convert to lower case characters
x = removeNumbers(x) # removing numbers
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
return(x)
}
distill.cog = function(mat1, # input TCM ADJ MAT
title, # title for the graph
s, # no. of central nodes
k1){ # max no. of connections
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b,b] #
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # wow. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:s] = "green"
V(graph)$color[(s+1):length(V(graph))] = "pink"
graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
plot(graph,
layout = layout.kamada.kawai,
main = title)
} # func ends
The above function is used for form the COG and plot them as a graph. So for all the 3 topics the above function is called independently and each topic will have their respective graphs.
Now load the csv values and clean the data.
file.cr = read.csv("/Users/bhargav/searchfoodText.csv")
file.mi = read.csv("/Users/bhargav/searchlaptopText.csv")
file.lin = read.csv("/Users/bhargav/searchpoliticsText.csv")
file.cr = file.cr[!is.na(file.cr$text)|file.cr$text != '',]
file.mi = file.mi[!is.na(file.mi$text)|file.mi$text != '',]
file.lin = file.lin[!is.na(file.lin$text)|file.lin$text != '',]
n = min(nrow(file.cr),nrow(file.mi),nrow(file.lin))
data = data.frame(id = 1:n, text1 = file.cr$text[1:n],
text2 = file.mi$text[1:n],
text3 = file.lin$text[1:n],
stringsAsFactors = F)
data$text = paste(data$text1,data$text2,data$text3)
dim(data)
## [1] 56 5
Once the text is cleaned completely then bock the words which are out of focus and they can be removed from the stopwords text file as shown below.
# Read Stopwords list
stpw1 = readLines("../../stopwords.txt")# stopwords list from git
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function
comn = unique(c(stpw1, stpw2)) # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn)) # final stop word lsit after removing punctuation
x = text.clean(data$text) # pre-process text corpus
x = removeWords(x,stopwords) # removing stopwords created above
x = stripWhitespace(x) # removing white space
# x = stemDocument(x)
The stopwords are removed and now we have to create a document term matrix to find out the token frequency.
t1 = Sys.time()
tok_fun = word_tokenizer
it_0 = itoken( x,
#preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_0,
ngram = c(2L, 2L)
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 10)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_0 = create_dtm(it_0, vectorizer)
# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),] #terms in decreasing order of freq
if (length(tsum) > 1000) {n = 1000} else {n = length(tsum)}
tsum = tsum[1:n]
text2 = x
text2 = paste("",text2,"")
pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0
for (term in names(tsum)){
i = i + 1
focal.term = gsub("_", " ",term) # in case dot was word-separator
replacement.term = term
text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
#setTxtProgressBar(pb, i)
}
it_m = itoken(text2,
# preprocessor = text.clean,
tokenizer = tok_fun,
ids = data$id,
progressbar = F)
vocab = create_vocabulary(it_m
# ngram = c(2L, 2L),
#stopwords = stopwords
)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_m = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1] 56 24455
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 41.62294 secs
# some basic clean-up ops
dim(dtm)
## [1] 56 24455
a0 = apply(dtm, 1, sum) # apply sum operation to dtm's rows. i.e. get rowSum
dtm = dtm[(a0 > 5),] # retain only those rows with token rowSum >5, i.e. delete empty rows
dim(dtm); rm(a0) # delete a0 object
## [1] 56 24455
a0 = apply(dtm, 2, sum) # use apply() to find colSUms this time
dtm = dtm[, (a0 > 4)] # retain only those terms that occurred > 4 times in the corpus
dim(dtm); rm(a0)
## [1] 56 6663
# view summary wordlcoud
a0 = apply(dtm, 2, sum) # colSum vector of dtm
a0[1:5] # view what a0 obj is like
## snowman sliders freeshipping inches_inches adblock
## 5 26 10 8 7
a1 = order(as.vector(a0), decreasing = TRUE) # vector of token locations
a0 = a0[a1] # a0 ordered asper token locations
a0[1:5] # view a0 now
## food laptop laptops politics intel_core
## 1110 843 590 530 464
The word cloud of the resultant document clearly indicates the highest frequency of the topics. So, the topics are clearly seperated from the whole of the document content.
wordcloud(names(a0), a0, # invoke wordcloud() func. Use ?wordcloud for more info
scale=c(4,1),
3, # min.freq
max.words = 100,
colors = brewer.pal(8, "Dark2"))
title(sub = "Quick Summary Wordcloud")
Let us have a look at the distilled COG now:
#quartz()
distill.cog(adj.mat, 'Distilled COG for full corpus', 10, 10)
The above analysis of the topics interlinks with the recommended topics and gives us the summary of the overall analysis.
Now, lets check for the LIFT and ETA proportion values which helps us the probability of the topics in each document.
K = 3 # overriding model fit criterion
# -- run topic model for selected K -- #
summary( simfit <- topics(dtm, K=K, verb=2), nwrd = 12 )
##
## Estimating on a 56 document collection.
## Fitting the 3 topic model.
## log posterior increase: 2308.2, 638.2, 882.9, 547.5, done. (L = -1181739.6)
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'rousseff', 'resealable', 'lockers', 'derwent', 'flipcharts', 'glues', 'compasses', 'protractors', 'scrapbooks', 'gel', 'multiroom', 'endive' (61.3)
## [2] 'polka', 'burnham', 'divorce', 'migrants', 'migration', 'andromeda', 'cdma', 'compareraja', 'stores_check', 'nom', 'knocks', 'sturgeon' (28.5)
## [3] 'godfrey', 'snowman', 'rasamalaysia', 'souffle', 'princess', 'reinvented', 'expressive', 'troubleshooting', 'housewife', 'berman', 'fpo', 'buckeye' (10.2)
##
## Dispersion = 3.18
rownames1 = gsub(" ", ".", rownames(simfit$theta)); rownames(simfit$theta) = rownames1;
## what are the factor components of the factorized DTM?
dim(dtm) # size of the orig input matrix
## [1] 56 6663
str(simfit) # structure of the output obj
## List of 6
## $ K : int 3
## $ theta: num [1:6663, 1:3] 5.08e-10 5.08e-10 2.13e-09 1.78e-09 1.60e-09 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ phrase: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
## .. ..$ topic : chr [1:3] "1" "2" "3"
## $ omega: num [1:56, 1:3] 0.000559 0.000292 0.00016 0.512922 0.808919 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ document: chr [1:56] "1" "2" "3" "4" ...
## .. ..$ topic : chr [1:3] "1" "2" "3"
## $ BF : NULL
## $ D :List of 3
## ..$ dispersion: num 3.18
## ..$ pvalue : num 0
## ..$ df : num 322131
## $ X :List of 6
## ..$ i : int [1:55288] 56 56 54 54 54 53 51 51 51 51 ...
## ..$ j : int [1:55288] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ v : num [1:55288] 5 26 10 8 7 36 14 14 24 24 ...
## ..$ nrow : int 56
## ..$ ncol : int 6663
## ..$ dimnames:List of 2
## .. ..$ Docs : chr [1:56] "1" "2" "3" "4" ...
## .. ..$ Terms: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
## ..- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## ..- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
## - attr(*, "class")= chr "topics"
dim(simfit$theta) # analogous to factor loadings
## [1] 6663 3
dim(simfit$omega) # analogous to factor scores
## [1] 56 3
Below are the Theta values for top tokens in the document:
simfit$theta[1:5,]
## topic
## phrase 1 2 3
## snowman 5.075749e-10 1.845461e-09 2.920818e-04
## sliders 5.075751e-10 1.845461e-09 1.518814e-03
## freeshipping 2.126554e-09 3.513155e-04 2.933013e-09
## inches_inches 1.776377e-09 2.810529e-04 2.933013e-09
## adblock 1.603402e-09 2.459215e-04 2.933013e-09
Below are the Omega values for the tokens in the document:
simfit$omega[1:5,]
## topic
## document 1 2 3
## 1 0.0005590992 0.9989189550 0.0005219458
## 2 0.0002920402 0.9991193911 0.0005885687
## 3 0.0001596683 0.0003159075 0.9995244242
## 4 0.5129219500 0.4853479812 0.0017300689
## 5 0.8089190860 0.1887835890 0.0022973250
tst = round(ncol(dtm)/100)
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))
ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
tempdtm = dtm[,(b[i]+1):(b[i+1])]
s = colSums(as.matrix(tempdtm))
ss.col = c(ss.col,s)
#print(i)
}
theta = simfit$theta
lift = theta*0; # lift will have same dimn as the theta matrix
sum1 = sum(dtm)
pterms = ss.col/sum1 # each column's marginal occurrence probability
for (i in 1:nrow(theta)){
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j]
pterm = pterms[i]
lift[i, j] = ptermtopic/pterm # divide each cell by the column's marg. occurr. proby.
}
}
dim(lift); head(lift, 15)
## [1] 6663 3
## topic
## phrase 1 2 3
## snowman 1.497742e-05 5.445549e-05 8.618693e+00
## sliders 2.880274e-06 1.047221e-05 8.618629e+00
## freeshipping 3.137497e-05 5.183274e+00 4.327337e-05
## inches_inches 3.276061e-05 5.183282e+00 5.409172e-05
## adblock 3.379490e-05 5.183288e+00 6.181910e-05
## emi_rs 2.629482e-04 5.182407e+00 1.199793e-05
## mumbai_rajkot 1.446314e+00 3.081649e-05 4.525178e-03
## gurgaon_delhi 1.446314e+00 3.081649e-05 4.525178e-03
## pincode_locations 1.446627e+00 1.800933e-05 2.647700e-03
## product_delivered 1.446627e+00 1.800933e-05 2.647700e-03
## nasik 1.446015e+00 4.311127e-05 6.326799e-03
## stoves 1.444967e+00 8.614228e-05 1.262663e-02
## photoframes 1.444967e+00 8.614228e-05 1.262663e-02
## conservation 1.444967e+00 8.614228e-05 1.262663e-02
## merson 1.445316e+00 7.179873e-05 1.052770e-02
On further analysis, the lift values for the corresponding topics are as below:
lift[25:35,]
## topic
## phrase 1 2 3
## adageunicorns 1.750409e-05 5.183362e+00 1.087037e-04
## entrepreneur 2.786827e-03 5.173378e+00 1.731776e-04
## charles_pierce 8.064532e-06 7.466233e-06 8.618585e+00
## staggers 8.105816e-04 2.850908e-04 8.613571e+00
## smartcooky 7.791184e-06 1.800563e-05 8.618614e+00
## statesman 1.578546e-01 1.249039e-02 7.657820e+00
## foodnavigator 1.441350e+00 2.057864e-02 9.240647e-05
## confectionery 1.445557e+00 5.484517e-03 9.209458e-05
## exhaustive 1.442577e+00 1.615626e-02 6.600552e-05
## ratings_based 4.160594e-06 5.183346e+00 2.159149e-05
## prices_ratings 4.160594e-06 5.183346e+00 2.159149e-05
# Generate A censored Lift matrix
censored.lift = lift
for (i in 1:nrow(lift)){
censored.lift[i,][censored.lift[i,] < max(censored.lift[i,])] = 0 # hard assigning tokens to topics
}
The below censored Lift values of the tokens in the document below shows the weightage of each topic classified.
head(censored.lift, 10)
## topic
## phrase 1 2 3
## snowman 0.000000 0.000000 8.618693
## sliders 0.000000 0.000000 8.618629
## freeshipping 0.000000 5.183274 0.000000
## inches_inches 0.000000 5.183282 0.000000
## adblock 0.000000 5.183288 0.000000
## emi_rs 0.000000 5.182407 0.000000
## mumbai_rajkot 1.446314 0.000000 0.000000
## gurgaon_delhi 1.446314 0.000000 0.000000
## pincode_locations 1.446627 0.000000 0.000000
## product_delivered 1.446627 0.000000 0.000000
t = Sys.time()
if(nrow(dtm) < 100) {k1 = 10} else {k1= 100} # to avoid machine choking up in v small datasets
tst = ceiling(nrow(dtm)/k1) # now using 1% of the rows at a time
a = rep(tst, (k1 - 1))
b = cumsum(a);rm(a) # cumsum() is cumulative sum.
b = c(0, b, nrow(dtm)) # broke the supermassive dtm into chunks of 1% ncol each
a0 = which(b > nrow(dtm)); # sometimes, rounding errors cause out of bound errors
if (length(a0) > 0) {b = b[-a0]}
eta.new = NULL
for (i1 in 1:K){
a2 = c(NULL)
for (i in 1:(length(b)-3)) {
tempdtm = dtm[(b[i]+1):(b[i+1]),]
a = matrix(rep(lift[, i1], nrow(tempdtm)), nrow(tempdtm), ncol(tempdtm), byrow = TRUE)
a1 = rowSums(as.matrix(tempdtm * a))
a2 = c(a2, a1); rm(a, a1, tempdtm)
} # i ends
eta.new = cbind(eta.new, a2); rm(a2)
} # i1 ends
Sys.time() - t # will take longer than lift building coz ncol is v v high now
## Time difference of 0.1570508 secs
#rownames(eta.new) = rownames(simfit$omega)
colnames(eta.new) = colnames(simfit$theta)
# so what does eta.new look like? what does it mean?
dim(eta.new)
## [1] 48 3
The Eta proportion values for each topic are calculated and below is the summary. The columns indicates the document numbers and each row represents the topic. For example, if the topic resides in the document-1, all the other document values for the corresponding topic will be assigned Zero.
#head(eta.new)
# eta.new = simfit$theta # if error is happening, worst case
eta.propn = eta.new / rowSums(eta.new) # calc topic proportions for each document
eta.propn [1:5,]
## 1 2 3
## 1 0.1780223 0.6251038 0.1968739
## 2 0.1251950 0.7039112 0.1708938
## 3 0.1032872 0.1628087 0.7339040
## 4 0.2713160 0.5080330 0.2206510
## 5 0.4042983 0.3690693 0.2266324
df.top.terms = data.frame(NULL) # can't fit ALL terms in plot, so choose top ones with max loading
for (i in 1:K){ # For each topic
a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0, i] # Theta for terms greator than 1
freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
top.terms = row.names(top_word)
df.top.terms.t = data.frame(topic = i, top.terms =top.terms, stringsAsFactors = F )
df.top.terms = rbind(df.top.terms, df.top.terms.t )
} # i loop ends
# pdf(file = paste0(K,' Topic Model results.pdf')) # use pdf() func to save plot directly as PDFs in your getwd()
for (i in 1:K){ # For each topic
a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# SUB TCM
sub.tcm = adj.mat[colnames(adj.mat) %in% names(a0),colnames(adj.mat) %in% names(a0)]
# Plot wordcloud
#windows()
wordcloud(rownames(top_word), top_word, scale=c(4,.2), 1,
random.order=FALSE, random.color=FALSE,
colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i,"-Word Cloud"), side = 2, line = 0, cex=2)
# PLot TCM
#windows()
distill.cog(sub.tcm, '', 5, 5)
mtext(paste("Latent Topic",i,"-cog"), side = 2, line = 0, cex=2)
#mtext(paste("Term co-occurrence - Topic",i), side = 3, line = 2, cex=2)
} # i loop ends
show.top.loading.rows = function(eta.obj, number.of.units){
K = ncol(eta.obj) # no. of topic factors
n = number.of.units
top.loaders = NULL
for (i in 1:K){
a0 = order(eta.obj[,i], decreasing = TRUE)
a1 = rownames(eta.obj[a0[1:n],])
top.loaders = cbind(a1, top.loaders)
} # i loop ends
a2 = matrix()
return(top.loaders)
} # func ends
The above latent topics clearly classifies each topic from the documents are indicated correctly.
show.top.loading.rows(eta.propn, 20)
## a1 a1 a1
## [1,] "3" "2" "24"
## [2,] "20" "29" "27"
## [3,] "13" "1" "31"
## [4,] "43" "38" "15"
## [5,] "6" "41" "10"
## [6,] "48" "47" "32"
## [7,] "18" "25" "34"
## [8,] "36" "40" "26"
## [9,] "39" "22" "16"
## [10,] "26" "8" "28"
## [11,] "28" "4" "23"
## [12,] "21" "45" "17"
## [13,] "14" "12" "30"
## [14,] "45" "33" "9"
## [15,] "19" "35" "39"
## [16,] "22" "7" "11"
## [17,] "11" "37" "44"
## [18,] "30" "21" "46"
## [19,] "32" "46" "5"
## [20,] "25" "42" "36"
show.top.loading.rows(eta.new, 10)
## a1 a1 a1
## [1,] "3" "2" "24"
## [2,] "13" "41" "26"
## [3,] "20" "13" "28"
## [4,] "26" "28" "31"
## [5,] "28" "26" "15"
## [6,] "48" "12" "30"
## [7,] "15" "1" "13"
## [8,] "24" "22" "34"
## [9,] "31" "14" "9"
## [10,] "43" "31" "32"
QNo.1. whether the topic model is able to separate each subject from other subjects. To what extent is it able to do so?
The topics are able to seperate the subject from other subjects. However the subjects Food and Politics has both the contents in their document and hence few words were included in both the latent topics.
QNo.2. Are there mixed tokens (with high lift in more than one topic)? Are the highest LIFT tokens and the document topic proportions (ETA scores) clear and able to identify each topic?
The mixed tokens available in the topic models are however very less. The ETA proportion values are clear and are able to identify each topic individually.
QNo.3. What are your learnings from this exercise.
Below are the learnings from this exercise: 1. The emperical topic modelling though the topics are clubbed into a single document has successfully identified each topic. 2. The topic model has the tokens classified and the theta and omega values clearly indicates the token belongs to a particlar topic which suggests the model is highly effective. 3.The latent topics obtained by the result of the model is as expected. If the K value is increased, the token classification will become close to a perfect model.