This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
rm(list=ls()) # Clear the workspace
library("tm")
## Loading required package: NLP
library("wordcloud")
## Loading required package: RColorBrewer
library("maptpx")
## Loading required package: slam
library("igraph")
##
## Attaching package: 'igraph'
## The following object is masked from 'package:maptpx':
##
## normalize
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library("reshape")
## Warning: package 'reshape' was built under R version 3.2.4
dtm1 = readRDS(file.choose()) # Select dtm1.RF.Rds
K = 2 # Choose number of topics in the model
simfit = topics(dtm1, K = K, verb = 2) # Fit the K topic model
##
## Estimating on a 85 document collection.
## Fitting the 2 topic model.
## log posterior increase: 111.9, done. (L = -2347579.3)
summary(simfit, nwrd = 12) # Summary of simfit model
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'abandon', 'atic', 'dominance', 'fastest', 'instruction', 'pack', 'sunnyvale', 'amk', 'applied-products', 'linewidths', 'varian', 'dense' (80.2)
## [2] 'peo', 'adp', 'motivated', 'employer', 'multitude', 'transmitter', 'statutes', 'withheld', 'worsening', 'highest', 'security-systems', 'borrowers' (19.8)
##
## Dispersion = 2.12
simfit$theta[1:10,]
## topic
## phrase 1 2
## abandon 8.417557e-06 8.040463e-10
## abandoned 3.785483e-05 9.538075e-08
## abandonment 4.835164e-07 3.196205e-05
## abatement 1.297934e-06 4.563540e-05
## abide 4.718840e-07 3.200891e-05
## abilities 2.886747e-06 1.579139e-04
## ability 7.447915e-03 5.827765e-06
## abroad 1.851574e-04 1.003033e-07
## abruptly 9.114345e-07 3.023824e-05
## absence 2.351180e-06 1.431172e-04
theta <- simfit$theta[1:10,]
a0 = apply(simfit$theta, 1, sum);
a01 = order(a0, decreasing = TRUE)
simfit$theta[a01[1:10],]
## topic
## phrase 1 2
## products 0.0141411246 1.440973e-07
## business 0.0128248911 1.273608e-05
## operations 0.0101151951 9.944519e-06
## customers 0.0092253760 1.925722e-07
## result 0.0090019883 1.513073e-06
## services 0.0074996323 3.484800e-05
## ability 0.0074479148 5.827765e-06
## change 0.0004822414 5.907110e-03
## addition 0.0063839862 2.334829e-06
## results 0.0060873876 1.035554e-05
simfit$omega[1:10,]
## topic
## document 1 2
## 1 0.9998552 0.0001448060
## 2 0.9995329 0.0004670551
## 3 0.9998276 0.0001724272
## 4 0.5186917 0.4813083067
## 5 0.9997058 0.0002941557
## 6 0.8072293 0.1927706932
## 7 0.7768235 0.2231765152
## 8 0.7928529 0.2071471166
## 9 0.8002837 0.1997162711
## 10 0.7840141 0.2159859383
t = Sys.time()
theta = simfit$theta
lift = theta*0; sum1 = sum(dtm1)
for (i in 1:nrow(theta)){
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j] # term i's probability of topic j membership
pterm = sum(dtm1[,i])/sum1 # marginal probability of term i's occurrence in corpus
lift[i, j] = ptermtopic/pterm # so, lift is topic membership probability normalized by occurrence probability
}
}
Sys.time() - t # Total time for calculating lift
## Time difference of 1.247762 mins
lift[1:10,]
## topic
## phrase 1 2
## abandon 1.24826471 0.0001192344
## abandoned 1.24746803 0.0031431774
## abandonment 0.07170210 4.7397489451
## abatement 0.12831632 4.5116064902
## abide 0.06997709 4.7466979203
## abilities 0.08561688 4.6835044100
## ability 1.24799280 0.0009765162
## abroad 1.24807000 0.0006761038
## abruptly 0.13515935 4.4841196489
## absence 0.07748079 4.7162848842
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# Plot wordcloud
wordcloud(rownames(top_word), top_word, scale=c(4,0.5), 1,
random.order=FALSE, random.color=FALSE,
colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
}
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 30. So auto correction
n = ifelse(length(freq) >= 20, 20, length(freq))
top_word = as.matrix(freq[1:n])
# now for top 30 words let's find Document Term Matrix
mat = dtm1[,match(row.names(top_word),colnames(dtm1))]
mat = as.matrix(mat)
cmat = t(mat) %*% (mat)
diag(cmat) = 0
# Let's limit number of connections to 2
for (p in 1:nrow(cmat)){
vec = cmat[p,]
cutoff = sort(vec, decreasing = T)[2]
cmat[p,][cmat[p,] < cutoff] = 0
}
#cmat[cmat < quantile(cmat,.80)] = 0
graph <- graph.adjacency(cmat, mode = "undirected",weighted=T)
plot(graph, #the graph to be plotted
layout=layout.fruchterman.reingold, # the layout method.
vertex.frame.color='blue', #the color of the border of the dots
vertex.label.color='black', #the color of the name labels
vertex.label.font=1, #the font of the name labels
vertex.size = .00001, # Dots size
vertex.label.cex=1.3)
mtext(paste("Topic",i), side = 3, line = 2, cex=2)
}
eta = function(mat, dtm) {
mat1 = mat/mean(mat); terms1 = rownames(mat1);
eta.mat = matrix(0, 1, ncol(mat1))
for (i in 1:nrow(dtm)){
a11 = as.data.frame(matrix(dtm[i,]));
rownames(a11) = colnames(dtm)
a12 = as.matrix(a11[(a11>0),]);
rownames(a12) = rownames(a11)[(a11>0)];
rownames(a12)[1:4]
a13 = intersect(terms1, rownames(a12));
a13[1:15]; length(a13)
a14a = match(a13, terms1); # positions of matching terms in mat1 matrix
a14b = match(a13, rownames(a12))
a15 = mat1[a14a,]*matrix(rep(a12[a14b,],
ncol(mat1)),
ncol = ncol(mat1))
eta.mat = rbind(eta.mat, apply(a15, 2, mean))
rm(a11, a12, a13, a14a, a14b, a15)
}
eta.mat = eta.mat[2:nrow(eta.mat), ] # remove top zeroes row
row.names(eta.mat)=row.names(dtm)
return(eta.mat)
}
twc = eta(lift, dtm1)
head(twc)
## 1 2
## 1 1.8058129 0.07338217
## 2 1.3752730 0.12560819
## 3 1.7365399 0.05202135
## 4 0.6121672 1.86188229
## 5 1.6841064 0.13873854
## 6 1.2361259 1.18955013
eta.file.name = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),1] # Store first n companies name in list
}
return(s)
}
temp1 = eta.file.name(twc,textdata,5)
for (i in 1:length(temp1)){
print(paste('Companies loading heavily on topic',i,'are'))
print(temp1[[i]])
print('--------------------------')
}
## [1] "Companies loading heavily on topic 1 are"
## [1] "SUNPOWER CORP" "ADVANCED MICRO DEVICES"
## [3] "SANDISK CORP" "APPLIED MATERIALS INC"
## [5] "LEAP WIRELESS INTL INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 2 are"
## [1] "SPRINT CORP" "TELEPHONE & DATA SYSTEMS INC"
## [3] "SUNPOWER CORP" "FACEBOOK INC"
## [5] "BROCADE COMMUNICATIONS SYS"
## [1] "--------------------------"
eta.file = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),2] # Store first n documents in list
}
return(s)
}
temp2 = eta.file(twc,textdata,5)
# for (i in 1:length(temp2)){
# print(paste('Documents loading heavily on topic',i,'are'))
# print(temp2[[i]])
# print('--------------------------')
# }
studyLDAModel = function(K) {
simfit = topics(dtm1, K = K, verb = 2) # Fit the K topic model
summary(simfit, nwrd = 12) # Summary of simfit model
simfit$theta[1:10,]
theta <- simfit$theta[1:10,]
a0 = apply(simfit$theta, 1, sum);
a01 = order(a0, decreasing = TRUE)
simfit$theta[a01[1:10],]
simfit$omega[1:10,]
t = Sys.time()
theta = simfit$theta
lift = theta*0; sum1 = sum(dtm1)
for (i in 1:nrow(theta)){
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j] # term i's probability of topic j membership
pterm = sum(dtm1[,i])/sum1 # marginal probability of term i's occurrence in corpus
lift[i, j] = ptermtopic/pterm # so, lift is topic membership probability normalized by occurrence probability
}
}
Sys.time() - t # Total time for calculating lift
lift[1:10,]
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# Plot wordcloud
wordcloud(rownames(top_word), top_word, scale=c(4,0.5), 1,
random.order=FALSE, random.color=FALSE,
colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
}
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 30. So auto correction
n = ifelse(length(freq) >= 20, 20, length(freq))
top_word = as.matrix(freq[1:n])
# now for top 30 words let's find Document Term Matrix
mat = dtm1[,match(row.names(top_word),colnames(dtm1))]
mat = as.matrix(mat)
cmat = t(mat) %*% (mat)
diag(cmat) = 0
# Let's limit number of connections to 2
for (p in 1:nrow(cmat)){
vec = cmat[p,]
cutoff = sort(vec, decreasing = T)[2]
cmat[p,][cmat[p,] < cutoff] = 0
}
#cmat[cmat < quantile(cmat,.80)] = 0
graph <- graph.adjacency(cmat, mode = "undirected",weighted=T)
plot(graph, #the graph to be plotted
layout=layout.fruchterman.reingold, # the layout method.
vertex.frame.color='blue', #the color of the border of the dots
vertex.label.color='black', #the color of the name labels
vertex.label.font=1, #the font of the name labels
vertex.size = .00001, # Dots size
vertex.label.cex=1.3)
mtext(paste("Topic",i), side = 3, line = 2, cex=2)
}
twc = eta(lift, dtm1)
head(twc)
temp1 = eta.file.name(twc,textdata,5)
for (i in 1:length(temp1)){
print(paste('Companies loading heavily on topic',i,'are'))
print(temp1[[i]])
print('--------------------------')
}
}
K=3
studyLDAModel(K)
##
## Estimating on a 85 document collection.
## Fitting the 3 topic model.
## log posterior increase: 890.2, 1133.3, done. (L = -2335640.4)
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'abandon', 'atic', 'dominance', 'fastest', 'instruction', 'pack', 'sunnyvale', 'amk', 'applied-products', 'linewidths', 'varian', 'manufacture-product' (75.3)
## [2] 'access-facilities', 'blurred', 'control-management', 'dated', 'deploys', 'mvno', 'revenues-future', 'subsidized', 'waiting', 'dividing', 'holdco', 'sprint-clearwire' (13.1)
## [3] 'multitude', 'withheld', 'broadridge', 'censures', 'solutions-business', 'intentionally', 'clients-customers', 'clearpath', 'adequate-security', 'acquirors', 'admissions', 'cerner' (11.6)
##
## Dispersion = 2
## [1] "Companies loading heavily on topic 1 are"
## [1] "SUNPOWER CORP" "ADVANCED MICRO DEVICES"
## [3] "APPLIED MATERIALS INC" "SANDISK CORP"
## [5] "LEAP WIRELESS INTL INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 2 are"
## [1] "SPRINT CORP" "TELEPHONE & DATA SYSTEMS INC"
## [3] "SUNPOWER CORP" "BROCADE COMMUNICATIONS SYS"
## [5] "ECHOSTAR CORP"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 3 are"
## [1] "IAC/INTERACTIVECORP" "GROUPON INC"
## [3] "CERNER CORP" "AUTOMATIC DATA PROCESSING"
## [5] "BROADRIDGE FINANCIAL SOLUTNS"
## [1] "--------------------------"
K=4
studyLDAModel(K)
##
## Estimating on a 85 document collection.
## Fitting the 4 topic model.
## log posterior increase: 3545.4, 1671.9, 1348.9, 693.1, 463.3, done. (L = -2314418.5)
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'abandon', 'atic', 'dominance', 'fastest', 'instruction', 'pack', 'sunnyvale', 'amk', 'applied-products', 'linewidths', 'varian', 'dense' (59.6)
## [2] 'consequent', 'mdash', 'repealed', 'affect-reported', 'company-customers', 'facilities-equipment', 'hai', 'hon', 'kill', 'lived', 'pricings', 'quick' (16.4)
## [3] 'access-facilities', 'blurred', 'control-management', 'dated', 'deploys', 'mvno', 'revenues-future', 'subsidized', 'waiting', 'rollout', 'household', 'sign' (14)
## [4] 'withheld', 'broadridge', 'censures', 'solutions-business', 'clearpath', 'intentionally', 'adequate-security', 'clients-customers', 'admissions', 'cerner', 'coding', 'diagnosis' (10)
##
## Dispersion = 1.88
## [1] "Companies loading heavily on topic 1 are"
## [1] "SUNPOWER CORP" "ADVANCED MICRO DEVICES"
## [3] "APPLIED MATERIALS INC" "SANDISK CORP"
## [5] "CENTURYLINK INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 2 are"
## [1] "CISCO SYSTEMS INC" "JABIL CIRCUIT INC"
## [3] "SANDISK CORP" "AMKOR TECHNOLOGY INC"
## [5] "TELEDYNE TECHNOLOGIES INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 3 are"
## [1] "SPRINT CORP" "TELEPHONE & DATA SYSTEMS INC"
## [3] "LEAP WIRELESS INTL INC" "SUNPOWER CORP"
## [5] "FACEBOOK INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 4 are"
## [1] "MANTECH INTL CORP" "CERNER CORP"
## [3] "BROADRIDGE FINANCIAL SOLUTNS" "UNISYS CORP"
## [5] "COMPUTER SCIENCES CORP"
## [1] "--------------------------"
K=5
studyLDAModel(K)
##
## Estimating on a 85 document collection.
## Fitting the 5 topic model.
## log posterior increase: 1271.3, 2374.8, 1480.8, 855.8, 702.2, 514.6, 470.4, done. (L = -2305161.6)
##
## Top 12 phrases by topic-over-null term lift (and usage %):
##
## [1] 'amk', 'applied-products', 'linewidths', 'varian', 'affect-applied', 'general-industry', 'inflections', 'tvs', 'abandon', 'atic', 'dominance', 'fastest' (46)
## [2] 'access-facilities', 'blurred', 'control-management', 'dated', 'deploys', 'mvno', 'revenues-future', 'subsidized', 'waiting', 'rollout', 'acs', 'baseline' (21.8)
## [3] 'manufacture-product', 'axis', 'guarantor', 'rejection', 'removable', 'spills', 'wdc', 'yokkaichi', 'unsold', 'expenses-including', 'controllers', 'lsi' (14.3)
## [4] 'broadridge', 'censures', 'solutions-business', 'intentionally', 'clearpath', 'adequate-security', 'clients-customers', 'admissions', 'cerner', 'coding', 'diagnosis', 'displaying' (12.3)
## [5] 'reputation-harm', 'depreciating', 'disadvantageous', 'equity-debt', 'federal-tax', 'foot', 'payouts', 'stockholders-including', 'tanks', 'trs', 'cabinet', 'prevailing-market' (5.6)
##
## Dispersion = 1.82
## [1] "Companies loading heavily on topic 1 are"
## [1] "APPLIED MATERIALS INC" "ADVANCED MICRO DEVICES"
## [3] "SUNPOWER CORP" "CISCO SYSTEMS INC"
## [5] "CENTURYLINK INC"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 2 are"
## [1] "SPRINT CORP" "TELEPHONE & DATA SYSTEMS INC"
## [3] "LEAP WIRELESS INTL INC" "FACEBOOK INC"
## [5] "SUNPOWER CORP"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 3 are"
## [1] "SANDISK CORP" "AMKOR TECHNOLOGY INC" "WESTERN DIGITAL CORP"
## [4] "JABIL CIRCUIT INC" "SANMINA CORP"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 4 are"
## [1] "MANTECH INTL CORP" "CERNER CORP"
## [3] "UNISYS CORP" "BROADRIDGE FINANCIAL SOLUTNS"
## [5] "COMPUTER SCIENCES CORP"
## [1] "--------------------------"
## [1] "Companies loading heavily on topic 5 are"
## [1] "ECHOSTAR CORP" "EQUINIX INC"
## [3] "IRON MOUNTAIN INC" "TRIMBLE NAVIGATION LTD"
## [5] "ITRON INC"
## [1] "--------------------------"
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.