Group Members
71610033 - Kunal Dharmadhikari
71610051 - Praveen K Orvakanti
71610086 - Sudarshan Singh
71610105 - Vikram Menon Malik
71610113 - Vipul Manocha
Companies like Advanced Micro Devices, Analog Devices, Applied Materials Inc, Automdated Data processing and Centurylink Inc. offer platforms for storing digital content online. These companies provide solutions to customers for cloud storage. The information of customers is stored securely with robust internet security measures in place. Domain of these companies is telecommunication, manufacturing semi-conductors devices which do digital processing of data. The information of customers is stored securely with robust internet security measures in place. Domain of these companies is telecommunication, manufacturing semi-conductors devices which do digital processing of data.
This type of analysis can help in understanding the underlying business of companies. For competition or new entrants this information has the potential to show the network of business applications of a company, vendors associated, customers & locations where the customers are based.
Use of TF gave us generic words like Company, Product, Technology, Software, Services as the key words however the underlying theme was difficult to infer as these words repeat in each company’s business description.
But when we used TFIDF it normalized the words by using a weightage factor and showed words from the corpus that gave a better understanding of the business themes of companies in the 10-K Form.
The code used for converting to TIDF :
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
When we plotted our WordCloud initially we found that there was 90% sparsity in the DTM with 13826 terms to analyse, however higher sparsity meant that a lot of documents did not contain words. Hence, we rationalized by removing those that were blank & focus on the key words in DTM.
dtm1 <- removeSparseTerms(dtmx, 0.4).
So now we reduced the terms to 331 with maximum sparsity in DTM as 40%.
We plotted the word Cloud with different values of K however inferences around the theme were difficult to identify. At K=4 however, the co-occurrence graphs & the Word Cloud was able to give us insight into the Digital On-line Storage business of the companies, thus we identified K=4 as optimal.
rm(list=ls()) # Clear the workspace
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("maptpx")
#install.packages("igraph")
#install.packages("NLP")
#install.packages("RColorBrewer")
#install.packages("slam")
#install.packages('textir')
library("NLP")
library("RColorBrewer")
library("slam")
library("tm")
library("wordcloud")
library("maptpx")
library("igraph")
library('textir')
textdata = readRDS(file.choose()) # Select BD.Technology.Rds OR RF.Technology.Rds data set
dtmx = readRDS(file.choose()) # Select dtm1.BD.Rds or dtm1.RF.Rds
dtm1 <- removeSparseTerms(dtmx, 0.4) #To remove all the terms which are with more than 60% of sparsity
dtm1 = weightTfIdf(dtm1, normalize = TRUE) #Normalise the DTM with TFIDF
K = 4 # Selecting 4 topic Models
simfit = topics(dtm1, K = K, verb = 2)
#dtmtfidf = tfidf(dtm1,normalize = TRUE)
##
## Estimating on a 20 document collection.
## Fitting the 2 topic model.
## log posterior increase: 448.9, 142.5, done. (L = -496223.7)
summary(simfit, nwrd = 12) # Summary of simfit model
simfit$theta[1:10,]
a0 = apply(simfit$theta, 1, sum);
a01 = order(a0, decreasing = TRUE)
simfit$theta[a01[1:10],]
simfit$omega[1:10,]
t = Sys.time()
theta = simfit$theta
lift = theta*0; sum1 = sum(dtm1)
sum1
for (i in 1:nrow(theta)){
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j] # term i's probability of topic j membership
pterm = sum(dtm1[,i])/sum1 # marginal probability of term i's occurrence in corpus
lift[i, j] = ptermtopic/pterm # so, lift is topic membership probability normalized by occurrence
}
}
Sys.time() - t # Total time for calculating lift
lift[1:15,]
#Construct Word Cloud
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# Plot wordcloud and save it to file - should be saved in your working directory
filename = paste("Latent Topic",i,".png")
png(filename = filename, width=1280,height=800)
wordcloud(rownames(top_word), top_word, scale=c(4,0.5), 1,
random.order=FALSE, random.color=FALSE,
colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
dev.off()
}
#Construct Co-occurance graph and save it to file
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 30. So auto correction
n = ifelse(length(freq) >= 20, 20, length(freq))
top_word = as.matrix(freq[1:n])
# now for top 30 words let's find Document Term Matrix
mat = dtm1[,match(row.names(top_word),colnames(dtm1))]
mat = as.matrix(mat)
cmat = t(mat) %*% (mat)
diag(cmat) = 0
# Let's limit number of connections to 2
for (p in 1:nrow(cmat)){
vec = cmat[p,]
cutoff = sort(vec, decreasing = T)[2]
cmat[p,][cmat[p,] < cutoff] = 0
}
#cmat[cmat < quantile(cmat,.80)] = 0
graph <- graph.adjacency(cmat, mode = "undirected",weighted=T)
cograph = paste("Topic",i,".png")
png(filename = cograph, width = 1280, height = 800)
plot(graph, #the graph to be plotted
layout=layout.fruchterman.reingold, # the layout method.
vertex.frame.color='blue', #the color of the border of the dots
vertex.label.color='black', #the color of the name labels
vertex.label.font=1, #the font of the name labels
vertex.size = .00001, # Dots size
vertex.label.cex=1.3)
mtext(paste("Topic",i), side = 3, line = 2, cex=2)
dev.off()
}
#Now we are trying to find the documents with high probability of Topics and then study them
eta = function(mat, dtm) {
mat1 = mat/mean(mat); terms1 = rownames(mat1);
eta.mat = matrix(0, 1, ncol(mat1))
for (i in 1:nrow(dtm)){
a11 = as.data.frame(matrix(dtm[i,]));
rownames(a11) = colnames(dtm)
a12 = as.matrix(a11[(a11>0),]);
rownames(a12) = rownames(a11)[(a11>0)];
rownames(a12)[1:4]
a13 = intersect(terms1, rownames(a12));
a13[1:15]; length(a13)
a14a = match(a13, terms1); # positions of matching terms in mat1 matrix
a14b = match(a13, rownames(a12))
a15 = mat1[a14a,]*matrix(rep(a12[a14b,],
ncol(mat1)),
ncol = ncol(mat1))
eta.mat = rbind(eta.mat, apply(a15, 2, mean))
rm(a11, a12, a13, a14a, a14b, a15)
}
eta.mat = eta.mat[2:nrow(eta.mat), ] # remove top zeroes row
8
row.names(eta.mat)=row.names(dtm)
return(eta.mat)
}
twc = eta(lift, dtm1)
head(twc)
#Print the company names from those documents
eta.file.name = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),1] # Store first n companies name in list
}
return(s)
}
temp1 = eta.file.name(twc,textdata,5)
for (i in 1:length(temp1)){
#print(paste('Companies loading heavily on topic',i,'are'))
#print(temp1[[i]])
#print('--------------------------')
compName = paste('Companies loading heavily on topic',i,'.txt')
write(temp1[[i]], file = compName)
}
#Print the text written in Next column
eta.file = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),2] # Store first n documents in list
}
return(s)
}
temp2 = eta.file(twc,textdata,5)
for (i in 1:length(temp2)){
docName = paste('Documents loading heavily on topic',i,'.txt')
#pdf(file = docName, height=11, width=8.5)
write(temp2[[i]], file = docName)
#temp2[[i]]
#dev.off()
#print(paste('Documents loading heavily on topic',i,'are'))
#print(temp2[[i]])
#print('--------------------------')
}
Copyright(c) 2016 Forbesganj Team. All rights reserved.