In this article we have analysed the “Risk Factors” section of 10-K form to identify the relevant topics being discussed and identify optimal number of techniques using Text Mining techniques.
########### LOADING Package ##############
rm(list=ls()) # Clear the workspace
library("tm")
library("wordcloud")
library("maptpx")
library("igraph")
library("textir")
library("RWeka")
library("qdap")
########### LOADING DATA ##############
textData <- readRDS('RF.technology.Rds')
dtm1 <- readRDS('dtm1.RF.Rds')
########### LOADING Custom Function ##############
custom.dtm = function(x1, # Text Corpus
scheme) # tf or tfidf
{
tdm = TermDocumentMatrix(x1)
a1 = apply(tdm, 1, sum)
a2 =((a1 >= 2))
tdm.new = tdm[a2, ]
# remove blank documents (i.e. columns with zero sums)
a0 = NULL;
for (i1 in 1:ncol(tdm.new)){ if (sum(tdm.new[, i1]) == 0) {a0 = c(a0, i1)} }
length(a0) # no. of empty docs in the corpus
if (length(a0) >0) { tdm.new1 = tdm.new[, -a0]} else {tdm.new1 = tdm.new};
dim(tdm.new1) # reduced tdm
if (scheme == "tfidf") {
x2mat = t(tfidf(tdm.new1))
}
else {x2mat = t((tdm.new1))}
return(x2mat)
}
# ########### Converting Data.Frame (text data) into Corpus
# corpuslocal <- Corpus(DataframeSource(textData))
#
# ########### Comment the Below Line to build the DocumentTermmatrix By TF
# dtm1 <- custom.dtm(corpuslocal, "tf")
########### Comment the Below Line to build the DocumentTermmatrix By TFIDF
# dtm2 <- custom.dtm(corpuslocal, "tfidf")
K = 3 # Choose number of topics in the model
simfit = topics(dtm1, K = K, verb = 2) # Fit the K topic model
summary(simfit, nwrd = 12)
# simfit$theta[1:10,]
a0 = apply(simfit$theta, 1, sum);
a01 = order(a0, decreasing = TRUE)
simfit$theta[a01[1:10],]
simfit$omega[1:10,]
###### DocumentTermMatrix lift calculation ##############
t = Sys.time()
theta = simfit$theta
lift = theta*0;
sum1 = sum(dtm1)
for (i in 1:nrow(theta)) {
for (j in 1:ncol(theta)){
ptermtopic = 0; pterm = 0;
ptermtopic = theta[i, j] # term i's probability of topic j membership
pterm = sum(dtm1[,i])/sum1 # marginal probability of term i's occurrence in corpus
lift[i, j] = ptermtopic/pterm # so, lift is topic membership probability normalized by occurrence probability
}
}
Sys.time() - t # Total time for calculating lift
###### plot top 100 terms in each topic ##############
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 100. So auto correction
n = ifelse(length(freq) >= 100, 100, length(freq))
top_word = as.matrix(freq[1:n])
# Plot wordcloud
wordcloud(rownames(top_word), top_word, scale=c(4,0.5), 1, random.order=FALSE, random.color=FALSE, colors=brewer.pal(8, "Dark2"))
mtext(paste("Latent Topic",i), side = 3, line = 2, cex=2)
}
###### plot top 20 terms co-occurrence graph ##############
for (i in 1:K){ # For each topic
a0 = which(lift[,i] > 1) # terms with lift greator than 1 for topic i
freq = theta[a0,i] # Theta for terms greator than 1
freq = sort(freq,decreasing = T) # Terms with higher probilities for topic i
# Auto Correction - Sometime terms in topic with lift above 1 are less than 30. So auto correction
n = ifelse(length(freq) >= 20, 20, length(freq))
top_word = as.matrix(freq[1:n])
# now for top 30 words let's find Document Term Matrix
mat = dtm1[,match(row.names(top_word),colnames(dtm1))]
mat = as.matrix(mat)
cmat = t(mat) %*% (mat)
diag(cmat) = 0
# Let's limit number of connections to 2
for (p in 1:nrow(cmat)){
vec = cmat[p,]
cutoff = sort(vec, decreasing = T)[2]
cmat[p,][cmat[p,] < cutoff] = 0 }
#cmat[cmat < quantile(cmat,.80)] = 0
graph <- graph.adjacency(cmat, mode = "undirected",weighted=T)
plot(graph, #the graph to be plotted
layout=layout.fruchterman.reingold, # the layout method.
vertex.frame.color='blue', #the color of the border of the dots
vertex.label.color='black', #the color of the name labels
vertex.label.font=1, #the font of the name labels
vertex.size = .00001, # Dots size
vertex.label.cex=1.3)
mtext(paste("Topic",i), side = 3, line = 2, cex=2)
}
###### weighing scheme for each document and each topic ##############
eta = function(mat, dtm) {
mat1 = mat/mean(mat);
terms1 = rownames(mat1);
eta.mat = matrix(0, 1, ncol(mat1))
for (i in 1:nrow(dtm)) {
a11 = as.data.frame(matrix(dtm[i,]));
rownames(a11) = colnames(dtm)
a12 = as.matrix(a11[(a11>0),]);
rownames(a12) = rownames(a11)[(a11>0)];
rownames(a12)[1:4]
a13 = intersect(terms1, rownames(a12));
a13[1:15]; length(a13)
a14a = match(a13, terms1); # positions of matching terms in mat1 matrix
a14b = match(a13, rownames(a12))
a15 = mat1[a14a,]*matrix(rep(a12[a14b,], ncol(mat1)), ncol = ncol(mat1))
eta.mat = rbind(eta.mat, apply(a15, 2, mean))
rm(a11, a12, a13, a14a, a14b, a15)
}
eta.mat = eta.mat[2:nrow(eta.mat), ] # remove top zeroes row
row.names(eta.mat)=row.names(dtm)
return(eta.mat)
}
twc = eta(lift, dtm1)
head(twc)
###### top 5 company names for each topic ##############
eta.file.name = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),1] # Store first n companies name in list
}
return(s)
}
temp1 = eta.file.name(twc,textData,5)
for (i in 1:length(temp1)){
print(paste('Companies loading heavily on topic',i,'are'))
print(temp1[[i]])
print('--------------------------')
}
###### top 5 Text Documents for each topic ##############
eta.file = function(mat,calib,n) {
s = list() # Blank List
for (i in 1: ncol(mat)) # For each topic
{
read_doc = mat[order(mat[,i], decreasing= T),] # Sort document prop matrix (twc)
read_names = row.names(read_doc[1:n,]) # docuemnt index for first n document
s[[i]] = calib[as.numeric(read_names),2] # Store first n documents in list
}
return(s)
}
temp2 = eta.file(twc,textData,5)
for (i in 1:length(temp2))
{
# formattedFilename <- paste0("DocumentId_",i, "_", i)
# print(formattedFilename)
# write(temp2[[i]], formattedFilename)
print(paste('Documents loading heavily on topic',i,'are'))
print(temp2[[i]])
print('--------------------------')
}
###### END OF THE FILE ##############
After looking at the word clouds and co-occurrence graphs for k = 2,3,4,5 for both TF and TF-IDF, it was felt that k=3 is the optimal number of topics. When k=2, it does not provide clear differentiation of topics and when k>3, topics seem to repeat indicating that k=3 is optimal. Also, TF is depicted better information than the TF-IDF.
Looking at the Word cloud for 3 topics and co-occurrence graph, below seem to be the three topics:
Above conclusion was reached based on interpreting the word-clouds and co-occurrence graphs. Also below are the Top 5 companies for each of the Topic.
Topic 1
Sunpower: risks related to our sales channels the increase in the global supply of solar cells and panels, and increasing competition, may cause substantial downward pressure on the prices of such products and cause us to lose sales or market share, resulting in lower revenues, earnings, and cash flow. global solar cell and panel production capacity has been materially increasing since 2009, and solar cell and solar panel manufacturers continue to have significant excess capacity, particularly in china. excess capacity and industry competition have resulted, and will continue to result, in substantial downward pressure on the price of solar cells and panels, including sunpower products. increasing competition could also result in us losing sales or market share. such price reductions or loss of sales or market share could continue to have a negative impact on our revenue and earnings, and could materially adversely affect our business and financial condition and cash flows. in addition, our internal forecasts of pricing may not be accurate in the current market environment, which could cause our financial results to be different than forecasted. see also “if we fail to successfully execute our cost reduction roadmap, and develop and introduce new and enhanced products and services, we may not be able to compete effectively,
AMD: environmental laws are complex, change frequently and have tended to become more stringent over time. for example, the european union (eu) and china are two among a growing number of jurisdictions that have enacted restrictions on the use of lead and other materials in electronic products. other countries have also implemented similar restrictions. these regulations affect semiconductor devices and packaging. as regulations restricting materials in electronic products continue to increase around the world, there is a risk that the cost, quality and manufacturing yields of products that are subject to these restrictions, may be less favorable compared to products that are not subject to such restrictions, or that the transition to compliant products may produce sudden changes in demand, which may result in excess inventory.
Topic 2
SPRINT CORP: And there is no assurance that the softbank merger or the clearwire acquisition and the respective related transactions will occur on the terms and timeline currently contemplated or at all, or that the conditions to the softbank merger or the clearwire acquisition will be satisfied or waived in a timely manner or at all. any delay in completing the clearwire acquisition could cause sprint not to realize, or delay the realization of, some or all of the benefits that sprint expects to achieve from the softbank merger and clearwire acquisition”
“sales of our common stock in the public market or sales of any of our other securities could dilute ownership and earnings per share, and even the perception that such sales could occur could cause the market prices of our common stock to decline. in addition, the existence of our outstanding debentures may encourage short selling of our common stock by market participants who expect that the conversion of the debentures could depress the prices of our common stock. we issued warrants to affiliates of the underwriters of our 4.50% and 4.75% debentures, which are exercisable for a total of approximately 11.1 million shares and 8.7 million shares of our common stock, respectively. the warrants, together with certain convertible hedge transactions, are meant to reduce our exposure upon potential conversion of our 4.50% and 4.75% debentures. if the market price of our common stock exceeds the respective exercise prices of the warrants, such warrants will have a dilutive effect on our earnings per share, and could dilute the ownership interests for existing stockholders if exercised.
Topic Mining on Risk Factors could be extremely useful for several entities or individuals such as Risk Managers, Investment Bankers, Shareholders, Mutual Fund companies to take better decision on whether to invest or not in a particular Company or an Industry (in this case technology).
This will also gives an holistic view of the common risks that segment of companies are facing as whole.
* Siva Gangadhar G [71610080] * Shree Sudhha [71610079] * Sravya Chunduri [71610110] * Srikanth Vidapanakal [71610084]