EY & UTS

This is an R Markdown document of the text mining performed on a corpus of Ernst & Young articles.

Made up of 4 articles from the Australian Financial Review, 19 articles referencing Ernst and Young and 5 articles relevant to Australian Universities and Ernst and Young

Approach

Utilising text mining techniques to create a mathematical representation of the corpus text to observe any valuable information that may be hidden or locked away. The form of the data is unstructured and can be said to be a representation of “a bag of words”. The goal is to discover and extract key values and sentiments associated with Ernst and Young and its partnering organisations from this corpus of articles.

The techniques utilised will be a stepped approach starting with simple word frequencies and associations and culminating in theme or topic based classification and semantics.

Install packages for below libraries if not already loaded

library(NLP)
library(SnowballC)
library(tm)
library(ggplot2)
library(RColorBrewer)
library(tidytext)
library(dplyr)
library(wordcloud2)
library(topicmodels)

Load corpus of EY .txt files, note these have been saved on my working directory in a folder called “textmining1”

docs <- Corpus(DirSource("./textmining1"))
#inspect a particular document
writeLines(as.character(docs[[6]]))

Clean and pre-process and visually check Progress as needed

# Remove punctuation
docs <- tm_map(docs, removePunctuation)
# Transform to lower case
docs <- tm_map(docs,content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove stopwords from standard stopword list 
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove whitespace 
docs <- tm_map(docs, stripWhitespace)
#Stem document
docs <- tm_map(docs,stemDocument)
#inspect a particular document
writeLines(as.character(docs[[6]]))

Some spelling clean up

docs <- tm_map(docs, content_transformer(gsub),
               pattern = "peopl", replacement = "people")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "compani", replacement = "company")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "innov", replacement = "innovate")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "opportun", replacement = "opportunity")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "recognis", replacement = "recognise")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "communiti", replacement = "community")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "encourag", replacement = "encourage")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "knowledg", replacement = "knowledge")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "entrepreneuri", replacement = "entrepreneur")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "servic", replacement = "service")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "busi", replacement = "business")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "valu", replacement = "value")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "provid", replacement = "provide")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "includ", replacement = "include")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "engag", replacement = "engage")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "manag", replacement = "manage")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "educ", replacement = "educate")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "organ", replacement = "organisation")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "univers", replacement = "university")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "polici", replacement = "policy")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "qualiti", replacement = "quality")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "independ", replacement = "independent")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "creat", replacement = "create")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "industri", replacement = "industry")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "advisori", replacement = "advisory")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "believ", replacement = "believe")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "donat", replacement = "donate")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "potenti", replacement = "potential")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "challeng", replacement = "challenge")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "communic", replacement = "communicate")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "prioriti", replacement = "priority")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "capit", replacement = "capital")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "advanc", replacement = "advance")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "repres", replacement = "represent")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "activ", replacement = "active")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "issu", replacement = "issue")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "relat", replacement = "relate")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "integr", replacement = "integrity")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "financ", replacement = "finance")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "purpo", replacement = "purpose")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "profess", replacement = "professional")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "requir", replacement = "require")
docs <- tm_map(docs, content_transformer(gsub),
               pattern = "organisationis", replacement = "organisation")

More Stop words to remove

myStopwords <- c("can", "say","one","way","use",
                 "also","howev","tell","will",
                 "much","need","take","tend","even",
                 "like","particular","rather","said",
                 "get","well","make","ask","come","end",
                 "first","two","help","often","may",
                 "might","see","someth","thing","point",
                 "post","look","right","now","think","'ve ",
                 "'re ","anoth","put","set","new","good",
                 "want","sure","kind","larg","yes,","day","etc",
                 "quit","sinc","attempt","lack","seen","awar",
                 "littl","ever","moreov","though","found","abl",
                 "enough","far","earli","away","achiev","draw",
                 "last","never","brief","bit","entir","brief",
                 "great","lot", "mani","just","find","found",
                 "give","big","oper","becom","made","exampl",
                 "sinc","avail","â???o")

# Remove my list of unwanted words
docs <- tm_map(docs, removeWords, myStopwords)

# Inspect
writeLines(as.character(docs[[6]]))

Create document-term matrix and set frequency and sort order

dtm <- DocumentTermMatrix(docs)
#inspect segment of document term matrix
inspect(dtm[4:6,1000:1005])
# Sum over columns to get total counts (over all docs) for each term
freq <- colSums(as.matrix(dtm))
# Check number of terms
length(freq)
# Create sort order highest frequency ascending
ord <- order(freq,decreasing=TRUE)
# Inspect most frequently occurring terms
freq[head(ord)]
freq[tail(ord)]

list most frequent terms and review correlations of some key terms

findFreqTerms(dtm,lowfreq=80)
#correlations
findAssocs(dtm,"entrepreneur",0.89)
findAssocs(dtm,"value",0.92)
findAssocs(dtm,"partner",0.94)
findAssocs(dtm,"university", 0.99)

Remove very frequent and very rare words to observe important key words. Based on common words are not specific enough and uncommon are to specific. source:DAM slides utscic.edu.au

dtmr <-DocumentTermMatrix(docs, control=list(wordLengths=c(4, 20),
                                             bounds = list(global = c(3,27))))

Convert to data.frame and plot as a histogram

#histogram
wf=data.frame(term=names(freq),occurrences=freq)
p <- ggplot(subset(wf, occurrences>100), aes(term, occurrences))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

Create a word cloud of the Corpus with size of word indicative of frequency

#setting the same seed each time ensures consistent look across clouds
set.seed(42)
# turn into data.frame
d<-data.frame(word=names(freq),freq = freq)
wordcloud2(d, shape = "circle", color = "random-dark", backgroundColor = "white")

Idnetify the Sentiment of the corpus based on positive and negative word sentiment using bing algorithm

#convert to data frame
ap_td <- tidy(dtm)
#identify sentiment Positive or negative words
ap_sentiments <- ap_td %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))
#view sentiments
ap_sentiments


    colnames(ap_td) <- c("SOTU 2010","SOTU 2011")
    comparison.cloud(term.matrix,max.words=40,random.order=FALSE)
    commonality.cloud(term.matrix,max.words=40,random.order=FALSE)

Plot sentiment filtering for word frequency greater than 10

ap_sentiments %>%
  count(sentiment, term, wt = count) %>%
  ungroup() %>%
  filter(n >= 10) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("Contribution to sentiment") + coord_flip()

Visualisation

qplot(sentiment, data=ap_sentiments, weight=count, geom="bar",fill=sentiment)

Clustering

m<-as.matrix(dtm)
#shorten rownames for display purposes
rownames(m) <- paste(substring(rownames(m),1,3),rep("..",nrow(m)),
                     substring(rownames(m),
                               nchar(rownames(m))-12,nchar(rownames(m))-4))
#compute distance between document vectors
d <- dist(m)
#run hierarchical clustering using Ward's method
groups <- hclust(d,method="ward.D")
#plot, use hang to ensure that labels fall below tree
plot(groups, hang=-1)
#cut into 3 subtrees based on plot above
rect.hclust(groups,3)

Topic Modeling

Run LDA using Gibbs Sampling, Gibbs Sampling is a “Markov Chain Monte Carlo” algorithm that is often used to approximate a probability distribution.

filenames <- list.files(getwd(),pattern="*.txt")
files <- lapply(filenames,readLines)
docs <- Corpus(VectorSource(files))

# The burn-in period is used to ensure that we start from a representative point.
burnin <- 1000
# and perform 2000 iterations (after burn-in)...
iter <- 2000
#..taking every 500th one for further use. This "thinning" is done to ensure that
# samples are not correlated.
thin <- 500
#We'll use 5 different, randomly chosen starting points
nstart <- 5
#using random integers as seed. 
seed <- list(2003,5,63,100001,765)
#best run (the one with the highest probability) as the result
best <- TRUE

#Number of topics chosen is 4 from previous analysis
k <- 4

ldaOut <- LDA(dtm,k, method="Gibbs", control=
              list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
topics(ldaOut)
ldaOut.topics <-as.matrix(topics(ldaOut))
write.csv(ldaOut.topics,file=paste("LDAGibbs",k,"DocsToTopics.csv"))
terms(ldaOut,6)
ldaOut.terms <- as.matrix(terms(ldaOut,6))
write.csv(ldaOut.terms,file=paste("LDAGibbs",k,"TopicsToTerms.csv"))
#Find probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(ldaOut@gamma) 
write.csv(topicProbabilities,file=paste("LDAGibbs",k,"EYTopicProbabilities.csv"))

How distinct are the topic assignments?

#Find relative importance of top 2 topic assignments
topic1ToTopic2 <- lapply(1:nrow(dtm),function(x)
  sort(topicProbabilities[x,])[k]/sort(topicProbabilities[x,])[k-1])
#Find relative importance of second and third ranked topic assignments
topic2ToTopic3 <- lapply(1:nrow(dtm),function(x)
  sort(topicProbabilities[x,])[k-1]/sort(topicProbabilities[x,])[k-2])
#write to file
write.csv(topic1ToTopic2,file=paste("LDAGibbs",k,"EYTopic1ToTopic2.csv"))
write.csv(topic2ToTopic3,file=paste("LDAGibbs",k,"EYTopic2ToTopic3.csv"))

Top Terms by Topic

ap_topics <- tidy(ldaOut.terms, matrix = "beta")
#Show topics and Words associated with them 
ap_topics

END