library(tm)
## Loading required package: NLP
library(SnowballC)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
## Setting working directory in R
getwd()
## [1] "C:/Users/vinod/Documents/Vinod_Docs/RFiles/Text_mining"
setwd("C:/Users/vinod/Documents/Vinod_Docs/RFiles/Text_mining")
Creating collection of documents in R environment
mydocs <- Corpus(DirSource("C:/Users/vinod/Documents/Vinod_Docs/RFiles/Text_mining/Text_Datasets"))
Metadata of the documents and inspecting a paricular document
mydocs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 30
#writeLines(as.character(mydocs[30]))
Creating content transformer called toSpace
toSpace <- content_transformer(function(x, pattern)(gsub(pattern, "",x )))
Converting all the text to lower
mydocs <- tm_map(mydocs, content_transformer(tolower))
#writeLines(as.character(mydocs[30]))
Removing anything other than english letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydocs <- tm_map(mydocs, content_transformer(removeNumPunct))
#writeLines(as.character(mydocs[30]))
Removing punctuations and other characters from the texts
mydocs <- tm_map(mydocs, toSpace, ":")
mydocs <- tm_map(mydocs, toSpace, ";")
mydocs <- tm_map(mydocs, toSpace, "<")
mydocs <- tm_map(mydocs, toSpace, ">")
mydocs <- tm_map(mydocs, toSpace, "???")
mydocs <- tm_map(mydocs, toSpace, "T")
mydocs <- tm_map(mydocs, toSpace, "â")
mydocs <- tm_map(mydocs, toSpace, "â???")
mydocs <- tm_map(mydocs, toSpace, "â???T")
#writeLines(as.character(mydocs[30]))
Removing Whitespaces
mydocs <- tm_map(mydocs, stripWhitespace)
#writeLines(as.character(mydocs[30]))
Removing stopwords from the document
mydocs = tm_map(mydocs, removeWords, stopwords("english"))
mydocs = tm_map(mydocs, removePunctuation)
mydocs <- tm_map(mydocs, removeWords, c("the"))
#writeLines(as.character(mydocs[30]))
# Removing numbers from text
mydocs = tm_map(mydocs, removeNumbers)
Steming the document
mydocs <- tm_map(mydocs, stemDocument)
#writeLines(as.character(mydocs[30]))
Rectifying mispelt words in the document
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "organis", replacement = "organ")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "strateg", replacement = "strategic")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "ani", replacement = "any")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "advantag", replacement = "vantage")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "strategicvantage", replacement = "strategic vantage")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "internallygen", replacement = "internally generated")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "minfreq", replacement = "minimum frequency")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "randomord", replacement = "random order")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "wordcloudmydoc", replacement = "word cloud my doc")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "howev", replacement = "however")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "plottedcodepr", replacement = "plotted codepr")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "enterpris", replacement = "enterprise")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "issu", replacement = "issue")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "peopl", replacement = "people")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "articl", replacement = "article")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "practic", replacement = "practice")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "becom", replacement = "become")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "decis", replacement = "decision")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "valu", replacement = "value")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "figur", replacement = "figure")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "exampl", replacement = "example")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "sens", replacement = "sense")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "situa", replacement = "situation")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "typic", replacement = "typical")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "futur", replacement = "future")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "collabor", replacement = "collaboration")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "busi", replacement = "busy")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "manag", replacement = "manage")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "leastadvic", replacement = "least advice")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "defin", replacement = "define")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "chang", replacement = "change")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "moreov", replacement = "more over")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "engag", replacement = "engage")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "consid", replacement = "consider")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "howev", replacement = "however")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "codepr", replacement = " ")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "tmmapmydoc", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "hese", replacement = "these")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "involv", replacement = "involve")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "tmmapmydoc", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "TM", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "precod", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "rotper", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "mydoc", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "achiev", replacement = "achieve")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "caus", replacement = "cause")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "remov", replacement = "remove")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "includ", replacement = "include")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "situationt", replacement = "situation")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "littl", replacement = "little")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "abl", replacement = "able")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "failur", replacement = "failure")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "titl", replacement = "title")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "larg", replacement = "large")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "notat", replacement = "not at")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "activ", replacement = "active")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "entir", replacement = "entire")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "flexibl", replacement = "flexible")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "titl", replacement = "title")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "uncertainti", replacement = "uncertainty")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "possibl", replacement = "possible")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "howeverer", replacement = "however")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "inde", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "unsaf", replacement = "unsafe")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "forti", replacement = "forty")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "reli", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "anoth", replacement = "another")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "theori", replacement = "theory")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "noth", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "irrit", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "autonomi", replacement = "autonomy")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "minut", replacement = "minute")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "realiti", replacement = "reality")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "studi", replacement = "study")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "quot", replacement = "quote")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "initi", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "ive", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "agre", replacement = "agree")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "howeverer", replacement = "however")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "wherea", replacement = "whereas")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "improvis", replacement = "improvise")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "desir", replacement = "desire")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "influenc", replacement = "influence")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "forc", replacement = "force")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "realli", replacement = "really")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "structur", replacement = "structure")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "natur", replacement = "nature")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "illustr", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "univers", replacement = "universe")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "paus", replacement = "pause")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "industri", replacement = "industry")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "sean", replacement = "seen")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "communiti", replacement = "community")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "classif", replacement = "classify")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "creat", replacement = "create")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "contenti", replacement = "content")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "newli", replacement = "newly")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "histori", replacement = "history")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "secur", replacement = "secure")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "tri", replacement = "try")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "unrs", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "innov", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "bodi", replacement = "body")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "subtl", replacement = "subtle")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "cours", replacement = "course")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "basi", replacement = "basic")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "aris", replacement = "arise")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "drs", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "anxieti", replacement = "anxiety")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "easili", replacement = "easily")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "consequ", replacement = "consequence")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "earli", replacement = "early")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "goe", replacement = "goes")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "considerer", replacement = "consider")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "asid", replacement = "aside")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "aer", replacement = "air")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "truli", replacement = "truly")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "alon", replacement = "alone")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "techiqu", replacement = "technique")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "heret", replacement = "here")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "solut", replacement = "solute")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "entitlee", replacement = "entitle")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "guid", replacement = "guide")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "entiti", replacement = "entity")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "easi", replacement = "easy")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "awar", replacement = "aware")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "changelike", replacement = "changelike")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "howeverpractice", replacement = "however practice")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "els", replacement = "else")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "cultur", replacement = "culture")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "assum", replacement = "assume")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "crisi", replacement = "crisis")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "ensur", replacement = "ensure")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "rittel", replacement = "")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "invit", replacement = "invite")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "clariti", replacement = "clarity")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "judg", replacement = "judge")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "slov", replacement = "solve")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "realiz", replacement = "realize")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "alloc", replacement = "allocate")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "territori", replacement = "territory")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "constitut", replacement = "constitute")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "alreadi", replacement = "already")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "observ", replacement = "observe")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "rais", replacement = "raise")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "kunzhalf", replacement = "half")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "continu", replacement = "continue")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "facilit", replacement = "facility")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "stori", replacement = "story")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "agil", replacement = "agile")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "choic", replacement = "choice")
mydocs <- tm_map(mydocs, content_transformer(gsub), pattern = "togaf", replacement = "")
#writeLines(as.character(mydocs[30]))
Creating term document matrix
dtm <- DocumentTermMatrix(mydocs)
inspect(dtm[1:10,1000:1005])
## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 15/45
## Sparsity : 75%
## Maximal term length: 13
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs current cynefin david decid decision defineeeffici
## 1 0 0 0 0 0 0
## 10 1 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 1 1 3 1 6 1
## 5 1 0 0 1 3 0
## 6 0 0 0 1 1 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## 9 1 0 0 1 4 0
Basic Statistics for the text document
## Frequency of occurances of each word of term count
freq <- colSums(as.matrix(dtm))
## Length should be the total number of terms
length(freq)
## [1] 3947
## Sort frequency in descending order
ord <- order(freq, decreasing = TRUE)
head(ord)
## [1] 387 91 344 393 910 551
## Inspect most frequently occuring words
freq[head(ord)]
## one can manage organ work system
## 316 244 229 218 209 193
## Inspect the least frequency occuring word
freq[tail(ord)]
## therebi timeorgan uncommit unionist willing workday
## 1 1 1 1 1 1
Creating a document term matrix with word length from 4 to 20 [12]
dtmr <- DocumentTermMatrix(mydocs) #, control = list(wordlengths= c(4, 20), #bounds=list(global=c(3,27)) ))
freqr <- colSums(as.matrix(dtmr))
head(freqr)
## abstract accept access accord account accur
## 9 16 12 16 9 8
#head(dtmr)
Exploring terms occuring atleast 100 times in the corpus
findFreqTerms(dtmr, lowfreq = 100)
## [1] "approach" "can" "data" "design" "differ" "example"
## [7] "get" "import" "make" "manage" "map" "model"
## [13] "one" "organ" "people" "point" "problem" "process"
## [19] "project" "question" "see" "system" "thing" "time"
## [25] "use" "way" "well" "will" "work" "change"
## [31] "like"
Correlation between some of the terms that occur atleast 100 times in the corpus Words occuring 60% of the time when you find specific words
findAssocs(dtmr, "project", 0.6)
## $project
## inher manage occurr handl bok onthespot
## 0.80 0.68 0.67 0.67 0.61 0.61
findAssocs(dtmr, "system", 0.6)
## $system
## lock incent design subset
## 0.82 0.79 0.78 0.78
## rollout user function involve
## 0.75 0.73 0.72 0.71
## specifi two adopt intend
## 0.71 0.68 0.68 0.67
## step specif intent softwar
## 0.67 0.66 0.66 0.66
## groupwar infer invest compos
## 0.66 0.66 0.66 0.66
## checklist depart phone actor
## 0.66 0.65 0.63 0.63
## frequent artifact actant actionw
## 0.62 0.62 0.62 0.62
## actuallyinflu administ ampli artifactartifact
## 0.62 0.62 0.62 0.62
## automata brian complementari dead
## 0.62 0.62 0.62 0.62
## decompos designerengin disconnect drag
## 0.62 0.62 0.62 0.62
## encod endus enthusiasm faith
## 0.62 0.62 0.62 0.62
## feet feldman firsthand flowchart
## 0.62 0.62 0.62 0.62
## folli grew grid humanartifact
## 0.62 0.62 0.62 0.62
## humanhuman idealis idiosyncrat invok
## 0.62 0.62 0.62 0.62
## keepan licens mainfram martha
## 0.62 0.62 0.62 0.62
## materialview migrat mindless misunderstand
## 0.62 0.62 0.62 0.62
## myriad nearer newth nonadopt
## 0.62 0.62 0.62 0.62
## nonhuman obstacl orchestra ostens
## 0.62 0.62 0.62 0.62
## outreach outreachrel packag pentland
## 0.62 0.62 0.62 0.62
## practis progress redesign resist
## 0.62 0.62 0.62 0.62
## restat rulefollow sequenc setup
## 0.62 0.62 0.62 0.62
## somewher spreadsheet stall standalone
## 0.62 0.62 0.62 0.62
## strongest subscrib tip tweak
## 0.62 0.62 0.62 0.62
## unheed unravel vain weakest
## 0.62 0.62 0.62 0.62
## worthwhil pattern whereas cognit
## 0.62 0.61 0.60 0.60
findAssocs(dtmr, "problem", 0.6)
## $problem
## wick real much good value immedi second know note
## 0.71 0.67 0.66 0.64 0.64 0.64 0.63 0.61 0.61
## one present forward
## 0.61 0.61 0.61
Plotting frequency histogram
wf=data.frame(terms = names(freqr), occurances=freqr)
Plotting the terms occuring most frequently
p4 <- ggplot(subset(wf, freqr>100), aes(terms, occurances))
p4 <- p4 + geom_bar(stat = "identity")
p4 <- p4 + theme(axis.text.x=element_text(angle = 45, hjust = 1)) + coord_flip()
p4
Word cloud for all the words occuring in text corpus
#wordcloud(mydocs, min.freq = 20,random.order = FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
m1 <- as.matrix(dtm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(colSums(m1), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
#wordcloud(word.freq, min.freq = 3,
#random.order = F, colors = pal)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
random.order = F, colors=brewer.pal(8, "Dark2"))
dtm
## <<DocumentTermMatrix (documents: 30, terms: 3947)>>
## Non-/sparse entries: 13772/104638
## Sparsity : 88%
## Maximal term length: 56
## Weighting : term frequency (tf)
head(word.freq)
## one can manage organ work system
## 316 244 229 218 209 193
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.