title: “Untitled” author: “Word Cloud” date: “May 18, 2016” output: html_document

WORD CLOUD

Install

install.packages(“tm”) # for text mining install.package(“SnowballC”) # for text stemming install.packages(“wordcloud”) # word-cloud generator install.packages(“RColorBrewer”) # color palettes

Load

library(“tm”) library(“SnowballC”) library(“wordcloud”) library(“RColorBrewer”)

Text mining

Read the text file

filePath <- - “http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt” text <- readLines(filePath) #text <-getwd() ## Number of documents #length(dir(text)) ## list file names #dir(text)

Load the data as a corpus

docs <- Corpus(VectorSource(text))

inspect(docs)

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " “, x)) docs <- tm_map(docs, toSpace,”/“) docs <- tm_map(docs, toSpace,”@“) docs <- tm_map(docs, toSpace,”\|“)

Cleaning the text

Convert the text to lower case

docs <- tm_map(docs, content_transformer(tolower))

Remove numbers

docs <- tm_map(docs, removeNumbers)

Remove english common stopwords

docs <- tm_map(docs, removeWords, stopwords(“english”))

Remove your own stop word

specify your stopwords as a character vector

docs <- tm_map(docs, removeWords, c(“blabla1”, “blabla2”))

Remove punctuations

docs <- tm_map(docs, removePunctuation)

Eliminate extra white spaces

docs <- tm_map(docs, stripWhitespace)

Text stemming

docs <- tm_map(docs, stemDocument)

####### # Build a term-document matrix #######

dtm <- TermDocumentMatrix(docs) m <- as.matrix(dtm) v <- sort(rowSums(m),decreasing=TRUE) d <- data.frame(word = names(v),freq=v) head(d, 10)

####### # Generate the Word cloud #######

set.seed(1234) wordcloud(words = d\(word, freq = d\)freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, “Dark2”))

##Explore frequent terms and their associations ## words which occurs atleast four tomes findFreqTerms(dtm, lowfreq = 4)

findAssocs(dtm, terms = “freedom”, corlimit = 0.3)

###### #bar plot # ######

barplot(d[1:10,]\(freq, las = 2, names.arg = d[1:10,]\)word,col =“lightblue”, main =“Most frequent words”,ylab = “Word frequencies”)