title: “Untitled” author: “Word Cloud” date: “May 18, 2016” output: html_document
WORD CLOUD
install.packages(“tm”) # for text mining install.package(“SnowballC”) # for text stemming install.packages(“wordcloud”) # word-cloud generator install.packages(“RColorBrewer”) # color palettes
library(“tm”) library(“SnowballC”) library(“wordcloud”) library(“RColorBrewer”)
filePath <- - “http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt” text <- readLines(filePath) #text <-getwd() ## Number of documents #length(dir(text)) ## list file names #dir(text)
docs <- Corpus(VectorSource(text))
inspect(docs)
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " “, x)) docs <- tm_map(docs, toSpace,”/“) docs <- tm_map(docs, toSpace,”@“) docs <- tm_map(docs, toSpace,”\|“)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords(“english”))
docs <- tm_map(docs, removeWords, c(“blabla1”, “blabla2”))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stemDocument)
####### # Build a term-document matrix #######
dtm <- TermDocumentMatrix(docs) m <- as.matrix(dtm) v <- sort(rowSums(m),decreasing=TRUE) d <- data.frame(word = names(v),freq=v) head(d, 10)
####### # Generate the Word cloud #######
set.seed(1234) wordcloud(words = d\(word, freq = d\)freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, “Dark2”))
##Explore frequent terms and their associations ## words which occurs atleast four tomes findFreqTerms(dtm, lowfreq = 4)
findAssocs(dtm, terms = “freedom”, corlimit = 0.3)
###### #bar plot # ######
barplot(d[1:10,]\(freq, las = 2, names.arg = d[1:10,]\)word,col =“lightblue”, main =“Most frequent words”,ylab = “Word frequencies”)