Word Cloud

Dahee Kim

2020-02-04

pre-processing

# Create a corpus  
docs <- Corpus(VectorSource(text))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove stopwords for the language 
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
# Remove your own stopwords
docs <- tm_map(docs, removeWords, c("shrimp","nothing","none","nope","really","love")) 
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("shrimp", "nothing", :
## transformation drops documents
## Create term-document matrix
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
word <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(word),freq=word)
Palettes

Plot I

# Plot the word cloud
set.seed(1234)
wordcloud(d$word,d$freq,
            min.freq=3, max.words=200,
            random.order=FALSE, rot.per=0.35, 
            use.r.layout=FALSE,  colors= brewer.pal(8, "RdBu"))

Plot II

wordcloud2(data=d, size = 0.7, shape = 'pentagon')

Words frequency table

# Show the top10 words and their frequency
head(d, 10)
##          word freq
## like     like   53
## fresh   fresh   25
## caught caught   17
## good     good   15
## prefer prefer   14
## time     time   13
## eat       eat   13
## just     just   13
## wild     wild   12
## raised raised   10
barplot(d[1:10,]$freq, las = 2, 
        names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

Associated words

#association between frequent terms
findFreqTerms(tdm, lowfreq = 50)
## [1] "like"
asso <- findAssocs(tdm, terms = "like", corlimit = 0.2)
head(asso)
## $like
##        agent    cardboard contaminated          etc         even 
##         0.26         0.26         0.26         0.26         0.26 
##          fed         fond          gmo       highly       orange 
##         0.26         0.26         0.26         0.26         0.26 
##     possibly    something      vietnam          wet        worse 
##         0.26         0.26         0.26         0.26         0.26 
##         farm 
##         0.23
barplot(asso$like, main="like Distribution", horiz=FALSE,las=2)