pre-processing
# Create a corpus
docs <- Corpus(VectorSource(text))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove stopwords for the language
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
# Remove your own stopwords
docs <- tm_map(docs, removeWords, c("shrimp","nothing","none","nope","really","love"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("shrimp", "nothing", :
## transformation drops documents
## Create term-document matrix
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
word <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(word),freq=word)
Plot I
# Plot the word cloud
set.seed(1234)
wordcloud(d$word,d$freq,
min.freq=3, max.words=200,
random.order=FALSE, rot.per=0.35,
use.r.layout=FALSE, colors= brewer.pal(8, "RdBu"))

Words frequency table
# Show the top10 words and their frequency
head(d, 10)
## word freq
## like like 53
## fresh fresh 25
## caught caught 17
## good good 15
## prefer prefer 14
## time time 13
## eat eat 13
## just just 13
## wild wild 12
## raised raised 10
barplot(d[1:10,]$freq, las = 2,
names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")

Associated words
#association between frequent terms
findFreqTerms(tdm, lowfreq = 50)
## [1] "like"
asso <- findAssocs(tdm, terms = "like", corlimit = 0.2)
head(asso)
## $like
## agent cardboard contaminated etc even
## 0.26 0.26 0.26 0.26 0.26
## fed fond gmo highly orange
## 0.26 0.26 0.26 0.26 0.26
## possibly something vietnam wet worse
## 0.26 0.26 0.26 0.26 0.26
## farm
## 0.23
barplot(asso$like, main="like Distribution", horiz=FALSE,las=2)
