#1. Read the text file from Your Computer
txtdata <- readLines("~/Documents/academics/squ/Research/Collaboration/Maryam/Westext.txt")
#2. Load the data as a corpus
doc <- Corpus(VectorSource(txtdata))
#3. Text transformation
toSpc <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
doc <- tm_map(doc, toSpc, "/")
doc <- tm_map(doc, toSpc, "@")
doc <- tm_map(doc, toSpc, "\\|")
#4. Some Cleaning
# Convert the text to lower case
doc <- tm_map(doc, content_transformer(tolower))
# Remove numbers
doc <- tm_map(doc, removeNumbers)
# Remove conjunctions
doc <- tm_map(doc, removeWords, stopwords("english"))
# Remove suffixes to the common 'stem'
doc <- tm_map(doc, stemDocument)
# Remove punctuations
doc <- tm_map(doc, removePunctuation)
#5. Term document matrix
dtm <- TermDocumentMatrix(doc)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## autism autism 5
## factor factor 5
## children children 4
## oman oman 4
## child child 4
## studi studi 4
## statist statist 3
## number number 3
## status status 3
## among among 2
#6. Word Cloud
set.seed(1234)
wordcloud(words = d$word,
freq = d$freq,
min.freq = 2,
max.words=30,
random.order=FALSE,
rot.per=0.25,
colors=brewer.pal(8, "Dark2"))

wordcloud2(data=d, size=1.4, color='random-dark')