Part 2 is just clustering
DATA PROCESSING
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stemDocument)
dtm <- DocumentTermMatrix(docs)
tdm <- TermDocumentMatrix(docs)
mystopwords <- findFreqTerms(tdm, 1, 20)#cut less frquent words
mystpwrds <- paste(mystopwords, collapse = "|")
tdm <- tdm[tdm$dimnames$Terms[!grepl(mystpwrds,tdm$dimnames$Terms)],]
tdmss <- removeSparseTerms(tdm, 0.06)
CLUSTERING
Creating a hierrarchichal cluster dendrogram
d<-dist(tdm)
#rownames(d) <- tdm
hc<-hclust(d)
plot(hc)
#library("rafalib")
#myplclust(hc, labels=hc$labels)
rect.hclust(hc,k=5)#k can be any value that you'd like to skin the cat :)
