Text Mining Part. 2 (Hierrarchical Clustering)

Part 2 is just clustering

DATA PROCESSING

docs <- tm_map(docs, removePunctuation) 
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace) 
docs <- tm_map(docs, stemDocument) 
dtm <- DocumentTermMatrix(docs) 
tdm <- TermDocumentMatrix(docs) 
mystopwords <- findFreqTerms(tdm, 1, 20)#cut less frquent words
mystpwrds <- paste(mystopwords, collapse = "|")
tdm <- tdm[tdm$dimnames$Terms[!grepl(mystpwrds,tdm$dimnames$Terms)],]
tdmss <- removeSparseTerms(tdm, 0.06)

CLUSTERING

Creating a hierrarchichal cluster dendrogram

d<-dist(tdm)
#rownames(d) <- tdm
hc<-hclust(d)
plot(hc)
#library("rafalib")
#myplclust(hc, labels=hc$labels)
rect.hclust(hc,k=5)#k can be any value that you'd like to skin the cat :)