Sys.setenv(NOAWT=TRUE)
library(tm)
## Loading required package: NLP
library(magrittr)
library(SnowballC)
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(ggdendro)
my.corpus <- Corpus(DirSource("~/dev/tracts-for-the-times/clean/")) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(stemDocument)
my.dtm <- DocumentTermMatrix(my.corpus,
control = list(weighting = weightTfIdf,
stopwords = TRUE))
fit <- my.dtm %>%
as.matrix() %>%
as.data.frame() %>%
scale() %>%
dist() %>%
hclust(method = "ward.D")
The plot:
plot(fit)
rect.hclust(fit, k=8, border="red")
In ggplot2:
ggdendrogram(fit, rotate = TRUE) +
ggtitle("Tracts for the Times")