Sys.setenv(NOAWT=TRUE)
require("tm")
## Loading required package: tm
## Loading required package: NLP
my.corpus <- Corpus(DirSource("~/CLIO-3/tracts-for-the-times"))
getTransformations
## function ()
## c("removeNumbers", "removePunctuation", "removeWords", "stemDocument",
## "stripWhitespace")
## <environment: namespace:tm>
my.corpus <- tm_map(my.corpus, removePunctuation)
my.corpus <- tm_map(my.corpus, removeWords, stopwords("english"))
my.stops <- c("history","clio", "programming")
my.corpus <- tm_map(my.corpus, removeWords, my.stops)
#my.list <- unlist(read.table("PATH TO STOPWORD FILE", stringsAsFactors=FALSE)
#my.stops <- c(my.list)
#my.corpus <- tm_map(my.corpus, removeWords, my.stops)
require("SnowballC")
## Loading required package: SnowballC
my.corpus <- tm_map(my.corpus, content_transformer(tolower))
my.corpus <- tm_map(my.corpus, stemDocument)
my.corpus <- tm_map(my.corpus, removeNumbers)
#library(magrittr)
#my.tdm <- TermDocumentMatrix(my.corpus)
#inspect(my.tdm)
#my.dtm <- DocumentTermMatrix(my.corpus, control = list(weighting = weightTfIdf, stopwords = TRUE))
#inspect(my.dtm)
#my.tdm %>%
# inspect() %>%
#as.data.frame() %>%
# View()
#my.dtm %>%
# inspect() %>%
# as.data.frame() %>%
# View()
#my.tdm <- removeSparseTerms(my.tdm, 0.2)
#findFreqTerms(my.tdm, 2)
#findAssocs(my.tdm, 'mine', 0.20)
#my.df <- as.data.frame(inspect(my.tdm))
#my.df.scale <- scale(my.df)
#d <- dist(my.df.scale,method="euclidean")
#fit <- hclust(d, method="ward.D")
#plot(fit)
#my.df <- as.data.frame(inspect(my.dtm))
#my.df.scale <- scale(my.df)
#d <- dist(my.df.scale,method="euclidean")
#fit <- hclust(d, method="ward.D")
#plot(fit)