This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Needed <- c(“tm”, “SnowballCC”, “RColorBrewer”, “ggplot2”, “wordcloud”, “biclust”, “cluster”, “igraph”, “fpc”)
install.packages(Needed, dependencies=TRUE)
install.packages(“Rcampdf”, repos = “http://datacube.wu.ac.at/”, type = “source”)
cname <- file.path(“~”, “Desktop”, “texts”)
cname
dir(cname) # Use this to check to see that your texts have loaded.
cname <- file.path(“C:”, “texts”)
cname
dir(cname)
########################################################################################## ##########################################################################################
library(tm)
docs <- Corpus(DirSource(cname))
## Preprocessing
docs <- tm_map(docs, removePunctuation) # Removing punctuation:
docs <- tm_map(docs, removeNumbers) # Removing numbers:
docs <- tm_map(docs, tolower) # Converting to lowercase:
docs <- tm_map(docs, removeWords, stopwords(“english”)) # Removing “stopwords” library(SnowballC)
docs <- tm_map(docs, stemDocument) # Removing common word endings* (e.g., “ing”, “es”)
docs <- tm_map(docs, stripWhitespace) # Stripping whitespace
docs <- tm_map(docs, PlainTextDocument)
## This is the end of the preprocessing stage.*
dtm <- DocumentTermMatrix(docs)
tdm <- TermDocumentMatrix(docs)
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq)
m <- as.matrix(dtm)
dim(m)
write.csv(m, file=“DocumentTermMatrix.csv”)
### FOCUS - on just the interesting stuff…
# Start by removing sparse terms:
dtms <- removeSparseTerms(dtm, 0.1) # This makes a matrix that is 10% empty space, maximum.
### Word Frequency
head(table(freq), 20)
# The above output is two rows of numbers. The top number is the frequency with which # words appear and the bottom number reflects how many words appear that frequently. # tail(table(freq), 20)
findFreqTerms(dtm, lowfreq=400) # Change “50” to whatever is most appropriate for your data. # # #
### Plot Word Frequencies # Plot words that appear at least 50 times.
library(ggplot2)
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>50), aes(word, freq))
p <- p + geom_bar(stat=“identity”)
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
#
## Relationships Between Terms ### Term Correlations # See the description above for more guidance with correlations. # If words always appear together, then correlation=1.0.
findAssocs(dtm, c(“dan” , “klik”), corlimit=0.98) # specifying a correlation limit of 0.98
# # Change “question” & “analysi” to terms that actually appear in your texts. # Also adjust the corlimit= to any value you feel is necessary. # # ### Word Clouds!
# First load the package that makes word clouds in R.
library(wordcloud)
dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)
freq <- colSums(as.matrix(dtm)) # Find word frequencies
dark2 <- brewer.pal(6, “Dark2”)
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)
dtms <- removeSparseTerms(dtm, 0.15) # This makes a matrix that is only 15% empty space. library(cluster)
d <- dist(t(dtms), method=“euclidian”) # First calculate distance between words fit <- hclust(d=d, method=“ward.D”)
plot.new() plot(fit, hang=-1) groups <- cutree(fit, k=5) # “k=” defines the number of clusters you are using
rect.hclust(fit, k=5, border=“red”) # draw dendogram with red borders around the 5 clusters
library(fpc)
library(cluster)
dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)
d <- dist(t(dtms), method=“euclidian”)
kfit <- kmeans(d, 2)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)