#Create a folder to store all text files. Here, I created a folder called c:/Texts
Texts <- file.path("C:", "texts")
dir(Texts)
## [1] "Trump Black History Month Speech.txt"
## [2] "Trump CIA Speech.txt"
## [3] "Trump Congressional Address.txt"
## [4] "Trump CPAC Speech.txt"
## [5] "Trump Florida Rally 2-18-17.txt"
## [6] "Trump Immigration Speech 8-31-16.txt"
## [7] "Trump Inauguration Speech.txt"
## [8] "Trump National Prayer Breakfast.txt"
## [9] "Trump Nomination Speech.txt"
## [10] "Trump Police Chiefs Speech.txt"
## [11] "Trump Response to Healthcare Bill Failure.txt"
#Load package "tm" and all the text files in C:/Texts
library(tm)
## Loading required package: NLP
#Create Corpus after you create the Source
docs <- VCorpus(DirSource(Texts))
summary(docs)
## Length Class
## Trump Black History Month Speech.txt 2 PlainTextDocument
## Trump CIA Speech.txt 2 PlainTextDocument
## Trump Congressional Address.txt 2 PlainTextDocument
## Trump CPAC Speech.txt 2 PlainTextDocument
## Trump Florida Rally 2-18-17.txt 2 PlainTextDocument
## Trump Immigration Speech 8-31-16.txt 2 PlainTextDocument
## Trump Inauguration Speech.txt 2 PlainTextDocument
## Trump National Prayer Breakfast.txt 2 PlainTextDocument
## Trump Nomination Speech.txt 2 PlainTextDocument
## Trump Police Chiefs Speech.txt 2 PlainTextDocument
## Trump Response to Healthcare Bill Failure.txt 2 PlainTextDocument
## Mode
## Trump Black History Month Speech.txt list
## Trump CIA Speech.txt list
## Trump Congressional Address.txt list
## Trump CPAC Speech.txt list
## Trump Florida Rally 2-18-17.txt list
## Trump Immigration Speech 8-31-16.txt list
## Trump Inauguration Speech.txt list
## Trump National Prayer Breakfast.txt list
## Trump Nomination Speech.txt list
## Trump Police Chiefs Speech.txt list
## Trump Response to Healthcare Bill Failure.txt list
#Remove numbers, capitalization, common words and punctuation.
docs <- tm_map(docs,removePunctuation)
for (j in seq(docs)) {
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
docs[[j]] <- gsub("\u2028", " ", docs[[j]])
}
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, PlainTextDocument)
DocsCopy <- docs
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, PlainTextDocument)
####Preparing the data####
#Here, I am going to create a document Matrix which describe the frequency of terms within the text.
dtm <- DocumentTermMatrix(docs)
dtm
## <<DocumentTermMatrix (documents: 11, terms: 3695)>>
## Non-/sparse entries: 8447/32198
## Sparsity : 79%
## Maximal term length: 18
## Weighting : term frequency (tf)
####Transpose the matrix####
tdm <- TermDocumentMatrix(docs)
tdm
## <<TermDocumentMatrix (terms: 3695, documents: 11)>>
## Non-/sparse entries: 8447/32198
## Sparsity : 79%
## Maximal term length: 18
## Weighting : term frequency (tf)
####Organize words by Frequency####
freq <- colSums(as.matrix(dtm))
length(freq)
## [1] 3695
ord <- order(freq)
m <- as.matrix(dtm)
dim(m)
## [1] 11 3695
#Create CSV
write.csv(m, file="DocumentTermMatrix.csv")
#Remove sparse terms
dtms <- removeSparseTerms(dtm, 0.2)
dtms
## <<DocumentTermMatrix (documents: 11, terms: 87)>>
## Non-/sparse entries: 848/109
## Sparsity : 11%
## Maximal term length: 11
## Weighting : term frequency (tf)
#####Mosts and least frequently recurring words####
freq <- colSums(as.matrix(dtm))
tail(table(freq), 20) # The ", 20" indicates that we only want the last 20 frequencies
## freq
## 79 83 88 89 98 100 101 102 105 107 111 122 127 139 140 163 174 265
## 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 278 428
## 1 1
#Here is the frequency table when we removed sparse terms
freq <- colSums(as.matrix(dtms))
freq
## also always america american another back
## 54 24 122 107 22 75
## bad believe big came can care
## 35 60 45 20 100 37
## come country day different done enforcement
## 55 174 36 16 24 43
## even ever every get getting give
## 55 42 49 79 23 25
## going good great group happen job
## 265 58 163 20 36 38
## just know last law let life
## 88 127 44 59 40 27
## like little long look lot love
## 79 24 36 52 44 45
## made many much must nation need
## 32 101 68 53 48 32
## never new now office one people
## 83 69 111 24 139 278
## president put really remember right safe
## 44 35 57 27 102 35
## said say see seen something special
## 83 66 48 34 25 26
## states take tell thank things think
## 65 64 50 105 40 77
## time today together totally truly understand
## 76 33 34 18 16 29
## united want way well will work
## 64 140 71 51 428 64
## world year years
## 56 47 54
####Sort most frequent words in descending order####
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)#top 20 words
## will people going country great want one know
## 428 278 265 174 163 140 139 127
## america now american thank right many can â\200”
## 122 111 107 105 102 101 100 98
## theyre just never said
## 89 88 83 83
#create a data frame for next steps
df <- data.frame(word=names(freq), freq=freq)
head(df)
## word freq
## will will 428
## people people 278
## going going 265
## country country 174
## great great 163
## want want 140
#####Plot Word Frequency#########
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
#Plot a histogram for words that appear at a minimum 50 times
p <- ggplot(subset(df, freq>50), aes(x = reorder(word, -freq), y = freq)) +geom_bar(stat = "identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
p
#####Calculate terms correlations#####
#identify the words that most highly correlate with that term. If words always appear together, then correlation=1.0
findAssocs(dtm, c("america" , "race"), corlimit=0.90)
## $america
## foreign millions
## 0.93 0.90
##
## $race
## terrorist difference elite independent lies
## 0.99 0.98 0.98 0.98 0.98
## manipulation melania projects seeing signing
## 0.98 0.98 0.98 0.98 0.98
## sisters understands workforce control presidency
## 0.98 0.98 0.98 0.96 0.96
## greatest next legacy
## 0.92 0.91 0.90
#####Create word cloud######
library(wordcloud)
## Loading required package: RColorBrewer
dtms <- removeSparseTerms(dtm, 0.15)
freq <- colSums(as.matrix(dtm)) # Find word frequencies
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)
#####Hierarchal Clustering#####
dtms <- removeSparseTerms(dtm, 0.15)
library(cluster)
d <- dist(t(dtms), method="euclidian")
fit <- hclust(d=d, method="complete")
plot.new()
plot(fit, hang=-1)
groups <- cutree(fit, k=6)
rect.hclust(fit, k=6, border="purple")
```