Text Mining Assignment-Week 1

#Create a folder to store all text files. Here, I created a folder called c:/Texts
Texts <- file.path("C:", "texts")   
dir(Texts)

##  [1] "Trump Black History Month Speech.txt"         
##  [2] "Trump CIA Speech.txt"                         
##  [3] "Trump Congressional Address.txt"              
##  [4] "Trump CPAC Speech.txt"                        
##  [5] "Trump Florida Rally 2-18-17.txt"              
##  [6] "Trump Immigration Speech 8-31-16.txt"         
##  [7] "Trump Inauguration Speech.txt"                
##  [8] "Trump National Prayer Breakfast.txt"          
##  [9] "Trump Nomination Speech.txt"                  
## [10] "Trump Police Chiefs Speech.txt"               
## [11] "Trump Response to Healthcare Bill Failure.txt"

#Load package "tm" and all the text files in C:/Texts
library(tm)

## Loading required package: NLP

#Create Corpus after you create the Source
docs <- VCorpus(DirSource(Texts))   
summary(docs)

##                                               Length Class            
## Trump Black History Month Speech.txt          2      PlainTextDocument
## Trump CIA Speech.txt                          2      PlainTextDocument
## Trump Congressional Address.txt               2      PlainTextDocument
## Trump CPAC Speech.txt                         2      PlainTextDocument
## Trump Florida Rally 2-18-17.txt               2      PlainTextDocument
## Trump Immigration Speech 8-31-16.txt          2      PlainTextDocument
## Trump Inauguration Speech.txt                 2      PlainTextDocument
## Trump National Prayer Breakfast.txt           2      PlainTextDocument
## Trump Nomination Speech.txt                   2      PlainTextDocument
## Trump Police Chiefs Speech.txt                2      PlainTextDocument
## Trump Response to Healthcare Bill Failure.txt 2      PlainTextDocument
##                                               Mode
## Trump Black History Month Speech.txt          list
## Trump CIA Speech.txt                          list
## Trump Congressional Address.txt               list
## Trump CPAC Speech.txt                         list
## Trump Florida Rally 2-18-17.txt               list
## Trump Immigration Speech 8-31-16.txt          list
## Trump Inauguration Speech.txt                 list
## Trump National Prayer Breakfast.txt           list
## Trump Nomination Speech.txt                   list
## Trump Police Chiefs Speech.txt                list
## Trump Response to Healthcare Bill Failure.txt list

#Remove numbers, capitalization, common words and punctuation.

docs <- tm_map(docs,removePunctuation) 


for (j in seq(docs)) {
  docs[[j]] <- gsub("/", " ", docs[[j]])
  docs[[j]] <- gsub("@", " ", docs[[j]])
  docs[[j]] <- gsub("\\|", " ", docs[[j]])
  docs[[j]] <- gsub("\u2028", " ", docs[[j]])
}

docs <- tm_map(docs, removeNumbers)  

docs <- tm_map(docs, tolower)   
docs <- tm_map(docs, PlainTextDocument)
DocsCopy <- docs

docs <- tm_map(docs, removeWords, stopwords("english"))   
docs <- tm_map(docs, PlainTextDocument)

####Preparing the data####

#Here, I am going to create a document Matrix which describe the frequency of terms within the text.
dtm <- DocumentTermMatrix(docs)   
dtm

## <<DocumentTermMatrix (documents: 11, terms: 3695)>>
## Non-/sparse entries: 8447/32198
## Sparsity           : 79%
## Maximal term length: 18
## Weighting          : term frequency (tf)

####Transpose the matrix####
tdm <- TermDocumentMatrix(docs)   
tdm

## <<TermDocumentMatrix (terms: 3695, documents: 11)>>
## Non-/sparse entries: 8447/32198
## Sparsity           : 79%
## Maximal term length: 18
## Weighting          : term frequency (tf)

####Organize words by Frequency####
freq <- colSums(as.matrix(dtm))   
length(freq)

## [1] 3695

ord <- order(freq)

m <- as.matrix(dtm)   
dim(m)

## [1]   11 3695

#Create CSV
write.csv(m, file="DocumentTermMatrix.csv")

#Remove sparse terms
dtms <- removeSparseTerms(dtm, 0.2)  
dtms

## <<DocumentTermMatrix (documents: 11, terms: 87)>>
## Non-/sparse entries: 848/109
## Sparsity           : 11%
## Maximal term length: 11
## Weighting          : term frequency (tf)

#####Mosts and least frequently recurring words####
freq <- colSums(as.matrix(dtm))

tail(table(freq), 20) # The ", 20" indicates that we only want the last 20 frequencies

## freq
##  79  83  88  89  98 100 101 102 105 107 111 122 127 139 140 163 174 265 
##   2   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 278 428 
##   1   1

#Here is the frequency table when we removed sparse terms
freq <- colSums(as.matrix(dtms))   
freq

##        also      always     america    american     another        back 
##          54          24         122         107          22          75 
##         bad     believe         big        came         can        care 
##          35          60          45          20         100          37 
##        come     country         day   different        done enforcement 
##          55         174          36          16          24          43 
##        even        ever       every         get     getting        give 
##          55          42          49          79          23          25 
##       going        good       great       group      happen         job 
##         265          58         163          20          36          38 
##        just        know        last         law         let        life 
##          88         127          44          59          40          27 
##        like      little        long        look         lot        love 
##          79          24          36          52          44          45 
##        made        many        much        must      nation        need 
##          32         101          68          53          48          32 
##       never         new         now      office         one      people 
##          83          69         111          24         139         278 
##   president         put      really    remember       right        safe 
##          44          35          57          27         102          35 
##        said         say         see        seen   something     special 
##          83          66          48          34          25          26 
##      states        take        tell       thank      things       think 
##          65          64          50         105          40          77 
##        time       today    together     totally       truly  understand 
##          76          33          34          18          16          29 
##      united        want         way        well        will        work 
##          64         140          71          51         428          64 
##       world        year       years 
##          56          47          54

####Sort most frequent words in descending order####
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)   
head(freq, 20)#top 20 words

##     will   people    going  country    great     want      one     know 
##      428      278      265      174      163      140      139      127 
##  america      now american    thank    right     many      can      â\200” 
##      122      111      107      105      102      101      100       98 
##   theyre     just    never     said 
##       89       88       83       83

#create a data frame for next steps
df <- data.frame(word=names(freq), freq=freq)
head(df)

##            word freq
## will       will  428
## people   people  278
## going     going  265
## country country  174
## great     great  163
## want       want  140

#####Plot Word Frequency#########
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

#Plot a histogram for words that appear at a minimum 50 times
p <- ggplot(subset(df, freq>50), aes(x = reorder(word, -freq), y = freq)) +geom_bar(stat = "identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
p

#####Calculate terms correlations#####

#identify the words that most highly correlate with that term. If words always appear together, then correlation=1.0
findAssocs(dtm, c("america" , "race"), corlimit=0.90)

## $america
##  foreign millions 
##     0.93     0.90 
## 
## $race
##    terrorist   difference        elite  independent         lies 
##         0.99         0.98         0.98         0.98         0.98 
## manipulation      melania     projects       seeing      signing 
##         0.98         0.98         0.98         0.98         0.98 
##      sisters  understands    workforce      control   presidency 
##         0.98         0.98         0.98         0.96         0.96 
##     greatest         next       legacy 
##         0.92         0.91         0.90

#####Create word cloud######
library(wordcloud)

## Loading required package: RColorBrewer

dtms <- removeSparseTerms(dtm, 0.15) 
freq <- colSums(as.matrix(dtm)) # Find word frequencies   
dark2 <- brewer.pal(6, "Dark2")   
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)

#####Hierarchal Clustering#####   
dtms <- removeSparseTerms(dtm, 0.15)

library(cluster)   
d <- dist(t(dtms), method="euclidian") 
fit <- hclust(d=d, method="complete")    
plot.new()
plot(fit, hang=-1)
groups <- cutree(fit, k=6)   
rect.hclust(fit, k=6, border="purple")

```

Text Mining Assignment-Week 1

Jaclyn V.Bazsika