Use text mining tools in R to analyze the transcript from first presidential debate between Donald Trump and Hillary Clinton.

Summary

Load Packages

library(dplyr)
library(tm)
library(SnowballC)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(coreNLP)
library(syuzhet)
library(RWeka)

Load data

debate <- read.csv('debate.csv', stringsAsFactors = FALSE)

Take a look at the dataset

str(debate)
## 'data.frame':    351 obs. of  4 variables:
##  $ Line   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Speaker: chr  "Holt" "Audience" "Clinton" "Audience" ...
##  $ Text   : chr  "Good evening from Hofstra University in Hempstead, New York. I'm Lester Holt, anchor of \"NBC Nightly News.\" I want to welcome"| __truncated__ "(APPLAUSE)" "How are you, Donald?" "(APPLAUSE)" ...
##  $ Date   : chr  "9/26/2016" "9/26/2016" "9/26/2016" "9/26/2016" ...

Create new column Category by assigning different sections of the debate into different topics.

debate$Category <- "Opening / Closing Remarks"
debate$Category[8:161] <- "Jobs"
debate$Category[162:222] <- "Race"
debate$Category[223:310] <- "National Security"
debate$Category[311:343] <- "Gender"

Filter Trump and Clinton in Speaker column and remove Date column

debate <- debate %>% filter(Speaker %in% c("Clinton", "Trump"))
debate$Date <- NULL

Splite the dataset into Trump and Clinton subsets.

debate.trump <- debate %>% filter(Speaker == "Trump")
debate.clinton <- debate %>% filter(Speaker == "Clinton")

Processing Text

Process text using tm package

tmfunction <- function(x){
  corpus <- Corpus(VectorSource(x$Text))
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english")) 
  corpus <- tm_map(corpus, stemDocument)
  corpus <- tm_map(corpus, PlainTextDocument)
  dtm <- DocumentTermMatrix(corpus)
  dtmss <- removeSparseTerms(dtm, 0.99)
}

Convert document term matrix into data frame

word.trump <- as.data.frame(as.matrix(tmfunction(debate.trump)))
word.clinton <- as.data.frame(as.matrix(tmfunction(debate.clinton)))

Sum up the total from each documents

freq.trump <- colSums(word.trump)
wf.trump <- data.frame(word = names(freq.trump), freq = freq.trump)
freq.clinton <- colSums(word.clinton)
wf.clinton <- data.frame(word = names(freq.clinton), freq = freq.clinton)

20 most used words from Trump

head(wf.trump[order(-wf.trump$freq),], n = 20)
##                word freq
## going         going   47
## country     country   46
## look           look   44
## theyre       theyre   41
## just           just   39
## think         think   38
## said           said   37
## people       people   35
## dont           dont   34
## say             say   32
## will           will   32
## know           know   30
## thats         thats   29
## many           many   28
## get             get   27
## secretary secretary   27
## well           well   26
## one             one   25
## clinton     clinton   23
## now             now   23

20 most used words from Clinton

head(wf.clinton[order(-wf.clinton$freq),], n = 20)
##          word freq
## well     well   42
## think   think   39
## people people   33
## can       can   30
## know     know   28
## donald donald   26
## going   going   26
## need     need   23
## one       one   22
## thats   thats   22
## will     will   21
## really really   20
## want     want   20
## lot       lot   18
## said     said   18
## good     good   17
## weve     weve   17
## get       get   16
## jobs     jobs   16
## just     just   16

Word Cloud

Create word cloud to visualize the most frequent words

Trump

wordcloud(words = wf.trump$word, freq = wf.trump$freq, min.freq = 10,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Clinton

wordcloud(words = wf.clinton$word, freq = wf.clinton$freq, min.freq = 8,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Word Association

Word Association - jobs, police, isis

findAssocs(tmfunction(debate.trump), terms = "jobs", corlimit = 0.6)
## $jobs
##  leaving michigan   theyre   mexico      air    allow  numbers     ohio 
##     0.77     0.75     0.71     0.70     0.62     0.60     0.60     0.60
findAssocs(tmfunction(debate.clinton), terms = "jobs", corlimit = 0.6)
## $jobs
##  manufacturing        economy        incomes          clean         energy 
##           0.76           0.69           0.69           0.68           0.68 
##            new       advanced       birthday          build        central 
##           0.64           0.61           0.61           0.61           0.61 
##         fairer granddaughters      guarantee        hofstra        hosting 
##           0.61           0.61           0.61           0.61           0.61 
## infrastructure     innovation      renewable         starts     technology 
##           0.61           0.61           0.61           0.61           0.61 
##          thank         thanks           wage         womens 
##           0.61           0.61           0.61           0.61
findAssocs(tmfunction(debate.trump), terms = "police", corlimit = 0.6)
## $police
##      killed       cases   shootings     chicago     january       order 
##        0.97        0.87        0.83        0.75        0.75        0.75 
##   thousands         law      street communities       whats 
##        0.75        0.74        0.71        0.66        0.61
findAssocs(tmfunction(debate.clinton), terms = "police", corlimit = 0.6)
## $police
##        chiefs      officers         admit          bias         comes 
##          0.78          0.78          0.74          0.74          0.74 
##      concerns   conclusions distinguished   experienced         fatal 
##          0.74          0.74          0.74          0.74          0.74 
##         group        handle      implicit          jump     literally 
##          0.74          0.74          0.74          0.74          0.74 
##        mental      policing    retraining     therefore        budget 
##          0.74          0.74          0.74          0.74          0.66 
##        health  consequences       problem         since unfortunately 
##          0.66          0.65          0.63          0.63          0.63
findAssocs(tmfunction(debate.trump), terms = "isis", corlimit = 0.6)
## $isis
##  troops  vacuum  formed created 
##    0.75    0.75    0.70    0.60
findAssocs(tmfunction(debate.clinton), terms = "isis", corlimit = 0.6)
## $isis
## addressing        air       arab  assisting   baghdadi        bin 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##  caliphate  cognizant    disrupt eventually   fighters     hoping 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##  intensify   internet    involve   involved    kurdish      laden 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##     online operatives organizing   partners   possible principles 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##   priority propaganda       push      qaida radicalize      raqqa 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##    squeeze    strikes      syria     taking       tech  volunteer 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##     within     defeat    foreign    efforts leadership     number 
##       0.99       0.96       0.93       0.88       0.88       0.87 
##       take  elsewhere       able      claim     coming  companies 
##       0.85       0.81       0.79       0.69       0.69       0.69 
##     direct        end     europe everything      forth  including 
##       0.69       0.69       0.69       0.69       0.69       0.69 
##    prevent   progress       iraq 
##       0.69       0.69       0.65

N-Grame Analysis

N-Grame Analysis - N = 2

Trump

corpus <- Corpus(VectorSource(debate.trump$Text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
corpus <- tm_map(corpus, PlainTextDocument)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm.bigram <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
dtm.bigram <- removeSparseTerms(dtm.bigram, 0.995)
dtm.bigram <- as.data.frame(as.matrix(dtm.bigram))
freq.bigram <- colSums(dtm.bigram)
wf.bigram <- data.frame(word=names(freq.bigram), freq=freq.bigram)
wf.bigram <- wf.bigram[order(-wf.bigram$freq),]
wf.bigram[1:10,]
##                                word freq
## secretary clinton secretary clinton   22
## theyre going           theyre going   10
## law order                 law order    7
## right now                 right now    7
## sean hannity           sean hannity    7
## will tell                 will tell    7
## dont think               dont think    6
## inner cities           inner cities    6
## long time                 long time    6
## middle east             middle east    6

N-Grame Analysis - N = 2

Clinton

corpus <- Corpus(VectorSource(debate.clinton$Text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
corpus <- tm_map(corpus, PlainTextDocument)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm.bigram <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
dtm.bigram <- removeSparseTerms(dtm.bigram, 0.995)
dtm.bigram <- as.data.frame(as.matrix(dtm.bigram))
freq.bigram <- colSums(dtm.bigram)
wf.bigram <- data.frame(word=names(freq.bigram), freq=freq.bigram)
wf.bigram <- wf.bigram[order(-wf.bigram$freq),]
wf.bigram[1:10,]
##                              word freq
## weve got                 weve got    9
## new jobs                 new jobs    7
## criminal justice criminal justice    5
## middle class         middle class    5
## tax returns           tax returns    5
## united states       united states    5
## well think             well think    5
## im going                 im going    4
## justice system     justice system    4
## nuclear weapons   nuclear weapons    4

Sentiment Analysis

debate.trump.text <- debate.trump[,3]
debate.trump.text.word <- get_tokens(debate.trump.text, pattern = "\\W")
syuzhet.debate.trump.text.word <- get_sentiment(debate.trump.text.word, method="syuzhet")
s <- length(debate.trump.text)
sentiment.score.trump <- vector()
for (i in 1:s){
  debate.trump.text.word <- get_tokens(debate.trump.text[i], pattern = "\\W")
    syuzhet.debate.trump.text.word <- get_sentiment(debate.trump.text.word, method="syuzhet")
    sentiment.score.trump[i] = mean(syuzhet.debate.trump.text.word)
}
debate.clinton.text <- debate.clinton[,3]
debate.clinton.text.word <- get_tokens(debate.clinton.text, pattern = "\\W")
syuzhet.debate.clinton.text.word <- get_sentiment(debate.clinton.text.word, method="syuzhet")
s <- length(debate.clinton.text)
sentiment.score.clinton <- vector()
for (i in 1:s){
  debate.clinton.text.word <- get_tokens(debate.clinton.text[i], pattern = "\\W")
  syuzhet.debate.clinton.text.word <- get_sentiment(debate.clinton.text.word, method="syuzhet")
  sentiment.score.clinton[i] = mean(syuzhet.debate.clinton.text.word)
}
debate.trump$Score <- sentiment.score.trump
debate.clinton$Score <- sentiment.score.clinton
debate.combine <- rbind(debate.trump, debate.clinton)
ggplot(debate.combine, aes(x = debate.combine$Line, y = debate.combine$Score, colour = debate.combine$Speaker)) +
  geom_line(size = 1) + 
  scale_color_manual(values=c("#6697e8", "#e86666")) +
  theme(legend.position="bottom") +
  ggtitle("Sentiment Analysis - Score Along Time of the Debate")

ggplot(debate.combine, aes(x = debate.combine$Score, fill = debate.combine$Speaker)) +
  geom_histogram(binwidth=.05, position="dodge") +
  scale_fill_manual(values=c("#6697e8", "#e86666")) +
  theme(legend.position="bottom") +
  ggtitle("Sentiment Analysis Score Distribution")