Use text mining tools in R to analyze the transcript from first presidential debate between Donald Trump and Hillary Clinton.
Load Packages
library(dplyr)
library(tm)
library(SnowballC)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(coreNLP)
library(syuzhet)
library(RWeka)
Load data
debate <- read.csv('debate.csv', stringsAsFactors = FALSE)
Take a look at the dataset
str(debate)
## 'data.frame': 351 obs. of 4 variables:
## $ Line : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Speaker: chr "Holt" "Audience" "Clinton" "Audience" ...
## $ Text : chr "Good evening from Hofstra University in Hempstead, New York. I'm Lester Holt, anchor of \"NBC Nightly News.\" I want to welcome"| __truncated__ "(APPLAUSE)" "How are you, Donald?" "(APPLAUSE)" ...
## $ Date : chr "9/26/2016" "9/26/2016" "9/26/2016" "9/26/2016" ...
Create new column Category by assigning different sections of the debate into different topics.
Opening / Closing Remarks
Jobs
Race
National Security
Gender
debate$Category <- "Opening / Closing Remarks"
debate$Category[8:161] <- "Jobs"
debate$Category[162:222] <- "Race"
debate$Category[223:310] <- "National Security"
debate$Category[311:343] <- "Gender"
Filter Trump and Clinton in Speaker column and remove Date column
debate <- debate %>% filter(Speaker %in% c("Clinton", "Trump"))
debate$Date <- NULL
Splite the dataset into Trump and Clinton subsets.
debate.trump <- debate %>% filter(Speaker == "Trump")
debate.clinton <- debate %>% filter(Speaker == "Clinton")
Process text using tm package
tmfunction <- function(x){
corpus <- Corpus(VectorSource(x$Text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(corpus)
dtmss <- removeSparseTerms(dtm, 0.99)
}
Convert document term matrix into data frame
word.trump <- as.data.frame(as.matrix(tmfunction(debate.trump)))
word.clinton <- as.data.frame(as.matrix(tmfunction(debate.clinton)))
Sum up the total from each documents
freq.trump <- colSums(word.trump)
wf.trump <- data.frame(word = names(freq.trump), freq = freq.trump)
freq.clinton <- colSums(word.clinton)
wf.clinton <- data.frame(word = names(freq.clinton), freq = freq.clinton)
20 most used words from Trump
head(wf.trump[order(-wf.trump$freq),], n = 20)
## word freq
## going going 47
## country country 46
## look look 44
## theyre theyre 41
## just just 39
## think think 38
## said said 37
## people people 35
## dont dont 34
## say say 32
## will will 32
## know know 30
## thats thats 29
## many many 28
## get get 27
## secretary secretary 27
## well well 26
## one one 25
## clinton clinton 23
## now now 23
20 most used words from Clinton
head(wf.clinton[order(-wf.clinton$freq),], n = 20)
## word freq
## well well 42
## think think 39
## people people 33
## can can 30
## know know 28
## donald donald 26
## going going 26
## need need 23
## one one 22
## thats thats 22
## will will 21
## really really 20
## want want 20
## lot lot 18
## said said 18
## good good 17
## weve weve 17
## get get 16
## jobs jobs 16
## just just 16
Create word cloud to visualize the most frequent words
Trump
wordcloud(words = wf.trump$word, freq = wf.trump$freq, min.freq = 10,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Clinton
wordcloud(words = wf.clinton$word, freq = wf.clinton$freq, min.freq = 8,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Word Association - jobs, police, isis
findAssocs(tmfunction(debate.trump), terms = "jobs", corlimit = 0.6)
## $jobs
## leaving michigan theyre mexico air allow numbers ohio
## 0.77 0.75 0.71 0.70 0.62 0.60 0.60 0.60
findAssocs(tmfunction(debate.clinton), terms = "jobs", corlimit = 0.6)
## $jobs
## manufacturing economy incomes clean energy
## 0.76 0.69 0.69 0.68 0.68
## new advanced birthday build central
## 0.64 0.61 0.61 0.61 0.61
## fairer granddaughters guarantee hofstra hosting
## 0.61 0.61 0.61 0.61 0.61
## infrastructure innovation renewable starts technology
## 0.61 0.61 0.61 0.61 0.61
## thank thanks wage womens
## 0.61 0.61 0.61 0.61
findAssocs(tmfunction(debate.trump), terms = "police", corlimit = 0.6)
## $police
## killed cases shootings chicago january order
## 0.97 0.87 0.83 0.75 0.75 0.75
## thousands law street communities whats
## 0.75 0.74 0.71 0.66 0.61
findAssocs(tmfunction(debate.clinton), terms = "police", corlimit = 0.6)
## $police
## chiefs officers admit bias comes
## 0.78 0.78 0.74 0.74 0.74
## concerns conclusions distinguished experienced fatal
## 0.74 0.74 0.74 0.74 0.74
## group handle implicit jump literally
## 0.74 0.74 0.74 0.74 0.74
## mental policing retraining therefore budget
## 0.74 0.74 0.74 0.74 0.66
## health consequences problem since unfortunately
## 0.66 0.65 0.63 0.63 0.63
findAssocs(tmfunction(debate.trump), terms = "isis", corlimit = 0.6)
## $isis
## troops vacuum formed created
## 0.75 0.75 0.70 0.60
findAssocs(tmfunction(debate.clinton), terms = "isis", corlimit = 0.6)
## $isis
## addressing air arab assisting baghdadi bin
## 0.99 0.99 0.99 0.99 0.99 0.99
## caliphate cognizant disrupt eventually fighters hoping
## 0.99 0.99 0.99 0.99 0.99 0.99
## intensify internet involve involved kurdish laden
## 0.99 0.99 0.99 0.99 0.99 0.99
## online operatives organizing partners possible principles
## 0.99 0.99 0.99 0.99 0.99 0.99
## priority propaganda push qaida radicalize raqqa
## 0.99 0.99 0.99 0.99 0.99 0.99
## squeeze strikes syria taking tech volunteer
## 0.99 0.99 0.99 0.99 0.99 0.99
## within defeat foreign efforts leadership number
## 0.99 0.96 0.93 0.88 0.88 0.87
## take elsewhere able claim coming companies
## 0.85 0.81 0.79 0.69 0.69 0.69
## direct end europe everything forth including
## 0.69 0.69 0.69 0.69 0.69 0.69
## prevent progress iraq
## 0.69 0.69 0.65
N-Grame Analysis - N = 2
Trump
corpus <- Corpus(VectorSource(debate.trump$Text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm.bigram <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
dtm.bigram <- removeSparseTerms(dtm.bigram, 0.995)
dtm.bigram <- as.data.frame(as.matrix(dtm.bigram))
freq.bigram <- colSums(dtm.bigram)
wf.bigram <- data.frame(word=names(freq.bigram), freq=freq.bigram)
wf.bigram <- wf.bigram[order(-wf.bigram$freq),]
wf.bigram[1:10,]
## word freq
## secretary clinton secretary clinton 22
## theyre going theyre going 10
## law order law order 7
## right now right now 7
## sean hannity sean hannity 7
## will tell will tell 7
## dont think dont think 6
## inner cities inner cities 6
## long time long time 6
## middle east middle east 6
N-Grame Analysis - N = 2
Clinton
corpus <- Corpus(VectorSource(debate.clinton$Text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm.bigram <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
dtm.bigram <- removeSparseTerms(dtm.bigram, 0.995)
dtm.bigram <- as.data.frame(as.matrix(dtm.bigram))
freq.bigram <- colSums(dtm.bigram)
wf.bigram <- data.frame(word=names(freq.bigram), freq=freq.bigram)
wf.bigram <- wf.bigram[order(-wf.bigram$freq),]
wf.bigram[1:10,]
## word freq
## weve got weve got 9
## new jobs new jobs 7
## criminal justice criminal justice 5
## middle class middle class 5
## tax returns tax returns 5
## united states united states 5
## well think well think 5
## im going im going 4
## justice system justice system 4
## nuclear weapons nuclear weapons 4
debate.trump.text <- debate.trump[,3]
debate.trump.text.word <- get_tokens(debate.trump.text, pattern = "\\W")
syuzhet.debate.trump.text.word <- get_sentiment(debate.trump.text.word, method="syuzhet")
s <- length(debate.trump.text)
sentiment.score.trump <- vector()
for (i in 1:s){
debate.trump.text.word <- get_tokens(debate.trump.text[i], pattern = "\\W")
syuzhet.debate.trump.text.word <- get_sentiment(debate.trump.text.word, method="syuzhet")
sentiment.score.trump[i] = mean(syuzhet.debate.trump.text.word)
}
debate.clinton.text <- debate.clinton[,3]
debate.clinton.text.word <- get_tokens(debate.clinton.text, pattern = "\\W")
syuzhet.debate.clinton.text.word <- get_sentiment(debate.clinton.text.word, method="syuzhet")
s <- length(debate.clinton.text)
sentiment.score.clinton <- vector()
for (i in 1:s){
debate.clinton.text.word <- get_tokens(debate.clinton.text[i], pattern = "\\W")
syuzhet.debate.clinton.text.word <- get_sentiment(debate.clinton.text.word, method="syuzhet")
sentiment.score.clinton[i] = mean(syuzhet.debate.clinton.text.word)
}
debate.trump$Score <- sentiment.score.trump
debate.clinton$Score <- sentiment.score.clinton
debate.combine <- rbind(debate.trump, debate.clinton)
ggplot(debate.combine, aes(x = debate.combine$Line, y = debate.combine$Score, colour = debate.combine$Speaker)) +
geom_line(size = 1) +
scale_color_manual(values=c("#6697e8", "#e86666")) +
theme(legend.position="bottom") +
ggtitle("Sentiment Analysis - Score Along Time of the Debate")
ggplot(debate.combine, aes(x = debate.combine$Score, fill = debate.combine$Speaker)) +
geom_histogram(binwidth=.05, position="dodge") +
scale_fill_manual(values=c("#6697e8", "#e86666")) +
theme(legend.position="bottom") +
ggtitle("Sentiment Analysis Score Distribution")