There’s no denying it’s a mess
## Loading required package: xml2
# scrape debate transcript from fortune
scrp <- read_html("http://fortune.com/2016/09/26/presidential-debate-transcript/")
scrp2 <- html_node(scrp, "blockquote")
# dump all the html junk and pull just the text
debate <- html_text(scrp2)
# Tag speaker's name to split the text
prep <- gsub("TRUMP:", "~~@TRUMP:", debate)
prep <- gsub("CLINTON:", "~~@CLINTON:", prep)
prep <- gsub("HOLT:", "~~@HOLT:", prep)
# remove applause and newlines
prep <- gsub("(APPLAUSE)", "", prep)
prep <- gsub("[\r\n]", "", prep)
# split the text by the tag
clean <- unlist(strsplit( prep, "~~@"))
# subset words spoken by trump, clinton
trump <- clean[grep("TRUMP:", clean)]
trump <- gsub("TRUMP:", "", trump)
trump_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", trump), " ")))
trump_an <- trump_an[trump_an > 0]
clinton <- clean[grep("CLINTON:", clean)]
clinton <- gsub("CLINTON:", "", clinton)
clinton_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", clinton), " ")))
clinton_an <- clinton_an[clinton_an > 0]
stri_stats_latex(trump)[4]
## Words
## 8956
stri_stats_latex(clinton)[4]
## Words
## 6542
length(unique(as.character(tolower(stri_extract_all_words(trump, simplify = T)))))
## [1] 1371
length(unique(as.character(tolower(stri_extract_all_words(clinton, simplify = T)))))
## [1] 1422
summary(nchar(trump_an))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 4.218 5.000 21.000
summary(nchar(clinton_an))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 4.356 6.000 17.000
# frequency tables
head(sort(table(trump_an), decreasing = T), 40)
## trump_an
## the and to i you a of that have
## 290 267 258 229 203 172 171 162 147
## it we in is its they very was our
## 123 120 107 81 77 75 71 69 61
## but because are be do not with were this
## 60 58 57 53 53 53 53 49 48
## going at country on look all for what theyre
## 46 45 45 45 44 43 42 42 41
## just when been me
## 39 39 38 38
head(sort(table(clinton_an), decreasing = T), 40)
## clinton_an
## the to and that i of we a in have
## 250 240 186 145 136 134 126 122 103 84
## you it be is he for do our not well
## 75 65 59 59 56 46 45 43 41 41
## would what are think with so was about this at
## 41 40 39 39 38 37 37 36 36 34
## on has people they can were but know as donald
## 34 32 32 32 30 30 28 28 27 26
hist(nchar(trump_an), main = "Trump Word Character Length", col = 'red', xlab = "Number of Characters in Word")
hist(nchar(clinton_an), main = "Clinton Word Character Length", col = 'blue', xlab = "Number of Characters in Word")
term <- "me"
table(clinton_an)[names(table(clinton_an)) == term]
## me
## 7
table(trump_an)[names(table(trump_an)) == term]
## me
## 38
And that’s that