This code is sloppy

There’s no denying it’s a mess

## Loading required package: xml2

Scrape the debate transcript data from the web.

# scrape debate transcript from fortune
scrp <- read_html("http://fortune.com/2016/09/26/presidential-debate-transcript/")
scrp2 <- html_node(scrp, "blockquote")
# dump all the html junk and pull just the text
debate <- html_text(scrp2)

Clean and prep it

# Tag speaker's name to split the text
prep <- gsub("TRUMP:", "~~@TRUMP:", debate)
prep <- gsub("CLINTON:", "~~@CLINTON:", prep)
prep <- gsub("HOLT:", "~~@HOLT:", prep)
# remove applause and newlines
prep <- gsub("(APPLAUSE)", "", prep)
prep <- gsub("[\r\n]", "", prep)

# split the text by the tag
clean <- unlist(strsplit( prep, "~~@"))

# subset words spoken by trump, clinton
trump <- clean[grep("TRUMP:", clean)]
trump <- gsub("TRUMP:", "", trump)
trump_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", trump), " ")))
trump_an <- trump_an[trump_an > 0]

clinton <- clean[grep("CLINTON:", clean)]
clinton <- gsub("CLINTON:", "", clinton)
clinton_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", clinton), " ")))
clinton_an <- clinton_an[clinton_an > 0]

How many total words were spoken by each candidate?

stri_stats_latex(trump)[4]
## Words 
##  8956
stri_stats_latex(clinton)[4]
## Words 
##  6542

How many unique words?

length(unique(as.character(tolower(stri_extract_all_words(trump, simplify = T)))))
## [1] 1371
length(unique(as.character(tolower(stri_extract_all_words(clinton, simplify = T)))))
## [1] 1422

Summary stats on word character length and a frequency table of top terms

summary(nchar(trump_an))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   4.218   5.000  21.000
summary(nchar(clinton_an))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   4.356   6.000  17.000
# frequency tables
head(sort(table(trump_an), decreasing = T), 40)
## trump_an
##     the     and      to       i     you       a      of    that    have 
##     290     267     258     229     203     172     171     162     147 
##      it      we      in      is     its    they    very     was     our 
##     123     120     107      81      77      75      71      69      61 
##     but because     are      be      do     not    with    were    this 
##      60      58      57      53      53      53      53      49      48 
##   going      at country      on    look     all     for    what  theyre 
##      46      45      45      45      44      43      42      42      41 
##    just    when    been      me 
##      39      39      38      38
head(sort(table(clinton_an), decreasing = T), 40)
## clinton_an
##    the     to    and   that      i     of     we      a     in   have 
##    250    240    186    145    136    134    126    122    103     84 
##    you     it     be     is     he    for     do    our    not   well 
##     75     65     59     59     56     46     45     43     41     41 
##  would   what    are  think   with     so    was  about   this     at 
##     41     40     39     39     38     37     37     36     36     34 
##     on    has people   they    can   were    but   know     as donald 
##     34     32     32     32     30     30     28     28     27     26

Histograms!

hist(nchar(trump_an), main = "Trump Word Character Length", col = 'red', xlab = "Number of Characters in Word")

hist(nchar(clinton_an), main = "Clinton Word Character Length", col = 'blue', xlab = "Number of Characters in Word")

term <- "me"
table(clinton_an)[names(table(clinton_an)) == term]
## me 
##  7
table(trump_an)[names(table(trump_an)) == term]
## me 
## 38

And that’s that