First Debate 2016

This code is sloppy

There’s no denying it’s a mess

## Loading required package: xml2

Scrape the debate transcript data from the web.

# scrape debate transcript from fortune
scrp <- read_html("http://fortune.com/2016/09/26/presidential-debate-transcript/")
scrp2 <- html_node(scrp, "blockquote")
# dump all the html junk and pull just the text
debate <- html_text(scrp2)

Clean and prep it

# Tag speaker's name to split the text
prep <- gsub("TRUMP:", "~~@TRUMP:", debate)
prep <- gsub("CLINTON:", "~~@CLINTON:", prep)
prep <- gsub("HOLT:", "~~@HOLT:", prep)
# remove applause and newlines
prep <- gsub("(APPLAUSE)", "", prep)
prep <- gsub("[\r\n]", "", prep)

# split the text by the tag
clean <- unlist(strsplit( prep, "~~@"))

# subset words spoken by trump, clinton
trump <- clean[grep("TRUMP:", clean)]
trump <- gsub("TRUMP:", "", trump)
trump_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", trump), " ")))
trump_an <- trump_an[trump_an > 0]

clinton <- clean[grep("CLINTON:", clean)]
clinton <- gsub("CLINTON:", "", clinton)
clinton_an <- tolower(unlist(strsplit(gsub("[^[:alnum:] ]", "", clinton), " ")))
clinton_an <- clinton_an[clinton_an > 0]

How many total words were spoken by each candidate?

stri_stats_latex(trump)[4]

## Words 
##  8956

stri_stats_latex(clinton)[4]

## Words 
##  6542

How many unique words?

length(unique(as.character(tolower(stri_extract_all_words(trump, simplify = T)))))

## [1] 1371

length(unique(as.character(tolower(stri_extract_all_words(clinton, simplify = T)))))

## [1] 1422

Summary stats on word character length and a frequency table of top terms

summary(nchar(trump_an))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   4.218   5.000  21.000

summary(nchar(clinton_an))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   4.356   6.000  17.000

# frequency tables
head(sort(table(trump_an), decreasing = T), 40)

## trump_an
##     the     and      to       i     you       a      of    that    have 
##     290     267     258     229     203     172     171     162     147 
##      it      we      in      is     its    they    very     was     our 
##     123     120     107      81      77      75      71      69      61 
##     but because     are      be      do     not    with    were    this 
##      60      58      57      53      53      53      53      49      48 
##   going      at country      on    look     all     for    what  theyre 
##      46      45      45      45      44      43      42      42      41 
##    just    when    been      me 
##      39      39      38      38

head(sort(table(clinton_an), decreasing = T), 40)

## clinton_an
##    the     to    and   that      i     of     we      a     in   have 
##    250    240    186    145    136    134    126    122    103     84 
##    you     it     be     is     he    for     do    our    not   well 
##     75     65     59     59     56     46     45     43     41     41 
##  would   what    are  think   with     so    was  about   this     at 
##     41     40     39     39     38     37     37     36     36     34 
##     on    has people   they    can   were    but   know     as donald 
##     34     32     32     32     30     30     28     28     27     26

Histograms!

hist(nchar(trump_an), main = "Trump Word Character Length", col = 'red', xlab = "Number of Characters in Word")

hist(nchar(clinton_an), main = "Clinton Word Character Length", col = 'blue', xlab = "Number of Characters in Word")

term <- "me"
table(clinton_an)[names(table(clinton_an)) == term]

## me 
##  7

table(trump_an)[names(table(trump_an)) == term]

## me 
## 38

And that’s that

First Debate 2016

Christopher Castle

September 27, 2016

This code is sloppy

Scrape the debate transcript data from the web.

Clean and prep it

How many total words were spoken by each candidate?

How many unique words?

Summary stats on word character length and a frequency table of top terms

Histograms!