1. Load packages and import data

library(dplyr);library(ggplot2);require(plotrix)
library(tm);library(qdap);require(SnowballC);library(tau);require(wordcloud)


rm(list=ls())
file <- "https://raw.githubusercontent.com/tkvit/Data_No_Project/master/US_debate_presidential_2016.csv"
debate <- read.csv(file,h =T, stringsAsFactors = F)

#Extract Speaker
clinton <- debate %>% filter(Speaker =="Clinton") %>% .$Text  %>% iconv(to="utf-8",sub ="")
trump <- debate %>% filter(Speaker == "Trump") %>%  .$Text %>% iconv(to = "utf-8",sub ="")

#Create Corpus 
clinton_source <- VectorSource(clinton)
trump_source <- VectorSource(trump)

clinton_corpus <- clinton %>% VectorSource() %>% Corpus()
trump_corpus <- trump %>% VectorSource() %>% Corpus()

Need to do some preliminary cleanings before mining them

#Cleaning text wit tm package
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords,c(stopwords("en"),stopwords("SMART"), 
                                         "thats","people","theyre","think",
                                         "well","lot"))
  return(corpus)
}
clinton_corpus <- clean_corpus(clinton_corpus)
trump_corpus <- clean_corpus(trump_corpus)

2. Create bigrams

# Make a bigram TDM
tokenize_ngrams <- function(x, n=2) {
  return(textcnt(x,method="string",n=n,decreasing=TRUE))}

bigram_vit <- function(corpus){
  sample_df <- data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F)
  bigrams_df <- sample_df %>% tokenize_ngrams(n=2) 
  bigrams_df <- data.frame(word = rownames(as.data.frame(unclass(bigrams_df))),
                           freq = unclass(bigrams_df))
  return(bigrams_df)
}

#For Clinton
corpus <- clinton_corpus
bigram_clin <- bigram_vit(corpus)

#For Trump
corpus <- trump_corpus
bigram_trum <- bigram_vit(corpus)

Now it’s ready to plot

3. Plot

Barplot

plot_barplot <- function(bigram,tit){
  # Create a barplot
  bigram$word <- factor(bigram$word, levels = bigram$word)
  bigram %>% ggplot(aes(x= reorder(word,freq), y= freq,fill= freq))+
    geom_bar(stat="identity")+coord_flip() + ggtitle(tit)+
    theme(axis.title.y = element_blank())
}

# First 10 words
plot_barplot(bigram_clin[1:10,],"Clinton")

plot_barplot(bigram_trum[1:10,],"Trump")

Word cloud

WordCloud for Cliton

wordcloud(word_freqs$word, word_freqs$freq,
          max.words = 100, colors= brewer.pal(5,"Set1"))

WordCloud for Trump

word_freqs <- bigram_trum

WordCloud for Trump

wordcloud(word_freqs$word, word_freqs$freq,
          max.words = 100, colors= brewer.pal(5,"Set1"))

Pyramid Plot

At first, we need a matrix with common words

# Combine both corpus
colnames(bigram_clin)[2] <-"Clinton"
colnames(bigram_trum)[2] <-"Trump"
all_m <- merge(bigram_clin,bigram_trum,all = T)
all_m[is.na(all_m)]<-0
rownames(all_m) <- all_m$word
all_m <- all_m[,2:3]

#Subset shared terms
common_words <- subset(all_m,all_m[, 1] > 0 & all_m[, 2] > 0)
# Find most commonly shared words
difference <- abs(common_words[, 1] - common_words[, 2])
common_words <- cbind(common_words, difference)
common_words <- common_words[order(common_words[, 3],
                                   decreasing = TRUE), ]
top25_df <- data.frame(x = common_words[1:25, 1],
                       y = common_words[1:25, 2],
                       labels = rownames(common_words[1:25, ]))

pyramid.plot(top25_df$x, top25_df$y,
             labels = top25_df$labels,
             main = "Words in Common",
             gap = 8, laxlab = 0:18,
             raxlab = 0:18, unit = NULL,
             top.labels = c("Clinton",
                            "Words",
                            "Trump"))

Credit: Many thanks to The Washinton Post and Kaggle team for the data.

Clinton and Trump during the 2 presidential debates