library(dplyr);library(ggplot2);require(plotrix)
library(tm);library(qdap);require(SnowballC);library(tau);require(wordcloud)
rm(list=ls())
file <- "https://raw.githubusercontent.com/tkvit/Data_No_Project/master/US_debate_presidential_2016.csv"
debate <- read.csv(file,h =T, stringsAsFactors = F)
#Extract Speaker
clinton <- debate %>% filter(Speaker =="Clinton") %>% .$Text %>% iconv(to="utf-8",sub ="")
trump <- debate %>% filter(Speaker == "Trump") %>% .$Text %>% iconv(to = "utf-8",sub ="")
#Create Corpus
clinton_source <- VectorSource(clinton)
trump_source <- VectorSource(trump)
clinton_corpus <- clinton %>% VectorSource() %>% Corpus()
trump_corpus <- trump %>% VectorSource() %>% Corpus()
Need to do some preliminary cleanings before mining them
#Cleaning text wit tm package
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords,c(stopwords("en"),stopwords("SMART"),
"thats","people","theyre","think",
"well","lot"))
return(corpus)
}
clinton_corpus <- clean_corpus(clinton_corpus)
trump_corpus <- clean_corpus(trump_corpus)
# Make a bigram TDM
tokenize_ngrams <- function(x, n=2) {
return(textcnt(x,method="string",n=n,decreasing=TRUE))}
bigram_vit <- function(corpus){
sample_df <- data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F)
bigrams_df <- sample_df %>% tokenize_ngrams(n=2)
bigrams_df <- data.frame(word = rownames(as.data.frame(unclass(bigrams_df))),
freq = unclass(bigrams_df))
return(bigrams_df)
}
#For Clinton
corpus <- clinton_corpus
bigram_clin <- bigram_vit(corpus)
#For Trump
corpus <- trump_corpus
bigram_trum <- bigram_vit(corpus)
Now it’s ready to plot
plot_barplot <- function(bigram,tit){
# Create a barplot
bigram$word <- factor(bigram$word, levels = bigram$word)
bigram %>% ggplot(aes(x= reorder(word,freq), y= freq,fill= freq))+
geom_bar(stat="identity")+coord_flip() + ggtitle(tit)+
theme(axis.title.y = element_blank())
}
# First 10 words
plot_barplot(bigram_clin[1:10,],"Clinton")
plot_barplot(bigram_trum[1:10,],"Trump")
WordCloud for Cliton
wordcloud(word_freqs$word, word_freqs$freq,
max.words = 100, colors= brewer.pal(5,"Set1"))
WordCloud for Trump
word_freqs <- bigram_trum
WordCloud for Trump
wordcloud(word_freqs$word, word_freqs$freq,
max.words = 100, colors= brewer.pal(5,"Set1"))
At first, we need a matrix with common words
# Combine both corpus
colnames(bigram_clin)[2] <-"Clinton"
colnames(bigram_trum)[2] <-"Trump"
all_m <- merge(bigram_clin,bigram_trum,all = T)
all_m[is.na(all_m)]<-0
rownames(all_m) <- all_m$word
all_m <- all_m[,2:3]
#Subset shared terms
common_words <- subset(all_m,all_m[, 1] > 0 & all_m[, 2] > 0)
# Find most commonly shared words
difference <- abs(common_words[, 1] - common_words[, 2])
common_words <- cbind(common_words, difference)
common_words <- common_words[order(common_words[, 3],
decreasing = TRUE), ]
top25_df <- data.frame(x = common_words[1:25, 1],
y = common_words[1:25, 2],
labels = rownames(common_words[1:25, ]))
pyramid.plot(top25_df$x, top25_df$y,
labels = top25_df$labels,
main = "Words in Common",
gap = 8, laxlab = 0:18,
raxlab = 0:18, unit = NULL,
top.labels = c("Clinton",
"Words",
"Trump"))
Credit: Many thanks to The Washinton Post and Kaggle team for the data.