This project classified tweets between Hillary Clinton and Donald Trump based on their tweets published between April 17, 2016 to September 28, 2016. These tweets came from their original accounts, so none of these were retweets. Let’s predict Cliton’s tweets from Trump’s using different classification algorithms.
library(tidyverse)
library(wrapr)
library(readit)
library(tidytext)
library(tm)
library(e1071)
library(wordcloud)
# load data
# the tweets were manually collected from various sources, i.e. Kaggle, Github
corpus <- readit("../data/tweets.txt")
## File guessed to be tab-delimited ("../data/tweets.txt")
## Parsed with column specification:
## cols(
## handle = col_character(),
## text = col_character()
## )
# %>% mutate(handle = dplyr::case_when(handle == 'HillaryClinton' ~ 0, TRUE ~ 1))
dim(corpus)
## [1] 4332 2
# check counts
table(corpus$handle)
##
## HillaryClinton realDonaldTrump
## 2629 1703
### cleanup steps ###
# first put the corpus in tm format
corpusClean <- Corpus(VectorSource(corpus$text))
# clean up
corpusClean <- tm_map(corpusClean, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpusClean, content_transformer(tolower)):
## transformation drops documents
corpusClean <- tm_map(corpusClean, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(corpusClean, removeWords, stopwords()):
## transformation drops documents
corpusClean <- tm_map(corpusClean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpusClean, stripWhitespace):
## transformation drops documents
corpusClean <- tm_map(corpusClean, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpusClean, removePunctuation):
## transformation drops documents
# convert it into a dtm (row per document, column per word)
dtm <- DocumentTermMatrix(corpusClean)
inspect(dtm)
## <<DocumentTermMatrix (documents: 4332, terms: 10193)>>
## Non-/sparse entries: 43875/44112201
## Sparsity : 100%
## Maximal term length: 32
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs —hillary america donald great hillary make president thank trump
## 1282 1 0 1 0 0 0 0 0 1
## 1287 0 0 0 0 0 0 0 0 0
## 2644 0 0 0 0 0 0 0 0 0
## 2774 0 0 1 0 0 0 0 0 1
## 2878 0 0 0 0 1 0 0 0 0
## 2881 0 0 0 0 0 0 0 0 1
## 3143 0 0 0 1 0 0 0 0 0
## 41 0 0 0 0 0 0 0 0 0
## 593 0 1 0 1 0 0 0 0 0
## 855 0 1 2 1 0 2 0 0 1
## Terms
## Docs will
## 1282 0
## 1287 0
## 2644 0
## 2774 1
## 2878 0
## 2881 0
## 3143 0
## 41 0
## 593 0
## 855 0
# set index
set.seed(1234)
index <- sample(1:dim(corpus)[1], .7 * dim(corpus)[1]) # split it by 70% vs 30%
# split original corpus into train and test sets, each set contains the "handle" (dependent variable)
train_step_1 <- corpus[index, ]
test_step_1 <- corpus[-index, ]
# set frequency filter, i.e. only include words that appear f or more times in the whole corpus
f = 5
features <- findFreqTerms(dtm, f)
# split clean corpus
# turn it into dtm
# set features filter - this is a very important step; if not applied, some sparse terms built in the training set would not be found in the test set and the model will fail
# convert value into 1 or 0 for classification, e.g. Naive Bayes
# turn it into a data.frame
system.time(
train_step_2 <- corpusClean[index] %>%
DocumentTermMatrix(., list(global = c(2, Inf), dictionary = features)) %>%
apply(MARGIN = 2, function(x) x <- ifelse(x >0, 1, 0)) %>%
as.data.frame
)
## user system elapsed
## 4.02 0.06 4.11
system.time(
test_step_2 <- corpusClean[-index] %>%
DocumentTermMatrix(., list(global = c(2, Inf), dictionary = features)) %>%
apply(MARGIN = 2, function(x) x <- ifelse(x >0, 1, 0)) %>%
as.data.frame
)
## user system elapsed
## 1.11 0.03 1.17
# final step, put step 1 and 2 together
train <- cbind(handle = factor(train_step_1$handle), train_step_2) %>% as.data.frame
test <- cbind(handle = factor(test_step_1$handle), test_step_2) %>% as.data.frame
system.time( fit_svm <- e1071::svm(handle~., train) )
## user system elapsed
## 65.35 0.35 69.25
# summary(fit_svm)
# fit a prediction
system.time( fit_svm_pred <- predict(fit_svm, na.omit(test)) )
## user system elapsed
## 8.93 0.02 9.06
# classification outcome
ftable(fit_svm_pred, test$handle,
dnn = c("Predicted", "Actual")) -> table_svm
table_svm
## Actual HillaryClinton realDonaldTrump
## Predicted
## HillaryClinton 756 156
## realDonaldTrump 11 377
table_svm %>% prop.table(., margin = 2)*100 -> accuracy_svm
round(accuracy_svm, 1)
## Actual HillaryClinton realDonaldTrump
## Predicted
## HillaryClinton 98.6 29.3
## realDonaldTrump 1.4 70.7
tp_svm <- 756
fp_svm <- 156
tn_svm <- 377
fn_svm <- 11
sensitivity_svm <- tp_svm / (tp_svm + fn_svm) # equivalent to recall
specificity_svm <- tn_svm / (tn_svm + fp_svm)
precision_svm <- tp_svm / (tp_svm + fp_svm)
total_accuracy_svm <- (tp_svm + tn_svm) / sum(tp_svm, fp_svm, tn_svm, fn_svm)
system.time( fit_nb <- e1071::naiveBayes(handle ~., train) )
## user system elapsed
## 0.70 0.02 0.72
# summary(fit_nb)
# fit a prediction
system.time( fit_nb_pred <- predict(fit_nb, na.omit(test)) )
## user system elapsed
## 32.22 0.03 36.66
# classification outcome
ftable(fit_nb_pred, test$handle,
dnn = c("Predicted", "Actual")) -> table_nb
table_nb
## Actual HillaryClinton realDonaldTrump
## Predicted
## HillaryClinton 195 49
## realDonaldTrump 572 484
table_nb %>% prop.table(., margin = 2)*100 -> accuracy_nb
round(accuracy_nb, 1)
## Actual HillaryClinton realDonaldTrump
## Predicted
## HillaryClinton 25.4 9.2
## realDonaldTrump 74.6 90.8
tp_nb <- 195
fp_nb <- 49
tn_nb <- 484
fn_nb <- 572
sensitivity_nb <- tp_nb / (tp_nb + fn_nb) # equivalent to recall
specificity_nb <- tn_nb / (tn_nb + fp_nb)
precision_nb <- tp_nb / (tp_nb + fp_nb)
total_accuracy_nb <- (tp_nb + tn_nb) / sum(tp_nb, fp_nb, tn_nb, fn_nb)
system.time( fit_lr <- glm(handle ~., train, family = "binomial") )
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## user system elapsed
## 161.48 0.83 182.58
# summary(fit_lr)
# fit a prediction
system.time( fit_lr_pred <- predict(fit_lr, newdata = test[, -1], type = "response") )
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## user system elapsed
## 0.14 0.02 0.18
# classification outcome
ftable(test$handle, fit_lr_pred > 0.5) -> table_lr
table_lr
## FALSE TRUE
##
## HillaryClinton 592 175
## realDonaldTrump 151 382
table_lr %>% prop.table(., margin = 1)*100 -> accuracy_lr
round(accuracy_lr, 1)
## FALSE TRUE
##
## HillaryClinton 77.2 22.8
## realDonaldTrump 28.3 71.7
tp_lr <- 175
fp_lr <- 151
tn_lr <- 382
fn_lr <- 592
sensitivity_lr <- tp_lr / (tp_lr + fn_lr) # equivalent to recall
specificity_lr <- tn_lr / (tn_lr + fp_lr)
precision_lr <- tp_lr / (tp_lr + fp_lr)
total_accuracy_lr <- (tp_lr + tn_lr) / sum(tp_lr, fp_lr, tn_lr, fn_lr)
Overall, Support Vector Machine (SVM) outperformed the others, i.e. extremely sensitive in identifying Clinton’s tweets; interestingly, Naive Bayes (NB) performed the best by identifying Trump’s tweets.
data.frame(classification = c("Support Vector Machine", "Naive Bayes", "Logistic Regression"),
sensitivity = round(c(sensitivity_svm, sensitivity_nb, sensitivity_lr), 3),
specificity = round(c(specificity_svm, specificity_nb, specificity_lr), 3),
precision = round(c(precision_svm, precision_nb, precision_lr), 3),
total_accuracy = round(c(total_accuracy_svm, total_accuracy_nb, total_accuracy_lr), 3),
stringsAsFactors = F) %>%
tidyr::gather(., "measure", "outcome", -classification) %>%
tidyr::spread(classification, outcome)
## measure Logistic Regression Naive Bayes Support Vector Machine
## 1 precision 0.537 0.799 0.829
## 2 sensitivity 0.228 0.254 0.986
## 3 specificity 0.717 0.908 0.707
## 4 total_accuracy 0.428 0.522 0.872
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
clinton <- corpus %>%
dplyr::filter(handle == "HillaryClinton") %>%
dplyr::select(text) %>%
dplyr::mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
tidytext::unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% c(stop_words$word,
c("trump", "hillary", "donald", "clinton", "trump's", "hillary's")),
str_detect(word, "[a-z]")) %.>%
plyr::count(.$word) %>%
arrange(desc(freq)) %>%
top_n(200) %>%
rename("word" = x)
## Selecting by freq
# clinton
set.seed(20161108)
wordcloud(word = clinton$word, freq = clinton$freq,
colors = brewer.pal(8, "RdBu"), scale = c(4, .1), rot.per = .2)
trump <- corpus %>%
dplyr::filter(handle == "realDonaldTrump") %>%
dplyr::select(text) %>%
dplyr::mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
tidytext::unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% c(stop_words$word,
c("trump", "hillary", "donald", "clinton", "trump's", "hillary's")),
str_detect(word, "[a-z]")) %.>%
plyr::count(.$word) %>%
arrange(desc(freq)) %>%
top_n(200) %>%
rename("word" = x)
## Selecting by freq
# trump
set.seed(20161108)
wordcloud(word = trump$word, freq = trump$freq,
colors = brewer.pal(8, "RdBu"), scale = c(4, .1), rot.per = .2)