project_4_text_classification

Text Classification

This project classified tweets between Hillary Clinton and Donald Trump based on their tweets published between April 17, 2016 to September 28, 2016. These tweets came from their original accounts, so none of these were retweets. Let’s predict Cliton’s tweets from Trump’s using different classification algorithms.

library(tidyverse)
library(wrapr)
library(readit)
library(tidytext)
library(tm)
library(e1071)
library(wordcloud)

# load data
# the tweets were manually collected from various sources, i.e. Kaggle, Github
corpus <- readit("../data/tweets.txt")

## File guessed to be tab-delimited ("../data/tweets.txt")

## Parsed with column specification:
## cols(
##   handle = col_character(),
##   text = col_character()
## )

# %>% mutate(handle = dplyr::case_when(handle == 'HillaryClinton' ~ 0, TRUE ~ 1))

dim(corpus)

## [1] 4332    2

# check counts
table(corpus$handle)

## 
##  HillaryClinton realDonaldTrump 
##            2629            1703

### cleanup steps ###

# first put the corpus in tm format
corpusClean <- Corpus(VectorSource(corpus$text))

# clean up
corpusClean <- tm_map(corpusClean, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpusClean, content_transformer(tolower)):
## transformation drops documents

corpusClean <- tm_map(corpusClean, removeWords, stopwords())

## Warning in tm_map.SimpleCorpus(corpusClean, removeWords, stopwords()):
## transformation drops documents

corpusClean <- tm_map(corpusClean, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpusClean, stripWhitespace):
## transformation drops documents

corpusClean <- tm_map(corpusClean, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpusClean, removePunctuation):
## transformation drops documents

# convert it into a dtm (row per document, column per word)
dtm <- DocumentTermMatrix(corpusClean)
inspect(dtm)

## <<DocumentTermMatrix (documents: 4332, terms: 10193)>>
## Non-/sparse entries: 43875/44112201
## Sparsity           : 100%
## Maximal term length: 32
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   —hillary america donald great hillary make president thank trump
##   1282        1       0      1     0       0    0         0     0     1
##   1287        0       0      0     0       0    0         0     0     0
##   2644        0       0      0     0       0    0         0     0     0
##   2774        0       0      1     0       0    0         0     0     1
##   2878        0       0      0     0       1    0         0     0     0
##   2881        0       0      0     0       0    0         0     0     1
##   3143        0       0      0     1       0    0         0     0     0
##   41          0       0      0     0       0    0         0     0     0
##   593         0       1      0     1       0    0         0     0     0
##   855         0       1      2     1       0    2         0     0     1
##       Terms
## Docs   will
##   1282    0
##   1287    0
##   2644    0
##   2774    1
##   2878    0
##   2881    0
##   3143    0
##   41      0
##   593     0
##   855     0

# set index
set.seed(1234)
index <- sample(1:dim(corpus)[1], .7 * dim(corpus)[1])  # split it by 70% vs 30%

# split original corpus into train and test sets, each set contains the "handle" (dependent variable)
train_step_1 <- corpus[index, ]
test_step_1 <- corpus[-index, ]

# set frequency filter, i.e. only include words that appear f or more times in the whole corpus
f = 5
features <- findFreqTerms(dtm, f)

# split clean corpus
# turn it into dtm
# set features filter - this is a very important step; if not applied, some sparse terms built in the training set would not be found in the test set and the model will fail
# convert value into 1 or 0 for classification, e.g. Naive Bayes
# turn it into a data.frame

system.time(
train_step_2 <- corpusClean[index] %>% 
        DocumentTermMatrix(., list(global = c(2, Inf), dictionary = features)) %>%
        apply(MARGIN = 2, function(x) x <- ifelse(x >0, 1, 0)) %>%
        as.data.frame
)

##    user  system elapsed 
##    4.02    0.06    4.11

system.time(
test_step_2 <- corpusClean[-index] %>% 
        DocumentTermMatrix(., list(global = c(2, Inf), dictionary = features)) %>%
        apply(MARGIN = 2, function(x) x <- ifelse(x >0, 1, 0)) %>%
        as.data.frame
)

##    user  system elapsed 
##    1.11    0.03    1.17

# final step, put step 1 and 2 together
train <- cbind(handle = factor(train_step_1$handle), train_step_2) %>% as.data.frame
test <- cbind(handle = factor(test_step_1$handle), test_step_2) %>% as.data.frame

Classification Models

Support Vector Machine

system.time( fit_svm <- e1071::svm(handle~., train) )

##    user  system elapsed 
##   65.35    0.35   69.25

# summary(fit_svm)

# fit a prediction
system.time( fit_svm_pred <- predict(fit_svm, na.omit(test)) )

##    user  system elapsed 
##    8.93    0.02    9.06

# classification outcome
ftable(fit_svm_pred, test$handle, 
       dnn = c("Predicted", "Actual")) -> table_svm
table_svm

##                 Actual HillaryClinton realDonaldTrump
## Predicted                                            
## HillaryClinton                    756             156
## realDonaldTrump                    11             377

table_svm %>% prop.table(., margin = 2)*100 -> accuracy_svm
round(accuracy_svm, 1)

##                 Actual HillaryClinton realDonaldTrump
## Predicted                                            
## HillaryClinton                   98.6            29.3
## realDonaldTrump                   1.4            70.7

tp_svm <- 756
fp_svm <- 156
tn_svm <- 377
fn_svm <- 11

sensitivity_svm <- tp_svm / (tp_svm + fn_svm)  # equivalent to recall
specificity_svm <- tn_svm / (tn_svm + fp_svm)
precision_svm <- tp_svm / (tp_svm + fp_svm) 
total_accuracy_svm <- (tp_svm + tn_svm) / sum(tp_svm, fp_svm, tn_svm, fn_svm)

Naive Bayes

system.time( fit_nb <- e1071::naiveBayes(handle ~., train) )

##    user  system elapsed 
##    0.70    0.02    0.72

# summary(fit_nb)

# fit a prediction
system.time( fit_nb_pred <- predict(fit_nb, na.omit(test)) )

##    user  system elapsed 
##   32.22    0.03   36.66

# classification outcome
ftable(fit_nb_pred, test$handle, 
       dnn = c("Predicted", "Actual")) -> table_nb
table_nb

##                 Actual HillaryClinton realDonaldTrump
## Predicted                                            
## HillaryClinton                    195              49
## realDonaldTrump                   572             484

table_nb %>% prop.table(., margin = 2)*100 -> accuracy_nb
round(accuracy_nb, 1)

##                 Actual HillaryClinton realDonaldTrump
## Predicted                                            
## HillaryClinton                   25.4             9.2
## realDonaldTrump                  74.6            90.8

tp_nb <- 195
fp_nb <- 49
tn_nb <- 484
fn_nb <- 572

sensitivity_nb <- tp_nb / (tp_nb + fn_nb)  # equivalent to recall
specificity_nb <- tn_nb / (tn_nb + fp_nb)
precision_nb <- tp_nb / (tp_nb + fp_nb) 
total_accuracy_nb <- (tp_nb + tn_nb) / sum(tp_nb, fp_nb, tn_nb, fn_nb)

Logistc Regression

system.time( fit_lr <- glm(handle ~., train, family = "binomial") )

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##    user  system elapsed 
##  161.48    0.83  182.58

# summary(fit_lr)

# fit a prediction
system.time( fit_lr_pred <- predict(fit_lr, newdata = test[, -1], type = "response") )

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

##    user  system elapsed 
##    0.14    0.02    0.18

# classification outcome
ftable(test$handle, fit_lr_pred > 0.5) -> table_lr
table_lr

##                  FALSE TRUE
##                            
## HillaryClinton     592  175
## realDonaldTrump    151  382

table_lr %>% prop.table(., margin = 1)*100 -> accuracy_lr
round(accuracy_lr, 1)

##                  FALSE TRUE
##                            
## HillaryClinton    77.2 22.8
## realDonaldTrump   28.3 71.7

tp_lr <- 175
fp_lr <- 151
tn_lr <- 382
fn_lr <- 592

sensitivity_lr <- tp_lr / (tp_lr + fn_lr)  # equivalent to recall
specificity_lr <- tn_lr / (tn_lr + fp_lr)
precision_lr <- tp_lr / (tp_lr + fp_lr) 
total_accuracy_lr <- (tp_lr + tn_lr) / sum(tp_lr, fp_lr, tn_lr, fn_lr)

Table Summary

Overall, Support Vector Machine (SVM) outperformed the others, i.e. extremely sensitive in identifying Clinton’s tweets; interestingly, Naive Bayes (NB) performed the best by identifying Trump’s tweets.

data.frame(classification = c("Support Vector Machine", "Naive Bayes", "Logistic Regression"),
           sensitivity = round(c(sensitivity_svm, sensitivity_nb, sensitivity_lr), 3),
           specificity = round(c(specificity_svm, specificity_nb, specificity_lr), 3),
           precision = round(c(precision_svm, precision_nb, precision_lr), 3),
           total_accuracy = round(c(total_accuracy_svm, total_accuracy_nb, total_accuracy_lr), 3),
           stringsAsFactors = F) %>%
        tidyr::gather(., "measure", "outcome", -classification) %>%
        tidyr::spread(classification, outcome)

##          measure Logistic Regression Naive Bayes Support Vector Machine
## 1      precision               0.537       0.799                  0.829
## 2    sensitivity               0.228       0.254                  0.986
## 3    specificity               0.717       0.908                  0.707
## 4 total_accuracy               0.428       0.522                  0.872

Word Cloud

Clinton

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"

clinton <- corpus %>%
        dplyr::filter(handle == "HillaryClinton") %>%
        dplyr::select(text) %>%
        dplyr::mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
        tidytext::unnest_tokens(word, text, token = "regex", pattern = reg) %>%
        filter(!word %in% c(stop_words$word, 
                            c("trump", "hillary", "donald", "clinton", "trump's", "hillary's")),
               str_detect(word, "[a-z]")) %.>%
        plyr::count(.$word) %>%
        arrange(desc(freq)) %>%
        top_n(200) %>%
        rename("word" = x)

## Selecting by freq

# clinton

set.seed(20161108)
wordcloud(word = clinton$word, freq = clinton$freq, 
          colors = brewer.pal(8, "RdBu"), scale = c(4, .1), rot.per = .2)

Trump

trump <- corpus %>%
        dplyr::filter(handle == "realDonaldTrump") %>%
        dplyr::select(text) %>%
        dplyr::mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
        tidytext::unnest_tokens(word, text, token = "regex", pattern = reg) %>%
        filter(!word %in% c(stop_words$word, 
                            c("trump", "hillary", "donald", "clinton", "trump's", "hillary's")),
               str_detect(word, "[a-z]")) %.>%
        plyr::count(.$word) %>%
        arrange(desc(freq)) %>%
        top_n(200) %>%
        rename("word" = x)

## Selecting by freq

# trump

set.seed(20161108)
wordcloud(word = trump$word, freq = trump$freq, 
          colors = brewer.pal(8, "RdBu"), scale = c(4, .1), rot.per = .2)