Sentiment Analysis in the USA Post-Election Tweets

Introduction

As of November 10, 2020, Biden clearly wins the election, with electoral vote of 290 vs 214. The goal of this analysis is to conduct sentiment analysis on the recent tweeter data from the United States of America (USA). Data extracted on the same day as this document is published.

Libraries Needed

library(rtweet)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(stringr)
library(tidytext)
library(tidyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(scales)

setwd("C:/Users/EUVIE/Desktop/njd files detail desktop/text mining with R book/rtwitter_2020_us_election")

source("2020_us_election.R") # This contains my api key and tokens

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> Nilrey Twitter Test App
##   key:    xMM6HZyWPa3HhAX40iJblL9Rl
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Extracting Tweeters

We will utilize the rtweet library to extract twitter data. Since twitter API has 18,000 limit per request per 15minutes, we will extract recent 9,000 tweets each with #trump and #biden. We will ensure that the tweets are from USA and the English language.

rt_trump <- search_tweets(
  "#trump", n = 9000, include_rts = FALSE, retryonratelimit =TRUE, lang="en", geocode = lookup_coords("usa")
)

## Warning: Rate limit exceeded - 88

## Warning: Rate limit exceeded

rt_biden <- search_tweets(
  "#biden", n = 9000, include_rts = FALSE, retryonratelimit =TRUE,  lang="en", geocode = lookup_coords("usa")
)

## retry on rate limit...
## waiting about 9 minutes...

## Warning: Rate limit exceeded - 88

## Warning: Rate limit exceeded

Data Pre-processing

We will remove the digits and punctuation

# first let's merge the two dataframes

rt_trump <- mutate(rt_trump, president = "Trump")
rt_biden <- mutate(rt_biden, president = "Biden")

rt_all <- bind_rows(rt_trump, rt_biden)

# create a function to return a clean character vector

clean_text_col <- function(df, col_name){
        text_col <-  df[[col_name]]
        text_col <- str_replace_all(text_col, "[:punct:]", "")
        text_col <- str_replace_all(text_col, "[:digit:]", "")
        
        df[col_name] <- text_col
        
        return(df)
}

rt_all <- clean_text_col(rt_all, "text")

# retain some columns for analysis
col_retain <- c("user_id", "status_id", "created_at", "screen_name", "text", "source", "is_retweet", "retweet_count", "lang", "president")

rt_all <- select(rt_all, col_retain)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(col_retain)` instead of `col_retain` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

dim(rt_all)

## [1] 33591    10

table(rt_all$president)

## 
## Biden Trump 
## 16813 16778

Check for Duplicate Tweets

Let’s check if there is duplicate status id. Then remove the dupes

rt_all %>% count(status_id) %>% 
  mutate(n_group = case_when(n>1~"2 or more",
                             TRUE~"1 only")) %>% 
  count(n_group)

## # A tibble: 2 x 2
##   n_group       n
##   <chr>     <int>
## 1 1 only    29165
## 2 2 or more  2213

rt_all <- distinct(rt_all)

Tokenize the Extracted Tweets Data

rt_all_tokenize <- unnest_tokens(rt_all, word, text)
knitr::kable(head(rt_all_tokenize))

user_id	status_id	created_at	screen_name	source	is_retweet	lang	president	word
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	texas
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	teens
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	throw
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	eggs
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	at
2190845029	1326434390885404672	2020-11-11 07:59:37	usacustomers	WordPress.com	FALSE	en	Trump	protrump

Downloading Sentiment Lexicon

Now we will extract the positive and negative word From Bing et al using the tidytext package.

bing_lexicon <- get_sentiments("bing")
head(bing_lexicon)

## # A tibble: 6 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

bing_lexicon %>% 
  group_by(sentiment) %>% 
  summarise(n=n()) %>% 
  ggplot(aes(x=sentiment, y=n, fill=sentiment))+
    geom_bar(stat = "identity")+
    geom_text(size = 3, position = position_stack(vjust = 0.5), aes(label=n))

## `summarise()` ungrouping output (override with `.groups` argument)

knitr::kable(head(bing_lexicon))

word	sentiment
2-faces	negative
abnormal	negative
abolish	negative
abominable	negative
abominably	negative
abominate	negative

Match Sentiment Lexicon to the each Word in Each Tweets

rt_all_sentiment <- rt_all_tokenize %>% 
        inner_join(bing_lexicon)

## Joining, by = "word"

knitr::kable(head(rt_all_sentiment))

user_id	status_id	created_at	screen_name	source	is_retweet	lang	president	word	sentiment
2190845029	1325585394000388097	2020-11-08 23:46:00	usacustomers	WordPress.com	FALSE	en	Trump	trump	positive
2190845029	1325585394000388097	2020-11-08 23:46:00	usacustomers	WordPress.com	FALSE	en	Trump	rape	negative
2190845029	1325585394000388097	2020-11-08 23:46:00	usacustomers	WordPress.com	FALSE	en	Trump	defamation	negative
2190845029	1326433916991967233	2020-11-11 07:57:44	usacustomers	WordPress.com	FALSE	en	Trump	kill	negative
2190845029	1326433916991967233	2020-11-11 07:57:44	usacustomers	WordPress.com	FALSE	en	Trump	trump	positive
2190845029	1325604943210958848	2020-11-09 01:03:41	usacustomers	WordPress.com	FALSE	en	Trump	safe	positive

Sentiment Analyses

Mostly used Positive and Negative Words per presidential candidate

president_sentiment <- data.frame(table(rt_all_sentiment$president, rt_all_sentiment$sentiment))
colnames(president_sentiment) <- c("candidate","sentiment","Freq")
president_sentiment %>% 
  group_by(candidate) %>% 
  mutate(total_words = sum(Freq), Percentage = Freq/sum(Freq)) %>% 
  ggplot(aes(x=candidate, y = Freq, fill = sentiment )) +
  geom_bar(position="stack", stat="identity") +
  geom_text(aes(label = paste(Freq,",",percent(Percentage))), position = position_stack(vjust = .5))+
  labs(title = "Count of Positive and Negative Words Associated to #trump and #biden")

bing_word_count <- count(rt_all_sentiment,
                         word,
                         president,
                         sentiment,
                         sort = TRUE) %>% 
  ungroup()

bing_word_count %>%
  filter(president=="Trump") %>% 
 group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment for #trump",
       y = NULL)

## Selecting by n

bing_word_count %>%
  filter(president=="Biden") %>% 
 group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment for #Biden",
       y = NULL)

## Selecting by n

Now we will count the number of positive and negative words per tweet and get the sentiment score by getting the difference of count of positive words and negative words (positive - negative).

rt_all_sentiment_score <- count(rt_all_sentiment, 
                          user_id,status_id, 
                          created_at, 
                          screen_name,
                          source, 
                          is_retweet, 
                          retweet_count, 
                          lang, 
                          president,
                          sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0 )

# compute sentiment_score
rt_all_sentiment_score <- mutate(rt_all_sentiment_score,
                           sentiment_score = positive - negative) %>% #adding back text column from rt_all df
                           left_join(select(rt_all, status_id, text), by = "status_id")

# find the mean score
mu <- rt_all_sentiment_score %>% 
  group_by(president) %>% 
  summarise(mean_score = mean(sentiment_score))

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(rt_all_sentiment_score, aes(x=sentiment_score, color=president, fill=president)) +
  geom_histogram(aes(y=..density..), position="identity", alpha=0.3)+
  geom_density(alpha=0.3)+
  geom_vline(data=mu, aes(xintercept=mean_score, color=president),
             linetype="dashed")+
  #geom_text(data = mu, aes(label=mean_score))+
  scale_color_manual(values=c("#72A3F3", "#ff8800"))+
  scale_fill_manual(values=c("#72A3F3", "#ff8800"))+
  labs(title="Weight histogram plot",x="Weight(kg)", y = "Density")+
  theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

print(mu)

## # A tibble: 2 x 2
##   president mean_score
##   <chr>          <dbl>
## 1 Biden          0.717
## 2 Trump          0.646

Based on the average sentiment score, Biden also wins in the Twitter World.

Limitation of lexicon-based Method of Sentiment Analysis

While lexicon-based method is easy to implement such as no training needed, it suffers from many limitations including:

Choice of sentiment lexicon. It could that the nature of the text being analyzed is not following similar structure with your lexicon, example: books from 100 years ago are written differently from the social media posts.
Semantic meaning e.g. negation of words. Since most lexicon are unigram only, just like what we used here, we are not taking into account the whole meaning of the sentence or phrase, for example “You are not beautiful neither smells good” has positive words but because of the negations “not”, the whole meaning of the sentence is negative.