As of November 10, 2020, Biden clearly wins the election, with electoral vote of 290 vs 214. The goal of this analysis is to conduct sentiment analysis on the recent tweeter data from the United States of America (USA). Data extracted on the same day as this document is published.
library(rtweet)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(tidytext)
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
setwd("C:/Users/EUVIE/Desktop/njd files detail desktop/text mining with R book/rtwitter_2020_us_election")
source("2020_us_election.R") # This contains my api key and tokens
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> Nilrey Twitter Test App
## key: xMM6HZyWPa3HhAX40iJblL9Rl
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
We will utilize the rtweet library to extract twitter data. Since twitter API has 18,000 limit per request per 15minutes, we will extract recent 9,000 tweets each with #trump and #biden. We will ensure that the tweets are from USA and the English language.
rt_trump <- search_tweets(
"#trump", n = 9000, include_rts = FALSE, retryonratelimit =TRUE, lang="en", geocode = lookup_coords("usa")
)
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
rt_biden <- search_tweets(
"#biden", n = 9000, include_rts = FALSE, retryonratelimit =TRUE, lang="en", geocode = lookup_coords("usa")
)
## retry on rate limit...
## waiting about 9 minutes...
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
We will remove the digits and punctuation
# first let's merge the two dataframes
rt_trump <- mutate(rt_trump, president = "Trump")
rt_biden <- mutate(rt_biden, president = "Biden")
rt_all <- bind_rows(rt_trump, rt_biden)
# create a function to return a clean character vector
clean_text_col <- function(df, col_name){
text_col <- df[[col_name]]
text_col <- str_replace_all(text_col, "[:punct:]", "")
text_col <- str_replace_all(text_col, "[:digit:]", "")
df[col_name] <- text_col
return(df)
}
rt_all <- clean_text_col(rt_all, "text")
# retain some columns for analysis
col_retain <- c("user_id", "status_id", "created_at", "screen_name", "text", "source", "is_retweet", "retweet_count", "lang", "president")
rt_all <- select(rt_all, col_retain)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(col_retain)` instead of `col_retain` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dim(rt_all)
## [1] 33591 10
table(rt_all$president)
##
## Biden Trump
## 16813 16778
Let’s check if there is duplicate status id. Then remove the dupes
rt_all %>% count(status_id) %>%
mutate(n_group = case_when(n>1~"2 or more",
TRUE~"1 only")) %>%
count(n_group)
## # A tibble: 2 x 2
## n_group n
## <chr> <int>
## 1 1 only 29165
## 2 2 or more 2213
rt_all <- distinct(rt_all)
rt_all_tokenize <- unnest_tokens(rt_all, word, text)
knitr::kable(head(rt_all_tokenize))
| user_id | status_id | created_at | screen_name | source | is_retweet | retweet_count | lang | president | word |
|---|---|---|---|---|---|---|---|---|---|
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | texas |
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | teens |
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | throw |
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | eggs |
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | at |
| 2190845029 | 1326434390885404672 | 2020-11-11 07:59:37 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | protrump |
Now we will extract the positive and negative word From Bing et al using the tidytext package.
bing_lexicon <- get_sentiments("bing")
head(bing_lexicon)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
bing_lexicon %>%
group_by(sentiment) %>%
summarise(n=n()) %>%
ggplot(aes(x=sentiment, y=n, fill=sentiment))+
geom_bar(stat = "identity")+
geom_text(size = 3, position = position_stack(vjust = 0.5), aes(label=n))
## `summarise()` ungrouping output (override with `.groups` argument)
knitr::kable(head(bing_lexicon))
| word | sentiment |
|---|---|
| 2-faces | negative |
| abnormal | negative |
| abolish | negative |
| abominable | negative |
| abominably | negative |
| abominate | negative |
rt_all_sentiment <- rt_all_tokenize %>%
inner_join(bing_lexicon)
## Joining, by = "word"
knitr::kable(head(rt_all_sentiment))
| user_id | status_id | created_at | screen_name | source | is_retweet | retweet_count | lang | president | word | sentiment |
|---|---|---|---|---|---|---|---|---|---|---|
| 2190845029 | 1325585394000388097 | 2020-11-08 23:46:00 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | trump | positive |
| 2190845029 | 1325585394000388097 | 2020-11-08 23:46:00 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | rape | negative |
| 2190845029 | 1325585394000388097 | 2020-11-08 23:46:00 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | defamation | negative |
| 2190845029 | 1326433916991967233 | 2020-11-11 07:57:44 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | kill | negative |
| 2190845029 | 1326433916991967233 | 2020-11-11 07:57:44 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | trump | positive |
| 2190845029 | 1325604943210958848 | 2020-11-09 01:03:41 | usacustomers | WordPress.com | FALSE | 0 | en | Trump | safe | positive |
president_sentiment <- data.frame(table(rt_all_sentiment$president, rt_all_sentiment$sentiment))
colnames(president_sentiment) <- c("candidate","sentiment","Freq")
president_sentiment %>%
group_by(candidate) %>%
mutate(total_words = sum(Freq), Percentage = Freq/sum(Freq)) %>%
ggplot(aes(x=candidate, y = Freq, fill = sentiment )) +
geom_bar(position="stack", stat="identity") +
geom_text(aes(label = paste(Freq,",",percent(Percentage))), position = position_stack(vjust = .5))+
labs(title = "Count of Positive and Negative Words Associated to #trump and #biden")
bing_word_count <- count(rt_all_sentiment,
word,
president,
sentiment,
sort = TRUE) %>%
ungroup()
bing_word_count %>%
filter(president=="Trump") %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment for #trump",
y = NULL)
## Selecting by n
bing_word_count %>%
filter(president=="Biden") %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment for #Biden",
y = NULL)
## Selecting by n
Now we will count the number of positive and negative words per tweet and get the sentiment score by getting the difference of count of positive words and negative words (positive - negative).
rt_all_sentiment_score <- count(rt_all_sentiment,
user_id,status_id,
created_at,
screen_name,
source,
is_retweet,
retweet_count,
lang,
president,
sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0 )
# compute sentiment_score
rt_all_sentiment_score <- mutate(rt_all_sentiment_score,
sentiment_score = positive - negative) %>% #adding back text column from rt_all df
left_join(select(rt_all, status_id, text), by = "status_id")
# find the mean score
mu <- rt_all_sentiment_score %>%
group_by(president) %>%
summarise(mean_score = mean(sentiment_score))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(rt_all_sentiment_score, aes(x=sentiment_score, color=president, fill=president)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.3)+
geom_density(alpha=0.3)+
geom_vline(data=mu, aes(xintercept=mean_score, color=president),
linetype="dashed")+
#geom_text(data = mu, aes(label=mean_score))+
scale_color_manual(values=c("#72A3F3", "#ff8800"))+
scale_fill_manual(values=c("#72A3F3", "#ff8800"))+
labs(title="Weight histogram plot",x="Weight(kg)", y = "Density")+
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
print(mu)
## # A tibble: 2 x 2
## president mean_score
## <chr> <dbl>
## 1 Biden 0.717
## 2 Trump 0.646
Based on the average sentiment score, Biden also wins in the Twitter World.
While lexicon-based method is easy to implement such as no training needed, it suffers from many limitations including: