Objective:
1. Get election related tweets originating from within 100 miles of Round Rock, TX
2. Tidy up the data and do sentiment analysis using bing data set
library(twitteR)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:twitteR':
##
## id, location
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
credentials = read.csv("twitter_credentials.txt" , header = TRUE, sep = ',' , stringsAsFactors = FALSE, strip.white = TRUE)
setup_twitter_oauth(credentials$api_key, credentials$api_secret, credentials$access_token, credentials$access_secret)
## [1] "Using direct authentication"
#Get election related tweets starting 1-Jan-2020 originating from withing 100 miles of Round Rock, TX
tweets = searchTwitter('election', since = '2020-01-01', geocode = '30.554139,-97.6212159,100mi', n=25000)
## [1] "Rate limited .... blocking for a minute and retrying up to 119 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 118 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 117 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 116 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 115 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 114 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 113 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 112 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 111 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 110 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 109 times ..."
## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 25000 tweets were requested but the
## API can only return 13587
text = as.character(rep(NA, length(tweets)))
screenname = as.character(rep(NA, length(tweets)))
created = as.POSIXct(rep(NA, length(tweets)))
id = c()
for(i in 1:length(tweets)){
text[i] = tweets[[i]]$text
screenname[i] = tweets[[i]]$screenName
created[i] = tweets[[i]]$created
id[i] = tweets[[i]]$id
}
x = data.frame(id = id, screenname = screenname, created = created, text = text, stringsAsFactors = FALSE)
x$text = str_replace_all(x$text,'\n',' ')
bing = get_sentiments('bing')
x %>%
unnest_tokens(token = "words", word, text) %>%
select(id, word) %>%
filter(!word %in% stop_words$word, !word == 'rt') %>%
inner_join(bing, by=c("word"="word")) %>%
count(word, sentiment, sort=TRUE)
## # A tibble: 1,129 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 trump positive 2231
## 2 win positive 987
## 3 won positive 614
## 4 loses negative 321
## 5 anger negative 297
## 6 corrupt negative 267
## 7 corruption negative 260
## 8 lead positive 246
## 9 trust positive 183
## 10 incredible positive 176
## # … with 1,119 more rows
trump is a positive word in bing dataset. Assuming trump is referring to as president trump in these tweets, ignoring the sentiment of this particular word.
x %>%
unnest_tokens(token = "words", word, text) %>%
select(id, word) %>%
filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
inner_join(bing, by=c("word"="word")) %>%
count(word, sentiment, sort=TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(n = ifelse(sentiment=='positive', n, -n)) %>%
arrange(desc(n)) %>%
ggplot(aes(x=factor(word, levels = rev(unique(word))), y = n, fill = sentiment)) + geom_bar(stat= 'identity') +
labs(y="Contribution to Sentiment", x = "") + coord_flip()
## Selecting by n
Win/Won are the most positive sentiment expressed by tweeter users. It is expected as supporter of both parties are expressing their desire to win the election. Most negative sentiment is ‘anger’. ‘Corruption’ appears to be almost on par with most positive sentiment ‘lead’ after win/won. It is interesting to see how human perspective can be on the opposite ends of spectrum for a single event/outcome. It seems people in my city are divided on this issue just like the rest of the country.
x %>%
unnest_tokens(token = "words", word, text) %>%
filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
select(id, created, word) %>%
inner_join(bing, by = c("word" = "word")) %>%
select(id, created, sentiment ) %>%
count(id,created, sentiment) %>%
mutate(n = ifelse(sentiment == 'negative', -n, n)) %>%
group_by(Date = as.Date(ymd_hms(created)), sentiment) %>%
summarize(total = sum(n)) %>%
ggplot(aes(x=Date, y=total, fill = sentiment)) + geom_bar(stat='identity')
Overall sentiment has been negative atleast for last one week. Feb 2nd was positive but negativity took over again starting 3rd of Feb.
x %>%
unnest_tokens(token = "words", word, text) %>%
filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
select(id, screenname, created, word) %>%
inner_join(bing, by = c("word" = "word")) %>%
count(screenname, sentiment, sort=TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot(aes(x=factor(screenname), y = n, fill= sentiment)) + geom_bar(stat='identity') +
theme(axis.text.x = element_text(angle=90)) +
labs(x = '', y = '')
## Selecting by n
Here we see that some twitter users have expressed both positive and negative sentiments while others have expressed positive or negative sentiments exclusively.