Twitter Data Analysis

Objective:
1. Get election related tweets originating from within 100 miles of Round Rock, TX
2. Tidy up the data and do sentiment analysis using bing data set

library(twitteR)
library(stringr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:twitteR':
## 
##     id, location

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

credentials = read.csv("twitter_credentials.txt" , header = TRUE, sep = ',' ,  stringsAsFactors = FALSE, strip.white = TRUE)
setup_twitter_oauth(credentials$api_key, credentials$api_secret, credentials$access_token, credentials$access_secret)

## [1] "Using direct authentication"

#Get election related tweets starting 1-Jan-2020 originating from withing 100 miles of Round Rock, TX
tweets = searchTwitter('election', since = '2020-01-01', geocode = '30.554139,-97.6212159,100mi', n=25000)

## [1] "Rate limited .... blocking for a minute and retrying up to 119 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 118 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 117 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 116 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 115 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 114 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 113 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 112 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 111 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 110 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 109 times ..."

## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 25000 tweets were requested but the
## API can only return 13587

text = as.character(rep(NA, length(tweets)))
screenname = as.character(rep(NA, length(tweets)))
created = as.POSIXct(rep(NA, length(tweets)))
id = c()

for(i in 1:length(tweets)){
  text[i] = tweets[[i]]$text
  screenname[i] = tweets[[i]]$screenName
  created[i] = tweets[[i]]$created
  id[i] = tweets[[i]]$id
}

x = data.frame(id = id, screenname = screenname, created = created, text = text, stringsAsFactors = FALSE)

x$text = str_replace_all(x$text,'\n',' ')

bing = get_sentiments('bing')

x %>% 
  unnest_tokens(token = "words", word, text) %>%
  select(id, word) %>%
  filter(!word %in% stop_words$word, !word == 'rt') %>%
  inner_join(bing, by=c("word"="word")) %>%
  count(word, sentiment, sort=TRUE)

## # A tibble: 1,129 x 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 trump      positive   2231
##  2 win        positive    987
##  3 won        positive    614
##  4 loses      negative    321
##  5 anger      negative    297
##  6 corrupt    negative    267
##  7 corruption negative    260
##  8 lead       positive    246
##  9 trust      positive    183
## 10 incredible positive    176
## # … with 1,119 more rows

trump is a positive word in bing dataset. Assuming trump is referring to as president trump in these tweets, ignoring the sentiment of this particular word.

x %>% 
  unnest_tokens(token = "words", word, text) %>%
  select(id, word) %>%
  filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
  inner_join(bing, by=c("word"="word")) %>%
  count(word, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(n = ifelse(sentiment=='positive', n, -n)) %>%
  arrange(desc(n)) %>%
  ggplot(aes(x=factor(word, levels = rev(unique(word))), y = n, fill = sentiment)) + geom_bar(stat= 'identity') +
  labs(y="Contribution to Sentiment", x = "") + coord_flip()

## Selecting by n

Win/Won are the most positive sentiment expressed by tweeter users. It is expected as supporter of both parties are expressing their desire to win the election. Most negative sentiment is ‘anger’. ‘Corruption’ appears to be almost on par with most positive sentiment ‘lead’ after win/won. It is interesting to see how human perspective can be on the opposite ends of spectrum for a single event/outcome. It seems people in my city are divided on this issue just like the rest of the country.

x %>% 
  unnest_tokens(token = "words", word, text) %>%
  filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
  select(id, created, word) %>%
  inner_join(bing, by = c("word" = "word")) %>%
  select(id, created, sentiment ) %>%
  count(id,created, sentiment) %>%
  mutate(n = ifelse(sentiment == 'negative', -n, n)) %>%
  group_by(Date = as.Date(ymd_hms(created)), sentiment) %>%
  summarize(total = sum(n)) %>%
  ggplot(aes(x=Date, y=total, fill = sentiment)) + geom_bar(stat='identity')

Overall sentiment has been negative atleast for last one week. Feb 2nd was positive but negativity took over again starting 3rd of Feb.

x %>% 
  unnest_tokens(token = "words", word, text) %>%
  filter(!word %in% stop_words$word, !word %in% c('rt','trump')) %>%
  select(id, screenname, created, word) %>%
  inner_join(bing, by = c("word" = "word")) %>%
  count(screenname, sentiment, sort=TRUE) %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ggplot(aes(x=factor(screenname), y = n, fill= sentiment)) + geom_bar(stat='identity') +
  theme(axis.text.x = element_text(angle=90)) +
  labs(x = '', y = '')

## Selecting by n

Here we see that some twitter users have expressed both positive and negative sentiments while others have expressed positive or negative sentiments exclusively.

Twitter Data Analysis

Sumit Kumar

2/4/2020