1 Brian Singh – Sentiment Analysis Using Twitter


1.0.1 Load Libraries

library(RCurl)
library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janeaustenr)
library(stringr)
library(textdata)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
library(ggplot2)
library(rjson)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
library(httr)
library(XML)
library(rvest)
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
library(RColorBrewer)
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:jsonlite':
## 
##     flatten
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate

1.0.2 Get code from Chapter 2 working

Source: (https://www.tidytextmining.com/sentiment.html).

# get_sentiments("afinn")
# get_sentiments("nrc")
# get_sentiments("bing")
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Chunks of code, above, were provided by https://www.tidytextmining.com/sentiment.html

# twitter_token <- create_token(app = "Data607",
#   consumer_key = "",
#   consumer_secret = "",
#   set_renv = TRUE)

1.0.3 Token for Twitter access

bearer_token <- Sys.setenv(BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAHbybAEAAAAAR%2F%2BruDfW2EDq4ke22576ezGoe7Y%3DrEuVwVpejXYDJb9aBLRGs9WyBbQ9SokoKLR6q0WPAhtKQNJ11O")
headers <- c(`Authorization` = sprintf('Bearer %s', bearer_token))

1.0.4 Pull in last 5,000 tweets mentioned the 49ers.

tweets <- search_tweets("49ers", n=5000, include_rts=FALSE, lang="en")

1.0.5 Clean tweets

tweets$text <-  gsub("https\\S*", "", tweets$text)
tweets$text <-  gsub("@\\S*", "", tweets$text) 
tweets$text <-  gsub("amp", "", tweets$text) 
tweets$text  <-  gsub("[\r\n]", "", tweets$text)
tweets$text  <-  gsub("[[:punct:]]", "", tweets$text)

1.0.6 Remove stop words

tweets_words <- tweets %>%
  select(text) %>%
  unnest_tokens(word, text)

#Add custom stop words
# custom_stop <- bind_rows(tibble, word=c("49ers","deebo","kevin","jimmy","samuel"),lexicon=c("custom"),stop_words)

tweets_words <- tweets_words %>%
  anti_join(stop_words)
## Joining, by = "word"

1.0.7 Identify the most frequent words in the tweets (graph top 20).

tweets_words %>% 
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Count",
       x = "Unique words",
       title = "Most frequent words found in the tweets of San Francisco 49ers",
       subtitle = "Stop words removed from the list")
## Selecting by n

1.0.8 Identify most used 49ers hashtags in last 5000 tweets

tweets$hashtags <- as.character(tweets$hashtags)
tweets$hashtags <- gsub("c\\(", "", tweets$hashtags)
set.seed(1234)
wordcloud(na.omit(tweets$hashtags), min.freq=5, scale=c(5.0, 1.5), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

1.0.9 Sentiment analysis using nrc (joy words only)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tweets_words %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"

1.0.10 Sentiment analysis using bing

fortyniners_sentiment_total <- tweets_words %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
fortyniners_sentiment_total
fortyniners_sentiment_byword <- tweets_words %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
head(fortyniners_sentiment_byword,20)
# ggplot(fortyniners_sentiment_byword, aes(word, sentiment)) +
#   geom_col(show.legend = FALSE)

1.0.11 Sentiment found online syzuhet package

library(syuzhet)
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:rtweet':
## 
##     get_tokens
# Converting tweets to ASCII to trackle strange characters
tweets <- iconv(tweets, from="UTF-8", to="ASCII", sub="")

# removing retweets, in case needed
tweets <-gsub("(RT|via)((?:\\b\\w*@\\w+)+)","",tweets)

# removing mentions, in case needed
tweets <-gsub("@\\w+","",tweets)
ew_sentiment<-get_nrc_sentiment((tweets))
sentimentscores<-data.frame(colSums(ew_sentiment[,]))
names(sentimentscores) <- "Score"
sentimentscores <-cbind("sentiment"=rownames(sentimentscores),sentimentscores)
rownames(sentimentscores) <- NULL

ggplot(data=sentimentscores,aes(x=reorder(sentiment,-Score),y=Score))+
  geom_bar(aes(fill=sentiment),stat = "identity")+
  theme(legend.position="none")+
  xlab("Sentiments")+ylab("Scores")+
  ggtitle("Total sentiment based on scores")+
  theme_minimal()

1.0.12 Conclusion

Overall, it appears the 49ers are trending slightly negatively. This correlates with the latest news that Deebo Samuel, the 49ers pro bowl wide receiver has unfollowed the team and removed any team-related posts on his social media due to breakdowns in contract extension negotiations. Of course, limitations include that the sentiment analysis does not pick terms such as “not bad” which would not rate negatively, but instead separates “not” and “bad” and two negative words. Additional steps would include refining the sentiment analysis to pick up such cases.