projecteight

### make a sentiment analysis of the tweets from the most popular account on twitter - @BarackObama

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(tidytext)
library(wordcloud2)
library(rtweet)

Attaching package: 'rtweet'

The following object is masked from 'package:purrr':

    flatten
### sentiment analysis 

# get the tweets from the most followed twitter account - @barackobama
auth_as('create_token')
Reading auth from '/Users/haydenbharper/Library/Preferences/org.R-project.R/R/
rtweet/create_token.rds'
tweets <- search_tweets(
  "@BarackObama", n = 500, include_rts = FALSE
)

# split the data into single words
d <- tweets |>
  select(text) |>
  unnest_tokens(output = word, 
                input = text)

## Step 3: Word Counts ----------------

d %>% 
  # count the number of times each word was used
  count(word) %>% 
  # arrange the new count data by popularity
  arrange(-n)
# A tibble: 2,205 × 2
   word            n
   <chr>       <int>
 1 barackobama   514
 2 joebiden      339
 3 the           193
 4 you           148
 5 https         121
 6 t.co          121
 7 to            112
 8 in             93
 9 a              92
10 of             85
# … with 2,195 more rows
# ℹ Use `print(n = ...)` to see more rows
get_stopwords()
# A tibble: 175 × 2
   word      lexicon 
   <chr>     <chr>   
 1 i         snowball
 2 me        snowball
 3 my        snowball
 4 myself    snowball
 5 we        snowball
 6 our       snowball
 7 ours      snowball
 8 ourselves snowball
 9 you       snowball
10 your      snowball
# … with 165 more rows
# ℹ Use `print(n = ...)` to see more rows
d <- d %>% 
  anti_join(get_stopwords()) %>% 
  count(word) %>% 
  filter(!(word %in% c('president', '1', '2', '1.00', '4.3', '24', '44', '5', 
                       'usa', 'barackobama', 'joebiden', '11', '22', 
                       'warnock', 'https', 't.co', 'potus',
                       'speakerpelosi', 'michelleobama', 
                       'whitehouse', 'flotus', 'administration', 
                       'reverendwarnock', 'vp', 'obama', 
                       'trump', 'to', 'you', 'I', 'rt'))) |>
  arrange(-n)
Joining, by = "word"
## Step 4: Sentiment Analysis ------------------------

sentiments <- get_sentiments('bing')

# merge with the tokenized dataset
d2 <- d |>
  left_join(sentiments, by = 'word')

d3 <- d2 %>% 
  drop_na(sentiment)

d4 <- d3 |>
  select(word, sentiment) |>
  mutate(count_negative = sum(sentiment == 'negative'),
            count_positive = sum(sentiment == 'positive')) %>% 
  mutate(pct = count_positive - count_negative)

### visualize the sentiment we found 

d4 |>
  ggplot(mapping = aes(x = "",
                       y = count_negative)) +
  geom_col() +
  facet_wrap(~sentiment) +
  theme(axis.text.y=element_blank(),
           axis.ticks.y=element_blank()) +
  labs(x = "Obama's Tweets", y = 'Word Sentiment')