I used a lot of libraries with this. I spent a lot of time playing with different things and by the end I couldn’t remember which ones I needed, so I kept them all

knitr::opts_chunk$set(echo = TRUE)
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)
library(twitteR)
library(ROAuth)
library(tm)
library(wordcloud)
library(kableExtra)
library(tidyverse)
library(igraph)
library(stringr)
library(knitr)
library(ggpubr)

I submitted token and key data to twitter for access data

# whatever name you assigned to your created app
appname <- "brett_vis_data"

## api key (example below is not a real key)
key <- "2U3TnrUD3MtbBoC0qXvmI1nBI"

## api secret (example below is not a real key)
secret <- "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO"

# create token named "twitter_token"
twitter_token <- create_token(
  app = "brett_vis_data",
  consumer_key = "2U3TnrUD3MtbBoC0qXvmI1nBI",
  consumer_secret = "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO",
  access_token = "1089208256717877250-2b2rHhrFZxAnDk5A2KQbyteOsVoi6Y",
  access_secret = "XLc5Hvqbc2Qf9RXf3rwhnJAw3qBXo4lVkMzlfUXhC74UJ")

setup_twitter_oauth("2U3TnrUD3MtbBoC0qXvmI1nBI", "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO", "1089208256717877250-2b2rHhrFZxAnDk5A2KQbyteOsVoi6Y", "XLc5Hvqbc2Qf9RXf3rwhnJAw3qBXo4lVkMzlfUXhC74UJ")
## [1] "Using direct authentication"

As the impeachment report dropped today I wanted to see how many people were tweeting about it

impeachment_tweets_raw <- search_tweets(q = "#impeachment",
                               n = 500)

# view the first 10 rows of the dataframe
impeachment_tweets <- impeachment_tweets_raw %>%
  select(screen_name,text)
impeachment_head <- head(impeachment_tweets, n = 10)

Find 10000 tweets with #impeachment but ignore retweets

impeachment_retweets_raw <- search_tweets("#impeachment", n = 10000,
                             include_rts = FALSE)

# view top 10 rows of data
impeachment_retweets <- impeachment_retweets_raw %>%
  select(screen_name,text)
head(impeachment_retweets, n = 10)
## search for 10000 tweets using the impeachment hashtag
rt <- rtweet::search_tweets(
  "#impeachment", n = 10000, include_rts = FALSE)
users_data(rt) %>% dplyr::glimpse(78)
## Observations: 6,490
## Variables: 20
## $ user_id                <chr> "62374483", "62374483", "62374483", "6237448…
## $ screen_name            <chr> "Horn_Sannity", "Horn_Sannity", "Horn_Sannit…
## $ name                   <chr> "Horn Sannity", "Horn Sannity", "Horn Sannit…
## $ location               <chr> "God's Green Earth", "God's Green Earth", "G…
## $ description            <chr> "Keeping an eye on things. Fighting for Sane…
## $ url                    <chr> NA, NA, NA, NA, NA, "https://t.co/o67L88tsic…
## $ protected              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ followers_count        <int> 3235, 3235, 3235, 3235, 662, 8911, 8911, 891…
## $ friends_count          <int> 4163, 4163, 4163, 4163, 2016, 3390, 3390, 33…
## $ listed_count           <int> 82, 82, 82, 82, 0, 326, 326, 326, 326, 326, …
## $ statuses_count         <int> 67107, 67107, 67107, 67107, 2797, 168597, 16…
## $ favourites_count       <int> 32287, 32287, 32287, 32287, 5745, 802, 802, …
## $ account_created_at     <dttm> 2009-08-02 23:12:16, 2009-08-02 23:12:16, 2…
## $ verified               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ profile_url            <chr> NA, NA, NA, NA, NA, "https://t.co/o67L88tsic…
## $ profile_expanded_url   <chr> NA, NA, NA, NA, NA, "https://www.Conservativ…
## $ account_lang           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ profile_banner_url     <chr> "https://pbs.twimg.com/profile_banners/62374…
## $ profile_background_url <chr> "http://abs.twimg.com/images/themes/theme1/b…
## $ profile_image_url      <chr> "http://pbs.twimg.com/profile_images/3507966…

how many locations are represented

length(unique(users$location))
## [1] 361

Twitter users by unique location. I thought there would have been more locations with 10k tweets, but I am assuming it may be a setting issue

users %>%
  count(location, sort = TRUE) %>%
  mutate(location = reorder(location,n)) %>%
  na.omit() %>%
  top_n(5) %>%
  ggplot(aes(x = location,y = n, fill="n")) +
  geom_col() +
  coord_flip() +
      labs(x = "Location",
      y = "Count", 
      title = "Twitter users - unique locations ")

Word cloud from words associated with #impeachment

tweets_cloud <- searchTwitter("#impeachment", n=1000, lang="en")
tweets.text <- sapply(tweets_cloud, function(x) x$getText())
# Replace blank space (“rt”)
tweets.text <- gsub("rt", "", tweets.text)
# Replace @UserName
tweets.text <- gsub("@\\w+", "", tweets.text)
# Remove punctuation
tweets.text <- gsub("[[:punct:]]", "", tweets.text)
# Remove links
tweets.text <- gsub("http\\w+", "", tweets.text)
# Remove tabs
tweets.text <- gsub("[ |\t]{2,}", "", tweets.text)
# Remove blank spaces at the beginning
tweets.text <- gsub("^ ", "", tweets.text)
# Remove blank spaces at the end
tweets.text <- gsub(" $", "", tweets.text)
 
# #convert all text to lower case
tweets.text <- tolower(tweets.text)

Generate wordcloud from associated words from #impeachment

library(tm)

wordcloud(tweets.text,min.freq = 2, scale=c(7,0.5),colors=brewer.pal(8, "Dark2"),  random.color= TRUE, random.order = FALSE, max.words = 150)

# We need to restructure tidy_impeachment_tweets as one-token-per-row format
tidy_impeachment_tweets <- rt %>% 
    filter(is_retweet==FALSE)%>% # original tweets
  select(status_id, 
         text)%>% # variables of interest
  unnest_tokens(word, text) # splits column 

view(tidy_impeachment_tweets)

stop_words
my_stop_words <- tibble( 
  word = c(
    "https",
    "t.co",
    "rt",
    "amp",
    "rstats",
    "gt"
  ),
  lexicon = "twitter"
)
# Connect stop words
all_stop_words <- stop_words %>%
  bind_rows(my_stop_words) # connecting two data frames

# Let's see if it worked
view(all_stop_words)

# Remove numbers
no_numbers <- tidy_impeachment_tweets %>%
    filter(is.na(as.numeric(word))) 

no_stop_words <- no_numbers %>%
  anti_join(all_stop_words, by = "word")
nrc <- get_sentiments("nrc") # get specific sentiment lexicons in a tidy format

view(nrc)

nrc_words <- no_stop_words %>%
  inner_join(nrc, by="word")

view(nrc_words)
pie_words<- nrc_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency

Sentiment analysis of emotional sentiment of tweets. Like with our current culture it was a slightly larger positive/negative split. Go partisan!

ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

impeachment_sources <- impeachment_tweets_raw %>% 
  select(source) %>% 
  group_by(source) %>%
  summarize(count=n())

  impeachment_sources <- subset(impeachment_sources, count > 11)

Source data for tweets #impeachment. No shock here the majority are from cell phones

data <- data.frame(
  category=impeachment_sources$source,
  count=impeachment_sources$count)
data$fraction = data$count / sum(data$count)
data$percentage = data$count / sum(data$count) * 100
data$ymax = cumsum(data$fraction)
data$ymin = c(0, head(data$ymax, n=-1))

data <- data[-c(1, 6), ]

Source <- paste(data$category, data$percentage, "%")

ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Source)) +
  geom_rect() +
  coord_polar(theta="y") + 
  xlim(c(2, 4)) +
  theme_void() +
  theme(legend.position = "right")

mconnell_tweets <- userTimeline("@senatemajldr", n=3200)
mconnell_tweets_df <- tbl_df(map_df(mconnell_tweets, as.data.frame))

pelosi_tweets <- userTimeline("@SpeakerPelosi", n=3200)
pelosi_tweets_df <- tbl_df(map_df(pelosi_tweets, as.data.frame))

I wanted to get the temperature of talk from either side of the aisle. Here is 20 of the most frequently tweeted words - Mitch McConnell for the Republicans

tweets <- mconnell_tweets_df

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))

tweet_words %>% head(20) %>% kable(row.names=TRUE)
word freq
1 democrats 133
2 senate 131
3 american 97
4 @potus 93
5 #taxreform 91
6 #senate 75
7 people 49
8 bill 47
9 judge 46
10 house 45
11 president 43
12 congress 41
13 funding 40
14 kavanaugh 39
15 nation 39
16 americans 38
17 time 38
18 act 37
19 tax 37
20 bipartisan 36

the sentiment behind Senate Majority Leader McConnell

mconnell_sentiment <- tweet_words %>%
  inner_join(get_sentiments("bing"))

mconnell_sentiment %>%
  group_by(sentiment) %>%
  top_n(10, freq) %>%
  arrange(freq) %>%
  ungroup() %>%
  mutate(word = reorder(word, freq)) %>%
  ggplot(aes(word, freq, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

Here is 20 of the most frequently tweeted words - Nancy Pelosi for the Democrats

tweets <- pelosi_tweets_df

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))

tweet_words %>% head(20) %>% kable(row.names=TRUE)
word freq
1 house 34
2 president 33
3 @realdonaldtrump 31
4 americans 22
5 trump 20
6 tune 20
7 act 19
8 bipartisan 18
9 @senatemajldr 17
10 america 16
11 people 16
12 american 15
13 delegation 14
14 passed 13
15 @housedemocrats 12
16 country 12
17 republicans 12
18 legislation 11
19 live 11
20 rights 11

the sentiment behind House Speaker Pelosi

pelosi_sentiment <- tweet_words %>%
  inner_join(get_sentiments("bing"))

pelosi_sentiment %>%
  group_by(sentiment) %>%
  top_n(10, freq) %>%
  arrange(freq) %>%
  ungroup() %>%
  mutate(word = reorder(word, freq)) %>%
  ggplot(aes(word, freq, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

Overall the data came out about as I would have expected with the exception of the unique locations. I know how to use the round function, but had trouble working out the errors with relation other then numeric data also being present. Overall pulling data from Twitter was easier than I expected, but I didn’t always have a good sense of best way to use the data I pulled for different graphics. It is a small miracle I got this done as my (no sh*t) my dog stepped on my keyboard after I have used the undo key to run very far back and I lost almost everything. Fortiunately after 2 days I had done enough trial and error that the redo went faster.