knitr::opts_chunk$set(echo = TRUE)
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)
library(twitteR)
library(ROAuth)
library(tm)
library(wordcloud)
library(kableExtra)
library(tidyverse)
library(igraph)
library(stringr)
library(knitr)
library(ggpubr)
# whatever name you assigned to your created app
appname <- "brett_vis_data"
## api key (example below is not a real key)
key <- "2U3TnrUD3MtbBoC0qXvmI1nBI"
## api secret (example below is not a real key)
secret <- "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO"
# create token named "twitter_token"
twitter_token <- create_token(
app = "brett_vis_data",
consumer_key = "2U3TnrUD3MtbBoC0qXvmI1nBI",
consumer_secret = "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO",
access_token = "1089208256717877250-2b2rHhrFZxAnDk5A2KQbyteOsVoi6Y",
access_secret = "XLc5Hvqbc2Qf9RXf3rwhnJAw3qBXo4lVkMzlfUXhC74UJ")
setup_twitter_oauth("2U3TnrUD3MtbBoC0qXvmI1nBI", "IScwTICj9sG5rXD7VsZPt53ZLvJPIa86m4PVGfpyIF2rtmzJfO", "1089208256717877250-2b2rHhrFZxAnDk5A2KQbyteOsVoi6Y", "XLc5Hvqbc2Qf9RXf3rwhnJAw3qBXo4lVkMzlfUXhC74UJ")
## [1] "Using direct authentication"
impeachment_tweets_raw <- search_tweets(q = "#impeachment",
n = 500)
# view the first 10 rows of the dataframe
impeachment_tweets <- impeachment_tweets_raw %>%
select(screen_name,text)
impeachment_head <- head(impeachment_tweets, n = 10)
impeachment_retweets_raw <- search_tweets("#impeachment", n = 10000,
include_rts = FALSE)
# view top 10 rows of data
impeachment_retweets <- impeachment_retweets_raw %>%
select(screen_name,text)
head(impeachment_retweets, n = 10)
## search for 10000 tweets using the impeachment hashtag
rt <- rtweet::search_tweets(
"#impeachment", n = 10000, include_rts = FALSE)
users_data(rt) %>% dplyr::glimpse(78)
## Observations: 6,490
## Variables: 20
## $ user_id <chr> "62374483", "62374483", "62374483", "6237448…
## $ screen_name <chr> "Horn_Sannity", "Horn_Sannity", "Horn_Sannit…
## $ name <chr> "Horn Sannity", "Horn Sannity", "Horn Sannit…
## $ location <chr> "God's Green Earth", "God's Green Earth", "G…
## $ description <chr> "Keeping an eye on things. Fighting for Sane…
## $ url <chr> NA, NA, NA, NA, NA, "https://t.co/o67L88tsic…
## $ protected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ followers_count <int> 3235, 3235, 3235, 3235, 662, 8911, 8911, 891…
## $ friends_count <int> 4163, 4163, 4163, 4163, 2016, 3390, 3390, 33…
## $ listed_count <int> 82, 82, 82, 82, 0, 326, 326, 326, 326, 326, …
## $ statuses_count <int> 67107, 67107, 67107, 67107, 2797, 168597, 16…
## $ favourites_count <int> 32287, 32287, 32287, 32287, 5745, 802, 802, …
## $ account_created_at <dttm> 2009-08-02 23:12:16, 2009-08-02 23:12:16, 2…
## $ verified <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ profile_url <chr> NA, NA, NA, NA, NA, "https://t.co/o67L88tsic…
## $ profile_expanded_url <chr> NA, NA, NA, NA, NA, "https://www.Conservativ…
## $ account_lang <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ profile_banner_url <chr> "https://pbs.twimg.com/profile_banners/62374…
## $ profile_background_url <chr> "http://abs.twimg.com/images/themes/theme1/b…
## $ profile_image_url <chr> "http://pbs.twimg.com/profile_images/3507966…
length(unique(users$location))
## [1] 361
users %>%
count(location, sort = TRUE) %>%
mutate(location = reorder(location,n)) %>%
na.omit() %>%
top_n(5) %>%
ggplot(aes(x = location,y = n, fill="n")) +
geom_col() +
coord_flip() +
labs(x = "Location",
y = "Count",
title = "Twitter users - unique locations ")
tweets_cloud <- searchTwitter("#impeachment", n=1000, lang="en")
tweets.text <- sapply(tweets_cloud, function(x) x$getText())
# Replace blank space (“rt”)
tweets.text <- gsub("rt", "", tweets.text)
# Replace @UserName
tweets.text <- gsub("@\\w+", "", tweets.text)
# Remove punctuation
tweets.text <- gsub("[[:punct:]]", "", tweets.text)
# Remove links
tweets.text <- gsub("http\\w+", "", tweets.text)
# Remove tabs
tweets.text <- gsub("[ |\t]{2,}", "", tweets.text)
# Remove blank spaces at the beginning
tweets.text <- gsub("^ ", "", tweets.text)
# Remove blank spaces at the end
tweets.text <- gsub(" $", "", tweets.text)
# #convert all text to lower case
tweets.text <- tolower(tweets.text)
library(tm)
wordcloud(tweets.text,min.freq = 2, scale=c(7,0.5),colors=brewer.pal(8, "Dark2"), random.color= TRUE, random.order = FALSE, max.words = 150)
# We need to restructure tidy_impeachment_tweets as one-token-per-row format
tidy_impeachment_tweets <- rt %>%
filter(is_retweet==FALSE)%>% # original tweets
select(status_id,
text)%>% # variables of interest
unnest_tokens(word, text) # splits column
view(tidy_impeachment_tweets)
stop_words
my_stop_words <- tibble(
word = c(
"https",
"t.co",
"rt",
"amp",
"rstats",
"gt"
),
lexicon = "twitter"
)
# Connect stop words
all_stop_words <- stop_words %>%
bind_rows(my_stop_words) # connecting two data frames
# Let's see if it worked
view(all_stop_words)
# Remove numbers
no_numbers <- tidy_impeachment_tweets %>%
filter(is.na(as.numeric(word)))
no_stop_words <- no_numbers %>%
anti_join(all_stop_words, by = "word")
nrc <- get_sentiments("nrc") # get specific sentiment lexicons in a tidy format
view(nrc)
nrc_words <- no_stop_words %>%
inner_join(nrc, by="word")
view(nrc_words)
pie_words<- nrc_words %>%
group_by(sentiment) %>% # group by sentiment type
tally %>% # counts number of rows
arrange(desc(n)) # arrange sentiments in descending order based on frequency
ggpubr::ggpie(pie_words, "n", label = "sentiment",
fill = "sentiment", color = "white",
palette = "Spectral")
impeachment_sources <- impeachment_tweets_raw %>%
select(source) %>%
group_by(source) %>%
summarize(count=n())
impeachment_sources <- subset(impeachment_sources, count > 11)
data <- data.frame(
category=impeachment_sources$source,
count=impeachment_sources$count)
data$fraction = data$count / sum(data$count)
data$percentage = data$count / sum(data$count) * 100
data$ymax = cumsum(data$fraction)
data$ymin = c(0, head(data$ymax, n=-1))
data <- data[-c(1, 6), ]
Source <- paste(data$category, data$percentage, "%")
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Source)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(2, 4)) +
theme_void() +
theme(legend.position = "right")
mconnell_tweets <- userTimeline("@senatemajldr", n=3200)
mconnell_tweets_df <- tbl_df(map_df(mconnell_tweets, as.data.frame))
pelosi_tweets <- userTimeline("@SpeakerPelosi", n=3200)
pelosi_tweets_df <- tbl_df(map_df(pelosi_tweets, as.data.frame))
tweets <- mconnell_tweets_df
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))
tweet_words %>% head(20) %>% kable(row.names=TRUE)
| word | freq | |
|---|---|---|
| 1 | democrats | 133 |
| 2 | senate | 131 |
| 3 | american | 97 |
| 4 | @potus | 93 |
| 5 | #taxreform | 91 |
| 6 | #senate | 75 |
| 7 | people | 49 |
| 8 | bill | 47 |
| 9 | judge | 46 |
| 10 | house | 45 |
| 11 | president | 43 |
| 12 | congress | 41 |
| 13 | funding | 40 |
| 14 | kavanaugh | 39 |
| 15 | nation | 39 |
| 16 | americans | 38 |
| 17 | time | 38 |
| 18 | act | 37 |
| 19 | tax | 37 |
| 20 | bipartisan | 36 |
mconnell_sentiment <- tweet_words %>%
inner_join(get_sentiments("bing"))
mconnell_sentiment %>%
group_by(sentiment) %>%
top_n(10, freq) %>%
arrange(freq) %>%
ungroup() %>%
mutate(word = reorder(word, freq)) %>%
ggplot(aes(word, freq, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
tweets <- pelosi_tweets_df
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))
tweet_words %>% head(20) %>% kable(row.names=TRUE)
| word | freq | |
|---|---|---|
| 1 | house | 34 |
| 2 | president | 33 |
| 3 | @realdonaldtrump | 31 |
| 4 | americans | 22 |
| 5 | trump | 20 |
| 6 | tune | 20 |
| 7 | act | 19 |
| 8 | bipartisan | 18 |
| 9 | @senatemajldr | 17 |
| 10 | america | 16 |
| 11 | people | 16 |
| 12 | american | 15 |
| 13 | delegation | 14 |
| 14 | passed | 13 |
| 15 | @housedemocrats | 12 |
| 16 | country | 12 |
| 17 | republicans | 12 |
| 18 | legislation | 11 |
| 19 | live | 11 |
| 20 | rights | 11 |
pelosi_sentiment <- tweet_words %>%
inner_join(get_sentiments("bing"))
pelosi_sentiment %>%
group_by(sentiment) %>%
top_n(10, freq) %>%
arrange(freq) %>%
ungroup() %>%
mutate(word = reorder(word, freq)) %>%
ggplot(aes(word, freq, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Overall the data came out about as I would have expected with the exception of the unique locations. I know how to use the round function, but had trouble working out the errors with relation other then numeric data also being present. Overall pulling data from Twitter was easier than I expected, but I didn’t always have a good sense of best way to use the data I pulled for different graphics. It is a small miracle I got this done as my (no sh*t) my dog stepped on my keyboard after I have used the undo key to run very far back and I lost almost everything. Fortiunately after 2 days I had done enough trial and error that the redo went faster.