library(rtweet)
library(tidytext)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
app <- "Ngolaz"
consumer_key <- "V6htm9XnGv1AI5VyJVgsY7rKH"
consumer_secret <- "Wn1R8X13p8QQr8wzghs78tXrR6M2mphNbZpjLHkhZBotD5wgdK"
access_token <- "1199062437548326912-hp6co4yYq41ZybxilgCmbvP6ZvLGrg"
access_secret <- "uELfp6850GCApKfn2vMios4IZhRjtvxSTTiMq4MzINAP8"
my_token <- create_token(app = app,
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
identical(my_token, get_token())
## [1] FALSE
num_tweets <-12000
mt <- search_tweets('#maine', n = num_tweets, include_rts = FALSE)
head(mt)
## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 216332~ 11995384~ 2019-11-27 04:00:27 conpsweeney "#Ma~ Hoots~
## 2 216332~ 11969915~ 2019-11-20 03:20:06 conpsweeney "#Ma~ Hoots~
## 3 216332~ 11979479~ 2019-11-22 18:40:11 conpsweeney The ~ Hoots~
## 4 216332~ 11965134~ 2019-11-18 19:40:08 conpsweeney Dark~ Hoots~
## 5 216332~ 11964228~ 2019-11-18 13:40:08 conpsweeney Dark~ Hoots~
## 6 216332~ 11972434~ 2019-11-20 20:00:49 conpsweeney "#Ma~ Hoots~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>,
## # ext_media_expanded_url <list>, ext_media_type <chr>,
## # mentions_user_id <list>, mentions_screen_name <list>, lang <chr>,
## # quoted_status_id <chr>, quoted_text <chr>, quoted_created_at <dttm>,
## # quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>,
## # quoted_screen_name <chr>, quoted_name <chr>,
## # quoted_followers_count <int>, quoted_friends_count <int>,
## # quoted_statuses_count <int>, quoted_location <chr>,
## # quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>,
## # retweet_created_at <dttm>, retweet_source <chr>,
## # retweet_favorite_count <int>, retweet_retweet_count <int>,
## # retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>,
## # country <chr>, country_code <chr>, geo_coords <list>,
## # coords_coords <list>, bbox_coords <list>, status_url <chr>,
## # name <chr>, location <chr>, description <chr>, url <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## # profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
me_platform <- mt %>% group_by(source) %>%
summarize(n = n()) %>%
mutate(percent_of_tweets = n/sum(n)) %>%
arrange(desc(n))
me_platform %>% slice(1:10)
## # A tibble: 10 x 3
## source n percent_of_tweets
## <chr> <int> <dbl>
## 1 Hootsuite Inc. 808 0.254
## 2 Twitter Web App 557 0.175
## 3 Twitter for iPhone 379 0.119
## 4 Twitter for Android 263 0.0828
## 5 TweetDeck 189 0.0595
## 6 Instagram 177 0.0557
## 7 Tweet Suite 170 0.0535
## 8 Buffer 86 0.0271
## 9 IFTTT 52 0.0164
## 10 Twitter Web Client 52 0.0164
mt %>%
group_by(screen_name) %>%
summarize(n = n()) %>%
mutate(percent_of_tweets = n/sum(n)) %>%
arrange(desc(n)) %>%
slice(1:10)
## # A tibble: 10 x 3
## screen_name n percent_of_tweets
## <chr> <int> <dbl>
## 1 conpsweeney 378 0.119
## 2 jhhayman 186 0.0586
## 3 MaineTweetz 170 0.0535
## 4 BasketIsOysters 113 0.0356
## 5 rick03907 55 0.0173
## 6 AltThisMoFo 51 0.0161
## 7 OnlineSentinel 48 0.0151
## 8 melivingcom 44 0.0139
## 9 PulpNews 43 0.0135
## 10 yuckf001 38 0.0120
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
maine_words <- mt %>% select(status_id, text) %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
maine_words %>% group_by(word) %>% summarize(n = n()) %>% arrange(desc(n)) %>% top_n(20)
## Selecting by n
## # A tibble: 20 x 2
## word n
## <chr> <int>
## 1 #maine 3163
## 2 girl 541
## 3 #murdermystery 368
## 4 maine 328
## 5 bridge 275
## 6 glass 275
## 7 @pressherald 232
## 8 including 207
## 9 books 204
## 10 author 201
## 11 challenge 199
## 12 sets 199
## 13 obsession 197
## 14 shown 197
## 15 @jhhayman 196
## 16 deft 196
## 17 earlier 196
## 18 fatal 196
## 19 plotting 196
## 20 prowess 196
nrc <- get_sentiments("nrc") %>%
select(word, sentiment)
head(nrc)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
maine_words_sentiments <- maine_words %>% inner_join(nrc, by = "word")
maine_words_sentiments %>% group_by(sentiment) %>% summarize(n = n()) %>% arrange(desc(n))
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 2720
## 2 negative 1601
## 3 trust 1504
## 4 anticipation 1205
## 5 joy 1110
## 6 fear 1098
## 7 anger 1061
## 8 sadness 970
## 9 surprise 469
## 10 disgust 394
pos_tw_ids <- maine_words_sentiments %>% filter(sentiment == "positive") %>% distinct(status_id)
mt %>% inner_join(pos_tw_ids, by = "status_id") %>% select(text) %>% slice(1:10)
## # A tibble: 10 x 1
## text
## <chr>
## 1 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 2 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 3 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 4 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 5 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 6 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 7 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 8 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 9 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 10 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
disg_tw_ids <- maine_words_sentiments %>% filter(sentiment == "disgust") %>% distinct(status_id, word)
mt %>% inner_join(disg_tw_ids, by = "status_id") %>% select(text, word) %>% slice(1:10)
## # A tibble: 10 x 2
## text word
## <chr> <chr>
## 1 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 2 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 3 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 4 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 5 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 6 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 7 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 8 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 9 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 10 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
vt <- search_tweets('#Vermont', n = num_tweets, include_rts = FALSE)
#show the platform
vt_platform <- vt %>% group_by(source) %>%
summarize(n = n()) %>%
mutate(percent_of_tweets = n / sum(n)) %>%
arrange(desc(n))
#extract the words and join to nrc sentiment words
vt_words <- vt %>% select(status_id, text) %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
vt_words_sentiments <- vt_words %>% inner_join(nrc, by = "word")
me_platform$state <- "Maine"
vt_platform$state <- "Vermont"
maine_words_sentiments$state <- "Maine"
vt_words_sentiments$state <- "Vermont"
platform <- rbind(me_platform, vt_platform)
words_sentiments <- rbind(maine_words_sentiments, vt_words_sentiments)
pf <- c("Twitter Web Client", "Twitter for iPhone", "Instagram", "Hootsuite Inc.", "Post Planner Inc.")
pf_df <- platform %>% filter(source %in% pf)
ggplot(pf_df, aes(x = source, y = percent_of_tweets, fill = state)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Platform") +
ylab("Percent of tweets") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

sent_df <- words_sentiments %>%
group_by(state, sentiment) %>%
summarize(n = n()) %>%
mutate(frequency = n/sum(n))
ggplot(sent_df, aes(x = sentiment, y = frequency, fill = state)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Sentiment") +
ylab("Percent of tweets") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
