Assignment #4 MBA 676

library(rtweet)
library(tidytext)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

app <- "Ngolaz"
consumer_key <- "V6htm9XnGv1AI5VyJVgsY7rKH"
consumer_secret <- "Wn1R8X13p8QQr8wzghs78tXrR6M2mphNbZpjLHkhZBotD5wgdK"
access_token <- "1199062437548326912-hp6co4yYq41ZybxilgCmbvP6ZvLGrg"
access_secret <- "uELfp6850GCApKfn2vMios4IZhRjtvxSTTiMq4MzINAP8"
my_token <- create_token(app = app,
            consumer_key = consumer_key,
            consumer_secret = consumer_secret,
            access_token = access_token,
            access_secret = access_secret)

identical(my_token, get_token())

## [1] FALSE

num_tweets <-12000
mt <- search_tweets('#maine', n = num_tweets, include_rts = FALSE)
head(mt)

## # A tibble: 6 x 90
##   user_id status_id created_at          screen_name text  source
##   <chr>   <chr>     <dttm>              <chr>       <chr> <chr> 
## 1 216332~ 11995384~ 2019-11-27 04:00:27 conpsweeney "#Ma~ Hoots~
## 2 216332~ 11969915~ 2019-11-20 03:20:06 conpsweeney "#Ma~ Hoots~
## 3 216332~ 11979479~ 2019-11-22 18:40:11 conpsweeney The ~ Hoots~
## 4 216332~ 11965134~ 2019-11-18 19:40:08 conpsweeney Dark~ Hoots~
## 5 216332~ 11964228~ 2019-11-18 13:40:08 conpsweeney Dark~ Hoots~
## 6 216332~ 11972434~ 2019-11-20 20:00:49 conpsweeney "#Ma~ Hoots~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>,
## #   ext_media_expanded_url <list>, ext_media_type <chr>,
## #   mentions_user_id <list>, mentions_screen_name <list>, lang <chr>,
## #   quoted_status_id <chr>, quoted_text <chr>, quoted_created_at <dttm>,
## #   quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>,
## #   quoted_screen_name <chr>, quoted_name <chr>,
## #   quoted_followers_count <int>, quoted_friends_count <int>,
## #   quoted_statuses_count <int>, quoted_location <chr>,
## #   quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>,
## #   retweet_created_at <dttm>, retweet_source <chr>,
## #   retweet_favorite_count <int>, retweet_retweet_count <int>,
## #   retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>,
## #   country <chr>, country_code <chr>, geo_coords <list>,
## #   coords_coords <list>, bbox_coords <list>, status_url <chr>,
## #   name <chr>, location <chr>, description <chr>, url <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## #   profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

me_platform <- mt %>% group_by(source) %>% 
  summarize(n = n()) %>% 
  mutate(percent_of_tweets = n/sum(n)) %>%
  arrange(desc(n))
me_platform %>% slice(1:10)

## # A tibble: 10 x 3
##    source                  n percent_of_tweets
##    <chr>               <int>             <dbl>
##  1 Hootsuite Inc.        808            0.254 
##  2 Twitter Web App       557            0.175 
##  3 Twitter for iPhone    379            0.119 
##  4 Twitter for Android   263            0.0828
##  5 TweetDeck             189            0.0595
##  6 Instagram             177            0.0557
##  7 Tweet Suite           170            0.0535
##  8 Buffer                 86            0.0271
##  9 IFTTT                  52            0.0164
## 10 Twitter Web Client     52            0.0164

mt %>%
  group_by(screen_name) %>% 
  summarize(n = n()) %>%
  mutate(percent_of_tweets = n/sum(n)) %>%
  arrange(desc(n)) %>% 
  slice(1:10)

## # A tibble: 10 x 3
##    screen_name         n percent_of_tweets
##    <chr>           <int>             <dbl>
##  1 conpsweeney       378            0.119 
##  2 jhhayman          186            0.0586
##  3 MaineTweetz       170            0.0535
##  4 BasketIsOysters   113            0.0356
##  5 rick03907          55            0.0173
##  6 AltThisMoFo        51            0.0161
##  7 OnlineSentinel     48            0.0151
##  8 melivingcom        44            0.0139
##  9 PulpNews           43            0.0135
## 10 yuckf001           38            0.0120

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
maine_words <- mt %>% select(status_id, text) %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

maine_words %>% group_by(word) %>% summarize(n = n()) %>% arrange(desc(n)) %>% top_n(20)

## Selecting by n

## # A tibble: 20 x 2
##    word               n
##    <chr>          <int>
##  1 #maine          3163
##  2 girl             541
##  3 #murdermystery   368
##  4 maine            328
##  5 bridge           275
##  6 glass            275
##  7 @pressherald     232
##  8 including        207
##  9 books            204
## 10 author           201
## 11 challenge        199
## 12 sets             199
## 13 obsession        197
## 14 shown            197
## 15 @jhhayman        196
## 16 deft             196
## 17 earlier          196
## 18 fatal            196
## 19 plotting         196
## 20 prowess          196

nrc <- get_sentiments("nrc") %>%
  select(word, sentiment)
head(nrc)

## # A tibble: 6 x 2
##   word      sentiment
##   <chr>     <chr>    
## 1 abacus    trust    
## 2 abandon   fear     
## 3 abandon   negative 
## 4 abandon   sadness  
## 5 abandoned anger    
## 6 abandoned fear

maine_words_sentiments <- maine_words %>% inner_join(nrc, by = "word")

maine_words_sentiments %>% group_by(sentiment) %>% summarize(n = n()) %>% arrange(desc(n))

## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 positive      2720
##  2 negative      1601
##  3 trust         1504
##  4 anticipation  1205
##  5 joy           1110
##  6 fear          1098
##  7 anger         1061
##  8 sadness        970
##  9 surprise       469
## 10 disgust        394

pos_tw_ids <- maine_words_sentiments %>% filter(sentiment == "positive") %>% distinct(status_id)
mt %>% inner_join(pos_tw_ids, by = "status_id") %>% select(text) %>% slice(1:10)

## # A tibble: 10 x 1
##    text                                                                    
##    <chr>                                                                   
##  1 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  2 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  3 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  4 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  5 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  6 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  7 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  8 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
##  9 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~
## 10 "#Maine author @jhhayman has shown his deft plotting prowess in earlier~

disg_tw_ids <- maine_words_sentiments %>% filter(sentiment == "disgust") %>% distinct(status_id, word)
mt %>% inner_join(disg_tw_ids, by = "status_id") %>% select(text, word) %>% slice(1:10)

## # A tibble: 10 x 2
##    text                                                              word  
##    <chr>                                                             <chr> 
##  1 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  2 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  3 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  4 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  5 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  6 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  7 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  8 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
##  9 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~
## 10 The Cutting tells about a #serialkiller dealing with #illegalorg~ cutti~

vt <- search_tweets('#Vermont', n = num_tweets, include_rts = FALSE)

#show the platform
vt_platform <- vt %>% group_by(source) %>% 
                summarize(n = n()) %>% 
                mutate(percent_of_tweets = n / sum(n)) %>% 
                arrange(desc(n))

#extract the words and join to nrc sentiment words
vt_words <- vt %>% select(status_id, text) %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))
vt_words_sentiments <- vt_words %>% inner_join(nrc, by = "word")

me_platform$state <- "Maine"
vt_platform$state <- "Vermont"
maine_words_sentiments$state <- "Maine"
vt_words_sentiments$state <- "Vermont"
platform <- rbind(me_platform, vt_platform)
words_sentiments <- rbind(maine_words_sentiments, vt_words_sentiments)

pf <- c("Twitter Web Client", "Twitter for iPhone", "Instagram", "Hootsuite Inc.", "Post Planner Inc.")
pf_df <- platform %>% filter(source %in% pf)
ggplot(pf_df, aes(x = source, y = percent_of_tweets, fill = state)) + 
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Platform") +
  ylab("Percent of tweets") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

sent_df <- words_sentiments %>% 
  group_by(state, sentiment) %>% 
  summarize(n = n()) %>%
  mutate(frequency = n/sum(n))

ggplot(sent_df, aes(x = sentiment, y = frequency, fill = state)) + 
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Sentiment") +
  ylab("Percent of tweets") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Assignment #4 MBA 676

JOY.OFIELU

11/22/2019