library(twitteR); library(ROAuth)
library(tidyverse); library(lubridate)
library(tidytext)
tw <- searchTwitter('starbucks coffee', n=100000)
df <- twListToDF(tw)
df <- df %>% mutate(Date = date(created))
df <- read_csv('starbucks_tweets.csv')
## Rows: 16745 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): text, replyToSN, statusSource, screenName
## dbl (7): favoriteCount, replyToSID, id, replyToUID, retweetCount, longitude...
## lgl (4): favorited, truncated, isRetweet, retweeted
## dttm (1): created
## date (1): Date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 16,745 x 17
## text favor~1 favor~2 reply~3 created trunc~4 replyT~5 id
## <chr> <lgl> <dbl> <chr> <dttm> <lgl> <dbl> <dbl>
## 1 "@peele~ FALSE 0 peeler~ 2022-10-01 05:45:31 FALSE 1.58e18 1.58e18
## 2 "coffee~ FALSE 0 <NA> 2022-10-01 05:44:22 FALSE NA 1.58e18
## 3 "RT @Mo~ FALSE 0 <NA> 2022-10-01 05:43:38 FALSE NA 1.58e18
## 4 "RT @sk~ FALSE 0 <NA> 2022-10-01 05:41:57 FALSE NA 1.58e18
## 5 "@theho~ FALSE 0 thehof~ 2022-10-01 05:40:29 FALSE 1.58e18 1.58e18
## 6 "RT @th~ FALSE 0 <NA> 2022-10-01 05:40:27 FALSE NA 1.58e18
## 7 "@Xia_L~ FALSE 0 Xia_La~ 2022-10-01 05:38:19 TRUE 1.58e18 1.58e18
## 8 "RT @Kr~ FALSE 0 <NA> 2022-10-01 05:37:42 FALSE NA 1.58e18
## 9 "RT @Ke~ FALSE 0 <NA> 2022-10-01 05:36:34 FALSE NA 1.58e18
## 10 "We wil~ FALSE 0 <NA> 2022-10-01 05:36:26 TRUE NA 1.58e18
## # ... with 16,735 more rows, 9 more variables: replyToUID <dbl>,
## # statusSource <chr>, screenName <chr>, retweetCount <dbl>, isRetweet <lgl>,
## # retweeted <lgl>, longitude <dbl>, latitude <dbl>, Date <date>, and
## # abbreviated variable names 1: favorited, 2: favoriteCount, 3: replyToSN,
## # 4: truncated, 5: replyToSID
tweets_text <- df$text
length(tweets_text)
## [1] 16745
tweets_tbl <- tibble(line =1:16745, text = tweets_text)
my_stop_words <- tibble(word=c("https","t.co","rt","amp","rstats","gt"), lexicon = "twitter")
tidy_words <- tweets_tbl %>%
unnest_tokens(word, text)
tidy_words %>%
count(word, sort = TRUE)
## # A tibble: 22,086 x 2
## word n
## <chr> <int>
## 1 u 72066
## 2 starbucks 14091
## 3 coffee 13594
## 4 rt 9151
## 5 https 7070
## 6 t.co 6993
## 7 the 6728
## 8 i 5972
## 9 to 5767
## 10 a 5335
## # ... with 22,076 more rows
buzcom <- c('simple', 'minimalist', 'visible', 'direct')
pos <- rep('positive', 4)
positive <- get_sentiments('bing') %>%
add_row(word = buzcom, sentiment = pos) %>%
filter(sentiment == 'positive')
tidy_words %>%
semi_join(positive) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 446 x 2
## word n
## <chr> <int>
## 1 like 945
## 2 support 793
## 3 better 660
## 4 premier 596
## 5 greatest 591
## 6 sweet 570
## 7 free 373
## 8 led 363
## 9 happy 360
## 10 win 340
## # ... with 436 more rows
bing <- get_sentiments('bing') %>%
add_row(word = buzcom, sentiment = pos)
bing_word_counts <- tidy_words %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE)
## Joining, by = "word"
bing_word_counts %>%
filter(n > 150) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col() +
coord_flip() +
labs(y = "Contribution to sentiment")