## [1] "Using direct authentication"
tweets %>%
count(source, hour = hour(with_tz(created, "EST"))) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(hour, percent, color = source)) +
geom_line() +
scale_y_continuous(labels = percent_format()) +
labs(x = "Hour of day (EST)",
y = "% of tweets",
color = "")

tweet_picture_counts <- tweets %>%
filter(!str_detect(text, '^"')) %>%
count(source,
picture = ifelse(str_detect(text, "t.co"),
"Picture/link", "No picture/link"))
ggplot(tweet_picture_counts, aes(source, n, fill = picture)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Number of tweets", fill = "")

spr <- tweet_picture_counts %>%
spread(source, n) %>%
mutate_each(funs(. / sum(.)), Android, iPhone)
rr <- spr$iPhone[2] / spr$Android[2]
library(tidytext)
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
android_iphone_ratios <- tweet_words %>%
count(word, source) %>%
filter(sum(n) >= 5) %>%
spread(source, n, fill = 0) %>%
ungroup() %>%
mutate_each(funs((. + 1) / sum(. + 1)), -word) %>%
mutate(logratio = log2(Android / iPhone)) %>%
arrange(desc(logratio))
tweet_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat = "identity") +
ylab("Occurrences") +
coord_flip()

android_iphone_ratios %>%
group_by(logratio > 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_bar(stat = "identity") +
coord_flip() +
ylab("Android / iPhone log ratio") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))

library(stringr)
tweets %>%
count(source,
quoted = ifelse(str_detect(text, '^"'), "Quoted", "Not quoted")) %>%
ggplot(aes(source, n, fill = quoted)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Number of tweets", fill = "") +
ggtitle('Whether tweets start with a quotation mark (")')

nrc <- sentiments %>%
filter(lexicon == "nrc") %>%
dplyr::select(word, sentiment)
sources <- tweet_words %>%
group_by(source) %>%
mutate(total_words = n()) %>%
ungroup() %>%
distinct(id, source, total_words)
by_source_sentiment <- tweet_words %>%
inner_join(nrc, by = "word") %>%
count(sentiment, id) %>%
ungroup() %>%
complete(sentiment, id, fill = list(n = 0)) %>%
inner_join(sources) %>%
group_by(source, sentiment, total_words) %>%
summarize(words = sum(n)) %>%
ungroup()
head(by_source_sentiment)
## Source: local data frame [6 x 4]
##
## source sentiment total_words words
## <chr> <chr> <int> <dbl>
## 1 Android anger 3555 225
## 2 Android anticipation 3555 239
## 3 Android disgust 3555 160
## 4 Android fear 3555 224
## 5 Android joy 3555 172
## 6 Android negative 3555 411
library(broom)
sentiment_differences <- by_source_sentiment %>%
group_by(sentiment) %>%
do(tidy(poisson.test(.$words, .$total_words)))
sentiment_differences
## Source: local data frame [10 x 9]
## Groups: sentiment [10]
##
## sentiment estimate statistic p.value parameter conf.low
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 anger 1.9351266 225 1.305956e-07 180.6824 1.4930214
## 2 anticipation 1.0213836 239 8.787136e-01 236.9605 0.8328856
## 3 disgust 2.5019818 160 7.939486e-09 120.8499 1.7822836
## 4 fear 1.3884872 224 4.526456e-03 198.4544 1.1011075
## 5 joy 1.0758522 172 5.856393e-01 167.0572 0.8420116
## 6 negative 1.8727583 411 8.522311e-12 332.9295 1.5504080
## 7 positive 1.1450491 451 8.147825e-02 427.7137 0.9827374
## 8 sadness 2.2312317 227 5.513881e-10 175.9432 1.6997497
## 9 surprise 0.9576843 103 8.185967e-01 104.8550 0.7035451
## 10 trust 1.0598673 305 5.553635e-01 297.9778 0.8833702
## Variables not shown: conf.high <dbl>, method <fctr>, alternative <fctr>.
library(scales)
sentiment_differences %>%
ungroup() %>%
mutate(sentiment = reorder(sentiment, estimate)) %>%
mutate_each(funs(. - 1), estimate, conf.low, conf.high) %>%
ggplot(aes(estimate, sentiment)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
scale_x_continuous(labels = percent_format()) +
labs(x = "% increase in Android relative to iPhone",
y = "Sentiment")

android_iphone_ratios %>%
inner_join(nrc, by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
mutate(sentiment = reorder(sentiment, -logratio),
word = reorder(word, -logratio)) %>%
group_by(sentiment) %>%
top_n(10, abs(logratio)) %>%
ungroup() %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
facet_wrap(~ sentiment, scales = "free", nrow = 2) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "", y = "Android / iPhone log ratio") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))
