load tweets and convert to data frame
trump_tweets <- userTimeline ("realDonaldTrump", n = 3200)
trump_tweets_df <- tbl_df(map_df(trump_tweets, as.data.frame))
extract tweet source from the data frame
tweets <- trump_tweets_df %>% select(id, statusSource, text, created) %>% extract(statusSource, "source", "Twitter for (.*?)<") %>% filter(source %in% c("iPhone", "Android"))
kable(head(tweets, 10))
| id | source | text | created |
|---|---|---|---|
| 797455295928791040 | Android | This will prove to be a great time in the lives of ALL Americans. We will unite and we will win, win, win! | 2016-11-12 15:05:33 |
| 797098212599496704 | iPhone | Today we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet https://t.co/wPk7QWpK8Z | 2016-11-11 15:26:37 |
| 797069763801387008 | Android | Busy day planned in New York. Will soon be making some very important decisions on the people who will be running our government! | 2016-11-11 13:33:35 |
| 797034721075228672 | Android | Love the fact that the small groups of protesters last night have passion for our great country. We will all come together and be proud! | 2016-11-11 11:14:20 |
| 796900183955095552 | Android | Just had a very open and successful presidential election. Now professional protesters, incited by the media, are protesting. Very unfair! | 2016-11-11 02:19:44 |
| 796897928048766976 | Android | A fantastic day in D.C. Met with President Obama for first time. Really good meeting, great chemistry. Melania liked Mrs. O a lot! | 2016-11-11 02:10:46 |
| 796797436752707585 | iPhone | Happy 241st birthday to the U.S. Marine Corps! Thank you for your service!! https://t.co/Lz2dhrXzo4 | 2016-11-10 19:31:27 |
| 796315640307060738 | Android | Such a beautiful and important evening! The forgotten man and woman will never be forgotten again. We will all come together as never before | 2016-11-09 11:36:58 |
| 796182637622816768 | iPhone | Watching the returns at 9:45pm. | |
| #ElectionNight #MAGA< | ed><U+00A0 | > |
|
| 796126077647196160 | iPhone | Still time to #VoteTrump! | |
| #iVoted #ElectionNigh | t https:// | t.co/UZtYAY1Ba6 2016-11-08 23:03:42 |
chart by time of day
tweets %>% count(source, hour = hour(with_tz(created, "EST"))) %>% mutate(percent = n/sum(n)) %>% ggplot(aes(hour, percent, color = source))+geom_line() + scale_y_continuous(labels = percent_format()) + labs(x = "Hour of the day (EST)", y = "% of tweets", color = "")
chart by picture/link
tweet_picture_counts <- tweets %>% filter(!str_detect(text, '^"')) %>% count(source, picture = ifelse(str_detect(text, "t.co"), "Picture/link", "No picture/link"))
ggplot(tweet_picture_counts, aes(source, n, fill = picture)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Number of tweets", fill = "")
reg <- "([^A-Za-z\\d#@']|'('?![A-Za-z\\d#@]))"
tweet_words <- tweets %>% filter(!str_detect(text, '^"')) %>% mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>% unnest_tokens(word, text, token = "regex", pattern = reg) %>% filter (!word %in% stop_words$word, str_detect(word, "[a-z]"))
kable(head(tweet_words))
| id | source | created | word |
|---|---|---|---|
| 764045345332396032 | Android | 2016-08-12 10:26:20 | ratings |
| 764045345332396032 | Android | 2016-08-12 10:26:20 | challenged |
| 764045345332396032 | Android | 2016-08-12 10:26:20 | @cnn |
| 764045345332396032 | Android | 2016-08-12 10:26:20 | reports |
| 764045345332396032 | Android | 2016-08-12 10:26:20 | call |
| 764045345332396032 | Android | 2016-08-12 10:26:20 | president |
android_iphone_ratios <- tweet_words %>%
count(word, source) %>%
filter(sum(n) >= 5) %>%
spread(source, n, fill = 0) %>%
ungroup() %>%
mutate_each(funs((. + 1) / sum(. + 1)), -word) %>%
mutate(logratio = log2(Android / iPhone)) %>%
arrange(desc(logratio))
kable(head(android_iphone_ratios, digits = 2))
| word | Android | iPhone | logratio |
|---|---|---|---|
| exciting | 0.0183486 | 0.0009116 | 4.331163 |
| wow | 0.0214067 | 0.0018232 | 3.553556 |
| paul | 0.0183486 | 0.0018232 | 3.331164 |
| ryan | 0.0183486 | 0.0018232 | 3.331164 |
| @cnn | 0.0214067 | 0.0027347 | 2.968593 |
| media | 0.0214067 | 0.0036463 | 2.553556 |
Visualization of most common words
android_iphone_ratios %>%
group_by(logratio > 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0))+
geom_bar(stat = "identity") +
coord_flip() +
ylab("Android/ iPhone log ratio") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))
## Warning: Stacking not well defined when ymin != 0
Sentiments
nrc <- sentiments %>% filter( lexicon == "nrc") %>% select(word, sentiment)
kable(head(nrc))
| word | sentiment |
|---|---|
| abacus | trust |
| abandon | fear |
| abandon | negative |
| abandon | sadness |
| abandoned | anger |
| abandoned | fear |
Word Count
sources <- tweet_words %>%
group_by(source) %>%
mutate(total_words = n()) %>%
ungroup() %>%
distinct(id, source, total_words)
iPhone
by_source_sentiment_iPh <- tweet_words %>%
inner_join(nrc, by = "word") %>%
count(sentiment, id) %>%
ungroup() %>%
complete(sentiment, id, fill = list(n = 0)) %>%
inner_join(sources) %>%
group_by(source, sentiment, total_words) %>%
filter(source == "iPhone") %>%
summarise(words = sum(n)) %>%
ungroup()
## Joining, by = "id"
kable(head(by_source_sentiment_iPh) %>% mutate(percent_sentiment = total_words/ words), digits = 2)
| source | sentiment | total_words | words | percent_sentiment |
|---|---|---|---|---|
| iPhone | anger | 2212 | 90 | 24.58 |
| iPhone | anticipation | 2212 | 151 | 14.65 |
| iPhone | disgust | 2212 | 42 | 52.67 |
| iPhone | fear | 2212 | 113 | 19.58 |
| iPhone | joy | 2212 | 99 | 22.34 |
| iPhone | negative | 2212 | 150 | 14.75 |
Android
by_source_sentiment_An <- tweet_words %>%
inner_join(nrc, by = "word") %>%
count(sentiment, id) %>%
ungroup() %>%
complete(sentiment, id, fill = list(n = 0)) %>%
inner_join(sources) %>%
group_by(source, sentiment, total_words) %>%
filter(source == "Android") %>%
summarise(words = sum(n)) %>%
ungroup()
## Joining, by = "id"
kable(head(by_source_sentiment_An) %>% mutate(percent_sentiment = total_words/words), digits = 2)
| source | sentiment | total_words | words | percent_sentiment |
|---|---|---|---|---|
| Android | anger | 536 | 31 | 17.29 |
| Android | anticipation | 536 | 41 | 13.07 |
| Android | disgust | 536 | 22 | 24.36 |
| Android | fear | 536 | 22 | 24.36 |
| Android | joy | 536 | 28 | 19.14 |
| Android | negative | 536 | 47 | 11.40 |
Join words to sentiments
by_source_sentiment <- tweet_words %>%
inner_join(nrc, by = "word") %>%
count(sentiment, id) %>%
ungroup() %>% complete(sentiment, id, fill = list(n = 0)) %>% inner_join(sources) %>% group_by(source, sentiment, total_words) %>% summarise(words = sum(n)) %>% ungroup()
## Joining, by = "id"
kable(head(by_source_sentiment))
| source | sentiment | total_words | words |
|---|---|---|---|
| Android | anger | 536 | 31 |
| Android | anticipation | 536 | 41 |
| Android | disgust | 536 | 22 |
| Android | fear | 536 | 22 |
| Android | joy | 536 | 28 |
| Android | negative | 536 | 47 |
Liklihood of a difference in word uses between phones
sentiment_differences <- by_source_sentiment %>%
group_by(sentiment) %>%
do(tidy(poisson.test(.$words, .$total_words)))
kable(sentiment_differences, digits = 2)
| sentiment | estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|
| anger | 1.42 | 31 | 0.11 | 23.60 | 0.91 | 2.16 | Comparison of Poisson rates | two.sided |
| anticipation | 1.12 | 41 | 0.52 | 37.45 | 0.77 | 1.59 | Comparison of Poisson rates | two.sided |
| disgust | 2.16 | 22 | 0.01 | 12.48 | 1.23 | 3.70 | Comparison of Poisson rates | two.sided |
| fear | 0.80 | 22 | 0.39 | 26.33 | 0.48 | 1.28 | Comparison of Poisson rates | two.sided |
| joy | 1.17 | 28 | 0.50 | 24.77 | 0.74 | 1.79 | Comparison of Poisson rates | two.sided |
| negative | 1.29 | 47 | 0.13 | 38.43 | 0.91 | 1.81 | Comparison of Poisson rates | two.sided |
| positive | 1.16 | 65 | 0.30 | 57.74 | 0.87 | 1.54 | Comparison of Poisson rates | two.sided |
| sadness | 1.25 | 24 | 0.32 | 20.09 | 0.76 | 2.00 | Comparison of Poisson rates | two.sided |
| surprise | 1.27 | 21 | 0.35 | 17.36 | 0.74 | 2.11 | Comparison of Poisson rates | two.sided |
| trust | 1.15 | 45 | 0.38 | 40.18 | 0.81 | 1.61 | Comparison of Poisson rates | two.sided |
Visualization of word use liklihood
sentiment_differences %>%
ungroup() %>%
mutate(sentiment = reorder(sentiment, estimate)) %>% mutate_each(funs(. - 1), estimate, conf.low, conf.high) %>%
ggplot(aes(estimate, sentiment)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
scale_x_continuous(labels = percent_format()) +
labs(x = "% increase in Android relative to iPhone", y = "Sentiment")
Most common words with their associated symptoms
android_iphone_ratios %>%
inner_join(nrc, by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
mutate(sentiment = reorder(sentiment, -logratio),
word = reorder(word, -logratio)) %>%
group_by(sentiment) %>%
top_n(10, abs(logratio)) %>%
ungroup() %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
facet_wrap(~sentiment, scales = "free", nrow = 2) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs(x = "", y = "Android / iPhone log ratio") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))
## Warning: Stacking not well defined when ymin != 0