Getting the data and distribution of tweets

Accessing Data from Twitter API using R (part1)

consumer_key <- ""
consumer_secret <- ""
access_token <- ""
access_secret <- ""
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
trump <- userTimeline("realDonaldTrump", n=3200, includeRts=T)
obama <- userTimeline("BarackObama", n=3200, includeRts=T)
president.obama <- userTimeline("POTUS44", n=3200, includeRts=T)
length(trump)
## [1] 3195
length(obama)
## [1] 3196
length(president.obama)
## [1] 352
df.trump <- twListToDF(trump)
df.obama <- twListToDF(obama)
df.president.obama <- twListToDF(president.obama)
dim(df.trump)
## [1] 3195   16
dim(df.obama)
## [1] 3196   16
dim(df.president.obama)
## [1] 352  16
colnames(df.trump)
##  [1] "text"          "favorited"     "favoriteCount" "replyToSN"    
##  [5] "created"       "truncated"     "replyToSID"    "id"           
##  [9] "replyToUID"    "statusSource"  "screenName"    "retweetCount" 
## [13] "isRetweet"     "retweeted"     "longitude"     "latitude"
tweets <- bind_rows(
  df.trump %>% filter(isRetweet==F) %>%
    select(text, screenName, created, retweetCount, favoriteCount),
  df.obama %>% filter(isRetweet==F) %>%
    select(text, screenName, created, retweetCount, favoriteCount),
  df.president.obama %>% filter(isRetweet==F) %>%
    select(text, screenName, created, retweetCount, favoriteCount))
ggplot(tweets, aes(x = created, fill = screenName)) +
  geom_histogram(position = "identity", bins = 50, show.legend = FALSE) +
  facet_wrap(~screenName, ncol = 1, scales = "free_y") + 
  ggtitle("Tweet Activity (Adaptive y-axis)")

ggplot(tweets, aes(x = created, fill = screenName)) +
  geom_histogram(position = "identity", bins = 50, show.legend = FALSE) +
  facet_wrap(~screenName, ncol = 1) + 
  ggtitle("Tweet Activity (Fixed y-axis)")

## Word frequencies

# replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
tidy_tweets <- tweets %>% 
  filter(!str_detect(text, "^RT")) %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  mutate(id = row_number()) %>%
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
  filter(!word %in% stop_words$word, str_detect(word, "[a-z]"))

frequency <- tidy_tweets %>% 
  group_by(screenName) %>% 
  count(word, sort = TRUE) %>% 
  left_join(tidy_tweets %>% 
              group_by(screenName) %>% 
              summarise(total = n()), by="screenName") %>%
  mutate(freq = n/total)
frequency
## # A tibble: 10,349 x 5
## # Groups:   screenName [3]
##    screenName      word              n total    freq
##    <chr>           <chr>         <int> <int>   <dbl>
##  1 BarackObama     president      1165 20498 0.0568 
##  2 BarackObama     obama          1029 20498 0.0502 
##  3 BarackObama     #actonclimate   270 20498 0.0132 
##  4 realDonaldTrump people          218 23417 0.00931
##  5 BarackObama     change          215 20498 0.0105 
##  6 realDonaldTrump news            204 23417 0.00871
##  7 BarackObama     climate         189 20498 0.00922
##  8 realDonaldTrump fake            177 23417 0.00756
##  9 realDonaldTrump america         167 23417 0.00713
## 10 BarackObama     watch           164 20498 0.00800
## # ... with 10,339 more rows
frequency.spread <- frequency %>% 
  select(screenName, word, freq) %>% 
  spread(screenName, freq) %>%
  arrange(desc(BarackObama), desc(realDonaldTrump))
frequency.spread
## # A tibble: 7,567 x 4
##    word          BarackObama   POTUS44 realDonaldTrump
##    <chr>               <dbl>     <dbl>           <dbl>
##  1 president         0.0568   0.00281         0.00615 
##  2 obama             0.0502   0.000351        0.00252 
##  3 #actonclimate     0.0132   0.00105        NA       
##  4 change            0.0105   0.00632         0.000897
##  5 climate           0.00922  0.00386        NA       
##  6 watch             0.00800 NA               0.00218 
##  7 health            0.00766  0.00597         0.000342
##  8 time              0.00737  0.00492         0.00538 
##  9 #doyourjob        0.00717 NA              NA       
## 10 americans         0.00698  0.00738         0.00132 
## # ... with 7,557 more rows

7.3 Comparing word usage

word_ratios <- tidy_tweets %>%
  filter(screenName != "POTUS44") %>%
  filter(!str_detect(word, "^@")) %>%
  count(word, screenName) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  spread(screenName, n, fill = 0) %>%
  mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
  mutate(logratio = log(realDonaldTrump / BarackObama)) %>%
  arrange(desc(logratio))
word_ratios %>% 
  arrange(abs(logratio))
## # A tibble: 6,803 x 4
##    word      BarackObama realDonaldTrump logratio
##    <chr>           <dbl>           <dbl>    <dbl>
##  1 save         0.000407        0.000406 -0.00390
##  2 level        0.000704        0.000710  0.00918
##  3 political    0.000667        0.000676  0.0145 
##  4 remarks      0.000630        0.000642  0.0203 
##  5 decades      0.000296        0.000304  0.0269 
##  6 november     0.000296        0.000304  0.0269 
##  7 happy        0.00178         0.00172  -0.0303 
##  8 american     0.00367         0.00355  -0.0321 
##  9 christmas    0.000259        0.000270  0.0426 
## 10 cost         0.000259        0.000270  0.0426 
## # ... with 6,793 more rows
word_ratios %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("log odds ratio (realDonaldTrump/BarackObama)") +
  scale_fill_discrete(name = "", labels = c("realDonaldTrump", "BarackObama"))

# Without hashtags
word_ratios %>%
  filter(!str_detect(word, "^#")) %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("log odds ratio (realDonaldTrump/BarackObama)") +
  scale_fill_discrete(name = "", labels = c("realDonaldTrump", "BarackObama"))

7.4 Changes in word use

words_by_time <- tidy_tweets %>%
  filter(!str_detect(word, "^@")) %>%
  mutate(time_floor = floor_date(created, unit = "1 month")) %>%
  count(time_floor, screenName, word) %>%
  ungroup() %>%
  group_by(screenName, time_floor) %>%
  mutate(time_total = sum(n)) %>%
  group_by(word) %>%
  mutate(word_total = sum(n)) %>%
  ungroup() %>%
  rename(count = n) %>%
  filter(word_total > 30)
words_by_time 
## # A tibble: 6,270 x 6
##    time_floor          screenName  word        count time_total word_total
##    <dttm>              <chr>       <chr>       <int>      <int>      <int>
##  1 2014-08-01 00:00:00 BarackObama #actonclim…     1         91        273
##  2 2014-08-01 00:00:00 BarackObama address         1         91        108
##  3 2014-08-01 00:00:00 BarackObama affordable      1         91         41
##  4 2014-08-01 00:00:00 BarackObama agree           1         91         75
##  5 2014-08-01 00:00:00 BarackObama america         2         91        303
##  6 2014-08-01 00:00:00 BarackObama americans       1         91        195
##  7 2014-08-01 00:00:00 BarackObama care            1         91        134
##  8 2014-08-01 00:00:00 BarackObama celebrate       1         91         33
##  9 2014-08-01 00:00:00 BarackObama change          1         91        254
## 10 2014-08-01 00:00:00 BarackObama check           1         91         72
## # ... with 6,260 more rows
nested_data <- words_by_time %>%
  filter(screenName != "BarackObama" | time_floor < ymd("20170101")) %>% 
  filter(screenName != "POTUS44") %>%
  nest(-word, -screenName) 
nested_data
## # A tibble: 473 x 3
##    screenName  word          data             
##    <chr>       <chr>         <list>           
##  1 BarackObama #actonclimate <tibble [28 × 4]>
##  2 BarackObama address       <tibble [28 × 4]>
##  3 BarackObama affordable    <tibble [14 × 4]>
##  4 BarackObama agree         <tibble [21 × 4]>
##  5 BarackObama america       <tibble [22 × 4]>
##  6 BarackObama americans     <tibble [27 × 4]>
##  7 BarackObama care          <tibble [23 × 4]>
##  8 BarackObama celebrate     <tibble [14 × 4]>
##  9 BarackObama change        <tibble [27 × 4]>
## 10 BarackObama check         <tibble [25 × 4]>
## # ... with 463 more rows
nested_models <- nested_data %>%
  mutate(models = map(data, ~ glm(
    cbind(count, time_total) ~ time_floor, ., 
  family = "binomial")))
nested_models
## # A tibble: 473 x 4
##    screenName  word          data              models   
##    <chr>       <chr>         <list>            <list>   
##  1 BarackObama #actonclimate <tibble [28 × 4]> <S3: glm>
##  2 BarackObama address       <tibble [28 × 4]> <S3: glm>
##  3 BarackObama affordable    <tibble [14 × 4]> <S3: glm>
##  4 BarackObama agree         <tibble [21 × 4]> <S3: glm>
##  5 BarackObama america       <tibble [22 × 4]> <S3: glm>
##  6 BarackObama americans     <tibble [27 × 4]> <S3: glm>
##  7 BarackObama care          <tibble [23 × 4]> <S3: glm>
##  8 BarackObama celebrate     <tibble [14 × 4]> <S3: glm>
##  9 BarackObama change        <tibble [27 × 4]> <S3: glm>
## 10 BarackObama check         <tibble [25 × 4]> <S3: glm>
## # ... with 463 more rows
slopes <- nested_models %>%
  unnest(map(models, tidy)) %>%
  filter(term == "time_floor") %>%
  mutate(adjusted.p.value = p.adjust(p.value))
top_slopes <- slopes %>% 
  filter(adjusted.p.value < 0.1)
top_slopes
## # A tibble: 18 x 8
##    screenName   word      term    estimate std.error statistic     p.value
##    <chr>        <chr>     <chr>      <dbl>     <dbl>     <dbl>       <dbl>
##  1 BarackObama  health    time_…  -2.64e⁻⁸   4.90e⁻⁹     -5.39    7.01e⁻ ⁸
##  2 BarackObama  obama     time_…  -1.27e⁻⁸   1.71e⁻⁹     -7.46    8.98e⁻¹⁴
##  3 BarackObama  president time_…  -1.37e⁻⁸   1.62e⁻⁹     -8.45    2.88e⁻¹⁷
##  4 BarackObama  watch     time_…  -2.21e⁻⁸   4.57e⁻⁹     -4.84    1.28e⁻ ⁶
##  5 BarackObama  join      time_…   2.91e⁻⁸   6.60e⁻⁹      4.41    1.02e⁻ ⁵
##  6 BarackObama  leaders   time_…   6.17e⁻⁸   9.13e⁻⁹      6.76    1.42e⁻¹¹
##  7 BarackObama  court     time_…   3.79e⁻⁸   8.51e⁻⁹      4.45    8.48e⁻ ⁶
##  8 BarackObama  fair      time_…   2.63e⁻⁸   5.80e⁻⁹      4.52    6.09e⁻ ⁶
##  9 BarackObama  supreme   time_…   3.46e⁻⁸   8.16e⁻⁹      4.23    2.31e⁻ ⁵
## 10 BarackObama  immigrat… time_…  -5.76e⁻⁸   1.49e⁻⁸     -3.88    1.06e⁻ ⁴
## 11 BarackObama  #immigra… time_…  -8.53e⁻⁸   1.56e⁻⁸     -5.47    4.49e⁻ ⁸
## 12 BarackObama  senate    time_…   4.39e⁻⁸   7.58e⁻⁹      5.79    7.15e⁻ ⁹
## 13 realDonaldT… election  time_…  -3.59e⁻⁸   8.63e⁻⁹     -4.16    3.24e⁻ ⁵
## 14 realDonaldT… jobs      time_…  -3.01e⁻⁸   7.97e⁻⁹     -3.78    1.56e⁻ ⁴
## 15 realDonaldT… watch     time_…  -7.11e⁻⁸   1.38e⁻⁸     -5.17    2.39e⁻ ⁷
## 16 realDonaldT… win       time_…  -4.66e⁻⁸   1.12e⁻⁸     -4.16    3.17e⁻ ⁵
## 17 realDonaldT… tax       time_…   4.18e⁻⁸   8.67e⁻⁹      4.82    1.45e⁻ ⁶
## 18 realDonaldT… cuts      time_…   5.62e⁻⁸   1.44e⁻⁸      3.90    9.79e⁻ ⁵
## # ... with 1 more variable: adjusted.p.value <dbl>
words_by_time %>%
  inner_join(top_slopes, by = c("word", "screenName")) %>%
  filter(screenName == "realDonaldTrump") %>%
  ggplot(aes(time_floor, count/time_total, color = word)) +
  geom_line(size = 1.3) +
  labs(x = NULL, y = "Word frequency")

words_by_time %>% filter(time_floor < ymd("20170101")) %>%
  inner_join(top_slopes, by = c("word", "screenName")) %>%
  filter(screenName == "BarackObama") %>%
  ggplot(aes(time_floor, count/time_total, color = word)) +
  geom_line(size = 1.3) +
  labs(x = NULL, y = "Word frequency")

7.5 Favorites and retweets

totals <- tweets %>%
  group_by(screenName) %>% 
  summarise(total_rts = sum(retweetCount)) 
totals
## # A tibble: 3 x 2
##   screenName      total_rts
##   <chr>               <dbl>
## 1 BarackObama      16775202
## 2 POTUS44           7772417
## 3 realDonaldTrump  57331788
word_by_rts <- tidy_tweets %>% 
  group_by(id, word, screenName) %>% 
  summarise(rts = first(retweetCount)) %>% 
  group_by(screenName, word) %>% 
  summarise(retweetCount = median(rts), uses = n()) %>%
  left_join(totals, by="screenName") %>%
  filter(retweetCount != 0) %>%
  ungroup()

word_by_rts %>% 
  filter(uses >= 5) %>%
  arrange(desc(retweetCount))
## # A tibble: 1,831 x 5
##    screenName      word       retweetCount  uses total_rts
##    <chr>           <chr>             <dbl> <int>     <dbl>
##  1 BarackObama     victims          144338     5  16775202
##  2 POTUS44         michelle         109332     5   7772417
##  3 BarackObama     michelle          65371    11  16775202
##  4 BarackObama     thinking          53721     7  16775202
##  5 realDonaldTrump jong              48988     6  57331788
##  6 realDonaldTrump kim               48988     6  57331788
##  7 realDonaldTrump christmas         47429     7  57331788
##  8 realDonaldTrump jail              42338     5  57331788
##  9 realDonaldTrump bless             41404     9  57331788
## 10 realDonaldTrump protesters        41200     5  57331788
## # ... with 1,821 more rows
word_by_rts %>%
  filter(uses >= 5) %>%
  group_by(screenName) %>%
  top_n(10, retweetCount) %>%
  arrange(retweetCount) %>%
  ungroup() %>%
  mutate(word = reorder(paste(word, screenName, sep = "__"), retweetCount)) %>%
  ungroup() %>%
  ggplot(aes(word, retweetCount, fill = screenName)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ screenName, scales = "free_y", ncol = 1) +
  scale_x_discrete(labels = function(x) gsub("__.+$","", x)) +  
  coord_flip() +
  labs(x = NULL, 
       y = "Median # of retweetCount for tweets containing each word")

totals <- tweets %>%
  group_by(screenName) %>% 
  summarise(total_rts = sum(favoriteCount)) 
totals
## # A tibble: 3 x 2
##   screenName      total_rts
##   <chr>               <dbl>
## 1 BarackObama      60765485
## 2 POTUS44          19218549
## 3 realDonaldTrump 242484336
word_by_favs <- tidy_tweets %>% 
  group_by(id, word, screenName) %>% 
  summarise(favs = first(favoriteCount)) %>% 
  group_by(screenName, word) %>% 
  summarise(favoriteCount = median(favs), uses = n()) %>%
  left_join(totals, by="screenName") %>%
  filter(favoriteCount != 0) %>%
  ungroup()
word_by_favs %>%
  filter(uses >= 5) %>%
  group_by(screenName) %>%
  top_n(10, favoriteCount) %>%
  arrange(favoriteCount) %>%
  ungroup() %>%
  mutate(word = reorder(paste(word, screenName, sep = "__"), favoriteCount)) %>%
  ungroup() %>%
  ggplot(aes(word, favoriteCount, fill = screenName)) +
  geom_col(show.legend = FALSE) +
  scale_x_discrete(labels = function(x) gsub("__.+$","", x)) +  
  facet_wrap(~ screenName, scales = "free_y", ncol = 1) +
  coord_flip() +
  labs(x = NULL, 
       y = "Median # of favoriteCount for tweets containing each word")