Accessing Data from Twitter API using R (part1)
consumer_key <- ""
consumer_secret <- ""
access_token <- ""
access_secret <- ""
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
trump <- userTimeline("realDonaldTrump", n=3200, includeRts=T)
obama <- userTimeline("BarackObama", n=3200, includeRts=T)
president.obama <- userTimeline("POTUS44", n=3200, includeRts=T)
length(trump)
## [1] 3195
length(obama)
## [1] 3196
length(president.obama)
## [1] 352
df.trump <- twListToDF(trump)
df.obama <- twListToDF(obama)
df.president.obama <- twListToDF(president.obama)
dim(df.trump)
## [1] 3195 16
dim(df.obama)
## [1] 3196 16
dim(df.president.obama)
## [1] 352 16
colnames(df.trump)
## [1] "text" "favorited" "favoriteCount" "replyToSN"
## [5] "created" "truncated" "replyToSID" "id"
## [9] "replyToUID" "statusSource" "screenName" "retweetCount"
## [13] "isRetweet" "retweeted" "longitude" "latitude"
tweets <- bind_rows(
df.trump %>% filter(isRetweet==F) %>%
select(text, screenName, created, retweetCount, favoriteCount),
df.obama %>% filter(isRetweet==F) %>%
select(text, screenName, created, retweetCount, favoriteCount),
df.president.obama %>% filter(isRetweet==F) %>%
select(text, screenName, created, retweetCount, favoriteCount))
ggplot(tweets, aes(x = created, fill = screenName)) +
geom_histogram(position = "identity", bins = 50, show.legend = FALSE) +
facet_wrap(~screenName, ncol = 1, scales = "free_y") +
ggtitle("Tweet Activity (Adaptive y-axis)")
ggplot(tweets, aes(x = created, fill = screenName)) +
geom_histogram(position = "identity", bins = 50, show.legend = FALSE) +
facet_wrap(~screenName, ncol = 1) +
ggtitle("Tweet Activity (Fixed y-axis)")
## Word frequencies
# replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https"
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
tidy_tweets <- tweets %>%
filter(!str_detect(text, "^RT")) %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
mutate(id = row_number()) %>%
unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
filter(!word %in% stop_words$word, str_detect(word, "[a-z]"))
frequency <- tidy_tweets %>%
group_by(screenName) %>%
count(word, sort = TRUE) %>%
left_join(tidy_tweets %>%
group_by(screenName) %>%
summarise(total = n()), by="screenName") %>%
mutate(freq = n/total)
frequency
## # A tibble: 10,349 x 5
## # Groups: screenName [3]
## screenName word n total freq
## <chr> <chr> <int> <int> <dbl>
## 1 BarackObama president 1165 20498 0.0568
## 2 BarackObama obama 1029 20498 0.0502
## 3 BarackObama #actonclimate 270 20498 0.0132
## 4 realDonaldTrump people 218 23417 0.00931
## 5 BarackObama change 215 20498 0.0105
## 6 realDonaldTrump news 204 23417 0.00871
## 7 BarackObama climate 189 20498 0.00922
## 8 realDonaldTrump fake 177 23417 0.00756
## 9 realDonaldTrump america 167 23417 0.00713
## 10 BarackObama watch 164 20498 0.00800
## # ... with 10,339 more rows
frequency.spread <- frequency %>%
select(screenName, word, freq) %>%
spread(screenName, freq) %>%
arrange(desc(BarackObama), desc(realDonaldTrump))
frequency.spread
## # A tibble: 7,567 x 4
## word BarackObama POTUS44 realDonaldTrump
## <chr> <dbl> <dbl> <dbl>
## 1 president 0.0568 0.00281 0.00615
## 2 obama 0.0502 0.000351 0.00252
## 3 #actonclimate 0.0132 0.00105 NA
## 4 change 0.0105 0.00632 0.000897
## 5 climate 0.00922 0.00386 NA
## 6 watch 0.00800 NA 0.00218
## 7 health 0.00766 0.00597 0.000342
## 8 time 0.00737 0.00492 0.00538
## 9 #doyourjob 0.00717 NA NA
## 10 americans 0.00698 0.00738 0.00132
## # ... with 7,557 more rows
word_ratios <- tidy_tweets %>%
filter(screenName != "POTUS44") %>%
filter(!str_detect(word, "^@")) %>%
count(word, screenName) %>%
filter(sum(n) >= 10) %>%
ungroup() %>%
spread(screenName, n, fill = 0) %>%
mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
mutate(logratio = log(realDonaldTrump / BarackObama)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
## # A tibble: 6,803 x 4
## word BarackObama realDonaldTrump logratio
## <chr> <dbl> <dbl> <dbl>
## 1 save 0.000407 0.000406 -0.00390
## 2 level 0.000704 0.000710 0.00918
## 3 political 0.000667 0.000676 0.0145
## 4 remarks 0.000630 0.000642 0.0203
## 5 decades 0.000296 0.000304 0.0269
## 6 november 0.000296 0.000304 0.0269
## 7 happy 0.00178 0.00172 -0.0303
## 8 american 0.00367 0.00355 -0.0321
## 9 christmas 0.000259 0.000270 0.0426
## 10 cost 0.000259 0.000270 0.0426
## # ... with 6,793 more rows
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (realDonaldTrump/BarackObama)") +
scale_fill_discrete(name = "", labels = c("realDonaldTrump", "BarackObama"))
# Without hashtags
word_ratios %>%
filter(!str_detect(word, "^#")) %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (realDonaldTrump/BarackObama)") +
scale_fill_discrete(name = "", labels = c("realDonaldTrump", "BarackObama"))
words_by_time <- tidy_tweets %>%
filter(!str_detect(word, "^@")) %>%
mutate(time_floor = floor_date(created, unit = "1 month")) %>%
count(time_floor, screenName, word) %>%
ungroup() %>%
group_by(screenName, time_floor) %>%
mutate(time_total = sum(n)) %>%
group_by(word) %>%
mutate(word_total = sum(n)) %>%
ungroup() %>%
rename(count = n) %>%
filter(word_total > 30)
words_by_time
## # A tibble: 6,270 x 6
## time_floor screenName word count time_total word_total
## <dttm> <chr> <chr> <int> <int> <int>
## 1 2014-08-01 00:00:00 BarackObama #actonclim… 1 91 273
## 2 2014-08-01 00:00:00 BarackObama address 1 91 108
## 3 2014-08-01 00:00:00 BarackObama affordable 1 91 41
## 4 2014-08-01 00:00:00 BarackObama agree 1 91 75
## 5 2014-08-01 00:00:00 BarackObama america 2 91 303
## 6 2014-08-01 00:00:00 BarackObama americans 1 91 195
## 7 2014-08-01 00:00:00 BarackObama care 1 91 134
## 8 2014-08-01 00:00:00 BarackObama celebrate 1 91 33
## 9 2014-08-01 00:00:00 BarackObama change 1 91 254
## 10 2014-08-01 00:00:00 BarackObama check 1 91 72
## # ... with 6,260 more rows
nested_data <- words_by_time %>%
filter(screenName != "BarackObama" | time_floor < ymd("20170101")) %>%
filter(screenName != "POTUS44") %>%
nest(-word, -screenName)
nested_data
## # A tibble: 473 x 3
## screenName word data
## <chr> <chr> <list>
## 1 BarackObama #actonclimate <tibble [28 × 4]>
## 2 BarackObama address <tibble [28 × 4]>
## 3 BarackObama affordable <tibble [14 × 4]>
## 4 BarackObama agree <tibble [21 × 4]>
## 5 BarackObama america <tibble [22 × 4]>
## 6 BarackObama americans <tibble [27 × 4]>
## 7 BarackObama care <tibble [23 × 4]>
## 8 BarackObama celebrate <tibble [14 × 4]>
## 9 BarackObama change <tibble [27 × 4]>
## 10 BarackObama check <tibble [25 × 4]>
## # ... with 463 more rows
nested_models <- nested_data %>%
mutate(models = map(data, ~ glm(
cbind(count, time_total) ~ time_floor, .,
family = "binomial")))
nested_models
## # A tibble: 473 x 4
## screenName word data models
## <chr> <chr> <list> <list>
## 1 BarackObama #actonclimate <tibble [28 × 4]> <S3: glm>
## 2 BarackObama address <tibble [28 × 4]> <S3: glm>
## 3 BarackObama affordable <tibble [14 × 4]> <S3: glm>
## 4 BarackObama agree <tibble [21 × 4]> <S3: glm>
## 5 BarackObama america <tibble [22 × 4]> <S3: glm>
## 6 BarackObama americans <tibble [27 × 4]> <S3: glm>
## 7 BarackObama care <tibble [23 × 4]> <S3: glm>
## 8 BarackObama celebrate <tibble [14 × 4]> <S3: glm>
## 9 BarackObama change <tibble [27 × 4]> <S3: glm>
## 10 BarackObama check <tibble [25 × 4]> <S3: glm>
## # ... with 463 more rows
slopes <- nested_models %>%
unnest(map(models, tidy)) %>%
filter(term == "time_floor") %>%
mutate(adjusted.p.value = p.adjust(p.value))
top_slopes <- slopes %>%
filter(adjusted.p.value < 0.1)
top_slopes
## # A tibble: 18 x 8
## screenName word term estimate std.error statistic p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 BarackObama health time_… -2.64e⁻⁸ 4.90e⁻⁹ -5.39 7.01e⁻ ⁸
## 2 BarackObama obama time_… -1.27e⁻⁸ 1.71e⁻⁹ -7.46 8.98e⁻¹⁴
## 3 BarackObama president time_… -1.37e⁻⁸ 1.62e⁻⁹ -8.45 2.88e⁻¹⁷
## 4 BarackObama watch time_… -2.21e⁻⁸ 4.57e⁻⁹ -4.84 1.28e⁻ ⁶
## 5 BarackObama join time_… 2.91e⁻⁸ 6.60e⁻⁹ 4.41 1.02e⁻ ⁵
## 6 BarackObama leaders time_… 6.17e⁻⁸ 9.13e⁻⁹ 6.76 1.42e⁻¹¹
## 7 BarackObama court time_… 3.79e⁻⁸ 8.51e⁻⁹ 4.45 8.48e⁻ ⁶
## 8 BarackObama fair time_… 2.63e⁻⁸ 5.80e⁻⁹ 4.52 6.09e⁻ ⁶
## 9 BarackObama supreme time_… 3.46e⁻⁸ 8.16e⁻⁹ 4.23 2.31e⁻ ⁵
## 10 BarackObama immigrat… time_… -5.76e⁻⁸ 1.49e⁻⁸ -3.88 1.06e⁻ ⁴
## 11 BarackObama #immigra… time_… -8.53e⁻⁸ 1.56e⁻⁸ -5.47 4.49e⁻ ⁸
## 12 BarackObama senate time_… 4.39e⁻⁸ 7.58e⁻⁹ 5.79 7.15e⁻ ⁹
## 13 realDonaldT… election time_… -3.59e⁻⁸ 8.63e⁻⁹ -4.16 3.24e⁻ ⁵
## 14 realDonaldT… jobs time_… -3.01e⁻⁸ 7.97e⁻⁹ -3.78 1.56e⁻ ⁴
## 15 realDonaldT… watch time_… -7.11e⁻⁸ 1.38e⁻⁸ -5.17 2.39e⁻ ⁷
## 16 realDonaldT… win time_… -4.66e⁻⁸ 1.12e⁻⁸ -4.16 3.17e⁻ ⁵
## 17 realDonaldT… tax time_… 4.18e⁻⁸ 8.67e⁻⁹ 4.82 1.45e⁻ ⁶
## 18 realDonaldT… cuts time_… 5.62e⁻⁸ 1.44e⁻⁸ 3.90 9.79e⁻ ⁵
## # ... with 1 more variable: adjusted.p.value <dbl>
words_by_time %>%
inner_join(top_slopes, by = c("word", "screenName")) %>%
filter(screenName == "realDonaldTrump") %>%
ggplot(aes(time_floor, count/time_total, color = word)) +
geom_line(size = 1.3) +
labs(x = NULL, y = "Word frequency")
words_by_time %>% filter(time_floor < ymd("20170101")) %>%
inner_join(top_slopes, by = c("word", "screenName")) %>%
filter(screenName == "BarackObama") %>%
ggplot(aes(time_floor, count/time_total, color = word)) +
geom_line(size = 1.3) +
labs(x = NULL, y = "Word frequency")
totals <- tweets %>%
group_by(screenName) %>%
summarise(total_rts = sum(retweetCount))
totals
## # A tibble: 3 x 2
## screenName total_rts
## <chr> <dbl>
## 1 BarackObama 16775202
## 2 POTUS44 7772417
## 3 realDonaldTrump 57331788
word_by_rts <- tidy_tweets %>%
group_by(id, word, screenName) %>%
summarise(rts = first(retweetCount)) %>%
group_by(screenName, word) %>%
summarise(retweetCount = median(rts), uses = n()) %>%
left_join(totals, by="screenName") %>%
filter(retweetCount != 0) %>%
ungroup()
word_by_rts %>%
filter(uses >= 5) %>%
arrange(desc(retweetCount))
## # A tibble: 1,831 x 5
## screenName word retweetCount uses total_rts
## <chr> <chr> <dbl> <int> <dbl>
## 1 BarackObama victims 144338 5 16775202
## 2 POTUS44 michelle 109332 5 7772417
## 3 BarackObama michelle 65371 11 16775202
## 4 BarackObama thinking 53721 7 16775202
## 5 realDonaldTrump jong 48988 6 57331788
## 6 realDonaldTrump kim 48988 6 57331788
## 7 realDonaldTrump christmas 47429 7 57331788
## 8 realDonaldTrump jail 42338 5 57331788
## 9 realDonaldTrump bless 41404 9 57331788
## 10 realDonaldTrump protesters 41200 5 57331788
## # ... with 1,821 more rows
word_by_rts %>%
filter(uses >= 5) %>%
group_by(screenName) %>%
top_n(10, retweetCount) %>%
arrange(retweetCount) %>%
ungroup() %>%
mutate(word = reorder(paste(word, screenName, sep = "__"), retweetCount)) %>%
ungroup() %>%
ggplot(aes(word, retweetCount, fill = screenName)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ screenName, scales = "free_y", ncol = 1) +
scale_x_discrete(labels = function(x) gsub("__.+$","", x)) +
coord_flip() +
labs(x = NULL,
y = "Median # of retweetCount for tweets containing each word")
totals <- tweets %>%
group_by(screenName) %>%
summarise(total_rts = sum(favoriteCount))
totals
## # A tibble: 3 x 2
## screenName total_rts
## <chr> <dbl>
## 1 BarackObama 60765485
## 2 POTUS44 19218549
## 3 realDonaldTrump 242484336
word_by_favs <- tidy_tweets %>%
group_by(id, word, screenName) %>%
summarise(favs = first(favoriteCount)) %>%
group_by(screenName, word) %>%
summarise(favoriteCount = median(favs), uses = n()) %>%
left_join(totals, by="screenName") %>%
filter(favoriteCount != 0) %>%
ungroup()
word_by_favs %>%
filter(uses >= 5) %>%
group_by(screenName) %>%
top_n(10, favoriteCount) %>%
arrange(favoriteCount) %>%
ungroup() %>%
mutate(word = reorder(paste(word, screenName, sep = "__"), favoriteCount)) %>%
ungroup() %>%
ggplot(aes(word, favoriteCount, fill = screenName)) +
geom_col(show.legend = FALSE) +
scale_x_discrete(labels = function(x) gsub("__.+$","", x)) +
facet_wrap(~ screenName, scales = "free_y", ncol = 1) +
coord_flip() +
labs(x = NULL,
y = "Median # of favoriteCount for tweets containing each word")