df <- read_rds(path = here::here("data_twitter", "data_processed", "covid_19.rds"))
total_tweets <- df %>%
count()
n_users <- df %>%
distinct(user_id) %>%
count()
Below are the basic statistics (total number of tweets, number of unique users and date):
Number of total tweets: 677001. Number of diferent users: 110698.
Distribution of bots/no_bots/unknown:
df %>%
distinct(user_id, .keep_all = T) %>%
count(is_bot) %>%
mutate(perct = n / sum(n)) %>%
gt() %>%
tab_options(
heading.title.font.size = 13,
column_labels.font.size = 11,
table.font.size = 11
) %>%
tab_header(
title = "Distribution of bots/no_bots/unknown:"
) %>%
fmt_number(
columns = vars(n),
decimals = 0
) %>%
fmt_percent(
columns = vars(perct),
decimals = 2,
)
| Distribution of bots/no_bots/unknown: | ||
|---|---|---|
| is_bot | n | perct |
| Bot | 601 | 0.54% |
| Bot_official | 5 | 0.00% |
| No Bot | 88,767 | 80.19% |
| Unknown | 21,325 | 19.26% |
Dates:
min(df$created_at)
## [1] "2020-03-16 00:51:30 UTC"
max(df$created_at)
## [1] "2020-03-23 18:45:30 UTC"
df %>%
mutate(user_id = as.character(user_id)) %>%
count(user_name, user_id, sort = T) %>%
slice(1:20) %>%
gt() %>%
tab_options(
heading.title.font.size = 13,
column_labels.font.size = 11,
table.font.size = 11
) %>%
tab_header(
title = "# of different tweets by user"
) %>%
fmt_number(
columns = vars(n),
decimals = 0
)
| # of different tweets by user | ||
|---|---|---|
| user_name | user_id | n |
| Coronavirus bot | 1235367745539248128 | 2,477 |
| covid19bot | 1234752234124124160 | 2,040 |
| Cyber Security Feed | 1131854274223366144 | 940 |
| 🚨#CoronaCamps R THE PLAN🚨NOT A DRILL🌹#NotMeUs | 1170258383506747392 | 645 |
| COVID-19 Real Time Numbers | 1237834300420173824 | 540 |
| NaFe065 | 1226454737542930432 | 481 |
| The Quint | 2982269822 | 450 |
| Hakkı Art. | 3086632433 | 439 |
| Sally Deal | 817825066993860608 | 392 |
| Shakthi Vadakkepat | 18936284 | 374 |
| NewsOnePlace.com | 3534222021 | 370 |
| A V | 94979487 | 353 |
| Hindusherni_ 🇮🇳 | 1236134854107860992 | 335 |
| Security Testing | 710123736175783936 | 317 |
| Real Talk with JAM Podcast | 1214084640916426752 | 305 |
| Mark Dominic | 856969871023890432 | 293 |
| Hindustan Times | 36327407 | 284 |
| IMRAN KHAN® NIYAZI™ SELECTED© | 788432343765626880 | 281 |
| Tonya | 287394890 | 276 |
| and the livin's easy | 132535895 | 269 |
df %>%
select(user_name, user_followers_count) %>%
arrange(desc(user_followers_count)) %>%
distinct(user_name, .keep_all = T) %>%
slice(1:20) %>%
gt() %>%
tab_options(
heading.title.font.size = 13,
column_labels.font.size = 11,
table.font.size = 11
) %>%
tab_header(
title = "Firts 20th users by # of followers"
) %>%
fmt_number(
columns = vars(user_followers_count),
decimals = 0
)
| Firts 20th users by # of followers | |
|---|---|
| user_name | user_followers_count |
| CGTN | 14,050,164 |
| Rachel Maddow MSNBC | 9,930,284 |
| Hindustan Times | 7,173,683 |
| People's Daily, China | 7,097,825 |
| Lonely Planet | 6,306,359 |
| ESPNcricinfo | 5,872,015 |
| Department of State | 5,690,004 |
| UK Prime Minister | 5,608,948 |
| American Red Cross | 5,320,293 |
| Alfie Deyes | 5,203,717 |
| MTV NEWS | 5,142,713 |
| ABS-CBN News Channel | 4,859,740 |
| Filmfare | 4,810,087 |
| Dr. Mehmet Oz | 4,082,376 |
| The Indian Express | 3,365,718 |
| Nic Nemeth | 2,842,804 |
| Kal Naga - أبوالنجا | 2,821,111 |
| Bakhtawar B-Zardari | 2,770,820 |
| Sahara Reporters | 2,697,417 |
| Naveen Patnaik | 2,647,178 |
library(scales)
df %>%
select(created_at) %>%
mutate(round_created_at_tweet = round_date(created_at, unit = "hour")) %>%
count(round_created_at_tweet) %>%
# filter(round_created_at_tweet > as.Date("2020-02-01")) %>% THIS HAS BEEN DONE PREVIOSLY
ggplot(aes(x = round_created_at_tweet, y = n)) +
geom_line() +
labs(
title = "",
x = "Date",
y = "# of tweets"
) +
scale_x_datetime() +
theme_light() +
theme(legend.position = "bottom") +
scale_color_brewer(palette = "Set1")
df %>%
filter(is_bot != "Bot_official") %>%
# select(is_bot, created_at) %>%
mutate(round_created_at_tweet = round_date(created_at, unit = "hour")) %>%
count(is_bot, round_created_at_tweet) %>%
ggplot(aes(x = round_created_at_tweet, y = n, color = is_bot)) +
geom_line() +
labs(
title = "",
x = "Date",
y = "# of tweets"
) +
scale_x_datetime() +
theme_light() +
theme(legend.position = "bottom") +
# facet_wrap(~ is_bot, scales = "free", ncol = 2) +
scale_color_brewer(palette = "Set1")
Distribution of probability of being a bot:
df %>%
select(prob_bot) %>%
ggplot(aes(x = prob_bot)) +
geom_histogram(bins = 10)
library(tidytext)
remove_reg <- "&|<|>"
remove_urls <- "http"
df %>%
mutate(
text = str_remove_all(text, remove_reg),
text = str_replace_all(text, "(?<=\\S)#", " #")
) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(
!word %in% stop_words$word,
!word %in% str_detect(word, remove_urls),
!word %in% str_remove_all(stop_words$word, "'"),
!word %in% c(
"rt", "#covid_19", "#covid19", "#covid", "#covid19esp",
"#coronavirus", "#coronavirusesp"
),
!word %in% c("coronavirus", "covid", "covid-19", "covid19"),
str_detect(word, "[a-z]")
) -> tidy_df
tidy_df %>%
group_by(is_bot) %>%
count(word, sort = T) %>%
ungroup() %>%
left_join(tidy_df %>%
group_by(is_bot) %>%
summarise(total = n())) %>%
mutate(freq = n / total) -> tidy_freq
df %>%
filter(!str_detect(text, "^RT")) %>%
mutate(
text = str_remove_all(text, remove_reg),
text = str_remove_all(text, "#"),
text = str_remove_all(text, "countryregion"),
text = str_remove_all(text, "ocoronavirus")
) %>%
unnest_tokens(word, text, token = "tweets", strip_url = TRUE) %>%
filter(
!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
!word %in% c("rt", "covid_19", "covid19", "covid", "covid19esp", "coronavirus", "coronavirusesp", "countryregion", "ocoronavirus", "gcoronavirus"),
str_detect(word, "[a-z]")
) %>%
count(is_bot, word, sort = T) %>%
group_by(is_bot) %>%
# summarise(n = n()) %>%
slice_max(n, n = 10) %>%
# arrange(n) %>%
ungroup() %>%
# mutate(word = fct_infreq(word, ordered = T)) %>%
ggplot(aes(reorder_within(word, n, is_bot), n, fill = is_bot)) +
geom_col(show.legend = F) +
scale_x_reordered() +
facet_wrap(~is_bot, scales = "free", ncol = 2) +
coord_flip()
Latent Dirichlet allocation (LDA) is a particularly popular method for fitting a topic model. It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to “overlap” each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural language:
lda_tweets <- read_rds(here::here(
"data_twitter",
"data_processed", "lda_bot.rds"
))
topic_tweets <- tidy(lda_tweets, matrix = "gamma")
topic_tweets %>%
distinct(document) %>%
nrow() -> n_total
topic_tweets %>%
rename("id" = "document") %>%
arrange(id) %>%
group_by(id) %>%
top_n(1, wt = gamma) %>%
ungroup() %>%
count(topic) %>%
mutate(
n_total = n_total,
percentage = n / n_total
) %>%
arrange(desc(percentage)) %>%
filter(percentage > .05) %>%
mutate(
curly_name = paste0(
"Topic ",
topic, " (",
round(percentage * 100, 2), "%)"
),
curly_name = as_factor(curly_name)
) -> cmn_topics
topic_tweets <- tidy(lda_tweets)
tw_top_terms <- topic_tweets %>%
inner_join(cmn_topics %>% select(topic, curly_name)) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
tw_top_terms %>%
mutate(
term = reorder_within(term, beta, topic)
) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~curly_name, scales = "free", ncol = 3) +
coord_flip() +
scale_x_reordered()
lda_tweets <- read_rds(here::here(
"data_twitter",
"data_processed", "lda_no_bot.rds"
))
topic_tweets <- tidy(lda_tweets, matrix = "gamma")
topic_tweets %>%
distinct(document) %>%
nrow() -> n_total
topic_tweets %>%
rename("id" = "document") %>%
arrange(id) %>%
group_by(id) %>%
top_n(1, wt = gamma) %>%
ungroup() %>%
count(topic) %>%
mutate(
n_total = n_total,
percentage = n / n_total
) %>%
arrange(desc(percentage)) %>%
filter(percentage > .05) %>%
mutate(
curly_name = paste0(
"Topic ",
topic, " (",
round(percentage * 100, 2), "%)"
),
curly_name = as_factor(curly_name)
) -> cmn_topics
topic_tweets <- tidy(lda_tweets)
tw_top_terms <- topic_tweets %>%
inner_join(cmn_topics %>% select(topic, curly_name)) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
tw_top_terms %>%
mutate(
term = reorder_within(term, beta, topic)
) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~curly_name, scales = "free", ncol = 3) +
coord_flip() +
scale_x_reordered()
lda_tweets <- read_rds(here::here(
"data_twitter",
"data_processed", "lda_unknown.rds"
))
topic_tweets <- tidy(lda_tweets, matrix = "gamma")
topic_tweets %>%
distinct(document) %>%
nrow() -> n_total
topic_tweets %>%
rename("id" = "document") %>%
arrange(id) %>%
group_by(id) %>%
top_n(1, wt = gamma) %>%
ungroup() %>%
count(topic) %>%
mutate(
n_total = n_total,
percentage = n / n_total
) %>%
arrange(desc(percentage)) %>%
filter(percentage > .05) %>%
mutate(
curly_name = paste0(
"Topic ",
topic, " (",
round(percentage * 100, 2), "%)"
),
curly_name = as_factor(curly_name)
) -> cmn_topics
topic_tweets <- tidy(lda_tweets)
tw_top_terms <- topic_tweets %>%
inner_join(cmn_topics %>% select(topic, curly_name)) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
tw_top_terms %>%
mutate(
term = reorder_within(term, beta, topic)
) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~curly_name, scales = "free", ncol = 3) +
coord_flip() +
scale_x_reordered()
lda_tweets <- read_rds(here::here(
"data_twitter",
"data_processed", "lda_botofficial.rds"
))
topic_tweets <- tidy(lda_tweets, matrix = "gamma")
topic_tweets %>%
distinct(document) %>%
nrow() -> n_total
topic_tweets %>%
rename("id" = "document") %>%
arrange(id) %>%
group_by(id) %>%
top_n(1, wt = gamma) %>%
ungroup() %>%
count(topic) %>%
mutate(
n_total = n_total,
percentage = n / n_total
) %>%
arrange(desc(percentage)) %>%
filter(percentage > .05) %>%
mutate(
curly_name = paste0(
"Topic ",
topic, " (",
round(percentage * 100, 2), "%)"
),
curly_name = as_factor(curly_name)
) -> cmn_topics
topic_tweets <- tidy(lda_tweets)
tw_top_terms <- topic_tweets %>%
inner_join(cmn_topics %>% select(topic, curly_name)) %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
tw_top_terms %>%
mutate(
term = reorder_within(term, beta, topic)
) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~curly_name, scales = "free", ncol = 3) +
coord_flip() +
scale_x_reordered()
library(tidytext)
library(tidyverse)
remove_reg <- "&|<|>"
remove_urls <- "http"
df %>%
filter(is_bot != "Bot_official") %>%
mutate(
text = str_remove_all(text, remove_reg),
text = str_replace_all(text, "(?<=\\S)#", " #")
) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(
!word %in% stop_words$word,
!word %in% str_detect(word, remove_urls),
!word %in% str_remove_all(stop_words$word, "'"),
!word %in% c("rt", "#covid_19", "#covid19", "#covid", "#covid19esp", "#coronavirus", "#coronavirusesp"),
str_detect(word, "[a-z]")
) -> tidy_df
tidy_df %>%
select(is_bot, id, word, created_at) %>%
mutate(id = as.character(id)) %>%
inner_join(get_sentiments("afinn")) %>%
mutate(round_created_at_tweet = lubridate::round_date(created_at, unit = "day")) %>%
group_by(id) %>%
mutate(sentiment = mean(value)) %>%
ungroup() %>%
group_by(is_bot, round_created_at_tweet) %>%
mutate(sentiment_per_day = mean(sentiment)) -> foo
library(scales)
foo %>%
ggplot(aes(x = round_created_at_tweet, y = sentiment_per_day, colour = is_bot)) +
geom_line() +
scale_x_datetime() +
theme_light() +
theme(legend.position = "bottom") +
scale_color_brewer(palette = "Set1")