### replicated from: https://www.jtimm.net/2020/05/26/corp-comp-ling-covid19/
#setwd("J:/COVID_Twitter/11")
library(data.table)
dat <- fread("J:/COVID_Twitter/11/2020-11-01_clean-dataset.csv")
names(dat)
## [1] "coordinates" "created_at"
## [3] "hashtags" "media"
## [5] "urls" "favorite_count"
## [7] "id" "in_reply_to_screen_name"
## [9] "in_reply_to_status_id" "in_reply_to_user_id"
## [11] "lang" "place"
## [13] "possibly_sensitive" "retweet_count"
## [15] "reweet_id" "retweet_screen_name"
## [17] "source" "text"
## [19] "tweet_url" "user_created_at"
## [21] "user_screen_name" "user_default_profile_image"
## [23] "user_description" "user_favourites_count"
## [25] "user_followers_count" "user_friends_count"
## [27] "user_listed_count" "user_location"
## [29] "user_name" "user_screen_name"
## [31] "user_statuses_count" "user_time_zone"
## [33] "user_urls" "user_verified"
dat= head(dat,20000)
dim(dat)
## [1] 20000 34
library(tidyverse)
library(dplyr)
dictionary <- readxl::read_xlsx ('J:/COVID_Twitter/covid_glossary_w_variants.xlsx') %>%
filter(category != 'race-ethnicity') %>%
ungroup()
unique(dictionary$category)
## [1] "cv" "interventions" "medical_response"
## [4] "prevention" "socio-political" "spread_of_disease"
## [7] "transmission"
dictionary %>%
filter(descriptor_name %in% c('antiviral', 'hand-hygiene')) %>%
group_by(category, descriptor_name) %>%
summarize(term_names = paste0(term_name, collapse = ' | ')) %>%
DT::datatable(rownames = FALSE, options = list(dom = 't',
scrollX = TRUE))
data.frame(tweets = format(nrow(dat), big.mark = ','),
tokens = format(sum(tokenizers::count_words(dat$text)),
big.mark = ',')) %>%
knitr::kable()
t1 <- tokenizers::tokenize_ptb(dat$text, lowercase = TRUE)
t2 <- lapply(t1, gsub,
pattern = '([a-z0-9])([[:punct:]])',
replacement = '\\1 \\2')
t3 <- lapply(t2, gsub,
pattern = '([[:punct:]])([a-z0-9])',
replacement = '\\1 \\2')
t4 <- lapply(t3, paste0, collapse = ' ')
## Re-build
dat$word_text <- unlist(t4)
mo <- text2vec::itoken(dat$word_text,
preprocessor = tolower,
tokenizer = text2vec::space_tokenizer,
n_chunks = 1,
ids = dat$id)
vocab <- text2vec::create_vocabulary(mo, stopwords = character(0)) #tm::stopwords()
multi_word_expressions <- subset(dictionary, grepl(' ', term_name))
sep = ' '
mas_que_dos <- subset(multi_word_expressions, grepl(' [a-z0-9]* ', term_name))
new_two_grams <- lapply(mas_que_dos$term_name, function(x) {
regmatches(x,
gregexpr("[^ ]+ [^ ]+", # sep = ' '
x,
perl=TRUE)
)[[1]] }) %>%
unlist() %>%
unique()
multi_word_expressions_replace <- gsub(' ', sep, multi_word_expressions$term_name)
multi_word_expressions_replace <- c(multi_word_expressions_replace,
new_two_grams )
model <- text2vec::Collocations$new(vocabulary = vocab, sep = sep)
model$.__enclos_env__$private$phrases <- multi_word_expressions_replace
it_phrases <- model$transform(mo)
term_vocab <- text2vec::create_vocabulary(it_phrases)
term_vocab1 <- text2vec::prune_vocabulary(term_vocab, term_count_min = 2)
ats <- attributes(term_vocab1)
term_vocab2 <- subset(term_vocab1, grepl('^[A-Za-z]', term) & nchar(term) > 2)
t2v_vocab <- term_vocab2
attributes(t2v_vocab) <- ats
term_freqs <- term_vocab2 %>%
left_join(dictionary , by = c('term' = 'term_name'))
descriptor_freq <- term_freqs %>%
group_by(category, descriptor_name) %>%
summarize(term_freq = sum(term_count)) %>%
filter(!is.na(descriptor_name))
term_freqs %>%
filter(descriptor_name %in% c('social-distancing', 'front-line-workers',
'flatten-the-curve')) %>%
arrange(desc(term_count)) %>%
mutate(tf = paste0(term, ' (', term_count, ')')) %>%
group_by(descriptor_name) %>%
summarize(relative = paste0(tf, collapse = ' | ')) %>%
DT::datatable(rownames = FALSE, options = list(dom = 't',
scrollX = TRUE))
vectorizer <- text2vec::vocab_vectorizer(t2v_vocab)
dtm0 <- text2vec::create_dtm(it_phrases, vectorizer)
dtm1 <- lexvarsdatr::lvdr_get_closest(dtm0) ## other ways --
colnames(dtm1) <- c('doc_id', 'term_name', 'count')
dtm2 <- dtm1 %>% inner_join(dictionary)
dat$id=as.character(dat$id)
dtm3 <- dtm2 %>%
## Aggregate -- to descriptor_name
group_by(doc_id, descriptor_name, category) %>%
summarize(count = sum(count)) %>%
ungroup() %>%
left_join(dat %>% select(id, user_name, created_at) %>%
mutate(user_name = toupper(user_name)), by = c('doc_id' = 'id'))
term_vocab3 <- term_vocab2 %>%
rename(term_name = term) %>%
left_join(dictionary) %>%
mutate(descriptor_name = ifelse(is.na(descriptor_name),
term_name,
descriptor_name),
category = ifelse(is.na(category),
'other',
category)) %>%
arrange(term_name)
tcm <- text2vec::create_tcm(it = it_phrases,
vectorizer = vectorizer,
skip_grams_window = 5L)
tcm <- tcm[, order(colnames(tcm))]
tcm <- tcm[order(rownames(tcm)), ]
tcm1 <- lexvarsdatr::lvdr_aggregate_matrix(tfm = tcm,
group = term_vocab3$descriptor_name,
fun = 'sum')
set.seed(99)
glove <- text2vec::GlobalVectors$new(rank = 128,
#vocabulary = row.names(tcm1),
x_max = 10)
wv_main <- glove$fit_transform(tcm1,
n_iter = 10,
convergence_tol = 0.01)
## INFO [17:53:17.861] epoch 1, loss 0.1270
## INFO [17:53:18.072] epoch 2, loss 0.0693
## INFO [17:53:18.263] epoch 3, loss 0.0426
## INFO [17:53:18.450] epoch 4, loss 0.0241
## INFO [17:53:18.635] epoch 5, loss 0.0183
## INFO [17:53:18.819] epoch 6, loss 0.0146
## INFO [17:53:19.012] epoch 7, loss 0.0120
## INFO [17:53:19.197] epoch 8, loss 0.0100
## INFO [17:53:19.375] epoch 9, loss 0.0084
## INFO [17:53:19.566] epoch 10, loss 0.0072
wv_context <- glove$components
glove_vectors <- wv_main + t(wv_context)
eg_terms <- c('stay-at-home', 'outbreak',
'front-line-workers', 'vaccine',
'relief' )
## Modified @ 5.28.20
# x <- lapply(eg_terms,
# LSAfun::neighbors,
# glove_vectors,
# n = 10)
# names(x) <- eg_terms
nns <- lapply(eg_terms,
lexvarsdatr::lvdr_quick_cosine,
tfm = glove_vectors,
include_targ = T) %>%
bind_rows()
set.seed(99)
keeps <- descriptor_freq %>% filter(term_freq > 30)
glove1 <- glove_vectors[rownames(glove_vectors) %in%
unique(keeps$descriptor_name),]
sim_mat <- text2vec::sim2(glove1,
method = "cosine",
norm = "l2")
# data set too small for tSNE ---
y1 <- cmdscale(1-sim_mat, eig = TRUE, k = 2)$points %>%
data.frame() %>%
mutate (descriptor_name = rownames(sim_mat)) %>%
left_join(dictionary %>% distinct(descriptor_name, category))
y1 %>%
ggplot(aes(X1,X2, label = descriptor_name)) +
geom_point(aes(color = category), size = 3.5) +
ggrepel::geom_text_repel(
data = y1,
nudge_y = 0.025,
segment.color = "grey80",
direction = "y",
hjust = 0,
size = 3 ) +
ggthemes::scale_colour_stata() +
theme_minimal() +
#theme_classic() +
theme(legend.position = "bottom",
plot.title = element_text(size=14))+
labs(title="COVID19-related concepts in 2D semantic space")

network <- tcm1 %>%
lexvarsdatr::lvdr_calc_ppmi(make_symmetric = TRUE) %>%
lexvarsdatr::lvdr_extract_network (
target = c('contact-tracing',
'flatten-the-curve',
'return-to-work',
'social-distancing',
#'remote-learning',
'drive-through-testing'),
n = 20)
set.seed(66)
network %>%
tidygraph::as_tbl_graph() %>%
ggraph::ggraph() +
ggraph::geom_edge_link(color = 'darkgray') +
ggraph::geom_node_point(aes(size = value,
color = term,
shape = group)) +
ggraph::geom_node_text(aes(label = toupper(label),
filter = group == 'term'),
repel = TRUE, size = 4) +
ggraph::geom_node_text(aes(label = tolower(label),
filter = group == 'feature'),
repel = TRUE, size = 3) +
ggthemes::scale_color_stata()+
theme_minimal() +
ggtitle('A COVID19 co-occurrence network') +
theme(legend.position = "none",
plot.title = element_text(size=14))
