### replicated from: https://www.jtimm.net/2020/05/26/corp-comp-ling-covid19/
#setwd("J:/COVID_Twitter/11")
library(data.table)
dat <- fread("J:/COVID_Twitter/11/2020-11-01_clean-dataset.csv")
names(dat)
##  [1] "coordinates"                "created_at"                
##  [3] "hashtags"                   "media"                     
##  [5] "urls"                       "favorite_count"            
##  [7] "id"                         "in_reply_to_screen_name"   
##  [9] "in_reply_to_status_id"      "in_reply_to_user_id"       
## [11] "lang"                       "place"                     
## [13] "possibly_sensitive"         "retweet_count"             
## [15] "reweet_id"                  "retweet_screen_name"       
## [17] "source"                     "text"                      
## [19] "tweet_url"                  "user_created_at"           
## [21] "user_screen_name"           "user_default_profile_image"
## [23] "user_description"           "user_favourites_count"     
## [25] "user_followers_count"       "user_friends_count"        
## [27] "user_listed_count"          "user_location"             
## [29] "user_name"                  "user_screen_name"          
## [31] "user_statuses_count"        "user_time_zone"            
## [33] "user_urls"                  "user_verified"
dat= head(dat,20000)
dim(dat)
## [1] 20000    34
library(tidyverse)
library(dplyr)
dictionary <-  readxl::read_xlsx ('J:/COVID_Twitter/covid_glossary_w_variants.xlsx') %>%
  filter(category != 'race-ethnicity') %>%
  ungroup()
unique(dictionary$category)
## [1] "cv"                "interventions"     "medical_response" 
## [4] "prevention"        "socio-political"   "spread_of_disease"
## [7] "transmission"
dictionary %>% 
  filter(descriptor_name %in% c('antiviral', 'hand-hygiene')) %>%
  group_by(category, descriptor_name) %>%
  summarize(term_names = paste0(term_name, collapse = ' | '))  %>%
  DT::datatable(rownames = FALSE, options = list(dom = 't',
                                                 scrollX = TRUE))
data.frame(tweets = format(nrow(dat), big.mark = ','), 
           tokens = format(sum(tokenizers::count_words(dat$text)), 
                           big.mark = ',')) %>%
  knitr::kable()
tweets tokens
20,000 505,372
t1 <- tokenizers::tokenize_ptb(dat$text, lowercase = TRUE)
t2 <- lapply(t1, gsub, 
             pattern = '([a-z0-9])([[:punct:]])', 
             replacement = '\\1 \\2') 
t3 <- lapply(t2, gsub, 
             pattern = '([[:punct:]])([a-z0-9])', 
             replacement = '\\1 \\2') 

t4 <- lapply(t3, paste0, collapse = ' ')
## Re-build
dat$word_text <- unlist(t4)
mo <- text2vec::itoken(dat$word_text, 
                       preprocessor = tolower,
                       tokenizer = text2vec::space_tokenizer, 
                       n_chunks = 1,
                       ids = dat$id) 

vocab <- text2vec::create_vocabulary(mo, stopwords = character(0)) #tm::stopwords()
multi_word_expressions <- subset(dictionary, grepl(' ', term_name))
sep = ' '
mas_que_dos <- subset(multi_word_expressions, grepl(' [a-z0-9]* ', term_name))

new_two_grams <- lapply(mas_que_dos$term_name, function(x) {
  regmatches(x, 
             gregexpr("[^ ]+ [^ ]+", # sep = ' '
                      x, 
                      perl=TRUE)
  )[[1]] }) %>%
  unlist() %>%
  unique()

multi_word_expressions_replace <- gsub(' ', sep, multi_word_expressions$term_name)
multi_word_expressions_replace <- c(multi_word_expressions_replace,
                                    new_two_grams )

model <- text2vec::Collocations$new(vocabulary = vocab, sep = sep) 
model$.__enclos_env__$private$phrases <- multi_word_expressions_replace

it_phrases <- model$transform(mo) 
term_vocab <- text2vec::create_vocabulary(it_phrases) 
term_vocab1 <- text2vec::prune_vocabulary(term_vocab, term_count_min = 2)


ats <- attributes(term_vocab1)
term_vocab2 <- subset(term_vocab1, grepl('^[A-Za-z]', term) & nchar(term) > 2)
t2v_vocab <- term_vocab2
attributes(t2v_vocab) <- ats


term_freqs <- term_vocab2 %>%
  left_join(dictionary , by = c('term' = 'term_name'))

descriptor_freq <- term_freqs %>%
  group_by(category, descriptor_name) %>%
  summarize(term_freq = sum(term_count)) %>%
  filter(!is.na(descriptor_name))
term_freqs %>%
  filter(descriptor_name %in% c('social-distancing', 'front-line-workers',
                                'flatten-the-curve')) %>%
  arrange(desc(term_count)) %>%
  mutate(tf = paste0(term, ' (', term_count, ')')) %>%
  group_by(descriptor_name) %>%
  summarize(relative = paste0(tf, collapse = ' | ')) %>%
  DT::datatable(rownames = FALSE, options = list(dom = 't',
                                                 scrollX = TRUE))
vectorizer <- text2vec::vocab_vectorizer(t2v_vocab)
dtm0 <- text2vec::create_dtm(it_phrases, vectorizer)

dtm1 <- lexvarsdatr::lvdr_get_closest(dtm0) ## other ways -- 
colnames(dtm1) <- c('doc_id', 'term_name', 'count')
dtm2 <- dtm1 %>% inner_join(dictionary)

dat$id=as.character(dat$id)
dtm3 <- dtm2 %>%
  ## Aggregate -- to descriptor_name
  group_by(doc_id, descriptor_name, category) %>%
  summarize(count = sum(count)) %>%
  ungroup() %>%
  
  left_join(dat %>% select(id, user_name, created_at) %>%
              mutate(user_name = toupper(user_name)), by = c('doc_id' = 'id')) 
term_vocab3 <- term_vocab2 %>%
  rename(term_name = term) %>%
  left_join(dictionary) %>%
  mutate(descriptor_name = ifelse(is.na(descriptor_name), 
                                  term_name, 
                                  descriptor_name),
         category = ifelse(is.na(category), 
                           'other', 
                           category)) %>%
  arrange(term_name)

tcm <- text2vec::create_tcm(it = it_phrases,
                            vectorizer = vectorizer,
                            skip_grams_window = 5L)
tcm <- tcm[, order(colnames(tcm))]
tcm <- tcm[order(rownames(tcm)), ]

tcm1 <- lexvarsdatr::lvdr_aggregate_matrix(tfm = tcm, 
                                           group = term_vocab3$descriptor_name, 
                                           fun = 'sum')


set.seed(99)
glove <- text2vec::GlobalVectors$new(rank = 128, 
                                     #vocabulary = row.names(tcm1), 
                                     x_max = 10)

wv_main <- glove$fit_transform(tcm1, 
                               n_iter = 10, 
                               convergence_tol = 0.01)
## INFO  [17:53:17.861] epoch 1, loss 0.1270 
## INFO  [17:53:18.072] epoch 2, loss 0.0693 
## INFO  [17:53:18.263] epoch 3, loss 0.0426 
## INFO  [17:53:18.450] epoch 4, loss 0.0241 
## INFO  [17:53:18.635] epoch 5, loss 0.0183 
## INFO  [17:53:18.819] epoch 6, loss 0.0146 
## INFO  [17:53:19.012] epoch 7, loss 0.0120 
## INFO  [17:53:19.197] epoch 8, loss 0.0100 
## INFO  [17:53:19.375] epoch 9, loss 0.0084 
## INFO  [17:53:19.566] epoch 10, loss 0.0072
wv_context <- glove$components
glove_vectors <- wv_main + t(wv_context)

eg_terms <- c('stay-at-home', 'outbreak', 
              'front-line-workers', 'vaccine',
              'relief' )

## Modified @ 5.28.20 
# x <- lapply(eg_terms, 
#             LSAfun::neighbors, 
#             glove_vectors, 
#             n = 10)
# names(x) <- eg_terms

nns <- lapply(eg_terms, 
              lexvarsdatr::lvdr_quick_cosine,
              tfm = glove_vectors,
              include_targ = T) %>%
  bind_rows()
set.seed(99)
keeps <- descriptor_freq %>% filter(term_freq > 30)
glove1 <- glove_vectors[rownames(glove_vectors) %in% 
                          unique(keeps$descriptor_name),]
sim_mat <- text2vec::sim2(glove1, 
                          method = "cosine", 
                          norm = "l2")

# data set too small for tSNE ---

y1 <- cmdscale(1-sim_mat, eig = TRUE, k = 2)$points %>% 
  data.frame() %>%
  mutate (descriptor_name = rownames(sim_mat)) %>%
  left_join(dictionary %>% distinct(descriptor_name, category)) 
y1 %>% 
  ggplot(aes(X1,X2, label = descriptor_name)) +
  geom_point(aes(color = category), size = 3.5) +
  ggrepel::geom_text_repel(
    data  = y1,
    nudge_y =  0.025,
    segment.color = "grey80",
    direction = "y",
    hjust = 0, 
    size = 3 ) +
  ggthemes::scale_colour_stata() + 
  theme_minimal() +
  #theme_classic() +
  theme(legend.position = "bottom",
        plot.title = element_text(size=14))+ 
  labs(title="COVID19-related concepts in 2D semantic space")

network <- tcm1 %>% 
  lexvarsdatr::lvdr_calc_ppmi(make_symmetric = TRUE) %>%
  lexvarsdatr::lvdr_extract_network (
    target = c('contact-tracing', 
               'flatten-the-curve',
               'return-to-work',
               'social-distancing',
               #'remote-learning',
               'drive-through-testing'), 
    n = 20)


set.seed(66)
network %>%
  tidygraph::as_tbl_graph() %>%
  ggraph::ggraph() +
  
  ggraph::geom_edge_link(color = 'darkgray') + 
  ggraph::geom_node_point(aes(size = value, 
                              color = term,
                              shape = group)) +
  
  ggraph::geom_node_text(aes(label = toupper(label), 
                             filter = group == 'term'), 
                         repel = TRUE, size = 4) +
  
  ggraph::geom_node_text(aes(label = tolower(label), 
                             filter = group == 'feature'), 
                         repel = TRUE, size = 3) +
  ggthemes::scale_color_stata()+
  theme_minimal() +
  ggtitle('A COVID19 co-occurrence network') +
  theme(legend.position = "none",
        plot.title = element_text(size=14))