WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
    janitor::clean_names() %>%
  rename(word1 = word_1,
         word2 = word_2,
         wordsim353 = human_mean)

SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
  select(word1, word2, SimLex999) %>%
  janitor::clean_names() %>%
  full_join(wordsim)


mod300_5 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_300d_5ep.vec")
mod300_15 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_300d_15ep.vec")
mod200_5 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_200d_5ep.vec")
mod200_15 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_200d_15ep.vec")


get_simplex_word_dists <- function(model_path, dim, this_simlex, model_name){
  current_model <- fread(
    model_path,
    header = FALSE,
    skip = 1,
    quote = "",
    encoding = "UTF-8",
    data.table = TRUE,
    col.names = c("word",
                  unlist(lapply(2:(dim + 1), function(x) paste0("V", x))))) %>%
    filter(word %in% c(this_simlex$word1, this_simlex$word2))
  
  word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
  
  wide_word_word_dists <- word_word_dists %>%
    as.data.frame()  %>%
    mutate(word1 =  current_model$word) %>%
    select(word1,everything())
  
  names(wide_word_word_dists)  = c("word1", current_model$word)
  
  long_word_word_dists <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
    select(word1, word2, everything())  %>%
    mutate(model = model_name)
  
  long_word_word_dists
}


mod1 <- get_simplex_word_dists(mod300_5, 300, simlex, "300_5")
mod2 <- get_simplex_word_dists(mod300_15, 300, simlex, "300_15")
mod3 <- get_simplex_word_dists(mod200_5, 200, simlex, "200_5")
mod4 <- get_simplex_word_dists(mod200_15, 200, simlex, "200_15")



simlex_distances <- simlex %>% left_join(mod1) %>%
  bind_rows(simlex %>% left_join(mod2)) %>%
  bind_rows(simlex %>% left_join(mod3)) %>%
  bind_rows(simlex %>% left_join(mod4)) %>%
  filter(!is.na(cos_dist))

ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable()
model estimate statistic p.value method alternative
300_5 0.0303871 38328159 0.4504484 Spearman’s rank correlation rho two.sided
300_15 0.0670507 36878870 0.0955728 Spearman’s rank correlation rho two.sided
200_5 0.0358485 38112274 0.3732602 Spearman’s rank correlation rho two.sided
200_15 0.0850264 36168304 0.0344311 Spearman’s rank correlation rho two.sided

Word Sim 353

ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable() 
model estimate statistic p.value method alternative
300_5 0.1837365 1314645 0.0071732 Spearman’s rank correlation rho two.sided
300_15 0.3198928 1095356 0.0000019 Spearman’s rank correlation rho two.sided
200_5 0.1838892 1314399 0.0071245 Spearman’s rank correlation rho two.sided
200_15 0.3272072 1083576 0.0000010 Spearman’s rank correlation rho two.sided