WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
    janitor::clean_names() %>%
  rename(word1 = word_1,
         word2 = word_2,
         wordsim353 = human_mean)

SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
  select(word1, word2, SimLex999) %>%
  janitor::clean_names() %>%
  full_join(wordsim)


MODEL_PATHS <- here("exploratory_studies/15_udbank/ud_en_models/")


get_simplex_word_dists <- function(model_path, this_simlex){
  
  model_name <-  basename(model_path) %>%
    str_remove("ud_corpus_en_") %>%
    str_remove(".vec")
  
  num_columns <- ifelse(str_detect(model_path, "150"), 150,
                        ifelse(str_detect(model_path, "200"), 200,
                               ifelse(str_detect(model_path, "300"), 300)))
  
  current_model <- fread(
    model_path,
    header = FALSE,
    skip = 1,
    quote = "",
    encoding = "UTF-8",
    data.table = TRUE,
    col.names = c("word",
                  unlist(lapply(2:(num_columns + 1), function(x) paste0("V", x))))) %>%
    filter(word %in% c(this_simlex$word1, this_simlex$word2))
  
  word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
  
  wide_word_word_dists <- word_word_dists %>%
    as.data.frame()  %>%
    mutate(word1 =  current_model$word) %>%
    select(word1,everything())
  
  names(wide_word_word_dists)  = c("word1", current_model$word)
  
  long_word_word_dists <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
    select(word1, word2, everything())  %>%
    mutate(model = model_name)
  
  this_simlex %>%
      left_join(long_word_word_dists)
}

all_files <- list.files(MODEL_PATHS, full.names = T)

simlex_distances <- map_df(all_files, get_simplex_word_dists, simlex) %>%
  filter(!is.na(cos_dist))

SimLex-999

ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable()
model estimate statistic p.value method alternative
150d_15ep_noSubword 0.1240710 34624894 0.0019843 Spearman’s rank correlation rho two.sided
150d_15ep 0.0572273 37267182 0.1550011 Spearman’s rank correlation rho two.sided
200d_15ep_10neg 0.0611699 37111334 0.1284514 Spearman’s rank correlation rho two.sided
200d_15ep_noSubword 0.1085676 35237734 0.0068579 Spearman’s rank correlation rho two.sided
200d_15ep 0.0850264 36168304 0.0344311 Spearman’s rank correlation rho two.sided
200d_30ep 0.0563761 37300829 0.1612437 Spearman’s rank correlation rho two.sided
200d_50ep 0.0406074 37924158 0.3131312 Spearman’s rank correlation rho two.sided
200d_5ep 0.0358485 38112274 0.3732602 Spearman’s rank correlation rho two.sided
300d_15ep 0.0670507 36878870 0.0955728 Spearman’s rank correlation rho two.sided
300d_5ep 0.0303871 38328159 0.4504484 Spearman’s rank correlation rho two.sided

Word Sim 353

ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable() 
model estimate statistic p.value method alternative
150d_15ep_noSubword 0.3074049 1115469 0.0000048 Spearman’s rank correlation rho two.sided
150d_15ep 0.3275798 1082976 0.0000010 Spearman’s rank correlation rho two.sided
200d_15ep_10neg 0.3709460 1013132 0.0000000 Spearman’s rank correlation rho two.sided
200d_15ep_noSubword 0.3193961 1096156 0.0000019 Spearman’s rank correlation rho two.sided
200d_15ep 0.3272072 1083576 0.0000010 Spearman’s rank correlation rho two.sided
200d_30ep 0.3551914 1038505 0.0000001 Spearman’s rank correlation rho two.sided
200d_50ep 0.3157078 1102096 0.0000026 Spearman’s rank correlation rho two.sided
200d_5ep 0.1838892 1314399 0.0071245 Spearman’s rank correlation rho two.sided
300d_15ep 0.3198928 1095356 0.0000019 Spearman’s rank correlation rho two.sided
300d_5ep 0.1837365 1314645 0.0071732 Spearman’s rank correlation rho two.sided