WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
janitor::clean_names() %>%
rename(word1 = word_1,
word2 = word_2,
wordsim353 = human_mean)
SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
select(word1, word2, SimLex999) %>%
janitor::clean_names() %>%
full_join(wordsim)
MODEL_PATHS <- here("exploratory_studies/15_udbank/ud_en_models/")
get_simplex_word_dists <- function(model_path, this_simlex){
model_name <- basename(model_path) %>%
str_remove("ud_corpus_en_") %>%
str_remove(".vec")
num_columns <- ifelse(str_detect(model_path, "150"), 150,
ifelse(str_detect(model_path, "200"), 200,
ifelse(str_detect(model_path, "300"), 300)))
current_model <- fread(
model_path,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:(num_columns + 1), function(x) paste0("V", x))))) %>%
filter(word %in% c(this_simlex$word1, this_simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1,everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything()) %>%
mutate(model = model_name)
this_simlex %>%
left_join(long_word_word_dists)
}
all_files <- list.files(MODEL_PATHS, full.names = T)
simlex_distances <- map_df(all_files, get_simplex_word_dists, simlex) %>%
filter(!is.na(cos_dist))
SimLex-999
ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
150d_15ep_noSubword |
0.1240710 |
34624894 |
0.0019843 |
Spearman’s rank correlation rho |
two.sided |
150d_15ep |
0.0572273 |
37267182 |
0.1550011 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep_10neg |
0.0611699 |
37111334 |
0.1284514 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep_noSubword |
0.1085676 |
35237734 |
0.0068579 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep |
0.0850264 |
36168304 |
0.0344311 |
Spearman’s rank correlation rho |
two.sided |
200d_30ep |
0.0563761 |
37300829 |
0.1612437 |
Spearman’s rank correlation rho |
two.sided |
200d_50ep |
0.0406074 |
37924158 |
0.3131312 |
Spearman’s rank correlation rho |
two.sided |
200d_5ep |
0.0358485 |
38112274 |
0.3732602 |
Spearman’s rank correlation rho |
two.sided |
300d_15ep |
0.0670507 |
36878870 |
0.0955728 |
Spearman’s rank correlation rho |
two.sided |
300d_5ep |
0.0303871 |
38328159 |
0.4504484 |
Spearman’s rank correlation rho |
two.sided |
Word Sim 353
ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
150d_15ep_noSubword |
0.3074049 |
1115469 |
0.0000048 |
Spearman’s rank correlation rho |
two.sided |
150d_15ep |
0.3275798 |
1082976 |
0.0000010 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep_10neg |
0.3709460 |
1013132 |
0.0000000 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep_noSubword |
0.3193961 |
1096156 |
0.0000019 |
Spearman’s rank correlation rho |
two.sided |
200d_15ep |
0.3272072 |
1083576 |
0.0000010 |
Spearman’s rank correlation rho |
two.sided |
200d_30ep |
0.3551914 |
1038505 |
0.0000001 |
Spearman’s rank correlation rho |
two.sided |
200d_50ep |
0.3157078 |
1102096 |
0.0000026 |
Spearman’s rank correlation rho |
two.sided |
200d_5ep |
0.1838892 |
1314399 |
0.0071245 |
Spearman’s rank correlation rho |
two.sided |
300d_15ep |
0.3198928 |
1095356 |
0.0000019 |
Spearman’s rank correlation rho |
two.sided |
300d_5ep |
0.1837365 |
1314645 |
0.0071732 |
Spearman’s rank correlation rho |
two.sided |