WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
janitor::clean_names() %>%
rename(word1 = word_1,
word2 = word_2,
wordsim353 = human_mean)
SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
select(word1, word2, SimLex999) %>%
janitor::clean_names() %>%
full_join(wordsim)
mod300_5 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_300d_5ep.vec")
mod300_15 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_300d_15ep.vec")
mod200_5 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_200d_5ep.vec")
mod200_15 <- here("exploratory_studies/15_udbank/ud_en_models/ud_corpus_en_200d_15ep.vec")
get_simplex_word_dists <- function(model_path, dim, this_simlex, model_name){
current_model <- fread(
model_path,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:(dim + 1), function(x) paste0("V", x))))) %>%
filter(word %in% c(this_simlex$word1, this_simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1,everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything()) %>%
mutate(model = model_name)
long_word_word_dists
}
mod1 <- get_simplex_word_dists(mod300_5, 300, simlex, "300_5")
mod2 <- get_simplex_word_dists(mod300_15, 300, simlex, "300_15")
mod3 <- get_simplex_word_dists(mod200_5, 200, simlex, "200_5")
mod4 <- get_simplex_word_dists(mod200_15, 200, simlex, "200_15")
simlex_distances <- simlex %>% left_join(mod1) %>%
bind_rows(simlex %>% left_join(mod2)) %>%
bind_rows(simlex %>% left_join(mod3)) %>%
bind_rows(simlex %>% left_join(mod4)) %>%
filter(!is.na(cos_dist))
ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
300_5 |
0.0303871 |
38328159 |
0.4504484 |
Spearman’s rank correlation rho |
two.sided |
300_15 |
0.0670507 |
36878870 |
0.0955728 |
Spearman’s rank correlation rho |
two.sided |
200_5 |
0.0358485 |
38112274 |
0.3732602 |
Spearman’s rank correlation rho |
two.sided |
200_15 |
0.0850264 |
36168304 |
0.0344311 |
Spearman’s rank correlation rho |
two.sided |
Word Sim 353
ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
300_5 |
0.1837365 |
1314645 |
0.0071732 |
Spearman’s rank correlation rho |
two.sided |
300_15 |
0.3198928 |
1095356 |
0.0000019 |
Spearman’s rank correlation rho |
two.sided |
200_5 |
0.1838892 |
1314399 |
0.0071245 |
Spearman’s rank correlation rho |
two.sided |
200_15 |
0.3272072 |
1083576 |
0.0000010 |
Spearman’s rank correlation rho |
two.sided |