WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
janitor::clean_names() %>%
rename(word1 = word_1,
word2 = word_2,
wordsim353 = human_mean)
SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
select(word1, word2, SimLex999) %>%
janitor::clean_names() %>%
full_join(wordsim)
UNTRANS_MODEL <- "/Users/mollylewis/Downloads/europarl_models/untrans_europarl_5ep_en.txt.vec"
TRANS_MODEL <- "/Users/mollylewis/Downloads/europarl_models/trans_f_europarl_5ep_en.txt.vec"
# untranslated
current_model <- fread(
UNTRANS_MODEL,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
filter(word %in% c(simlex$word1, simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1,everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists_untranslated <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything())
# translated
current_model <- fread(
TRANS_MODEL,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
filter(word %in% c(simlex$word1, simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1,everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists_translated <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything())
# wiki
WIKI_MODEL <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/exploratory_analyses/0_exploration/wiki.en.vec"
current_model <- fread(
WIKI_MODEL,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
filter(word %in% c(simlex$word1, simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1, everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists_wiki <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything())
#
COCA_MODEL <- "/Users/mollylewis/Desktop/coca_all.vec"
current_model <- fread(
COCA_MODEL,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
filter(word %in% c(simlex$word1, simlex$word2))
word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = current_model$word) %>%
select(word1,everything())
names(wide_word_word_dists) = c("word1", current_model$word)
long_word_word_dists_coca <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
select(word1, word2, everything())
simlex_distances <- simlex %>%
left_join(long_word_word_dists_untranslated) %>%
mutate(model = "untranslated") %>%
bind_rows(simlex %>%
left_join(long_word_word_dists_translated) %>%
mutate(model = "translated")) %>%
bind_rows(simlex %>%
left_join(long_word_word_dists_wiki) %>%
mutate(model = "wiki")) %>%
bind_rows(simlex %>%
left_join(long_word_word_dists_coca) %>%
mutate(model = "coca"))
Simlex 999
ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
| untranslated |
0.2529978 |
60225292 |
0 |
Spearman’s rank correlation rho |
two.sided |
| translated |
0.3257702 |
77363878 |
0 |
Spearman’s rank correlation rho |
two.sided |
| wiki |
0.3803332 |
102968179 |
0 |
Spearman’s rank correlation rho |
two.sided |
| coca |
0.3476024 |
108406950 |
0 |
Spearman’s rank correlation rho |
two.sided |
Word Sim 353
ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
theme_classic()

simlex_distances %>%
group_by(model)%>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
select(-data) %>%
unnest() %>%
kable()
| untranslated |
0.4618351 |
1613101 |
0 |
Spearman’s rank correlation rho |
two.sided |
| translated |
0.4662047 |
2037854 |
0 |
Spearman’s rank correlation rho |
two.sided |
| wiki |
0.7322051 |
1677960 |
0 |
Spearman’s rank correlation rho |
two.sided |
| coca |
0.6652850 |
2097270 |
0 |
Spearman’s rank correlation rho |
two.sided |
Language-wise corrrlations
Simlex999
model_dists <- long_word_word_dists_translated %>%
right_join(simlex) %>%
rename(cos_dist_trans = cos_dist) %>%
left_join(long_word_word_dists_untranslated %>%
rename(cos_dist_untrans = cos_dist),
by = c("word1", "word2")) %>%
left_join(long_word_word_dists_wiki %>%
rename(cos_dist_wiki = cos_dist),
by = c("word1", "word2")) %>%
left_join(long_word_word_dists_coca %>%
rename(cos_dist_coca = cos_dist),
by = c("word1", "word2")) %>%
mutate(human_norm = case_when(!is.na(sim_lex999) ~ "sim_lex999",
TRUE ~ "wordsim353"))
model_dists %>%
filter(!is.na(sim_lex999)) %>%
summarize(trans_untrans = cor(.$cos_dist_untrans, .$cos_dist_trans, use= "complete"),
trans_coca = cor(.$cos_dist_coca, .$cos_dist_trans, use= "complete"),
trans_wiki = cor(.$cos_dist_wiki, .$cos_dist_trans, use= "complete"),
untrans_coca = cor(.$cos_dist_untrans, .$cos_dist_coca, use= "complete"),
untrans_wiki = cor(.$cos_dist_untrans, .$cos_dist_wiki, use= "complete"),
coca_wiki = cor(.$cos_dist_coca, .$cos_dist_wiki, use= "complete")) %>%
kable()
| 0.8137307 |
0.5938353 |
0.6160205 |
0.5408185 |
0.5729829 |
0.8183628 |
wordsim353
model_dists %>%
filter(!is.na(wordsim353)) %>%
summarize(trans_untrans = cor(.$cos_dist_untrans, .$cos_dist_trans, use= "complete"),
trans_coca = cor(.$cos_dist_coca, .$cos_dist_trans, use= "complete"),
trans_wiki = cor(.$cos_dist_wiki, .$cos_dist_trans, use= "complete"),
untrans_coca = cor(.$cos_dist_untrans, .$cos_dist_coca, use= "complete"),
untrans_wiki = cor(.$cos_dist_untrans, .$cos_dist_wiki, use= "complete"),
coca_wiki = cor(.$cos_dist_coca, .$cos_dist_wiki, use= "complete")) %>%
kable()
| 0.8727768 |
0.6285733 |
0.6340114 |
0.60604 |
0.6311732 |
0.8510588 |