Read in words and word vecs for 1 language and get all pairwise distances between words, then save to feather
# get within-language pairwise word distance
get_pairwise_dist_beween_words <- function(d){
model_matrix <- d %>%
select(-1) %>%
as.matrix()
words <- d$word
word_word_dists <- philentropy::distance(model_matrix,
method = "cosine") %>%
as.data.frame() %>%
mutate(w1 = words)
colnames(word_word_dists) = c(words, "w1")
long_word_word_dists <- gather(word_word_dists, "w2", "cos_dist", -w1)
# this is a data.table way of getting unique pairs that seems fast
long_word_word_dists_f <- unique(as.data.table(long_word_word_dists)[, c("w1", "w2") := list(pmin(w1, w2),pmax(w1, w2))], by = c("w1", "w2")) %>%
filter(w1 != w2)
long_word_word_dists_f
}
# wrapper function for reading and writing
save_pairwise_dists <- function(lang){
print(paste0("====== ", lang, " ======"))
path <- "../../../data/processed/models/L1_high_low_models/"
# read in wordvecs and words
wordvecs <- read_feather(paste0(path, "wordvecs_", lang, "-low.model.feather") )
words <- read_feather(paste0(path, "words_", lang, "-low.model.feather")) %>%
rename(word = `0`)
d <- bind_cols(words, wordvecs)
pairwise_word_dists <- get_pairwise_dist_beween_words(d)
write_feather(pairwise_word_dists, paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang, "-low.feather"))
}
# get list of languages
files <- list.files("../../../data/raw/models/L1_models/")
LANGS <- str_sub(files, 1,3)
# do the thing
# walk(LANGS, save_pairwise_dists)
# get correlations between countries
get_pairwise_country_word_pairwise_correlations <- function(lang1, lang2){
print(paste0("====== ", lang1, " ", lang2, " ======"))
# read in two sets of pairiwise dists
pairiwise_dists1 <- read_feather(paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang1, "-low.feather")) %>%
rename(l1_cos = "cos_dist")
pairiwise_dists2 <- read_feather(paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang2, "-low.feather")) %>%
rename(l2_cos = "cos_dist")
# inner join two sets of distances
merged_dists <- merge(as.data.table(pairiwise_dists1),
as.data.table(pairiwise_dists2), all = FALSE)
# get correlation between two sets of distances
cor_df <- cor.test(merged_dists$l1_cos, merged_dists$l2_cos) %>%
tidy() %>%
mutate(lang1 = lang1,
lang2 = lang2) %>%
select(-method, -alternative) %>%
select(lang1, lang2, everything())
cor_df
}
# get list of unqiue country pairs
unique_lang_pairs <- expand.grid(LANGS, LANGS) %>%
rename(lang1 = Var1,
lang2 = Var2) %>%
filter(lang1 != lang2) %>%
mutate_all(as.character)
# do the thing
# all_corrs <- pmap_df(unique_lang_pairs, get_pairwise_country_word_pairwise_correlations)
# write_feather(all_corrs, "../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather")
(based on pairwise word distances)
all_corrs <- read_feather("../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather")
wide_corrs <- all_corrs %>%
select(1:3) %>%
spread(lang2, estimate)
all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1
dist_matrix <- dist(all_corrs_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
| ARA | BEN | BUL | CHI | DUT | ENG | FAS | FRE | GER | GRE | GUJ | HIN | IBO | IND | ITA | JPN | KAN | KOR | MAL | MAR | NEP | PAN | POL | POR | RUM | RUS | SPA | TAM | TEL | TGL | THA | TUR | URD | VIE | YOR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| BEN | 0.20 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| BUL | 0.15 | 0.16 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CHI | 0.21 | 0.31 | 0.32 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| DUT | 0.42 | 0.33 | 0.40 | 0.39 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| ENG | 0.26 | 0.14 | 0.15 | 0.41 | 0.40 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| FAS | 0.09 | 0.20 | 0.20 | 0.14 | 0.39 | 0.29 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| FRE | 0.10 | 0.18 | 0.12 | 0.23 | 0.38 | 0.24 | 0.12 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GER | 0.27 | 0.17 | 0.19 | 0.35 | 0.25 | 0.20 | 0.27 | 0.21 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GRE | 0.17 | 0.11 | 0.11 | 0.28 | 0.31 | 0.18 | 0.18 | 0.11 | 0.12 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GUJ | 0.20 | 0.22 | 0.29 | 0.15 | 0.37 | 0.34 | 0.14 | 0.23 | 0.31 | 0.24 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| HIN | 0.27 | 0.11 | 0.20 | 0.39 | 0.35 | 0.11 | 0.29 | 0.26 | 0.20 | 0.18 | 0.30 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| IBO | 0.20 | 0.13 | 0.20 | 0.24 | 0.30 | 0.23 | 0.17 | 0.18 | 0.19 | 0.14 | 0.18 | 0.20 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| IND | 0.18 | 0.26 | 0.28 | 0.07 | 0.38 | 0.37 | 0.10 | 0.20 | 0.32 | 0.24 | 0.12 | 0.34 | 0.20 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| ITA | 0.09 | 0.20 | 0.16 | 0.19 | 0.37 | 0.27 | 0.09 | 0.06 | 0.23 | 0.14 | 0.20 | 0.28 | 0.18 | 0.16 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| JPN | 0.25 | 0.33 | 0.36 | 0.06 | 0.40 | 0.44 | 0.17 | 0.26 | 0.37 | 0.31 | 0.17 | 0.42 | 0.27 | 0.10 | 0.23 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| KAN | 0.35 | 0.19 | 0.31 | 0.38 | 0.21 | 0.26 | 0.33 | 0.31 | 0.19 | 0.23 | 0.30 | 0.18 | 0.21 | 0.35 | 0.33 | 0.40 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| KOR | 0.22 | 0.32 | 0.34 | 0.04 | 0.41 | 0.43 | 0.15 | 0.24 | 0.36 | 0.29 | 0.17 | 0.40 | 0.25 | 0.08 | 0.20 | 0.06 | 0.40 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| MAL | 0.27 | 0.11 | 0.20 | 0.40 | 0.35 | 0.09 | 0.30 | 0.25 | 0.18 | 0.17 | 0.31 | 0.06 | 0.20 | 0.35 | 0.28 | 0.42 | 0.18 | 0.41 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| MAR | 0.25 | 0.07 | 0.21 | 0.34 | 0.30 | 0.16 | 0.25 | 0.23 | 0.18 | 0.16 | 0.25 | 0.08 | 0.15 | 0.29 | 0.25 | 0.36 | 0.15 | 0.35 | 0.10 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| NEP | 0.22 | 0.22 | 0.30 | 0.15 | 0.34 | 0.34 | 0.15 | 0.23 | 0.30 | 0.24 | 0.06 | 0.31 | 0.16 | 0.12 | 0.21 | 0.16 | 0.29 | 0.16 | 0.31 | 0.24 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| PAN | 0.18 | 0.12 | 0.22 | 0.23 | 0.35 | 0.25 | 0.15 | 0.19 | 0.24 | 0.17 | 0.12 | 0.21 | 0.11 | 0.18 | 0.19 | 0.25 | 0.24 | 0.24 | 0.21 | 0.15 | 0.12 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| POL | 0.12 | 0.15 | 0.09 | 0.27 | 0.36 | 0.19 | 0.16 | 0.07 | 0.17 | 0.08 | 0.24 | 0.22 | 0.17 | 0.23 | 0.10 | 0.30 | 0.29 | 0.28 | 0.21 | 0.21 | 0.25 | 0.19 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| POR | 0.11 | 0.18 | 0.11 | 0.26 | 0.37 | 0.22 | 0.15 | 0.05 | 0.19 | 0.11 | 0.25 | 0.25 | 0.18 | 0.23 | 0.09 | 0.30 | 0.31 | 0.27 | 0.24 | 0.22 | 0.26 | 0.20 | 0.06 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| RUM | 0.22 | 0.16 | 0.10 | 0.37 | 0.36 | 0.13 | 0.26 | 0.17 | 0.13 | 0.12 | 0.32 | 0.18 | 0.22 | 0.33 | 0.21 | 0.40 | 0.27 | 0.38 | 0.16 | 0.20 | 0.33 | 0.25 | 0.12 | 0.15 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| RUS | 0.09 | 0.19 | 0.10 | 0.28 | 0.44 | 0.21 | 0.16 | 0.09 | 0.24 | 0.15 | 0.26 | 0.25 | 0.21 | 0.25 | 0.12 | 0.32 | 0.35 | 0.29 | 0.24 | 0.25 | 0.28 | 0.22 | 0.10 | 0.09 | 0.17 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| SPA | 0.10 | 0.16 | 0.09 | 0.27 | 0.38 | 0.20 | 0.15 | 0.06 | 0.19 | 0.10 | 0.24 | 0.23 | 0.17 | 0.23 | 0.10 | 0.30 | 0.31 | 0.28 | 0.22 | 0.21 | 0.25 | 0.19 | 0.06 | 0.05 | 0.14 | 0.07 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
| TAM | 0.28 | 0.11 | 0.22 | 0.38 | 0.31 | 0.14 | 0.28 | 0.25 | 0.16 | 0.17 | 0.29 | 0.08 | 0.18 | 0.33 | 0.27 | 0.40 | 0.14 | 0.39 | 0.07 | 0.08 | 0.29 | 0.20 | 0.22 | 0.24 | 0.18 | 0.26 | 0.23 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
| TEL | 0.23 | 0.19 | 0.29 | 0.19 | 0.31 | 0.32 | 0.17 | 0.24 | 0.28 | 0.22 | 0.08 | 0.27 | 0.16 | 0.16 | 0.22 | 0.21 | 0.26 | 0.20 | 0.28 | 0.21 | 0.07 | 0.10 | 0.25 | 0.26 | 0.32 | 0.28 | 0.25 | 0.26 | 0.00 | NA | NA | NA | NA | NA | NA |
| TGL | 0.23 | 0.10 | 0.16 | 0.34 | 0.29 | 0.13 | 0.24 | 0.19 | 0.10 | 0.10 | 0.28 | 0.12 | 0.16 | 0.30 | 0.22 | 0.36 | 0.17 | 0.35 | 0.10 | 0.11 | 0.27 | 0.19 | 0.15 | 0.18 | 0.12 | 0.21 | 0.17 | 0.10 | 0.25 | 0.00 | NA | NA | NA | NA | NA |
| THA | 0.17 | 0.29 | 0.30 | 0.07 | 0.42 | 0.39 | 0.11 | 0.21 | 0.35 | 0.27 | 0.15 | 0.37 | 0.23 | 0.06 | 0.17 | 0.10 | 0.39 | 0.07 | 0.38 | 0.33 | 0.16 | 0.22 | 0.24 | 0.24 | 0.35 | 0.25 | 0.24 | 0.37 | 0.20 | 0.33 | 0.00 | NA | NA | NA | NA |
| TUR | 0.14 | 0.24 | 0.25 | 0.09 | 0.37 | 0.34 | 0.06 | 0.15 | 0.29 | 0.21 | 0.13 | 0.33 | 0.18 | 0.07 | 0.12 | 0.13 | 0.34 | 0.10 | 0.33 | 0.28 | 0.14 | 0.17 | 0.19 | 0.18 | 0.30 | 0.20 | 0.18 | 0.32 | 0.16 | 0.27 | 0.08 | 0.00 | NA | NA | NA |
| URD | 0.20 | 0.05 | 0.16 | 0.31 | 0.34 | 0.14 | 0.20 | 0.18 | 0.19 | 0.12 | 0.22 | 0.11 | 0.14 | 0.26 | 0.20 | 0.34 | 0.21 | 0.32 | 0.12 | 0.08 | 0.22 | 0.12 | 0.16 | 0.18 | 0.17 | 0.19 | 0.16 | 0.13 | 0.20 | 0.12 | 0.29 | 0.24 | 0.00 | NA | NA |
| VIE | 0.10 | 0.24 | 0.23 | 0.13 | 0.43 | 0.32 | 0.06 | 0.15 | 0.30 | 0.22 | 0.16 | 0.32 | 0.20 | 0.10 | 0.11 | 0.17 | 0.37 | 0.14 | 0.32 | 0.29 | 0.18 | 0.18 | 0.18 | 0.18 | 0.29 | 0.17 | 0.17 | 0.32 | 0.20 | 0.28 | 0.09 | 0.08 | 0.24 | 0.00 | NA |
| YOR | 0.19 | 0.12 | 0.19 | 0.25 | 0.31 | 0.21 | 0.17 | 0.18 | 0.20 | 0.14 | 0.18 | 0.19 | 0.04 | 0.21 | 0.18 | 0.27 | 0.21 | 0.26 | 0.19 | 0.14 | 0.17 | 0.12 | 0.17 | 0.19 | 0.21 | 0.20 | 0.17 | 0.18 | 0.16 | 0.15 | 0.24 | 0.19 | 0.13 | 0.21 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Language-pairwise correlations between word distances (low models)")