This analysis asks whether the distance between two words in language A is correlated with the same distance in language B. Here we look at German, Hindi, Korean, and Turkish for the “broad academics” prompt (“VC079857”).
Set params.
PROMPT <- "VC079857"
LANGS <- c("ENG", "GER", "HIN", "TUR", "KOR")
Read in model data.
all_models <- read_feather("data/all_langs_word_vectors.feather") %>%
select(L1_code, word, type, translation, everything())
For each word-word pair, we calculate the distance of word_x to word_y for all word pairs (n = 10000). We then look at the correlation of these distances across semantic space models. Prediction: Languages that are more closely related should be have semantic spaces that are more correlated.
all_prompt_dists <- map_df(LANGS,
get_pairwise_dist_beween_words,
all_models)
dist_matrix <- spread(all_prompt_dists, L1_code, dist) %>%
#filter(w1 != w2) %>%
select(-1:-2) %>%
as.matrix()
prompt_word_dist_cor_mat <- cor(dist_matrix, use = "pairwise.complete.obs")
kable(prompt_word_dist_cor_mat)
| ENG | GER | HIN | KOR | TUR | |
|---|---|---|---|---|---|
| ENG | 1.0000000 | 0.7610061 | 0.6050708 | 0.5214087 | 0.5091490 |
| GER | 0.7610061 | 1.0000000 | 0.5300831 | 0.5568331 | 0.5102290 |
| HIN | 0.6050708 | 0.5300831 | 1.0000000 | 0.5458384 | 0.5885479 |
| KOR | 0.5214087 | 0.5568331 | 0.5458384 | 1.0000000 | 0.5037011 |
| TUR | 0.5091490 | 0.5102290 | 0.5885479 | 0.5037011 | 1.0000000 |
Do our previous measures of language similarity correlate with word-word distance correlations?
Read in all distances measures based on document vectors, and combine document measures with word-word distances correlations (each data point in a language pair).
files <- list.files("../../all_data/pairwise_country_distances/")
all_doc_distances <- map_df(paste0("../../all_data/pairwise_country_distances/", files), function(x){
df <- read_csv(x)
mutate(df, measure = x, L1_code_1 = names(df))
}) %>%
rowwise() %>%
mutate(measure = unlist(str_split(measure, "distances/"))[2],
measure = unlist(str_split(measure, "_distances.csv"))[1])
all_doc_distances_long <- all_doc_distances %>%
gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
distinct(measure, L1_code_1, L1_code_2, .keep_all = TRUE)
all_dists_with_prompt <- prompt_word_dist_cor_mat %>%
as.data.frame() %>%
rownames_to_column(var = "L1_code_1") %>%
mutate(measure = "word_word_distance") %>%
gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
select(measure, everything()) %>%
bind_rows(all_doc_distances_long) %>%
spread(measure, dist) %>%
filter(L1_code_1 %in% LANGS,
L1_code_2 %in% LANGS) %>%
filter(L1_code_1 != L1_code_2) %>%
na.omit()
correlate(all_dists_with_prompt[,-1:-2], use =
"pairwise.complete.obs") %>%
kable()
| rowname | 2D_b_w | 2D_centroid | 2D_p_overlap | HD_centroid | word_word_distance |
|---|---|---|---|---|---|
| 2D_b_w | NA | -0.1455326 | 0.9559525 | 0.6901092 | 0.6031255 |
| 2D_centroid | -0.1455326 | NA | 0.0333834 | -0.1087799 | 0.4477248 |
| 2D_p_overlap | 0.9559525 | 0.0333834 | NA | 0.6947577 | 0.6214977 |
| HD_centroid | 0.6901092 | -0.1087799 | 0.6947577 | NA | 0.5330626 |
| word_word_distance | 0.6031255 | 0.4477248 | 0.6214977 | 0.5330626 | NA |
Overall language essay centroid distance vs. word-word distance
#ggplot(all_dists_with_prompt, aes(x = `2D_centroid`,
# y = word_word_distance)) +
# geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
# geom_smooth(method = "lm") +
# theme_bw()
ggplot(all_dists_with_prompt, aes(x = `HD_centroid`,
y = word_word_distance)) +
geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
geom_smooth(method = "lm")+
theme_bw()
Overlap measures vs. word-word distance
ggplot(all_dists_with_prompt, aes(x = `2D_p_overlap`,
y = word_word_distance)) +
geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
geom_smooth(method = "lm")+
theme_bw()
ggplot(all_dists_with_prompt, aes(x = `2D_b_w`,
y = word_word_distance)) +
geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
geom_smooth(method = "lm") +
theme_bw()
Suggestive….need more data.
(for each language pair)
dist_matrix_long_form <- spread(all_prompt_dists, L1_code, dist)
combos= expand.grid(LANGS, LANGS) %>%
filter(Var1 != Var2)
resids <- map2_df(combos$Var1, combos$Var2,
get_lang_pairwise_residuals, dist_matrix_long_form)
crit_resids <- resids %>%
rowwise() %>%
mutate(wpair = get_unique_relation_id(w1, w2)) %>%
mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
distinct(lpair, wpair, .keep_all = TRUE) %>%
group_by(lang1, lang2) %>%
arrange(-abs(resid)) %>%
slice(1:5) %>%
mutate(n = 1:n())
crit_resids %>%
select(lang1, lang2, n, w1, w2, resid) %>%
kable()
| lang1 | lang2 | n | w1 | w2 | resid |
|---|---|---|---|---|---|
| GER | ENG | 1 | life | live | -5.161712 |
| GER | ENG | 2 | broad | wide | -4.174902 |
| GER | ENG | 3 | issue | problem | -4.121196 |
| GER | ENG | 4 | kind | thing | 2.019841 |
| GER | ENG | 5 | human | people | -2.015536 |
| HIN | ENG | 1 | expert | specialize | -5.273020 |
| HIN | ENG | 2 | opinion | view | -5.178612 |
| HIN | ENG | 3 | specialist | specialize | -4.938001 |
| HIN | ENG | 4 | expert | specialist | -4.585205 |
| HIN | ENG | 5 | subject | topic | -4.460329 |
| HIN | GER | 1 | opinion | view | -5.445078 |
| HIN | GER | 2 | expert | specialize | -5.111721 |
| HIN | GER | 3 | subject | topic | -5.096890 |
| HIN | GER | 4 | job | work | -4.998181 |
| HIN | GER | 5 | specialist | specialize | -4.593314 |
| TUR | ENG | 1 | area | field | -5.999214 |
| TUR | ENG | 2 | issue | topic | -5.697727 |
| TUR | ENG | 3 | conclusion | result | -5.581131 |
| TUR | ENG | 4 | issue | subject | -5.472325 |
| TUR | ENG | 5 | business | work | -5.464909 |
| TUR | GER | 1 | issue | subject | -5.809935 |
| TUR | GER | 2 | business | work | -5.769715 |
| TUR | GER | 3 | business | job | -5.708771 |
| TUR | GER | 4 | case | situation | -5.690853 |
| TUR | GER | 5 | area | field | -5.643414 |
| TUR | HIN | 1 | area | field | -6.165838 |
| TUR | HIN | 2 | broad | wide | -5.891411 |
| TUR | HIN | 3 | information | knowledge | -5.436387 |
| TUR | HIN | 4 | case | situation | -5.426014 |
| TUR | HIN | 5 | conclusion | result | -5.275661 |
| KOR | ENG | 1 | day | job | -5.146330 |
| KOR | ENG | 2 | talk | topic | -5.133887 |
| KOR | ENG | 3 | academic | student | -4.798137 |
| KOR | ENG | 4 | broad | wide | -4.660774 |
| KOR | ENG | 5 | expert | specialist | -4.625501 |
| KOR | GER | 1 | day | job | -5.420826 |
| KOR | GER | 2 | academic | student | -5.396780 |
| KOR | GER | 3 | research | study | -4.787820 |
| KOR | GER | 4 | problem | question | -4.437232 |
| KOR | GER | 5 | expert | specialist | -4.244000 |
| KOR | HIN | 1 | problem | question | -5.759544 |
| KOR | HIN | 2 | broad | wide | -5.469397 |
| KOR | HIN | 3 | talk | topic | -5.097140 |
| KOR | HIN | 4 | academic | student | -4.870147 |
| KOR | HIN | 5 | day | job | -4.770451 |
| KOR | TUR | 1 | talk | topic | -5.274550 |
| KOR | TUR | 2 | day | job | -5.162980 |
| KOR | TUR | 3 | academic | student | -4.962353 |
| KOR | TUR | 4 | problem | question | -4.933043 |
| KOR | TUR | 5 | issue | subject | 3.567297 |
These are mostly English synonyms.
crit_resids <- resids %>%
rowwise() %>%
mutate(wpair = get_unique_relation_id(w1, w2)) %>%
mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
distinct(lpair, wpair, .keep_all = TRUE) %>%
group_by(lang1, lang2) %>%
arrange(abs(resid)) %>%
slice(1:5) %>%
mutate(n = 1:n())
crit_resids %>%
select(lang1, lang2, n, w1, w2, resid) %>%
kable()
| lang1 | lang2 | n | w1 | w2 | resid |
|---|---|---|---|---|---|
| GER | ENG | 1 | business | day | -0.0000865 |
| GER | ENG | 2 | expert | today | 0.0000994 |
| GER | ENG | 3 | academic | time | -0.0001373 |
| GER | ENG | 4 | computer | human | -0.0003691 |
| GER | ENG | 5 | concentrate | hand | 0.0004485 |
| HIN | ENG | 1 | easy | society | -0.0002624 |
| HIN | ENG | 2 | study | understand | -0.0005094 |
| HIN | ENG | 3 | choice | specialization | -0.0007034 |
| HIN | ENG | 4 | good | skill | -0.0008677 |
| HIN | ENG | 5 | conclusion | situation | -0.0009027 |
| HIN | GER | 1 | great | student | -0.0002246 |
| HIN | GER | 2 | choose | future | -0.0007646 |
| HIN | GER | 3 | broad | focus | -0.0009248 |
| HIN | GER | 4 | knowledge | live | -0.0011795 |
| HIN | GER | 5 | great | life | -0.0012007 |
| TUR | ENG | 1 | learn | student | -0.0000323 |
| TUR | ENG | 2 | day | instance | -0.0002132 |
| TUR | ENG | 3 | knowledge | subject | 0.0008540 |
| TUR | ENG | 4 | deep | question | 0.0009334 |
| TUR | ENG | 5 | job | knowledge | 0.0011320 |
| TUR | GER | 1 | feel | university | 0.0002077 |
| TUR | GER | 2 | information | question | -0.0002385 |
| TUR | GER | 3 | great | idea | -0.0002898 |
| TUR | GER | 4 | aspect | world | -0.0004491 |
| TUR | GER | 5 | company | topic | 0.0005529 |
| TUR | HIN | 1 | computer | subject | -0.0005786 |
| TUR | HIN | 2 | computer | topic | -0.0005786 |
| TUR | HIN | 3 | order | school | 0.0006069 |
| TUR | HIN | 4 | computer | easy | -0.0008794 |
| TUR | HIN | 5 | business | future | 0.0009191 |
| KOR | ENG | 1 | case | experience | -0.0001855 |
| KOR | ENG | 2 | experience | make | 0.0002437 |
| KOR | ENG | 3 | education | today | -0.0002646 |
| KOR | ENG | 4 | good | order | 0.0002882 |
| KOR | ENG | 5 | area | today | 0.0005000 |
| KOR | GER | 1 | advantage | specific | 0.0001099 |
| KOR | GER | 2 | day | level | -0.0003274 |
| KOR | GER | 3 | engineer | level | -0.0004187 |
| KOR | GER | 4 | academic | make | 0.0009453 |
| KOR | GER | 5 | student | world | 0.0010660 |
| KOR | HIN | 1 | choice | general | -0.0008783 |
| KOR | HIN | 2 | change | talk | 0.0009510 |
| KOR | HIN | 3 | change | people | 0.0009848 |
| KOR | HIN | 4 | expert | reason | -0.0011488 |
| KOR | HIN | 5 | reason | specialist | -0.0011488 |
| KOR | TUR | 1 | find | view | -0.0006536 |
| KOR | TUR | 2 | question | school | -0.0008498 |
| KOR | TUR | 3 | give | mind | 0.0011447 |
| KOR | TUR | 4 | gain | information | 0.0012216 |
| KOR | TUR | 5 | field | find | -0.0014194 |
These are surprising and - interestingly - different across languages.