This analysis asks whether the distance between two words in language A is correlated with the same distance in language B. Here we look at German, Hindi, Korean, and Turkish for the “broad academics” prompt (“VC079857”).

Set params.

PROMPT <- "VC079857"
LANGS <- c("ENG", "GER", "HIN", "TUR", "KOR")

Read in model data.

all_models <- read_feather("data/all_langs_word_vectors.feather") %>%
  select(L1_code, word, type, translation, everything())

Within-language word-word distance for all languages

For each word-word pair, we calculate the distance of word_x to word_y for all word pairs (n = 10000). We then look at the correlation of these distances across semantic space models. Prediction: Languages that are more closely related should be have semantic spaces that are more correlated.

all_prompt_dists <- map_df(LANGS, 
                           get_pairwise_dist_beween_words, 
                           all_models)

dist_matrix <- spread(all_prompt_dists, L1_code, dist) %>%
  #filter(w1 != w2) %>%
   select(-1:-2) %>%
   as.matrix()

Correlations of word-word distance across languages.

prompt_word_dist_cor_mat <- cor(dist_matrix, use = "pairwise.complete.obs")
kable(prompt_word_dist_cor_mat)
ENG GER HIN KOR TUR
ENG 1.0000000 0.7610061 0.6050708 0.5214087 0.5091490
GER 0.7610061 1.0000000 0.5300831 0.5568331 0.5102290
HIN 0.6050708 0.5300831 1.0000000 0.5458384 0.5885479
KOR 0.5214087 0.5568331 0.5458384 1.0000000 0.5037011
TUR 0.5091490 0.5102290 0.5885479 0.5037011 1.0000000

Essay measure of language similarity vs. word-word distance correlations

Do our previous measures of language similarity correlate with word-word distance correlations?

Read in all distances measures based on document vectors, and combine document measures with word-word distances correlations (each data point in a language pair).

files <- list.files("../../all_data/pairwise_country_distances/")

all_doc_distances <- map_df(paste0("../../all_data/pairwise_country_distances/", files), function(x){
  df <- read_csv(x)
  mutate(df, measure = x, L1_code_1 = names(df))
}) %>%
  rowwise() %>%
  mutate(measure = unlist(str_split(measure, "distances/"))[2],
         measure = unlist(str_split(measure, "_distances.csv"))[1])

all_doc_distances_long <- all_doc_distances %>%
  gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
  distinct(measure, L1_code_1, L1_code_2, .keep_all = TRUE)

all_dists_with_prompt <- prompt_word_dist_cor_mat %>%
  as.data.frame() %>%
  rownames_to_column(var = "L1_code_1") %>%
  mutate(measure = "word_word_distance")  %>%
  gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
  select(measure, everything()) %>%
  bind_rows(all_doc_distances_long) %>%
  spread(measure, dist) %>%
  filter(L1_code_1 %in% LANGS,
         L1_code_2 %in% LANGS) %>%
  filter(L1_code_1 != L1_code_2) %>% 
  na.omit() 

Correlations between document measures (from previous analyses) with word-word distances

correlate(all_dists_with_prompt[,-1:-2], use = 
            "pairwise.complete.obs") %>%
  kable()
rowname 2D_b_w 2D_centroid 2D_p_overlap HD_centroid word_word_distance
2D_b_w NA -0.1455326 0.9559525 0.6901092 0.6031255
2D_centroid -0.1455326 NA 0.0333834 -0.1087799 0.4477248
2D_p_overlap 0.9559525 0.0333834 NA 0.6947577 0.6214977
HD_centroid 0.6901092 -0.1087799 0.6947577 NA 0.5330626
word_word_distance 0.6031255 0.4477248 0.6214977 0.5330626 NA

Correlation plots

Overall language essay centroid distance vs. word-word distance

#ggplot(all_dists_with_prompt, aes(x = `2D_centroid`, 
#  y = word_word_distance)) +
#  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
#  geom_smooth(method = "lm") +
 # theme_bw()

ggplot(all_dists_with_prompt, aes(x = `HD_centroid`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm")+
  theme_bw()

Overlap measures vs. word-word distance

ggplot(all_dists_with_prompt, aes(x = `2D_p_overlap`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm")+
  theme_bw()

ggplot(all_dists_with_prompt, aes(x = `2D_b_w`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm") +
  theme_bw()

Suggestive….need more data.

Most different vs. most similar words

(for each language pair)

dist_matrix_long_form <- spread(all_prompt_dists, L1_code, dist)

combos= expand.grid(LANGS, LANGS) %>%
  filter(Var1 != Var2)

resids <- map2_df(combos$Var1, combos$Var2, 
                  get_lang_pairwise_residuals, dist_matrix_long_form)

Five most different words for each language pair

crit_resids <- resids %>%
  rowwise() %>%
  mutate(wpair = get_unique_relation_id(w1, w2)) %>%
  mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
  distinct(lpair, wpair, .keep_all = TRUE) %>%
  group_by(lang1, lang2) %>%
  arrange(-abs(resid)) %>%
  slice(1:5) %>%
  mutate(n = 1:n()) 

crit_resids  %>%
  select(lang1, lang2, n,  w1, w2, resid) %>%
  kable()
lang1 lang2 n w1 w2 resid
GER ENG 1 life live -5.161712
GER ENG 2 broad wide -4.174902
GER ENG 3 issue problem -4.121196
GER ENG 4 kind thing 2.019841
GER ENG 5 human people -2.015536
HIN ENG 1 expert specialize -5.273020
HIN ENG 2 opinion view -5.178612
HIN ENG 3 specialist specialize -4.938001
HIN ENG 4 expert specialist -4.585205
HIN ENG 5 subject topic -4.460329
HIN GER 1 opinion view -5.445078
HIN GER 2 expert specialize -5.111721
HIN GER 3 subject topic -5.096890
HIN GER 4 job work -4.998181
HIN GER 5 specialist specialize -4.593314
TUR ENG 1 area field -5.999214
TUR ENG 2 issue topic -5.697727
TUR ENG 3 conclusion result -5.581131
TUR ENG 4 issue subject -5.472325
TUR ENG 5 business work -5.464909
TUR GER 1 issue subject -5.809935
TUR GER 2 business work -5.769715
TUR GER 3 business job -5.708771
TUR GER 4 case situation -5.690853
TUR GER 5 area field -5.643414
TUR HIN 1 area field -6.165838
TUR HIN 2 broad wide -5.891411
TUR HIN 3 information knowledge -5.436387
TUR HIN 4 case situation -5.426014
TUR HIN 5 conclusion result -5.275661
KOR ENG 1 day job -5.146330
KOR ENG 2 talk topic -5.133887
KOR ENG 3 academic student -4.798137
KOR ENG 4 broad wide -4.660774
KOR ENG 5 expert specialist -4.625501
KOR GER 1 day job -5.420826
KOR GER 2 academic student -5.396780
KOR GER 3 research study -4.787820
KOR GER 4 problem question -4.437232
KOR GER 5 expert specialist -4.244000
KOR HIN 1 problem question -5.759544
KOR HIN 2 broad wide -5.469397
KOR HIN 3 talk topic -5.097140
KOR HIN 4 academic student -4.870147
KOR HIN 5 day job -4.770451
KOR TUR 1 talk topic -5.274550
KOR TUR 2 day job -5.162980
KOR TUR 3 academic student -4.962353
KOR TUR 4 problem question -4.933043
KOR TUR 5 issue subject 3.567297

These are mostly English synonyms.

Five most similiar words for each language pair

crit_resids <- resids %>%
  rowwise() %>%
  mutate(wpair = get_unique_relation_id(w1, w2)) %>%
  mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
  distinct(lpair, wpair, .keep_all = TRUE) %>%
  group_by(lang1, lang2) %>%
  arrange(abs(resid)) %>%
  slice(1:5) %>%
  mutate(n = 1:n()) 

crit_resids  %>%
  select(lang1, lang2, n,  w1, w2, resid) %>%
  kable()
lang1 lang2 n w1 w2 resid
GER ENG 1 business day -0.0000865
GER ENG 2 expert today 0.0000994
GER ENG 3 academic time -0.0001373
GER ENG 4 computer human -0.0003691
GER ENG 5 concentrate hand 0.0004485
HIN ENG 1 easy society -0.0002624
HIN ENG 2 study understand -0.0005094
HIN ENG 3 choice specialization -0.0007034
HIN ENG 4 good skill -0.0008677
HIN ENG 5 conclusion situation -0.0009027
HIN GER 1 great student -0.0002246
HIN GER 2 choose future -0.0007646
HIN GER 3 broad focus -0.0009248
HIN GER 4 knowledge live -0.0011795
HIN GER 5 great life -0.0012007
TUR ENG 1 learn student -0.0000323
TUR ENG 2 day instance -0.0002132
TUR ENG 3 knowledge subject 0.0008540
TUR ENG 4 deep question 0.0009334
TUR ENG 5 job knowledge 0.0011320
TUR GER 1 feel university 0.0002077
TUR GER 2 information question -0.0002385
TUR GER 3 great idea -0.0002898
TUR GER 4 aspect world -0.0004491
TUR GER 5 company topic 0.0005529
TUR HIN 1 computer subject -0.0005786
TUR HIN 2 computer topic -0.0005786
TUR HIN 3 order school 0.0006069
TUR HIN 4 computer easy -0.0008794
TUR HIN 5 business future 0.0009191
KOR ENG 1 case experience -0.0001855
KOR ENG 2 experience make 0.0002437
KOR ENG 3 education today -0.0002646
KOR ENG 4 good order 0.0002882
KOR ENG 5 area today 0.0005000
KOR GER 1 advantage specific 0.0001099
KOR GER 2 day level -0.0003274
KOR GER 3 engineer level -0.0004187
KOR GER 4 academic make 0.0009453
KOR GER 5 student world 0.0010660
KOR HIN 1 choice general -0.0008783
KOR HIN 2 change talk 0.0009510
KOR HIN 3 change people 0.0009848
KOR HIN 4 expert reason -0.0011488
KOR HIN 5 reason specialist -0.0011488
KOR TUR 1 find view -0.0006536
KOR TUR 2 question school -0.0008498
KOR TUR 3 give mind 0.0011447
KOR TUR 4 gain information 0.0012216
KOR TUR 5 field find -0.0014194

These are surprising and - interestingly - different across languages.