L2ETS Study 1

Within-language word-word distance for all languages
Essay measure of language similarity vs. word-word distance correlations
Most different vs. most similar words
- Five most different words for each language pair
- Five most similiar words for each language pair

This analysis asks whether the distance between two words in language A is correlated with the same distance in language B. Here we look at German, Hindi, Korean, and Turkish for the “broad academics” prompt (“VC079857”).

Set params.

PROMPT <- "VC079857"
LANGS <- c("ENG", "GER", "HIN", "TUR", "KOR")

Read in model data.

all_models <- read_feather("data/all_langs_word_vectors.feather") %>%
  select(L1_code, word, type, translation, everything())

Within-language word-word distance for all languages

For each word-word pair, we calculate the distance of word_x to word_y for all word pairs (n = 10000). We then look at the correlation of these distances across semantic space models. Prediction: Languages that are more closely related should be have semantic spaces that are more correlated.

all_prompt_dists <- map_df(LANGS, 
                           get_pairwise_dist_beween_words, 
                           all_models)

dist_matrix <- spread(all_prompt_dists, L1_code, dist) %>%
  #filter(w1 != w2) %>%
   select(-1:-2) %>%
   as.matrix()

Correlations of word-word distance across languages.

prompt_word_dist_cor_mat <- cor(dist_matrix, use = "pairwise.complete.obs")
kable(prompt_word_dist_cor_mat)

	ENG	GER	HIN	KOR	TUR
ENG	1.0000000	0.7610061	0.6050708	0.5214087	0.5091490
GER	0.7610061	1.0000000	0.5300831	0.5568331	0.5102290
HIN	0.6050708	0.5300831	1.0000000	0.5458384	0.5885479
KOR	0.5214087	0.5568331	0.5458384	1.0000000	0.5037011
TUR	0.5091490	0.5102290	0.5885479	0.5037011	1.0000000

Essay measure of language similarity vs. word-word distance correlations

Do our previous measures of language similarity correlate with word-word distance correlations?

Read in all distances measures based on document vectors, and combine document measures with word-word distances correlations (each data point in a language pair).

files <- list.files("../../all_data/pairwise_country_distances/")

all_doc_distances <- map_df(paste0("../../all_data/pairwise_country_distances/", files), function(x){
  df <- read_csv(x)
  mutate(df, measure = x, L1_code_1 = names(df))
}) %>%
  rowwise() %>%
  mutate(measure = unlist(str_split(measure, "distances/"))[2],
         measure = unlist(str_split(measure, "_distances.csv"))[1])

all_doc_distances_long <- all_doc_distances %>%
  gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
  distinct(measure, L1_code_1, L1_code_2, .keep_all = TRUE)

all_dists_with_prompt <- prompt_word_dist_cor_mat %>%
  as.data.frame() %>%
  rownames_to_column(var = "L1_code_1") %>%
  mutate(measure = "word_word_distance")  %>%
  gather("L1_code_2", "dist", c(-measure, -L1_code_1)) %>%
  select(measure, everything()) %>%
  bind_rows(all_doc_distances_long) %>%
  spread(measure, dist) %>%
  filter(L1_code_1 %in% LANGS,
         L1_code_2 %in% LANGS) %>%
  filter(L1_code_1 != L1_code_2) %>% 
  na.omit()

Correlations between document measures (from previous analyses) with word-word distances

correlate(all_dists_with_prompt[,-1:-2], use = 
            "pairwise.complete.obs") %>%
  kable()

rowname	2D_b_w	2D_centroid	2D_p_overlap	HD_centroid	word_word_distance
2D_b_w	NA	-0.1455326	0.9559525	0.6901092	0.6031255
2D_centroid	-0.1455326	NA	0.0333834	-0.1087799	0.4477248
2D_p_overlap	0.9559525	0.0333834	NA	0.6947577	0.6214977
HD_centroid	0.6901092	-0.1087799	0.6947577	NA	0.5330626
word_word_distance	0.6031255	0.4477248	0.6214977	0.5330626	NA

Correlation plots

Overall language essay centroid distance vs. word-word distance

#ggplot(all_dists_with_prompt, aes(x = `2D_centroid`, 
#  y = word_word_distance)) +
#  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
#  geom_smooth(method = "lm") +
 # theme_bw()

ggplot(all_dists_with_prompt, aes(x = `HD_centroid`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm")+
  theme_bw()

Overlap measures vs. word-word distance

ggplot(all_dists_with_prompt, aes(x = `2D_p_overlap`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm")+
  theme_bw()

ggplot(all_dists_with_prompt, aes(x = `2D_b_w`, 
  y = word_word_distance)) +
  geom_label(aes(label = paste0(L1_code_1, "_", L1_code_2))) +
  geom_smooth(method = "lm") +
  theme_bw()

Suggestive….need more data.

Most different vs. most similar words

(for each language pair)

dist_matrix_long_form <- spread(all_prompt_dists, L1_code, dist)

combos= expand.grid(LANGS, LANGS) %>%
  filter(Var1 != Var2)

resids <- map2_df(combos$Var1, combos$Var2, 
                  get_lang_pairwise_residuals, dist_matrix_long_form)

Five most different words for each language pair

crit_resids <- resids %>%
  rowwise() %>%
  mutate(wpair = get_unique_relation_id(w1, w2)) %>%
  mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
  distinct(lpair, wpair, .keep_all = TRUE) %>%
  group_by(lang1, lang2) %>%
  arrange(-abs(resid)) %>%
  slice(1:5) %>%
  mutate(n = 1:n()) 

crit_resids  %>%
  select(lang1, lang2, n,  w1, w2, resid) %>%
  kable()

lang1	lang2	n	w1	w2	resid
GER	ENG	1	life	live	-5.161712
GER	ENG	2	broad	wide	-4.174902
GER	ENG	3	issue	problem	-4.121196
GER	ENG	4	kind	thing	2.019841
GER	ENG	5	human	people	-2.015536
HIN	ENG	1	expert	specialize	-5.273020
HIN	ENG	2	opinion	view	-5.178612
HIN	ENG	3	specialist	specialize	-4.938001
HIN	ENG	4	expert	specialist	-4.585205
HIN	ENG	5	subject	topic	-4.460329
HIN	GER	1	opinion	view	-5.445078
HIN	GER	2	expert	specialize	-5.111721
HIN	GER	3	subject	topic	-5.096890
HIN	GER	4	job	work	-4.998181
HIN	GER	5	specialist	specialize	-4.593314
TUR	ENG	1	area	field	-5.999214
TUR	ENG	2	issue	topic	-5.697727
TUR	ENG	3	conclusion	result	-5.581131
TUR	ENG	4	issue	subject	-5.472325
TUR	ENG	5	business	work	-5.464909
TUR	GER	1	issue	subject	-5.809935
TUR	GER	2	business	work	-5.769715
TUR	GER	3	business	job	-5.708771
TUR	GER	4	case	situation	-5.690853
TUR	GER	5	area	field	-5.643414
TUR	HIN	1	area	field	-6.165838
TUR	HIN	2	broad	wide	-5.891411
TUR	HIN	3	information	knowledge	-5.436387
TUR	HIN	4	case	situation	-5.426014
TUR	HIN	5	conclusion	result	-5.275661
KOR	ENG	1	day	job	-5.146330
KOR	ENG	2	talk	topic	-5.133887
KOR	ENG	3	academic	student	-4.798137
KOR	ENG	4	broad	wide	-4.660774
KOR	ENG	5	expert	specialist	-4.625501
KOR	GER	1	day	job	-5.420826
KOR	GER	2	academic	student	-5.396780
KOR	GER	3	research	study	-4.787820
KOR	GER	4	problem	question	-4.437232
KOR	GER	5	expert	specialist	-4.244000
KOR	HIN	1	problem	question	-5.759544
KOR	HIN	2	broad	wide	-5.469397
KOR	HIN	3	talk	topic	-5.097140
KOR	HIN	4	academic	student	-4.870147
KOR	HIN	5	day	job	-4.770451
KOR	TUR	1	talk	topic	-5.274550
KOR	TUR	2	day	job	-5.162980
KOR	TUR	3	academic	student	-4.962353
KOR	TUR	4	problem	question	-4.933043
KOR	TUR	5	issue	subject	3.567297

These are mostly English synonyms.

Five most similiar words for each language pair

crit_resids <- resids %>%
  rowwise() %>%
  mutate(wpair = get_unique_relation_id(w1, w2)) %>%
  mutate(lpair = get_unique_relation_id(lang1, lang2)) %>%
  distinct(lpair, wpair, .keep_all = TRUE) %>%
  group_by(lang1, lang2) %>%
  arrange(abs(resid)) %>%
  slice(1:5) %>%
  mutate(n = 1:n()) 

crit_resids  %>%
  select(lang1, lang2, n,  w1, w2, resid) %>%
  kable()

lang1	lang2	n	w1	w2	resid
GER	ENG	1	business	day	-0.0000865
GER	ENG	2	expert	today	0.0000994
GER	ENG	3	academic	time	-0.0001373
GER	ENG	4	computer	human	-0.0003691
GER	ENG	5	concentrate	hand	0.0004485
HIN	ENG	1	easy	society	-0.0002624
HIN	ENG	2	study	understand	-0.0005094
HIN	ENG	3	choice	specialization	-0.0007034
HIN	ENG	4	good	skill	-0.0008677
HIN	ENG	5	conclusion	situation	-0.0009027
HIN	GER	1	great	student	-0.0002246
HIN	GER	2	choose	future	-0.0007646
HIN	GER	3	broad	focus	-0.0009248
HIN	GER	4	knowledge	live	-0.0011795
HIN	GER	5	great	life	-0.0012007
TUR	ENG	1	learn	student	-0.0000323
TUR	ENG	2	day	instance	-0.0002132
TUR	ENG	3	knowledge	subject	0.0008540
TUR	ENG	4	deep	question	0.0009334
TUR	ENG	5	job	knowledge	0.0011320
TUR	GER	1	feel	university	0.0002077
TUR	GER	2	information	question	-0.0002385
TUR	GER	3	great	idea	-0.0002898
TUR	GER	4	aspect	world	-0.0004491
TUR	GER	5	company	topic	0.0005529
TUR	HIN	1	computer	subject	-0.0005786
TUR	HIN	2	computer	topic	-0.0005786
TUR	HIN	3	order	school	0.0006069
TUR	HIN	4	computer	easy	-0.0008794
TUR	HIN	5	business	future	0.0009191
KOR	ENG	1	case	experience	-0.0001855
KOR	ENG	2	experience	make	0.0002437
KOR	ENG	3	education	today	-0.0002646
KOR	ENG	4	good	order	0.0002882
KOR	ENG	5	area	today	0.0005000
KOR	GER	1	advantage	specific	0.0001099
KOR	GER	2	day	level	-0.0003274
KOR	GER	3	engineer	level	-0.0004187
KOR	GER	4	academic	make	0.0009453
KOR	GER	5	student	world	0.0010660
KOR	HIN	1	choice	general	-0.0008783
KOR	HIN	2	change	talk	0.0009510
KOR	HIN	3	change	people	0.0009848
KOR	HIN	4	expert	reason	-0.0011488
KOR	HIN	5	reason	specialist	-0.0011488
KOR	TUR	1	find	view	-0.0006536
KOR	TUR	2	question	school	-0.0008498
KOR	TUR	3	give	mind	0.0011447
KOR	TUR	4	gain	information	0.0012216
KOR	TUR	5	field	find	-0.0014194

These are surprising and - interestingly - different across languages.