L2ETS Study 2 analyses - xling correlations of pairwise distances

For each language, get distance between each word pair
For each language pair, get correlations between pairwise word distances
Similarity matrix between languages

For each language, get distance between each word pair

Read in words and word vecs for 1 language and get all pairwise distances between words, then save to feather

# get within-language pairwise word distance
get_pairwise_dist_beween_words <- function(d){
  model_matrix <- d %>%
    select(-1) %>%
    as.matrix()
  
  words <- d$word

  word_word_dists <- philentropy::distance(model_matrix, 
                           method = "cosine") %>%
                      as.data.frame()  %>%
                      mutate(w1 = words)
  
  colnames(word_word_dists) = c(words, "w1") 
  
  long_word_word_dists <- gather(word_word_dists, "w2", "cos_dist", -w1)
  
  # this is a data.table way of getting unique pairs that seems fast
  long_word_word_dists_f <-  unique(as.data.table(long_word_word_dists)[, c("w1", "w2") := list(pmin(w1, w2),pmax(w1, w2))], by = c("w1", "w2")) %>%
     filter(w1 != w2)

  long_word_word_dists_f
}

# wrapper function for reading and writing
save_pairwise_dists <- function(lang){
  
  print(paste0("====== ", lang, " ======"))
  
  path <- "../../../data/processed/models/L1_high_low_models/"
  
  # read in wordvecs and words
  wordvecs <- read_feather(paste0(path, "wordvecs_", lang, "-low.model.feather") )
  words <- read_feather(paste0(path, "words_", lang, "-low.model.feather")) %>%
  rename(word = `0`)
  
  d <- bind_cols(words, wordvecs) 
  
  pairwise_word_dists <- get_pairwise_dist_beween_words(d)
  
  write_feather(pairwise_word_dists, paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang, "-low.feather"))
  
}

# get list of languages
files <- list.files("../../../data/raw/models/L1_models/") 
LANGS <- str_sub(files, 1,3)

# do the thing
# walk(LANGS, save_pairwise_dists)

For each language pair, get correlations between pairwise word distances

# get correlations between countries
get_pairwise_country_word_pairwise_correlations <- function(lang1, lang2){
    print(paste0("====== ", lang1, " ", lang2, " ======"))

  # read in two sets of pairiwise dists
  pairiwise_dists1 <- read_feather(paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang1, "-low.feather")) %>%
    rename(l1_cos = "cos_dist")
  pairiwise_dists2 <- read_feather(paste0("../../../data/processed/pairwise_word_dists/low/pairwise_word_", lang2, "-low.feather")) %>%
    rename(l2_cos = "cos_dist")
  
  # inner join two sets of distances
  merged_dists <- merge(as.data.table(pairiwise_dists1), 
                        as.data.table(pairiwise_dists2), all = FALSE)
  
  # get correlation between two sets of distances
  cor_df <- cor.test(merged_dists$l1_cos, merged_dists$l2_cos) %>%
    tidy() %>%
    mutate(lang1 = lang1, 
           lang2 = lang2) %>%
    select(-method, -alternative) %>%
    select(lang1, lang2, everything())
  
  cor_df
}

# get list of unqiue country pairs
unique_lang_pairs <- expand.grid(LANGS, LANGS) %>%
  rename(lang1 = Var1,
         lang2 = Var2) %>%
  filter(lang1 != lang2) %>%
  mutate_all(as.character)

# do the thing
# all_corrs <- pmap_df(unique_lang_pairs, get_pairwise_country_word_pairwise_correlations)

# write_feather(all_corrs, "../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather")

Similarity matrix between languages

(based on pairwise word distances)

all_corrs <- read_feather("../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather")

wide_corrs <- all_corrs %>%
  select(1:3) %>%
  spread(lang2, estimate)

all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1

dist_matrix <- dist(all_corrs_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	BEN	BUL	CHI	DUT	ENG	FAS	FRE	GER	GRE	GUJ	HIN	IBO	IND	ITA	JPN	KAN	KOR	MAL	MAR	NEP	PAN	POL	POR	RUM	RUS	SPA	TAM	TEL	TGL	THA	TUR	URD	VIE	YOR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
BEN	0.20	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
BUL	0.15	0.16	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	0.21	0.31	0.32	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
DUT	0.42	0.33	0.40	0.39	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
ENG	0.26	0.14	0.15	0.41	0.40	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
FAS	0.09	0.20	0.20	0.14	0.39	0.29	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	0.10	0.18	0.12	0.23	0.38	0.24	0.12	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
GER	0.27	0.17	0.19	0.35	0.25	0.20	0.27	0.21	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
GRE	0.17	0.11	0.11	0.28	0.31	0.18	0.18	0.11	0.12	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
GUJ	0.20	0.22	0.29	0.15	0.37	0.34	0.14	0.23	0.31	0.24	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
HIN	0.27	0.11	0.20	0.39	0.35	0.11	0.29	0.26	0.20	0.18	0.30	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
IBO	0.20	0.13	0.20	0.24	0.30	0.23	0.17	0.18	0.19	0.14	0.18	0.20	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
IND	0.18	0.26	0.28	0.07	0.38	0.37	0.10	0.20	0.32	0.24	0.12	0.34	0.20	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
ITA	0.09	0.20	0.16	0.19	0.37	0.27	0.09	0.06	0.23	0.14	0.20	0.28	0.18	0.16	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
JPN	0.25	0.33	0.36	0.06	0.40	0.44	0.17	0.26	0.37	0.31	0.17	0.42	0.27	0.10	0.23	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
KAN	0.35	0.19	0.31	0.38	0.21	0.26	0.33	0.31	0.19	0.23	0.30	0.18	0.21	0.35	0.33	0.40	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
KOR	0.22	0.32	0.34	0.04	0.41	0.43	0.15	0.24	0.36	0.29	0.17	0.40	0.25	0.08	0.20	0.06	0.40	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
MAL	0.27	0.11	0.20	0.40	0.35	0.09	0.30	0.25	0.18	0.17	0.31	0.06	0.20	0.35	0.28	0.42	0.18	0.41	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
MAR	0.25	0.07	0.21	0.34	0.30	0.16	0.25	0.23	0.18	0.16	0.25	0.08	0.15	0.29	0.25	0.36	0.15	0.35	0.10	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
NEP	0.22	0.22	0.30	0.15	0.34	0.34	0.15	0.23	0.30	0.24	0.06	0.31	0.16	0.12	0.21	0.16	0.29	0.16	0.31	0.24	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
PAN	0.18	0.12	0.22	0.23	0.35	0.25	0.15	0.19	0.24	0.17	0.12	0.21	0.11	0.18	0.19	0.25	0.24	0.24	0.21	0.15	0.12	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
POL	0.12	0.15	0.09	0.27	0.36	0.19	0.16	0.07	0.17	0.08	0.24	0.22	0.17	0.23	0.10	0.30	0.29	0.28	0.21	0.21	0.25	0.19	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
POR	0.11	0.18	0.11	0.26	0.37	0.22	0.15	0.05	0.19	0.11	0.25	0.25	0.18	0.23	0.09	0.30	0.31	0.27	0.24	0.22	0.26	0.20	0.06	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
RUM	0.22	0.16	0.10	0.37	0.36	0.13	0.26	0.17	0.13	0.12	0.32	0.18	0.22	0.33	0.21	0.40	0.27	0.38	0.16	0.20	0.33	0.25	0.12	0.15	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
RUS	0.09	0.19	0.10	0.28	0.44	0.21	0.16	0.09	0.24	0.15	0.26	0.25	0.21	0.25	0.12	0.32	0.35	0.29	0.24	0.25	0.28	0.22	0.10	0.09	0.17	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
SPA	0.10	0.16	0.09	0.27	0.38	0.20	0.15	0.06	0.19	0.10	0.24	0.23	0.17	0.23	0.10	0.30	0.31	0.28	0.22	0.21	0.25	0.19	0.06	0.05	0.14	0.07	0.00	NA	NA	NA	NA	NA	NA	NA	NA
TAM	0.28	0.11	0.22	0.38	0.31	0.14	0.28	0.25	0.16	0.17	0.29	0.08	0.18	0.33	0.27	0.40	0.14	0.39	0.07	0.08	0.29	0.20	0.22	0.24	0.18	0.26	0.23	0.00	NA	NA	NA	NA	NA	NA	NA
TEL	0.23	0.19	0.29	0.19	0.31	0.32	0.17	0.24	0.28	0.22	0.08	0.27	0.16	0.16	0.22	0.21	0.26	0.20	0.28	0.21	0.07	0.10	0.25	0.26	0.32	0.28	0.25	0.26	0.00	NA	NA	NA	NA	NA	NA
TGL	0.23	0.10	0.16	0.34	0.29	0.13	0.24	0.19	0.10	0.10	0.28	0.12	0.16	0.30	0.22	0.36	0.17	0.35	0.10	0.11	0.27	0.19	0.15	0.18	0.12	0.21	0.17	0.10	0.25	0.00	NA	NA	NA	NA	NA
THA	0.17	0.29	0.30	0.07	0.42	0.39	0.11	0.21	0.35	0.27	0.15	0.37	0.23	0.06	0.17	0.10	0.39	0.07	0.38	0.33	0.16	0.22	0.24	0.24	0.35	0.25	0.24	0.37	0.20	0.33	0.00	NA	NA	NA	NA
TUR	0.14	0.24	0.25	0.09	0.37	0.34	0.06	0.15	0.29	0.21	0.13	0.33	0.18	0.07	0.12	0.13	0.34	0.10	0.33	0.28	0.14	0.17	0.19	0.18	0.30	0.20	0.18	0.32	0.16	0.27	0.08	0.00	NA	NA	NA
URD	0.20	0.05	0.16	0.31	0.34	0.14	0.20	0.18	0.19	0.12	0.22	0.11	0.14	0.26	0.20	0.34	0.21	0.32	0.12	0.08	0.22	0.12	0.16	0.18	0.17	0.19	0.16	0.13	0.20	0.12	0.29	0.24	0.00	NA	NA
VIE	0.10	0.24	0.23	0.13	0.43	0.32	0.06	0.15	0.30	0.22	0.16	0.32	0.20	0.10	0.11	0.17	0.37	0.14	0.32	0.29	0.18	0.18	0.18	0.18	0.29	0.17	0.17	0.32	0.20	0.28	0.09	0.08	0.24	0.00	NA
YOR	0.19	0.12	0.19	0.25	0.31	0.21	0.17	0.18	0.20	0.14	0.18	0.19	0.04	0.21	0.18	0.27	0.21	0.26	0.19	0.14	0.17	0.12	0.17	0.19	0.21	0.20	0.17	0.18	0.16	0.15	0.24	0.19	0.13	0.21	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Language-pairwise correlations between word distances (low models)")

L2ETS Study 2 analyses - xling correlations of pairwise distances

low scoring essay models

Molly Lewis

2017-11-27

For each language, get distance between each word pair

For each language pair, get correlations between pairwise word distances

Similarity matrix between languages