In this analysis, I ask whether the words that are different across languages varies by language pair.

To ask this, for each language pair, I rank order the words by their residuals (low rank ordering means high residual -> dissimilar), and then divide the ranks into deciles. Then, for each decile, look at the speaerman correlation of this rank order across language pairs.

LANGS <- c("ARA", "BEN", "BUL" ,"CHI", "DUT" ,"ENG" ,"FAS" ,"FRE" ,"GER", "GRE", "GUJ" ,"HIN","IBO" ,"IND" ,"ITA", "JPN" ,"KAN" ,"KOR", "MAL", "MAR" ,"NEP", "PAN" ,"POL", "POR","RUM", "RUS", "SPA" ,"TAM" ,"TEL" ,"TGL" ,"THA" ,"TUR", "URD" ,"VIE", "YOR")

PATH_PREFIX_READ <- "/Volumes/wilbur_the_great/divergent_by_lang/many/"

FS <- 10
lang_pairs <- expand.grid(LANGS, LANGS) %>%
  rename(lang1 = Var1,
         lang2 = Var2) %>%
  filter(lang1 != lang2) %>%
  mutate_all(as.character) 
all_words <- map_df(list.files(PATH_PREFIX_READ), function(x) {read_feather(paste0(PATH_PREFIX_READ, x)) %>%
    mutate(lang_group = substr(x, 1, 3))}) %>%
  select(lang_group, everything())

all_words_with_group_rank <- all_words %>%
  mutate(lang_pair = paste0(lang1, "_", lang2)) %>%
  group_by(lang_pair) %>%
  mutate(ranking_group = ntile(word_rank, 10)) %>%
  ungroup() %>%
  mutate(lang_pair = paste0(lang_pair, "_", ranking_group))

all_corrs <- all_words_with_group_rank %>%
  pairwise_cor(lang_pair, # pairs we're comparing
               total.word, # linking feature
               word_rank,  # value
               method = "spearman", maximum_size = NULL)

all_corrs_tidy <- all_corrs %>%
  separate(item1, c("l1_1", "l2_1", "g_1"), sep = "_") %>%
  separate(item2, c("l1_2", "l2_2", "g_2"), sep = "_")  %>%
  filter(g_1 == g_2)

means <- all_corrs_tidy %>%
  mutate(correlation = log(correlation)) %>%
  group_by(g_2) %>%
  multi_boot_standard(col= "correlation", na.rm = TRUE)

ggplot(means, aes(x = as.numeric(g_2), y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
  #scale_x_continuous(breaks=1:5) +
  ggtitle("Mean correlation by decile") +
  theme_bw()