Read in all four distances measures (centroids vs. word distances x low vs. high)

low_word_dists <- read_csv("paper_data/word_pairwise_distances/xling_corr_pairwise_distances_low.csv") %>% 
  select(lang1, lang2, estimate, score_group, measure) 
high_word_dists <- read_csv("paper_data/word_pairwise_distances/xling_corr_pairwise_distances_high.csv") %>% 
  select(lang1, lang2, estimate, score_group, measure) 
low_centroid_dists <- read_csv("paper_data/centroid_distances/HD_centroid_distances_low.csv") %>%
  rename(estimate = cos_dist)
high_centroid_dists <- read_csv("paper_data/centroid_distances/HD_centroid_distances_high.csv") %>%
  rename(estimate = cos_dist)

all_dists_long <- bind_rows(list(low_word_dists, 
            high_word_dists, 
            low_centroid_dists, 
            high_centroid_dists))  %>%
  mutate_if(is.character, as.factor)

Pairwise correlations between low and high groups for two measures

all_dists_wide = all_dists_long %>%
  unite("id", lang1, lang2) %>%
  unite("measure", measure, score_group) %>%
  spread(measure, estimate)

corr_mat <- cor(all_dists_wide[,c(-1)], 
                use = "pairwise.complete.obs")

p.mat <- cor.mtest(all_dists_wide[,c(-1)], 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, sig.level = .95, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

They are all highly correlated with each other.

Correlations between distance measures in high and low groups

all_dists_long %>%
  spread(score_group, estimate) %>%
  ggplot(aes(y = high, x = low)) +
  geom_point(size = .8) + 
  geom_smooth(method = "lm") +
  xlab("low distance measure") +
  ylab("high distance measure") +
  facet_wrap(~measure, scales = "free") +
  theme_classic()

Dendograms

make_dendogram_from_long_data <- function(group, this_measure, df){
  df_wide <- df %>%
    filter(measure == this_measure, 
           score_group == group) %>%
    select(-score_group, -measure) %>%
    spread(lang1, estimate) 
  dist_mat <- as.matrix(df_wide[,-1])
  rownames(dist_mat) <- unlist(df_wide[,1])

  this_hclust <- hclust(dist(dist_mat))
  
  ggdendro::ggdendrogram(this_hclust) +
      ggtitle(paste0(this_measure, " and ", group))
}


combos <- cross_df(
        list(group = c("low", "high"), 
             this_measure = c("corr_word_dists", "cos_centroid_dists"))) 

map2(combos$group, combos$this_measure, 
       make_dendogram_from_long_data,
       all_dists_long)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]