Read in all four distances measures (centroids vs. word distances x low vs. high)
low_word_dists <- read_csv("paper_data/word_pairwise_distances/xling_corr_pairwise_distances_low.csv") %>%
select(lang1, lang2, estimate, score_group, measure)
high_word_dists <- read_csv("paper_data/word_pairwise_distances/xling_corr_pairwise_distances_high.csv") %>%
select(lang1, lang2, estimate, score_group, measure)
low_centroid_dists <- read_csv("paper_data/centroid_distances/HD_centroid_distances_low.csv") %>%
rename(estimate = cos_dist)
high_centroid_dists <- read_csv("paper_data/centroid_distances/HD_centroid_distances_high.csv") %>%
rename(estimate = cos_dist)
all_dists_long <- bind_rows(list(low_word_dists,
high_word_dists,
low_centroid_dists,
high_centroid_dists)) %>%
mutate_if(is.character, as.factor)
all_dists_wide = all_dists_long %>%
unite("id", lang1, lang2) %>%
unite("measure", measure, score_group) %>%
spread(measure, estimate)
corr_mat <- cor(all_dists_wide[,c(-1)],
use = "pairwise.complete.obs")
p.mat <- cor.mtest(all_dists_wide[,c(-1)],
conf.level = .95,
use = "pairwise.complete.obs")$p
cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot(corr_mat, method = "color", col = cols,
type = "full", order = "original", number.cex = .7,
addCoef.col = "black",
p.mat = p.mat, sig.level = .95, insig = "blank",
tl.col = "black", tl.srt = 90,
diag = FALSE)
They are all highly correlated with each other.
all_dists_long %>%
spread(score_group, estimate) %>%
ggplot(aes(y = high, x = low)) +
geom_point(size = .8) +
geom_smooth(method = "lm") +
xlab("low distance measure") +
ylab("high distance measure") +
facet_wrap(~measure, scales = "free") +
theme_classic()
make_dendogram_from_long_data <- function(group, this_measure, df){
df_wide <- df %>%
filter(measure == this_measure,
score_group == group) %>%
select(-score_group, -measure) %>%
spread(lang1, estimate)
dist_mat <- as.matrix(df_wide[,-1])
rownames(dist_mat) <- unlist(df_wide[,1])
this_hclust <- hclust(dist(dist_mat))
ggdendro::ggdendrogram(this_hclust) +
ggtitle(paste0(this_measure, " and ", group))
}
combos <- cross_df(
list(group = c("low", "high"),
this_measure = c("corr_word_dists", "cos_centroid_dists")))
map2(combos$group, combos$this_measure,
make_dendogram_from_long_data,
all_dists_long)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]