clust_dists <- read_feather( "../../../../data/processed/clusters/low/imposed/low_all_cluster_models_imposed_all_dists.feather") %>%
rename(L1_code = "lang" )
clust_dists_c <- clust_dists %>%
mutate(within_to_across = within_cluster_distance/across_cluster_distance)
clust_dists_c %>%
ggplot(aes(x = across_cluster_distance, fill = L1_code )) +
geom_histogram() +
facet_wrap(~L1_code) +
ggtitle("Sum absolute across cluster distances") +
theme_bw() +
theme(legend.position = "none")
clust_dists_c %>%
ggplot(aes(x = within_cluster_distance, fill = L1_code )) +
ggtitle("Sum absolute within cluster distances") +
geom_histogram() +
facet_wrap(~L1_code) +
theme_bw() +
theme(legend.position = "none")
clust_dists_c %>%
ggplot(aes(x = within_to_across, fill = L1_code )) +
geom_histogram() +
facet_wrap(~L1_code) +
ggtitle("Sum absolute within to across") +
theme_bw() +
theme(legend.position = "none")
Get country pairwise correlations between word distances (ratio, within, and across)
# get correlations between countries
get_pairwise_cluster_distance_corrs_correlations <- function(lang1, lang2, d){
print(paste0("====== ", lang1, " ", lang2, " ======"))
lang1_data <- filter(d, L1_code == lang1)
lang2_data <- filter(d, L1_code == lang2)
both_langs <- inner_join(lang1_data, lang2_data, by = "word")
# get correlation between two sets of distances
cor_df <- cor.test(both_langs$across_cluster_distance.x, both_langs$across_cluster_distance.y) %>%
tidy() %>%
mutate(lang1 = lang1,
lang2 = lang2) %>%
select(-method, -alternative) %>%
select(lang1, lang2, everything())
cor_df
}
# get list of unqiue country pairs
all_lang_pairs <- expand.grid(LANGS, LANGS) %>%
mutate_all(as.character)
unique_lang_pairs<- unique(as.data.frame(t(apply(all_lang_pairs, 1, sort )))) %>%
rename(lang1 = V1,
lang2 = V2) %>%
filter(lang1 != lang2)
# do the thing
all_corrs <- map2_df(unique_lang_pairs$lang1, unique_lang_pairs$lang2, get_pairwise_cluster_distance_corrs_correlations, clust_dists_c)
#write_feather(all_corrs, "../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_across_dists.feather")
acrossds <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_across_dists.feather") %>%
mutate(measure = "across")
withinds <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_within_dists.feather") %>%
mutate(measure = "within")
ratiods <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_withinvacross_dists.feather") %>%
mutate(measure = "ratio")
allds <- bind_rows(acrossds,withinds ) %>% bind_rows(ratiods)
ggplot(allds, aes(x = estimate)) +
geom_histogram(aes(fill = measure)) +
facet_wrap(~measure)+
theme_bw() +
theme(legend.position = "none")
Ratio dendogram
wide_corrs <- allds %>%
filter(measure == "ratio") %>%
select(1:3) %>%
spread(lang2, estimate)
all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Ratio distances")
Within dendogram
wide_corrs <- allds %>%
filter(measure == "within") %>%
select(1:3) %>%
spread(lang2, estimate)
all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Within distances")
Across dendogram
wide_corrs <- allds %>%
filter(measure == "across") %>%
select(1:3) %>%
spread(lang2, estimate)
all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Across distances")