Clustering distance correlations

Distributions of word distances based on clusters
Distributions of correlations
Dendograms

Distributions of word distances based on clusters

clust_dists <- read_feather( "../../../../data/processed/clusters/low/imposed/low_all_cluster_models_imposed_all_dists.feather") %>%
  rename(L1_code = "lang"  )

clust_dists_c <- clust_dists %>%
  mutate(within_to_across = within_cluster_distance/across_cluster_distance)


clust_dists_c %>%
  ggplot(aes(x = across_cluster_distance, fill = L1_code )) +
  geom_histogram() +
  facet_wrap(~L1_code) +
  ggtitle("Sum absolute across cluster distances") +
  theme_bw() +
  theme(legend.position = "none")

clust_dists_c %>%
  ggplot(aes(x = within_cluster_distance, fill = L1_code )) +
  ggtitle("Sum absolute within cluster distances") +
  geom_histogram() +
  facet_wrap(~L1_code) +
  theme_bw() +
  theme(legend.position = "none")

clust_dists_c %>%
  ggplot(aes(x = within_to_across, fill = L1_code )) +
  geom_histogram() +
  facet_wrap(~L1_code) +
  ggtitle("Sum absolute within to across") +
  theme_bw() +
  theme(legend.position = "none")

Get country pairwise correlations between word distances (ratio, within, and across)

# get correlations between countries
get_pairwise_cluster_distance_corrs_correlations <- function(lang1, lang2, d){
    print(paste0("====== ", lang1, " ", lang2, " ======"))

  lang1_data <- filter(d, L1_code == lang1)
  lang2_data <- filter(d, L1_code == lang2)
  
  both_langs <- inner_join(lang1_data, lang2_data, by = "word")
  # get correlation between two sets of distances
  cor_df <- cor.test(both_langs$across_cluster_distance.x, both_langs$across_cluster_distance.y) %>%
    tidy() %>%
    mutate(lang1 = lang1, 
           lang2 = lang2) %>%
    select(-method, -alternative) %>%
    select(lang1, lang2, everything())
  
  cor_df
}

# get list of unqiue country pairs
all_lang_pairs <- expand.grid(LANGS, LANGS) %>%
  mutate_all(as.character)

unique_lang_pairs<- unique(as.data.frame(t(apply(all_lang_pairs, 1, sort )))) %>%
    rename(lang1 = V1,
         lang2 = V2) %>%
    filter(lang1 != lang2) 

# do the thing
all_corrs <- map2_df(unique_lang_pairs$lang1, unique_lang_pairs$lang2, get_pairwise_cluster_distance_corrs_correlations, clust_dists_c)

#write_feather(all_corrs, "../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_across_dists.feather")

Distributions of correlations

acrossds <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_across_dists.feather") %>%
  mutate(measure = "across")

withinds <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_within_dists.feather")  %>%
  mutate(measure = "within")


ratiods <- read_feather("../../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_cluster_word_withinvacross_dists.feather")  %>%
  mutate(measure = "ratio")

allds <- bind_rows(acrossds,withinds ) %>% bind_rows(ratiods)

ggplot(allds, aes(x = estimate)) +
  geom_histogram(aes(fill = measure)) +
  facet_wrap(~measure)+
  theme_bw() +
  theme(legend.position = "none")

Dendograms

Ratio dendogram

wide_corrs <- allds %>%
  filter(measure == "ratio") %>%
  select(1:3) %>%
  spread(lang2, estimate)

all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1

dist_matrix <- dist(all_corrs_mat)

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Ratio distances")

Within dendogram

wide_corrs <- allds %>%
  filter(measure == "within") %>%
  select(1:3) %>%
  spread(lang2, estimate)

all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1

dist_matrix <- dist(all_corrs_mat)

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Within distances")

Across dendogram

wide_corrs <- allds %>%
  filter(measure == "across") %>%
  select(1:3) %>%
  spread(lang2, estimate)

all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1

dist_matrix <- dist(all_corrs_mat)

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Across distances")

Clustering distance correlations

low scoring essay models

Molly Lewis

2017-12-08

Distributions of word distances based on clusters

Distributions of correlations

Dendograms