INPATH <- "all_dists.csv"
full_df <- read_csv(INPATH)

The two sets of distances are weakly, but reliably, corrrelated.

cor.test(full_df$embedding_cos_dist, full_df$rank_cos_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  full_df$embedding_cos_dist and full_df$rank_cos_dist
## t = 30.91, df = 87498, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09737088 0.11047956
## sample estimates:
##       cor 
## 0.1039297
mean_full <- full_df %>%
  group_by(cluster1, cluster2) %>%
  summarize(mean_embedding = mean(embedding_cos_dist),
            rank_cos_dist = mean(rank_cos_dist))

cor.test(mean_full$mean_embedding, mean_full$rank_cos_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  mean_full$mean_embedding and mean_full$rank_cos_dist
## t = 5.2506, df = 2498, p-value = 1.644e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06554401 0.14309490
## sample estimates:
##       cor 
## 0.1044783

Cluster labels

labels_raw <- read_csv("full_dim_all_50.csv") 
  
labels <- labels_raw %>%
  filter(!is.na(label)) %>%
  select(label, cluster)

tidy_labels <- labels_raw %>% 
  select(-label) %>%
  left_join(labels, by = "cluster") %>%
  select(label, word, cluster) 

DT::datatable(tidy_labels)

Look at which clusters are weird for each language

corrs <- full_df %>%
  group_by(cluster1, lang) %>%
  do(corr = cor(.$embedding_cos_dist, .$rank_cos_dist),
            corr.p = cor.test(.$embedding_cos_dist, .$rank_cos_dist)$p.value) %>%
  unnest() %>%
  left_join(labels, by = c("cluster1" = "cluster"))

100 least correlated cluster x langs

deviant_cluster_langs <- corrs %>%
  filter(corr < 0) %>%
  arrange(corr) %>%
  slice(1:100) %>%
  data.frame()

kable(count(deviant_cluster_langs, label))
label n
cognitive_verbs2 1
community 30
money_value 10
speed_accuracy 35
time 24
DT::datatable(deviant_cluster_langs)

Top 5 cluster pairs for each “weird” cluster x lang

max_diffs <- full_df %>%
    right_join(deviant_cluster_langs %>% select(-corr, -corr.p)) %>%
    rename(label1 = label) %>%
    left_join(labels, by = c("cluster2" = "cluster")) %>%
    rename(label2 = label) %>%
    mutate(dist_diff = embedding_cos_dist - rank_cos_dist) %>%
    select(lang, label1, label2, dist_diff) %>%
    group_by(lang, label1, label2) %>%
    arrange(-abs(dist_diff)) %>%
   ungroup() 

 max_diffs %>%
  arrange(label1, lang, -abs(dist_diff)) %>%
  group_by(label1, lang) %>%
  slice(1:5) %>%
  data.frame()   %>%
 DT::datatable()