INPATH <- "all_dists.csv"
full_df <- read_csv(INPATH)
The two sets of distances are weakly, but reliably, corrrelated.
cor.test(full_df$embedding_cos_dist, full_df$rank_cos_dist)
##
## Pearson's product-moment correlation
##
## data: full_df$embedding_cos_dist and full_df$rank_cos_dist
## t = 30.91, df = 87498, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09737088 0.11047956
## sample estimates:
## cor
## 0.1039297
mean_full <- full_df %>%
group_by(cluster1, cluster2) %>%
summarize(mean_embedding = mean(embedding_cos_dist),
rank_cos_dist = mean(rank_cos_dist))
cor.test(mean_full$mean_embedding, mean_full$rank_cos_dist)
##
## Pearson's product-moment correlation
##
## data: mean_full$mean_embedding and mean_full$rank_cos_dist
## t = 5.2506, df = 2498, p-value = 1.644e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06554401 0.14309490
## sample estimates:
## cor
## 0.1044783
labels_raw <- read_csv("full_dim_all_50.csv")
labels <- labels_raw %>%
filter(!is.na(label)) %>%
select(label, cluster)
tidy_labels <- labels_raw %>%
select(-label) %>%
left_join(labels, by = "cluster") %>%
select(label, word, cluster)
DT::datatable(tidy_labels)
corrs <- full_df %>%
group_by(cluster1, lang) %>%
do(corr = cor(.$embedding_cos_dist, .$rank_cos_dist),
corr.p = cor.test(.$embedding_cos_dist, .$rank_cos_dist)$p.value) %>%
unnest() %>%
left_join(labels, by = c("cluster1" = "cluster"))
max_diffs <- full_df %>%
right_join(deviant_cluster_langs %>% select(-corr, -corr.p)) %>%
rename(label1 = label) %>%
left_join(labels, by = c("cluster2" = "cluster")) %>%
rename(label2 = label) %>%
mutate(dist_diff = embedding_cos_dist - rank_cos_dist) %>%
select(lang, label1, label2, dist_diff) %>%
group_by(lang, label1, label2) %>%
arrange(-abs(dist_diff)) %>%
ungroup()
max_diffs %>%
arrange(label1, lang, -abs(dist_diff)) %>%
group_by(label1, lang) %>%
slice(1:5) %>%
data.frame() %>%
DT::datatable()