In the previous analysis, I looked at the correlation between rank values for each family and each cluster and identified anomolous family-cluster pairs (cases where the correlation was relatively more negative).
In this analysis, I go back to the original word embeddings and caluclate a centroid for the clusters determined by the tensor analysis, and look at the distribution of other centroids around the target.
Each plot below takes an anomalous cluster, and shows the 10 closest word to that cluster for each language family. The goal is to understand why an anomalous language is anamoulous.
The language-family mappings and the word-cluster mappings are at the bottom.
# Read in all centroids
CENTROID_PATH <- "scripts/5_clustering_embeddings/data/embedding_cluster_centroids_by_fam_"
FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo",
"Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")
centroids <- map_df(FAMS, function(x,y){
path <- paste0(y, x, ".csv")
read_csv(path) %>%
mutate(family = x) %>%
select(family, cluster, everything())
}, CENTROID_PATH)
# Merge in cluster labels
CLUSTER_PATH <- "fam_sub_corrs_labeled.csv"
cluster_labs <- read_csv(CLUSTER_PATH) %>%
select(cluster, label) %>%
rename(cluster_label = label) %>%
distinct(cluster, cluster_label)
centroids_with_labs <- centroids %>%
left_join(cluster_labs)
# Do, tsne on embedding cluster centroids
# get tsne coordinates
tsne_outF = Rtsne::Rtsne(centroids_with_labs[,-1:-2],
check_duplicates = F)
tsne_dimsF <- tsne_outF$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(centroids_with_labs %>% select(family, cluster_label))
PAIRWISE_PATH <- "scripts/5_clustering_embeddings/data/all_centroid_pairwise_dists.csv"
pairwise_dists <- read_csv(PAIRWISE_PATH)
CLUSTER = "time"
### scatterplot
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
left_join(tsne_dimsF, by = c("family", "cluster2" = "cluster_label")) %>%
mutate(word_type = ifelse(cluster2 == CLUSTER, "target_cluster", "similiar_cluster")) %>%
ggplot(aes(x = tsne_X, y = tsne_Y)) +
geom_text(aes(label = cluster2, color = word_type), size = 2) +
facet_wrap(~ family, ncol = 5) +
ggtitle(toupper(CLUSTER)) +
theme_classic() +
theme(legend.position = "none")
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label = cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
CLUSTER = "alone_vs_groups"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
knowledge, large conflict more, community more commerce less
CLUSTER = "leisure_activities"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
large conflict more quantity less, forward progress less, cause and effect more
CLUSTER = "technology_media"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
University more, travel less
CLUSTER = "time_money"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
forward progress less
CLUSTER = "enivornment"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
CLUSTER = "scientific_method"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
More passions, future aspirations
CLUSTER = "normative_life"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
More emotion, less community
CLUSTER = "bodily_functions"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
More emotion, less community
CLUSTER = "knowledge"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
less influence_change?
CLUSTER = "cause_effect"
dists <- pairwise_dists %>%
filter(cluster1 == CLUSTER) %>%
arrange(family, -cos_dist) %>%
group_by(family) %>%
slice(1:11) %>%
mutate(rank = 1:11)
dists %>%
filter(cluster1 != cluster2) %>%
ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) +
geom_tile() +
geom_text(aes(label= cluster2), size = 1.3) +
theme_classic() +
ggtitle(toupper(CLUSTER)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"
lang_key <- read_csv(LANG_KEY)
lang_key %>%
select(lang_name, aff) %>%
arrange(aff) %>%
DT::datatable()
CLUST_PATH <-"fam_sub_clusters_labeled.csv"
clusts <- read_csv(CLUST_PATH) %>%
select(word, cluster, label)
clusts_with_lab <- clusts %>%
select(-label) %>%
left_join(clusts %>% distinct(label, .keep_all = T) %>% select(-word) %>% filter(!is.na(label)), by = "cluster")
#write_csv(clust_labels_words, "clust_labels.csv")
DT::datatable(clusts_with_lab)