In the previous analysis, I looked at the correlation between rank values for each family and each cluster and identified anomolous family-cluster pairs (cases where the correlation was relatively more negative).

In this analysis, I go back to the original word embeddings and caluclate a centroid for the clusters determined by the tensor analysis, and look at the distribution of other centroids around the target.

Each plot below takes an anomalous cluster, and shows the 10 closest word to that cluster for each language family. The goal is to understand why an anomalous language is anamoulous.

The language-family mappings and the word-cluster mappings are at the bottom.

# Read in all centroids
CENTROID_PATH <- "scripts/5_clustering_embeddings/data/embedding_cluster_centroids_by_fam_"
FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo",
          "Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

centroids <- map_df(FAMS, function(x,y){ 
  path <- paste0(y, x, ".csv")
  read_csv(path) %>%
    mutate(family = x) %>%
    select(family, cluster, everything())
}, CENTROID_PATH)
# Merge in cluster labels
CLUSTER_PATH <- "fam_sub_corrs_labeled.csv"
cluster_labs <- read_csv(CLUSTER_PATH) %>%
  select(cluster, label) %>%
  rename(cluster_label = label) %>%
  distinct(cluster, cluster_label)

centroids_with_labs <-  centroids %>%
  left_join(cluster_labs) 
# Do, tsne on embedding cluster centroids
# get tsne coordinates
tsne_outF = Rtsne::Rtsne(centroids_with_labs[,-1:-2],
                         check_duplicates = F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(centroids_with_labs %>% select(family, cluster_label)) 
PAIRWISE_PATH <- "scripts/5_clustering_embeddings/data/all_centroid_pairwise_dists.csv"
pairwise_dists <- read_csv(PAIRWISE_PATH)

Clusters with anomalous families

Time

  • anomolous: Austroasiatic - large conflict more closely associated?
CLUSTER = "time"
### scatterplot
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  left_join(tsne_dimsF, by = c("family", "cluster2" = "cluster_label")) %>%
  mutate(word_type = ifelse(cluster2 == CLUSTER, "target_cluster", "similiar_cluster")) %>%
  ggplot(aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = cluster2, color = word_type), size = 2) +
  facet_wrap(~ family, ncol = 5) +
  ggtitle(toupper(CLUSTER)) +
  theme_classic() +
  theme(legend.position = "none")
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label = cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Alone vs. groups

  • anomolous: TaiKadai, Sino-Tibetan
CLUSTER = "alone_vs_groups"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Leisure activities

  • anomolous: Afro-Asiatic (Arabic)

knowledge, large conflict more, community more commerce less

CLUSTER = "leisure_activities"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Technology- media

  • anomolous: Turkic

large conflict more quantity less, forward progress less, cause and effect more

CLUSTER = "technology_media"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Time- money

  • anomolous: Turkic

University more, travel less

CLUSTER = "time_money"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Environment

  • anomolous: Sino-Tibetan

forward progress less

CLUSTER = "enivornment"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Scientific method

  • anomolous: Afro-Asiatic - less first person (bodily functions), more social political scool
  • anomolous: Sino-Tibetan - less first person (bodily functions), more forward progress?
CLUSTER = "scientific_method"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Normative Life

  • anomolous: Dravidian

More passions, future aspirations

CLUSTER = "normative_life"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Bodily functions

  • anomolous: Turkic

More emotion, less community

CLUSTER = "bodily_functions"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Knowledge

  • anomolous: Turkic

More emotion, less community

CLUSTER = "knowledge"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Cause-effect

  • anomolous: Sino-Tibetan

less influence_change?

CLUSTER = "cause_effect"
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11) 
dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

language - family mappings

LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"

lang_key <- read_csv(LANG_KEY) 

lang_key %>%
  select(lang_name, aff) %>%
  arrange(aff) %>%
  DT::datatable()

word - cluster mappings

CLUST_PATH <-"fam_sub_clusters_labeled.csv"
clusts <- read_csv(CLUST_PATH) %>%
  select(word, cluster, label) 

clusts_with_lab <- clusts %>%
  select(-label) %>%
  left_join(clusts %>% distinct(label, .keep_all = T) %>% select(-word) %>% filter(!is.na(label)), by = "cluster")
#write_csv(clust_labels_words, "clust_labels.csv")
DT::datatable(clusts_with_lab)