Embedding centroids by family

Clusters with anomalous families
language - family mappings
word - cluster mappings

In the previous analysis, I looked at the correlation between rank values for each family and each cluster and identified anomolous family-cluster pairs (cases where the correlation was relatively more negative).

In this analysis, I go back to the original word embeddings and caluclate a centroid for the clusters determined by the tensor analysis, and look at the distribution of other centroids around the target.

Each plot below takes an anomalous cluster, and shows the 10 closest word to that cluster for each language family. The goal is to understand why an anomalous language is anamoulous.

The language-family mappings and the word-cluster mappings are at the bottom.

# Read in all centroids
CENTROID_PATH <- "scripts/5_clustering_embeddings/data/embedding_cluster_centroids_by_fam_"
FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo",
          "Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

centroids <- map_df(FAMS, function(x,y){ 
  path <- paste0(y, x, ".csv")
  read_csv(path) %>%
    mutate(family = x) %>%
    select(family, cluster, everything())
}, CENTROID_PATH)

# Merge in cluster labels
CLUSTER_PATH <- "fam_sub_corrs_labeled.csv"
cluster_labs <- read_csv(CLUSTER_PATH) %>%
  select(cluster, label) %>%
  rename(cluster_label = label) %>%
  distinct(cluster, cluster_label)

centroids_with_labs <-  centroids %>%
  left_join(cluster_labs)

# Do, tsne on embedding cluster centroids
# get tsne coordinates
tsne_outF = Rtsne::Rtsne(centroids_with_labs[,-1:-2],
                         check_duplicates = F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(centroids_with_labs %>% select(family, cluster_label))

PAIRWISE_PATH <- "scripts/5_clustering_embeddings/data/all_centroid_pairwise_dists.csv"
pairwise_dists <- read_csv(PAIRWISE_PATH)

Clusters with anomalous families

Time

anomolous: Austroasiatic - large conflict more closely associated?

CLUSTER = "time"

### scatterplot
dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  left_join(tsne_dimsF, by = c("family", "cluster2" = "cluster_label")) %>%
  mutate(word_type = ifelse(cluster2 == CLUSTER, "target_cluster", "similiar_cluster")) %>%
  ggplot(aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = cluster2, color = word_type), size = 2) +
  facet_wrap(~ family, ncol = 5) +
  ggtitle(toupper(CLUSTER)) +
  theme_classic() +
  theme(legend.position = "none")

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label = cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Alone vs. groups

anomolous: TaiKadai, Sino-Tibetan

CLUSTER = "alone_vs_groups"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Leisure activities

anomolous: Afro-Asiatic (Arabic)

knowledge, large conflict more, community more commerce less

CLUSTER = "leisure_activities"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Technology- media

anomolous: Turkic

large conflict more quantity less, forward progress less, cause and effect more

CLUSTER = "technology_media"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Time- money

anomolous: Turkic

University more, travel less

CLUSTER = "time_money"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Environment

anomolous: Sino-Tibetan

forward progress less

CLUSTER = "enivornment"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Scientific method

anomolous: Afro-Asiatic - less first person (bodily functions), more social political scool
anomolous: Sino-Tibetan - less first person (bodily functions), more forward progress?

CLUSTER = "scientific_method"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Normative Life

anomolous: Dravidian

More passions, future aspirations

CLUSTER = "normative_life"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Bodily functions

anomolous: Turkic

More emotion, less community

CLUSTER = "bodily_functions"

dists <-  pairwise_dists %>%
  filter(cluster1 == CLUSTER) %>%
  arrange(family, -cos_dist) %>%
  group_by(family) %>%
  slice(1:11) %>% 
  mutate(rank = 1:11)

dists %>%
  filter(cluster1 != cluster2) %>%
  ggplot(aes(x = family, y = rank, fill = factor(cluster2), alpha = cos_dist)) + 
      geom_tile() +
      geom_text(aes(label= cluster2), size = 1.3) +
      theme_classic() +
    ggtitle(toupper(CLUSTER)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

Knowledge

anomolous: Turkic