get_unique_pairs <- function(dat) unique(t(apply(dat, 1, sort)))
TENSOR_OUTPUT_FILENAME <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)

words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]

Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 615 ranks.

Language dendogram

LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

all_corrs_mat_langs <- as.matrix(langs)
rownames(all_corrs_mat_langs) <- LANGS
dist_matrix <- dist(all_corrs_mat_langs)
lang_clusts <- hclust(dist_matrix)
ggdendro::ggdendrogram(lang_clusts)

lang_sub_trees <- lang_clusts %>%
  as.dendrogram %>%
  dendextend::partition_leaves()

lang_sub_trees_ids <- map(lang_sub_trees, 
                          ~ paste(unlist(.), collapse = '_')) %>%
                      unlist()

get_mean_ranks_of_subtrees <- function(leaf_list, ranks){
  node_ranks <- ranks[unlist(leaf_list),]

  if (is.null(dim(node_ranks))){ # case where subtree is single leaf
   node_means <- node_ranks
  } else {
   node_means <- colMeans(node_ranks)
  }
  
  data.frame(rank_id = 1:dim(ranks)[2],
             rank_means = node_means)
}

mean_rank_subtrees_langs <- map_df(lang_sub_trees, 
                                   get_mean_ranks_of_subtrees, 
                                   all_corrs_mat_langs,
                                   .id = "node_id") %>%
  mutate(node_id = as.factor(node_id)) %>%
  data.table() # this is necessary for printing

The language dendogram has 69 subtrees (nodes).

As a sanity check, here they are:

lang_sub_trees_ids
##  [1] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA_TGL_YOR_FRE_ITA"
##  [2] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA_TGL_YOR"        
##  [3] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"                
##  [4] "NEP_TEL"                                                                                                                                    
##  [5] "NEP"                                                                                                                                        
##  [6] "TEL"                                                                                                                                        
##  [7] "FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"                        
##  [8] "FAS"                                                                                                                                        
##  [9] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"                            
## [10] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                        
## [11] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL"                                                                                                
## [12] "GRE"                                                                                                                                        
## [13] "JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL"                                                                                                    
## [14] "JPN_KOR_VIE_CHI_IND_THA_ARA_TUR"                                                                                                            
## [15] "JPN_KOR_VIE_CHI_IND_THA"                                                                                                                    
## [16] "JPN_KOR"                                                                                                                                    
## [17] "JPN"                                                                                                                                        
## [18] "KOR"                                                                                                                                        
## [19] "VIE_CHI_IND_THA"                                                                                                                            
## [20] "VIE"                                                                                                                                        
## [21] "CHI_IND_THA"                                                                                                                                
## [22] "CHI"                                                                                                                                        
## [23] "IND_THA"                                                                                                                                    
## [24] "IND"                                                                                                                                        
## [25] "THA"                                                                                                                                        
## [26] "ARA_TUR"                                                                                                                                    
## [27] "ARA"                                                                                                                                        
## [28] "TUR"                                                                                                                                        
## [29] "BUL_POL"                                                                                                                                    
## [30] "BUL"                                                                                                                                        
## [31] "POL"                                                                                                                                        
## [32] "ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                    
## [33] "ENG"                                                                                                                                        
## [34] "MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                        
## [35] "MAR"                                                                                                                                        
## [36] "URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                            
## [37] "URD"                                                                                                                                        
## [38] "KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                                
## [39] "KAN_TAM"                                                                                                                                    
## [40] "KAN"                                                                                                                                        
## [41] "TAM"                                                                                                                                        
## [42] "PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                                        
## [43] "PAN"                                                                                                                                        
## [44] "BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                                            
## [45] "BEN"                                                                                                                                        
## [46] "HIN_IBO_GUJ_DUT_RUM_GER_MAL"                                                                                                                
## [47] "HIN"                                                                                                                                        
## [48] "IBO_GUJ_DUT_RUM_GER_MAL"                                                                                                                    
## [49] "IBO_GUJ_DUT_RUM"                                                                                                                            
## [50] "IBO"                                                                                                                                        
## [51] "GUJ_DUT_RUM"                                                                                                                                
## [52] "GUJ"                                                                                                                                        
## [53] "DUT_RUM"                                                                                                                                    
## [54] "DUT"                                                                                                                                        
## [55] "RUM"                                                                                                                                        
## [56] "GER_MAL"                                                                                                                                    
## [57] "GER"                                                                                                                                        
## [58] "MAL"                                                                                                                                        
## [59] "RUS_POR_SPA"                                                                                                                                
## [60] "RUS"                                                                                                                                        
## [61] "POR_SPA"                                                                                                                                    
## [62] "POR"                                                                                                                                        
## [63] "SPA"                                                                                                                                        
## [64] "TGL_YOR"                                                                                                                                    
## [65] "TGL"                                                                                                                                        
## [66] "YOR"                                                                                                                                        
## [67] "FRE_ITA"                                                                                                                                    
## [68] "FRE"                                                                                                                                        
## [69] "ITA"

Word dendogram

WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")

all_corrs_mat_words <- as.matrix(words1)
rownames(all_corrs_mat_words) <- WORDS$w1 
dist_matrix <- dist(all_corrs_mat_words)
word_clusts <- hclust(dist_matrix)

ggdendro::ggdendrogram(word_clusts) +
 theme(axis.text.x = element_text(hjust = 1, size = 5)) 

word_sub_trees <- word_clusts %>%
  as.dendrogram %>%
  dendextend::partition_leaves()

word_sub_trees_ids <- map(word_sub_trees, 
                          ~ paste(unlist(.), collapse = '_')) %>%
                        unlist()

mean_rank_subtrees_words <- map_df(word_sub_trees, 
                                   get_mean_ranks_of_subtrees,
                                   all_corrs_mat_words, 
                                   .id = "node_id") %>%
                            mutate(node_id = as.factor(node_id)) %>%
                            data.table()

The word dendogram has 1419 subtrees (nodes).

Rank subtree correlations

Get correlations between ranks for all subtrees of the word dendogram and all subtrees of the tree dendogram.

LANGUAGE-LANGUAGE

Get the pairwise correlations between language nodes.

get_pairwise_corr <- function(i1, i2, means1, means2){
      ranks_1 <- means1[node_id == i1]
      ranks_2 <- means2[node_id == i2]
     
      r_coefficient  <- cor(ranks_1$rank_means, 
                            ranks_2$rank_means)
      
      data.frame(node1 = i1,
                 node2 = i2,
                 r_coefficient = r_coefficient)
}
## lang-lang
lang_node_pairs <- expand.grid(unique(mean_rank_subtrees_langs$node_id), 
                                    unique(mean_rank_subtrees_langs$node_id)) %>%
  get_unique_pairs() %>%
  data.frame() %>%
  mutate_all(as.numeric) 

lang_node_corrs  <- map2_df(lang_node_pairs$X1, 
                  lang_node_pairs$X2,
                  get_pairwise_corr,
                  mean_rank_subtrees_langs,
                  mean_rank_subtrees_langs)
write_csv(lang_node_corrs, "lang_node_corrs.csv")
lang_node_corrs <- read_csv( "lang_node_corrs.csv")
lang_ids_df <- data.frame(node_id = 1:length(lang_sub_trees_ids), 
                          node = lang_sub_trees_ids,
                          n = str_count(lang_sub_trees_ids, "_") + 1) 

lang_node_corrs_named <- lang_node_corrs %>%
  left_join(lang_ids_df, by = c("node1" = "node_id")) %>%
  rename(node1_name = node) %>%
  left_join(lang_ids_df, by = c("node2" = "node_id")) %>%
  rename(node2_name = node)
ggplot(lang_node_corrs_named, aes(x = r_coefficient)) +
  geom_histogram() +
  ggtitle("Distribution of lang-lang correlation coefficients") +
  theme_classic()

Table

MIN_LANGS <- 2
MAX_LANGS <- 8

I’m filtering here by nodes that have between 2 and 8 languages.

lang_node_corrs_named %>%
  arrange(-r_coefficient) %>%
  filter(n.x >= MIN_LANGS  & n.x <= MAX_LANGS & n.y >= MIN_LANGS  & n.y <= MAX_LANGS) %>%
  filter(node1 != node2) %>%
  select(node1_name, node2_name, r_coefficient) %>%
  DT::datatable()

LANGUAGE-WORD

Get the pairwise correlations between language and word nodes.

lang_word_node_pairs <- expand.grid(langs_ids = unique(mean_rank_subtrees_langs$node_id), 
                                    word_ids = unique(mean_rank_subtrees_words$node_id)) %>%
  get_unique_pairs() %>%
  data.frame() %>%
  mutate_all(as.numeric) 

lang_word_node_corrs  <- map2_df(lang_word_node_pairs$X1, 
                              lang_word_node_pairs$X2,
                              get_pairwise_corr,
                              mean_rank_subtrees_langs,
                              mean_rank_subtrees_words)
write_csv(lang_word_node_corrs, "lang_word_node_corrs.csv")
lang_word_node_corrs <- read_csv( "lang_word_node_corrs.csv")

word_ids_df <- data.frame(node_id = 1:length(word_sub_trees_ids), 
                              node = word_sub_trees_ids,
                          n = str_count(word_sub_trees_ids, "_") + 1) 

lang_word_node_corrs_named <- lang_word_node_corrs %>%
  left_join(lang_ids_df, by = c("node1" = "node_id")) %>%
  rename(node1_name = node) %>%
  left_join(word_ids_df, by = c("node2" = "node_id")) %>%
  rename(node2_name = node) 
ggplot(lang_word_node_corrs_named, aes(x = r_coefficient)) +
  geom_histogram() +
  ggtitle("Distribution of lang-word correlation coefficients") +
  theme_classic()

It looks like the mean correlation is zero, but it’s definitely positive (just small):

t.test(lang_word_node_corrs_named$r_coefficient)
## 
##  One Sample t-test
## 
## data:  lang_word_node_corrs_named$r_coefficient
## t = 21.052, df = 97910, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.002427969 0.002926476
## sample estimates:
##   mean of x 
## 0.002677222

Table

MIN_WORDS <- 2
MAX_WORDS <- 8

I’m filtering here by nodes that have between 2 and 8 languages and between 2 and 8 words.

lang_word_node_corrs_named %>%
  arrange(-r_coefficient) %>%
  filter(n.x >= MIN_LANGS  & n.x <= MAX_LANGS  & n.y >= MIN_WORDS &  n.y <= MAX_WORDS) %>%
  filter(node1 != node2) %>%
  select(node1_name, node2_name, r_coefficient) %>%
  DT::datatable()

WORD-WORD

N_WORDS_SAMPLE <- 20

Get the pairwise correlations between word nodes. This is huge and computationally slow. Here’s just look at a small subsample (N = 20).

target_words <- sample(unique(mean_rank_subtrees_words$node_id), N_WORDS_SAMPLE, replace = F)  %>%
  as.numeric()

word_word_node_pairs <- expand.grid(target_words, target_words) %>%
  get_unique_pairs() %>%
  data.frame()

word_word_node_corrs  <- map2_df(word_word_node_pairs$X1, 
                              word_word_node_pairs$X2,
                              get_pairwise_corr,
                              mean_rank_subtrees_words,
                              mean_rank_subtrees_words)
word_word_node_corrs_named <- word_word_node_corrs %>%
  left_join(word_ids_df, by = c("node1" = "node_id")) %>%
  rename(node1_name = node) %>%
  left_join(word_ids_df, by = c("node2" = "node_id")) %>%
  rename(node2_name = node)  
ggplot(word_word_node_corrs_named, aes(x = r_coefficient)) +
  geom_histogram() +
  ggtitle("Distribution of lang-word correlation coefficients") +
  theme_classic()

Table

I’m filtering here by nodes that have between 2 and 8 words.

word_word_node_corrs_named %>%
  arrange(-r_coefficient) %>%
  filter(n.x >= MIN_WORDS  & n.x <= MAX_WORDS  & n.y >= MIN_WORDS &  n.y <= MAX_WORDS) %>%
  filter(node1 != node2) %>%
  select(node1_name, node2_name, r_coefficient) %>%
  DT::datatable()