get_unique_pairs <- function(dat) unique(t(apply(dat, 1, sort)))
TENSOR_OUTPUT_FILENAME <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)
words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]
Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 615 ranks.
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')
all_corrs_mat_langs <- as.matrix(langs)
rownames(all_corrs_mat_langs) <- LANGS
dist_matrix <- dist(all_corrs_mat_langs)
lang_clusts <- hclust(dist_matrix)
ggdendro::ggdendrogram(lang_clusts)
lang_sub_trees <- lang_clusts %>%
as.dendrogram %>%
dendextend::partition_leaves()
lang_sub_trees_ids <- map(lang_sub_trees,
~ paste(unlist(.), collapse = '_')) %>%
unlist()
get_mean_ranks_of_subtrees <- function(leaf_list, ranks){
node_ranks <- ranks[unlist(leaf_list),]
if (is.null(dim(node_ranks))){ # case where subtree is single leaf
node_means <- node_ranks
} else {
node_means <- colMeans(node_ranks)
}
data.frame(rank_id = 1:dim(ranks)[2],
rank_means = node_means)
}
mean_rank_subtrees_langs <- map_df(lang_sub_trees,
get_mean_ranks_of_subtrees,
all_corrs_mat_langs,
.id = "node_id") %>%
mutate(node_id = as.factor(node_id)) %>%
data.table() # this is necessary for printing
The language dendogram has 69 subtrees (nodes).
As a sanity check, here they are:
lang_sub_trees_ids
## [1] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA_TGL_YOR_FRE_ITA"
## [2] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA_TGL_YOR"
## [3] "NEP_TEL_FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"
## [4] "NEP_TEL"
## [5] "NEP"
## [6] "TEL"
## [7] "FAS_GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"
## [8] "FAS"
## [9] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL_RUS_POR_SPA"
## [10] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL_ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [11] "GRE_JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL"
## [12] "GRE"
## [13] "JPN_KOR_VIE_CHI_IND_THA_ARA_TUR_BUL_POL"
## [14] "JPN_KOR_VIE_CHI_IND_THA_ARA_TUR"
## [15] "JPN_KOR_VIE_CHI_IND_THA"
## [16] "JPN_KOR"
## [17] "JPN"
## [18] "KOR"
## [19] "VIE_CHI_IND_THA"
## [20] "VIE"
## [21] "CHI_IND_THA"
## [22] "CHI"
## [23] "IND_THA"
## [24] "IND"
## [25] "THA"
## [26] "ARA_TUR"
## [27] "ARA"
## [28] "TUR"
## [29] "BUL_POL"
## [30] "BUL"
## [31] "POL"
## [32] "ENG_MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [33] "ENG"
## [34] "MAR_URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [35] "MAR"
## [36] "URD_KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [37] "URD"
## [38] "KAN_TAM_PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [39] "KAN_TAM"
## [40] "KAN"
## [41] "TAM"
## [42] "PAN_BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [43] "PAN"
## [44] "BEN_HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [45] "BEN"
## [46] "HIN_IBO_GUJ_DUT_RUM_GER_MAL"
## [47] "HIN"
## [48] "IBO_GUJ_DUT_RUM_GER_MAL"
## [49] "IBO_GUJ_DUT_RUM"
## [50] "IBO"
## [51] "GUJ_DUT_RUM"
## [52] "GUJ"
## [53] "DUT_RUM"
## [54] "DUT"
## [55] "RUM"
## [56] "GER_MAL"
## [57] "GER"
## [58] "MAL"
## [59] "RUS_POR_SPA"
## [60] "RUS"
## [61] "POR_SPA"
## [62] "POR"
## [63] "SPA"
## [64] "TGL_YOR"
## [65] "TGL"
## [66] "YOR"
## [67] "FRE_ITA"
## [68] "FRE"
## [69] "ITA"
WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")
all_corrs_mat_words <- as.matrix(words1)
rownames(all_corrs_mat_words) <- WORDS$w1
dist_matrix <- dist(all_corrs_mat_words)
word_clusts <- hclust(dist_matrix)
ggdendro::ggdendrogram(word_clusts) +
theme(axis.text.x = element_text(hjust = 1, size = 5))
word_sub_trees <- word_clusts %>%
as.dendrogram %>%
dendextend::partition_leaves()
word_sub_trees_ids <- map(word_sub_trees,
~ paste(unlist(.), collapse = '_')) %>%
unlist()
mean_rank_subtrees_words <- map_df(word_sub_trees,
get_mean_ranks_of_subtrees,
all_corrs_mat_words,
.id = "node_id") %>%
mutate(node_id = as.factor(node_id)) %>%
data.table()
The word dendogram has 1419 subtrees (nodes).
Get correlations between ranks for all subtrees of the word dendogram and all subtrees of the tree dendogram.
Get the pairwise correlations between language nodes.
get_pairwise_corr <- function(i1, i2, means1, means2){
ranks_1 <- means1[node_id == i1]
ranks_2 <- means2[node_id == i2]
r_coefficient <- cor(ranks_1$rank_means,
ranks_2$rank_means)
data.frame(node1 = i1,
node2 = i2,
r_coefficient = r_coefficient)
}
## lang-lang
lang_node_pairs <- expand.grid(unique(mean_rank_subtrees_langs$node_id),
unique(mean_rank_subtrees_langs$node_id)) %>%
get_unique_pairs() %>%
data.frame() %>%
mutate_all(as.numeric)
lang_node_corrs <- map2_df(lang_node_pairs$X1,
lang_node_pairs$X2,
get_pairwise_corr,
mean_rank_subtrees_langs,
mean_rank_subtrees_langs)
write_csv(lang_node_corrs, "lang_node_corrs.csv")
lang_node_corrs <- read_csv( "lang_node_corrs.csv")
lang_ids_df <- data.frame(node_id = 1:length(lang_sub_trees_ids),
node = lang_sub_trees_ids,
n = str_count(lang_sub_trees_ids, "_") + 1)
lang_node_corrs_named <- lang_node_corrs %>%
left_join(lang_ids_df, by = c("node1" = "node_id")) %>%
rename(node1_name = node) %>%
left_join(lang_ids_df, by = c("node2" = "node_id")) %>%
rename(node2_name = node)
ggplot(lang_node_corrs_named, aes(x = r_coefficient)) +
geom_histogram() +
ggtitle("Distribution of lang-lang correlation coefficients") +
theme_classic()
MIN_LANGS <- 2
MAX_LANGS <- 8
I’m filtering here by nodes that have between 2 and 8 languages.
lang_node_corrs_named %>%
arrange(-r_coefficient) %>%
filter(n.x >= MIN_LANGS & n.x <= MAX_LANGS & n.y >= MIN_LANGS & n.y <= MAX_LANGS) %>%
filter(node1 != node2) %>%
select(node1_name, node2_name, r_coefficient) %>%
DT::datatable()
Get the pairwise correlations between language and word nodes.
lang_word_node_pairs <- expand.grid(langs_ids = unique(mean_rank_subtrees_langs$node_id),
word_ids = unique(mean_rank_subtrees_words$node_id)) %>%
get_unique_pairs() %>%
data.frame() %>%
mutate_all(as.numeric)
lang_word_node_corrs <- map2_df(lang_word_node_pairs$X1,
lang_word_node_pairs$X2,
get_pairwise_corr,
mean_rank_subtrees_langs,
mean_rank_subtrees_words)
write_csv(lang_word_node_corrs, "lang_word_node_corrs.csv")
lang_word_node_corrs <- read_csv( "lang_word_node_corrs.csv")
word_ids_df <- data.frame(node_id = 1:length(word_sub_trees_ids),
node = word_sub_trees_ids,
n = str_count(word_sub_trees_ids, "_") + 1)
lang_word_node_corrs_named <- lang_word_node_corrs %>%
left_join(lang_ids_df, by = c("node1" = "node_id")) %>%
rename(node1_name = node) %>%
left_join(word_ids_df, by = c("node2" = "node_id")) %>%
rename(node2_name = node)
ggplot(lang_word_node_corrs_named, aes(x = r_coefficient)) +
geom_histogram() +
ggtitle("Distribution of lang-word correlation coefficients") +
theme_classic()
It looks like the mean correlation is zero, but it’s definitely positive (just small):
t.test(lang_word_node_corrs_named$r_coefficient)
##
## One Sample t-test
##
## data: lang_word_node_corrs_named$r_coefficient
## t = 21.052, df = 97910, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.002427969 0.002926476
## sample estimates:
## mean of x
## 0.002677222
MIN_WORDS <- 2
MAX_WORDS <- 8
I’m filtering here by nodes that have between 2 and 8 languages and between 2 and 8 words.
lang_word_node_corrs_named %>%
arrange(-r_coefficient) %>%
filter(n.x >= MIN_LANGS & n.x <= MAX_LANGS & n.y >= MIN_WORDS & n.y <= MAX_WORDS) %>%
filter(node1 != node2) %>%
select(node1_name, node2_name, r_coefficient) %>%
DT::datatable()
N_WORDS_SAMPLE <- 20
Get the pairwise correlations between word nodes. This is huge and computationally slow. Here’s just look at a small subsample (N = 20).
target_words <- sample(unique(mean_rank_subtrees_words$node_id), N_WORDS_SAMPLE, replace = F) %>%
as.numeric()
word_word_node_pairs <- expand.grid(target_words, target_words) %>%
get_unique_pairs() %>%
data.frame()
word_word_node_corrs <- map2_df(word_word_node_pairs$X1,
word_word_node_pairs$X2,
get_pairwise_corr,
mean_rank_subtrees_words,
mean_rank_subtrees_words)
word_word_node_corrs_named <- word_word_node_corrs %>%
left_join(word_ids_df, by = c("node1" = "node_id")) %>%
rename(node1_name = node) %>%
left_join(word_ids_df, by = c("node2" = "node_id")) %>%
rename(node2_name = node)
ggplot(word_word_node_corrs_named, aes(x = r_coefficient)) +
geom_histogram() +
ggtitle("Distribution of lang-word correlation coefficients") +
theme_classic()
I’m filtering here by nodes that have between 2 and 8 words.
word_word_node_corrs_named %>%
arrange(-r_coefficient) %>%
filter(n.x >= MIN_WORDS & n.x <= MAX_WORDS & n.y >= MIN_WORDS & n.y <= MAX_WORDS) %>%
filter(node1 != node2) %>%
select(node1_name, node2_name, r_coefficient) %>%
DT::datatable()