get_unique_pairs <- function(dat) data.frame(unique(t(apply(dat, 1, sort))))
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

Language dendograms

Low

TENSOR_OUTPUT_FILENAME_LOW <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_low.mat"
tensor_outputL <- read.mat(TENSOR_OUTPUT_FILENAME_LOW)

words1L <- tensor_outputL$k[[1]]
words2L <- tensor_outputL$k[[2]]
langsL <- tensor_outputL$k[[3]]

Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 620 ranks.

all_corrs_mat_langsL <- as.matrix(langsL)
rownames(all_corrs_mat_langsL) <- LANGS
dist_matrixL <- dist(all_corrs_mat_langsL)
lang_clustsL <- hclust(dist_matrixL, method = "ward.D")
ggdendro::ggdendrogram(lang_clustsL)

High

TENSOR_OUTPUT_FILENAME_HIGH <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_high.mat"
tensor_outputH <- read.mat(TENSOR_OUTPUT_FILENAME_HIGH)

words1H <- tensor_outputH$k[[1]]
words2H <- tensor_outputH$k[[2]]
langsH <- tensor_outputH$k[[3]]

Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 647 ranks.

all_corrs_mat_langsH <- as.matrix(langsH)
rownames(all_corrs_mat_langsH) <- LANGS
dist_matrixH <- dist(all_corrs_mat_langsH)
lang_clustsH <- hclust(dist_matrixH, method = "ward.D")
ggdendro::ggdendrogram(lang_clustsH)

Low vs. High

Hdists <- dist_matrixH %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather("lang2", "dist_H", -lang1)

Ldists <- dist_matrixL %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather("lang2", "dist_L", -lang1)

all_dists <- full_join(Hdists, Ldists) %>%
  filter(lang1 != lang2)

ggplot(all_dists, aes(x =dist_L, y = dist_H )) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(all_dists$dist_L, all_dists$dist_H)
## 
##  Pearson's product-moment correlation
## 
## data:  all_dists$dist_L and all_dists$dist_H
## t = 2.0131, df = 1188, p-value = 0.04433
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.00148425 0.11475307
## sample estimates:
##        cor 
## 0.05830631