get_unique_pairs <- function(dat) data.frame(unique(t(apply(dat, 1, sort))))
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')
TENSOR_OUTPUT_FILENAME_LOW <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_low.mat"
tensor_outputL <- read.mat(TENSOR_OUTPUT_FILENAME_LOW)
words1L <- tensor_outputL$k[[1]]
words2L <- tensor_outputL$k[[2]]
langsL <- tensor_outputL$k[[3]]
Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 620 ranks.
all_corrs_mat_langsL <- as.matrix(langsL)
rownames(all_corrs_mat_langsL) <- LANGS
dist_matrixL <- dist(all_corrs_mat_langsL)
lang_clustsL <- hclust(dist_matrixL, method = "ward.D")
ggdendro::ggdendrogram(lang_clustsL)
TENSOR_OUTPUT_FILENAME_HIGH <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_high.mat"
tensor_outputH <- read.mat(TENSOR_OUTPUT_FILENAME_HIGH)
words1H <- tensor_outputH$k[[1]]
words2H <- tensor_outputH$k[[2]]
langsH <- tensor_outputH$k[[3]]
Read in language and word tensor decomposition solutions from Bayesian model. The tensorization solution has 647 ranks.
all_corrs_mat_langsH <- as.matrix(langsH)
rownames(all_corrs_mat_langsH) <- LANGS
dist_matrixH <- dist(all_corrs_mat_langsH)
lang_clustsH <- hclust(dist_matrixH, method = "ward.D")
ggdendro::ggdendrogram(lang_clustsH)
Hdists <- dist_matrixH %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("lang1") %>%
gather("lang2", "dist_H", -lang1)
Ldists <- dist_matrixL %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("lang1") %>%
gather("lang2", "dist_L", -lang1)
all_dists <- full_join(Hdists, Ldists) %>%
filter(lang1 != lang2)
ggplot(all_dists, aes(x =dist_L, y = dist_H )) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()
cor.test(all_dists$dist_L, all_dists$dist_H)
##
## Pearson's product-moment correlation
##
## data: all_dists$dist_L and all_dists$dist_H
## t = 2.0131, df = 1188, p-value = 0.04433
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.00148425 0.11475307
## sample estimates:
## cor
## 0.05830631