Read in all pairwise correlaitons

all_words <- read_feather("../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather") %>%
  mutate(prop_words = as.character(1.0))

all_files <- list.files("../../../data/processed/pairwise_country_distances/params/")

all_corrs <- map_df(all_files, function(x){read_feather(paste0("../../../data/processed/pairwise_country_distances/params/",x)) %>% mutate(file = x)}) %>%
  separate(file, sep = "PROP_", into= c("a","b")) %>%
  separate(b, sep = "_low", into = c("prop_words", "c")) %>%
  select(-a, -c) %>%
  select(prop_words, everything()) %>%
  bind_rows(all_words)
  #mutate(prop_words = as.numeric(prop_words)) %>%
  

actually_unique <- unique(as.data.frame(t(apply(all_corrs[,2:3], 1, sort)))) %>%
  rename(lang1 = V1, lang2 = V2)

all_corrs_unique <- actually_unique %>%
  left_join(all_corrs) %>%
  mutate_if(is.character, as.factor)

Mean pairwise correlations

ggplot(all_corrs_unique, aes(x = estimate, fill = prop_words)) +
  geom_histogram() +
  facet_grid(.~prop_words) +
  theme_bw() +
  theme(legend.position = "none") 

mean_corrs <- all_corrs_unique %>%
  group_by(prop_words) %>%
  multi_boot_standard(col = "estimate")
ggplot(mean_corrs, aes(x = as.numeric(as.character(prop_words)), y = mean)) +
  geom_line() +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2, color = "red") +
  ylab("mean correlation between languages \n(pearson's r)")+
  xlab("prop words included") +
  theme_bw() +
  ggtitle("mean pairwise language corrlelation \nas a function of prop words included")

Dendograms

Similarity matrix between languages (based on pairwise word distances)

p = list()
all_props <- unique(all_corrs_unique$prop_words)
for (i in 1:length(all_props)){
  wide_corrs <- all_corrs_unique %>%
    filter(prop_words == all_props[i]) %>%
    select(c(1,2,4)) %>%
    spread(lang2, estimate)
  
  all_corrs_mat <- as.matrix(wide_corrs[,-1])
  rownames(all_corrs_mat) <- wide_corrs$lang1
  
  dist_matrix <- dist(all_corrs_mat)
  dist_matrix_kable <- as.matrix(dist_matrix)
  dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
  kable(as.data.frame(dist_matrix_kable), digits = 2)
  
  p[[i]] <- ggdendro::ggdendrogram(hclust(dist_matrix)) +
    ggtitle(paste0("prop =", all_props[i])) +
    theme(axis.text.x=element_text(size = 6))
}

library(gridExtra)
do.call("grid.arrange", c(p, ncol=3))

Correlations with linguistic distances

Read in data

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2)) %>%
  distinct()

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2)) %>%
  distinct() 

all_value_pairs <- 
  all_corrs_unique %>%
  left_join(lang_dists) %>%
  left_join(wals) 

all_value_pairs_f <- all_value_pairs %>%
  filter(!is.na(asjp_dist) || !is.na(wals),
         lang1 != lang2)

WALS

(grammatical distance)

ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

all_value_pairs_f %>% 
  group_by(prop_words) %>% do(tidy(cor.test(.$wals_euclidean_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()
prop_words estimate statistic p.value parameter conf.low conf.high
0.01 0.0106232 0.1607667 0.8724189 229 -0.1186172 0.1395095
0.02 -0.0800410 -1.2151382 0.2255647 229 -0.2069802 0.0495486
0.03 -0.1451142 -2.2194701 0.0274354 229 -0.2691505 -0.0163427
0.04 -0.1733081 -2.6629243 0.0082962 229 -0.2957694 -0.0452425
0.05 -0.1857603 -2.8608557 0.0046158 229 -0.3074657 -0.0580751
0.1 -0.1905870 -2.9379568 0.0036415 229 -0.3119896 -0.0630607
0.2 -0.1890258 -2.9129938 0.0039341 229 -0.3105269 -0.0614473
0.3 -0.1910124 -2.9447619 0.0035652 229 -0.3123880 -0.0635003
0.4 -0.1918603 -2.9583330 0.0034175 229 -0.3131821 -0.0643770
1 -0.1911828 -2.9474892 0.0035351 229 -0.3125476 -0.0636765

ASJP

(lexical distance)

ggplot(all_value_pairs_f, aes(x = asjp_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

all_value_pairs_f %>% 
  group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()
prop_words estimate statistic p.value parameter conf.low conf.high
0.01 -0.2339244 -2.441822 0.0163172 103 -0.4073270 -0.0442428
0.02 -0.4040874 -4.483381 0.0000191 103 -0.5529286 -0.2302554
0.03 -0.4777920 -5.519873 0.0000003 103 -0.6132942 -0.3149706
0.04 -0.5160870 -6.114991 0.0000000 103 -0.6440459 -0.3600350
0.05 -0.5216627 -6.205575 0.0000000 103 -0.6484890 -0.3666575
0.1 -0.5099233 -6.016085 0.0000000 103 -0.6391240 -0.3527322
0.2 -0.5085667 -5.994487 0.0000000 103 -0.6380394 -0.3511275
0.3 -0.5089598 -6.000738 0.0000000 103 -0.6383537 -0.3515923
0.4 -0.5092203 -6.004886 0.0000000 103 -0.6385620 -0.3519005
1 -0.5089941 -6.001285 0.0000000 103 -0.6383812 -0.3516330

Excluding outliers

ggplot(all_value_pairs_f %>% filter(asjp_dist > .7), aes(x = asjp_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("lexical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

all_value_pairs_f %>% 
  filter(asjp_dist > .7) %>%
  group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()
prop_words estimate statistic p.value parameter conf.low conf.high
0.01 0.0600458 0.589390 0.5569842 96 -0.1400435 0.2554234
0.02 -0.1204153 -1.188472 0.2375791 96 -0.3113960 0.0799148
0.03 -0.2516765 -2.547931 0.0124235 96 -0.4286895 -0.0560550
0.04 -0.3235129 -3.349912 0.0011566 96 -0.4904509 -0.1336729
0.05 -0.3370854 -3.508061 0.0006886 96 -0.5019344 -0.1486040
0.1 -0.3538034 -3.706273 0.0003518 96 -0.5159999 -0.1671144
0.2 -0.3516228 -3.680196 0.0003848 96 -0.5141703 -0.1646925
0.3 -0.3548796 -3.719169 0.0003364 96 -0.5169024 -0.1683105
0.4 -0.3548765 -3.719132 0.0003365 96 -0.5168999 -0.1683071
1 -0.3537389 -3.705501 0.0003527 96 -0.5159459 -0.1670427