Read in all pairwise correlaitons
all_words <- read_feather("../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather") %>%
mutate(prop_words = as.character(1.0))
all_files <- list.files("../../../data/processed/pairwise_country_distances/params/")
all_corrs <- map_df(all_files, function(x){read_feather(paste0("../../../data/processed/pairwise_country_distances/params/",x)) %>% mutate(file = x)}) %>%
separate(file, sep = "PROP_", into= c("a","b")) %>%
separate(b, sep = "_low", into = c("prop_words", "c")) %>%
select(-a, -c) %>%
select(prop_words, everything()) %>%
bind_rows(all_words)
#mutate(prop_words = as.numeric(prop_words)) %>%
actually_unique <- unique(as.data.frame(t(apply(all_corrs[,2:3], 1, sort)))) %>%
rename(lang1 = V1, lang2 = V2)
all_corrs_unique <- actually_unique %>%
left_join(all_corrs) %>%
mutate_if(is.character, as.factor)
ggplot(all_corrs_unique, aes(x = estimate, fill = prop_words)) +
geom_histogram() +
facet_grid(.~prop_words) +
theme_bw() +
theme(legend.position = "none")
mean_corrs <- all_corrs_unique %>%
group_by(prop_words) %>%
multi_boot_standard(col = "estimate")
ggplot(mean_corrs, aes(x = as.numeric(as.character(prop_words)), y = mean)) +
geom_line() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2, color = "red") +
ylab("mean correlation between languages \n(pearson's r)")+
xlab("prop words included") +
theme_bw() +
ggtitle("mean pairwise language corrlelation \nas a function of prop words included")
Similarity matrix between languages (based on pairwise word distances)
p = list()
all_props <- unique(all_corrs_unique$prop_words)
for (i in 1:length(all_props)){
wide_corrs <- all_corrs_unique %>%
filter(prop_words == all_props[i]) %>%
select(c(1,2,4)) %>%
spread(lang2, estimate)
all_corrs_mat <- as.matrix(wide_corrs[,-1])
rownames(all_corrs_mat) <- wide_corrs$lang1
dist_matrix <- dist(all_corrs_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
p[[i]] <- ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle(paste0("prop =", all_props[i])) +
theme(axis.text.x=element_text(size = 6))
}
library(gridExtra)
do.call("grid.arrange", c(p, ncol=3))
Read in data
wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
select(lang1, lang2, wals_euclidean_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct()
lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
select(lang1, lang2, asjp_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct()
all_value_pairs <-
all_corrs_unique %>%
left_join(lang_dists) %>%
left_join(wals)
all_value_pairs_f <- all_value_pairs %>%
filter(!is.na(asjp_dist) || !is.na(wals),
lang1 != lang2)
(grammatical distance)
ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = estimate)) +
facet_wrap(~prop_words) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic space correlations") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
all_value_pairs_f %>%
group_by(prop_words) %>% do(tidy(cor.test(.$wals_euclidean_dist, .$estimate))) %>%
select(-method, -alternative) %>%
kable()
| prop_words | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| 0.01 | 0.0106232 | 0.1607667 | 0.8724189 | 229 | -0.1186172 | 0.1395095 |
| 0.02 | -0.0800410 | -1.2151382 | 0.2255647 | 229 | -0.2069802 | 0.0495486 |
| 0.03 | -0.1451142 | -2.2194701 | 0.0274354 | 229 | -0.2691505 | -0.0163427 |
| 0.04 | -0.1733081 | -2.6629243 | 0.0082962 | 229 | -0.2957694 | -0.0452425 |
| 0.05 | -0.1857603 | -2.8608557 | 0.0046158 | 229 | -0.3074657 | -0.0580751 |
| 0.1 | -0.1905870 | -2.9379568 | 0.0036415 | 229 | -0.3119896 | -0.0630607 |
| 0.2 | -0.1890258 | -2.9129938 | 0.0039341 | 229 | -0.3105269 | -0.0614473 |
| 0.3 | -0.1910124 | -2.9447619 | 0.0035652 | 229 | -0.3123880 | -0.0635003 |
| 0.4 | -0.1918603 | -2.9583330 | 0.0034175 | 229 | -0.3131821 | -0.0643770 |
| 1 | -0.1911828 | -2.9474892 | 0.0035351 | 229 | -0.3125476 | -0.0636765 |
(lexical distance)
ggplot(all_value_pairs_f, aes(x = asjp_dist, y = estimate)) +
facet_wrap(~prop_words) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic space correlations") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
all_value_pairs_f %>%
group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
select(-method, -alternative) %>%
kable()
| prop_words | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| 0.01 | -0.2339244 | -2.441822 | 0.0163172 | 103 | -0.4073270 | -0.0442428 |
| 0.02 | -0.4040874 | -4.483381 | 0.0000191 | 103 | -0.5529286 | -0.2302554 |
| 0.03 | -0.4777920 | -5.519873 | 0.0000003 | 103 | -0.6132942 | -0.3149706 |
| 0.04 | -0.5160870 | -6.114991 | 0.0000000 | 103 | -0.6440459 | -0.3600350 |
| 0.05 | -0.5216627 | -6.205575 | 0.0000000 | 103 | -0.6484890 | -0.3666575 |
| 0.1 | -0.5099233 | -6.016085 | 0.0000000 | 103 | -0.6391240 | -0.3527322 |
| 0.2 | -0.5085667 | -5.994487 | 0.0000000 | 103 | -0.6380394 | -0.3511275 |
| 0.3 | -0.5089598 | -6.000738 | 0.0000000 | 103 | -0.6383537 | -0.3515923 |
| 0.4 | -0.5092203 | -6.004886 | 0.0000000 | 103 | -0.6385620 | -0.3519005 |
| 1 | -0.5089941 | -6.001285 | 0.0000000 | 103 | -0.6383812 | -0.3516330 |
Excluding outliers
ggplot(all_value_pairs_f %>% filter(asjp_dist > .7), aes(x = asjp_dist, y = estimate)) +
facet_wrap(~prop_words) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic space correlations") +
xlab("lexical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
all_value_pairs_f %>%
filter(asjp_dist > .7) %>%
group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
select(-method, -alternative) %>%
kable()
| prop_words | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| 0.01 | 0.0600458 | 0.589390 | 0.5569842 | 96 | -0.1400435 | 0.2554234 |
| 0.02 | -0.1204153 | -1.188472 | 0.2375791 | 96 | -0.3113960 | 0.0799148 |
| 0.03 | -0.2516765 | -2.547931 | 0.0124235 | 96 | -0.4286895 | -0.0560550 |
| 0.04 | -0.3235129 | -3.349912 | 0.0011566 | 96 | -0.4904509 | -0.1336729 |
| 0.05 | -0.3370854 | -3.508061 | 0.0006886 | 96 | -0.5019344 | -0.1486040 |
| 0.1 | -0.3538034 | -3.706273 | 0.0003518 | 96 | -0.5159999 | -0.1671144 |
| 0.2 | -0.3516228 | -3.680196 | 0.0003848 | 96 | -0.5141703 | -0.1646925 |
| 0.3 | -0.3548796 | -3.719169 | 0.0003364 | 96 | -0.5169024 | -0.1683105 |
| 0.4 | -0.3548765 | -3.719132 | 0.0003365 | 96 | -0.5168999 | -0.1683071 |
| 1 | -0.3537389 | -3.705501 | 0.0003527 | 96 | -0.5159459 | -0.1670427 |