QUESTION: Do centroid distances correlate with linguistic measure of distance?
hd_centroid_dists_raw <- read_csv("../../all_data/pairwise_country_distances/HD_centroid_distances.csv")
hd_centroid_dists <- hd_centroid_dists_raw %>%
mutate(id = 1:n(),
lang2 = colnames(hd_centroid_dists_raw)) %>%
gather("lang1", "hd_dist", -c(id, lang2)) %>%
select(lang1, lang2, hd_dist) %>%
filter(!is.na(hd_dist))
Read in data
wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
select(lang1, lang2, wals_euclidean_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct() %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
distinct(all_codes, .keep_all = TRUE) %>%
select(-lang1, -lang2) %>%
separate(all_codes, c("lang1", "lang2"), "_") # gets unique pairs for joining
lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
select(lang1, lang2, asjp_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct() %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
distinct(all_codes, .keep_all = TRUE) %>%
select(-lang1, -lang2) %>%
separate(all_codes, c("lang1", "lang2"), "_")
all_value_pairs <-
hd_centroid_dists %>%
left_join(lang_dists) %>%
left_join(wals)
We’re mising 21 out of 66 eucliean pairwise dists and 45 out of 66 pairwise asjp dists.
all_value_pairs_f <- all_value_pairs %>%
filter(!is.na(asjp_dist) || !is.na(wals),
lang1 != lang2)
ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = hd_dist)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic distance \n(in model space)") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
cor.test(all_value_pairs_f$wals_euclidean_dist, all_value_pairs_f$hd_dist)
##
## Pearson's product-moment correlation
##
## data: all_value_pairs_f$wals_euclidean_dist and all_value_pairs_f$hd_dist
## t = 3.0024, df = 34, p-value = 0.004995
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1521244 0.6835175
## sample estimates:
## cor
## 0.457781
ggplot(all_value_pairs_f, aes(x = asjp_dist, y = hd_dist)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic distance \n(in model space)") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
cor.test(all_value_pairs_f$asjp_dist, all_value_pairs_f$hd_dist)
##
## Pearson's product-moment correlation
##
## data: all_value_pairs_f$asjp_dist and all_value_pairs_f$hd_dist
## t = 0.79508, df = 13, p-value = 0.4408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3337351 0.6553163
## sample estimates:
## cor
## 0.2153427