QUESTION: Do centroid distances correlate with linguistic measure of distance?
hd_centroid_dists_raw <- read_csv("../../../data/processed/pairwise_country_distances/HD_centroid_distances_all.csv")
hd_centroid_dists <- hd_centroid_dists_raw %>%
mutate(id = 1:n(),
lang2 = colnames(hd_centroid_dists_raw)) %>%
gather("lang1", "hd_dist", -c(id, lang2)) %>%
select(lang1, lang2, hd_dist) %>%
filter(!is.na(hd_dist))
Read in data
wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
select(lang1, lang2, wals_euclidean_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct() %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
distinct(all_codes, .keep_all = TRUE) %>%
select(-lang1, -lang2) %>%
separate(all_codes, c("lang1", "lang2"), "_") # gets unique pairs for joining
lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
select(lang1, lang2, asjp_dist) %>%
mutate(lang1 = toupper(lang1),
lang2 = toupper(lang2)) %>%
distinct() %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
distinct(all_codes, .keep_all = TRUE) %>%
select(-lang1, -lang2) %>%
separate(all_codes, c("lang1", "lang2"), "_")
all_value_pairs <-
hd_centroid_dists %>%
left_join(lang_dists) %>%
left_join(wals)
We’re mising 377 out of 630 euclidean pairwise dists and 510 out of 630 pairwise asjp dists.
all_value_pairs_f <- all_value_pairs %>%
filter(!is.na(asjp_dist) || !is.na(wals),
lang1 != lang2)
ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = hd_dist)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic distance \n(in model space)") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
cor.test(all_value_pairs_f$wals_euclidean_dist, all_value_pairs_f$hd_dist)
##
## Pearson's product-moment correlation
##
## data: all_value_pairs_f$wals_euclidean_dist and all_value_pairs_f$hd_dist
## t = 2.3524, df = 229, p-value = 0.0195
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02502648 0.27718969
## sample estimates:
## cor
## 0.153608
ggplot(all_value_pairs_f, aes(x = asjp_dist, y = hd_dist)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic distance \n(in model space)") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
cor.test(all_value_pairs_f$asjp_dist, all_value_pairs_f$hd_dist)
##
## Pearson's product-moment correlation
##
## data: all_value_pairs_f$asjp_dist and all_value_pairs_f$hd_dist
## t = 3.8191, df = 103, p-value = 0.0002292
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1721492 0.5094683
## sample estimates:
## cor
## 0.3521938
There are the outliers with asjp_dist < .7 (langs very similiar)
filter(all_value_pairs_f, asjp_dist < .7) %>%
kable()
| lang1 | lang2 | hd_dist | asjp_dist | wals_euclidean_dist |
|---|---|---|---|---|
| BUL | POL | 0.9164063 | 0.5353481 | 100.81622 |
| BUL | RUS | 0.8779301 | 0.6247660 | 123.95096 |
| HIN | URD | 0.6351134 | 0.4254257 | NA |
| ITA | POR | 1.1273105 | 0.5832219 | 86.92013 |
| ITA | SPA | 1.2005809 | 0.5058394 | 145.24048 |
| POL | RUS | 0.9268018 | 0.6076491 | 136.06592 |
| POR | SPA | 0.7701023 | 0.6058661 | 148.58768 |
Exlcuding these outliers
all_value_pairs_f_asjpsub <- filter(all_value_pairs_f, asjp_dist > .7)
ggplot(all_value_pairs_f_asjpsub, aes(x = asjp_dist, y = hd_dist)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("semantic distance \n(in model space)") +
xlab("grammatical distance")+
ggtitle("Language-pairwise distances") +
theme_minimal() +
theme(
text = element_text(size = 13),
legend.background = element_rect(fill="gray90", size=.5))
cor.test(all_value_pairs_f_asjpsub$asjp_dist, all_value_pairs_f_asjpsub$hd_dist)
##
## Pearson's product-moment correlation
##
## data: all_value_pairs_f_asjpsub$asjp_dist and all_value_pairs_f_asjpsub$hd_dist
## t = -0.30951, df = 96, p-value = 0.7576
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2285628 0.1678987
## sample estimates:
## cor
## -0.03157396