This analysis is excluding yoruba, thai, and igbo, which have a fair number of embeddings missing. The same pattern of findings holds, though, when, they’re included.
INFILE <- "data/language_pairwise_swadesh_correlations_by_item.csv"
corr_df <- read_csv(INFILE) %>%
filter(!(lang1 %in% c("ig", "yo", "th")), # these langs have lots of missing items
!(lang2 %in% c("ig", "yo", "th")))
Physical distance
by language pair and item
pairwise_physical_distances <- geosphere::distm(as.matrix(lang_names[,c("lon", "lat")]))
colnames(pairwise_physical_distances) <-
lang_names$google_lang_name
pairwise_physical_dists <- pairwise_physical_distances %>%
data.frame() %>%
mutate(lang1 = colnames(pairwise_physical_distances)) %>%
select(lang1, everything()) %>%
gather("lang2", "dist", -lang1)
corr_geo <- left_join(corr_df, pairwise_physical_dists)
ggplot(corr_geo, aes (x = r, y = dist)) +
#facet_wrap(~item) +
# geom_point(size = .2) +
geom_smooth(method = "lm", aes(color = item), se = F) +
xlab("swadesh pairwise distance correlation") +
ylab("distance (meters)") +
theme_classic()

corr_geo %>%
filter(!is.na(item)) %>%
group_by(item) %>%
do(tidy(cor.test(.$r, .$dist))) %>%
select(item, estimate, p.value) %>%
arrange(estimate) %>%
kable()
| sand |
-0.2637058 |
0.0000000 |
| moon |
-0.2448806 |
0.0000001 |
| smoke |
-0.2318134 |
0.0000004 |
| wind |
-0.2239801 |
0.0000011 |
| day |
-0.2160789 |
0.0000026 |
| sun |
-0.2128056 |
0.0000037 |
| ash |
-0.2106228 |
0.0000046 |
| water |
-0.2077791 |
0.0000125 |
| mountain |
-0.2076504 |
0.0000126 |
| year |
-0.2059432 |
0.0000076 |
| night |
-0.2009535 |
0.0000242 |
| sea |
-0.1949803 |
0.0000230 |
| cloud |
-0.1841137 |
0.0000650 |
| fire |
-0.1828787 |
0.0001252 |
| earth |
-0.1781696 |
0.0001122 |
| sky |
-0.1736057 |
0.0001685 |
| stone |
-0.1683951 |
0.0002647 |
| lake |
-0.1657765 |
0.0003306 |
| dust |
-0.1615782 |
0.0007182 |
| salt |
-0.1597295 |
0.0008278 |
| river |
-0.1585378 |
0.0009064 |
| star |
-0.1491928 |
0.0012528 |
by item
geo_dists_by_lang <- pairwise_physical_dists %>%
group_by(lang1) %>%
summarize(dist = mean(dist, na.rm = T))
corr_geo_lang <- full_join(by_item, geo_dists_by_lang) %>%
filter(!is.na(item))
ggplot(corr_geo_lang, aes (x = r, y = dist,
label = lang1,
group = item)) +
geom_text() +
geom_smooth(aes(color = item), method = "lm") +
#geom_point(size = .2) +
facet_wrap(~item) +
xlab("swadesh pairwise distance correlation") +
ylab("distance (meters)") +
theme_classic() +
theme(legend.position = "none")

corr_geo_lang %>%
filter(!is.na(item)) %>%
group_by(item) %>%
do(tidy(cor.test(.$r, .$dist))) %>%
select(item, estimate, p.value) %>%
arrange(estimate) %>%
kable()
| sun |
-0.5279753 |
0.0022678 |
| moon |
-0.5025569 |
0.0039616 |
| sand |
-0.4925387 |
0.0056913 |
| day |
-0.4809710 |
0.0061620 |
| year |
-0.4470096 |
0.0116976 |
| mountain |
-0.4468227 |
0.0133094 |
| ash |
-0.4256814 |
0.0169621 |
| water |
-0.4212378 |
0.0204367 |
| wind |
-0.4189321 |
0.0189902 |
| smoke |
-0.4148587 |
0.0203086 |
| night |
-0.3907465 |
0.0327634 |
| earth |
-0.3725725 |
0.0390053 |
| sea |
-0.3635830 |
0.0443722 |
| lake |
-0.3625740 |
0.0450096 |
| stone |
-0.3557204 |
0.0495357 |
| river |
-0.3540983 |
0.0548846 |
| dust |
-0.3523741 |
0.0561604 |
| star |
-0.3494708 |
0.0539712 |
| cloud |
-0.3476228 |
0.0553412 |
| salt |
-0.3396611 |
0.0663019 |
| sky |
-0.3369686 |
0.0637829 |
| fire |
-0.2862736 |
0.1251132 |