This analysis is excluding yoruba, thai, and igbo, which have a fair number of embeddings missing. The same pattern of findings holds, though, when, they’re included.

INFILE <- "data/language_pairwise_swadesh_correlations_by_item.csv"
corr_df <- read_csv(INFILE)  %>%
  filter(!(lang1 %in% c("ig", "yo", "th")), # these langs have lots of missing items
           !(lang2 %in% c("ig", "yo", "th")))

Physical distance

by language pair and item

pairwise_physical_distances <- geosphere::distm(as.matrix(lang_names[,c("lon", "lat")]))

colnames(pairwise_physical_distances) <- 
  lang_names$google_lang_name

pairwise_physical_dists <- pairwise_physical_distances %>%
  data.frame() %>%
  mutate(lang1 = colnames(pairwise_physical_distances)) %>%
  select(lang1, everything()) %>%
  gather("lang2", "dist", -lang1)
  

corr_geo <- left_join(corr_df, pairwise_physical_dists) 

ggplot(corr_geo, aes (x = r, y = dist)) +
  #facet_wrap(~item) +
 # geom_point(size = .2) +
  geom_smooth(method = "lm", aes(color = item), se = F) +
  xlab("swadesh pairwise distance correlation") +
    ylab("distance (meters)") +

  theme_classic()

corr_geo %>%
  filter(!is.na(item)) %>%
  group_by(item) %>%
  do(tidy(cor.test(.$r, .$dist))) %>%
  select(item, estimate, p.value) %>%
  arrange(estimate) %>%
  kable()
item estimate p.value
sand -0.2637058 0.0000000
moon -0.2448806 0.0000001
smoke -0.2318134 0.0000004
wind -0.2239801 0.0000011
day -0.2160789 0.0000026
sun -0.2128056 0.0000037
ash -0.2106228 0.0000046
water -0.2077791 0.0000125
mountain -0.2076504 0.0000126
year -0.2059432 0.0000076
night -0.2009535 0.0000242
sea -0.1949803 0.0000230
cloud -0.1841137 0.0000650
fire -0.1828787 0.0001252
earth -0.1781696 0.0001122
sky -0.1736057 0.0001685
stone -0.1683951 0.0002647
lake -0.1657765 0.0003306
dust -0.1615782 0.0007182
salt -0.1597295 0.0008278
river -0.1585378 0.0009064
star -0.1491928 0.0012528

by item

geo_dists_by_lang <- pairwise_physical_dists %>%
  group_by(lang1) %>%
  summarize(dist = mean(dist, na.rm = T))

corr_geo_lang <- full_join(by_item, geo_dists_by_lang)  %>%
  filter(!is.na(item))

ggplot(corr_geo_lang, aes (x = r, y = dist,
                           label = lang1, 
                           group = item)) +
  geom_text() +
  geom_smooth(aes(color = item), method = "lm") +
  #geom_point(size = .2) +
  facet_wrap(~item) +
  xlab("swadesh pairwise distance correlation") +
  ylab("distance (meters)") +
  theme_classic()  +
  theme(legend.position = "none")

corr_geo_lang %>%
  filter(!is.na(item)) %>%
  group_by(item) %>%
  do(tidy(cor.test(.$r, .$dist))) %>%
  select(item, estimate, p.value) %>%
  arrange(estimate) %>%
  kable()
item estimate p.value
sun -0.5279753 0.0022678
moon -0.5025569 0.0039616
sand -0.4925387 0.0056913
day -0.4809710 0.0061620
year -0.4470096 0.0116976
mountain -0.4468227 0.0133094
ash -0.4256814 0.0169621
water -0.4212378 0.0204367
wind -0.4189321 0.0189902
smoke -0.4148587 0.0203086
night -0.3907465 0.0327634
earth -0.3725725 0.0390053
sea -0.3635830 0.0443722
lake -0.3625740 0.0450096
stone -0.3557204 0.0495357
river -0.3540983 0.0548846
dust -0.3523741 0.0561604
star -0.3494708 0.0539712
cloud -0.3476228 0.0553412
salt -0.3396611 0.0663019
sky -0.3369686 0.0637829
fire -0.2862736 0.1251132