TO DO:

INFILE <- "data/language_pairwise_swadesh_correlations_by_item.csv"
corr_df <- read_csv(INFILE)  

lang_names <- read_csv( "/Users/mollylewis/Documents/research/Projects/1_in_progress/L2ETS/studies/study2/data/processed/lang_names/ets_to_google_langcodes_complete.csv") %>%
  select(ETS_lang_name, google_lang_name, lon, lat)
pairwise_physical_distances <- geosphere::distm(as.matrix(lang_names[,c("lon", "lat")]))

colnames(pairwise_physical_distances) <- 
  lang_names$google_lang_name

pairwise_physical_dists <- pairwise_physical_distances %>%
  data.frame() %>%
  mutate(lang1 = colnames(pairwise_physical_distances)) %>%
  select(lang1, everything()) %>%
  gather("lang2", "dist", -lang1) %>%
  filter(lang1 != lang2)

hist(pairwise_physical_dists$dist)

corr_geo <- left_join(corr_df, pairwise_physical_dists) 
corr_geo %>%
  ggplot(aes(x = r, y = dist)) +
  facet_wrap(~item) +
  geom_point(size = .1) +
  geom_smooth(aes(color = item), se = F, method = "lm") +
  xlab("swadesh pairwise distance correlation") +
  ylab("distance (meters)") +
  theme_classic()

corr_geo %>%
  ggplot(aes(x = r, y = dist)) +
  geom_point(size = .1) +
  xlab("Word Pairwise Correlation (Pearson's r)") +
  ylab("Distance (meters)") +
  theme_classic(base_size = 15) +
  geom_smooth(aes(color = item), se = F, method = "lm", size = .5) +
  xlim(0,1) +
  scale_color_discrete(name = "Swadesh word") +
  theme(legend.position = c(0.8, 0.7))

corr_geo %>%
  ggplot(aes(x = r, y = dist)) +
  xlab("Word Pairwise Correlation (Pearson's r)") +
  ylab("Distance (meters)") +
  theme_classic(base_size = 15) +
  geom_smooth(aes(color = item), se = F, method = "lm", size = .5) +
  xlim(0,1) +
  scale_color_discrete(name = "Swadesh word") +
  theme(legend.position = c(0.8, 0.7))

corr_geo %>%
  group_by(item) %>%
  do(tidy(cor.test(.$r, .$dist))) %>%
  select(item, estimate, p.value) %>%
  arrange(estimate) %>%
  kable()
item estimate p.value
sun -0.3083445 0.0000000
mountain -0.2966179 0.0000000
day -0.2912059 0.0000000
year -0.2869088 0.0000000
moon -0.2864168 0.0000000
water -0.2820849 0.0000000
river -0.2795073 0.0000000
sea -0.2753706 0.0000000
earth -0.2727574 0.0000000
salt -0.2706258 0.0000000
fire -0.2384460 0.0000000
dust -0.2376438 0.0000000
sand -0.2364880 0.0000001
smoke -0.2264250 0.0000001
sky -0.2252467 0.0000001
stone -0.2157788 0.0000002
cloud -0.2099182 0.0000005
night -0.2023687 0.0000109
wind -0.1966189 0.0000053
ash -0.1916804 0.0000092
lake -0.1765190 0.0000453
star -0.1327586 0.0016249

Excluding outliers

all_dists <- map_df(fs::dir_ls("/Volumes/wilbur_the_great/pairwise_swadesh_words/"), read_feather)

all_dists %>%
  group_by(lang) %>%
  distinct(word1) %>%
  count(lang) %>%
  filter(n < 22) %>%
  arrange(n) %>%
  kable()
lang n
ig 13
yo 14
th 18
de 20
nl 20
kn 21
ml 21
ne 21
te 21

There are four outliers in correlations (“ja”, “ig”, “zh”, “yo”). Let’s exclude them.

get_mean_corr <- function(current_lang, df){
  corr_df %>%
    filter(lang1 == current_lang| lang2 == current_lang) %>%
    mutate(temp = "temp") %>%
    group_by(temp) %>%
    multi_boot_standard(col = "r") %>%
    ungroup() %>%
    mutate(lang = current_lang) %>%
    select(-temp)
}
mean_corr <- map_df(unique(lang_names$google_lang_name),get_mean_corr)

ggplot(mean_corr, aes(x = fct_reorder(lang, mean), y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  theme_classic()

corr_df_filtered <- corr_df %>%
  filter(!(lang1 %in% c("ja", "ig", "zh", "yo")), 
           !(lang2 %in% c("ja", "ig", "zh", "yo")))
corr_geo_filtered <- left_join(corr_df_filtered, pairwise_physical_dists)
corr_geo_filtered %>%
  ggplot(aes(x = r, y = dist)) +
  facet_wrap(~item) +
  geom_point(size = .1) +
  geom_smooth(aes(color = item), se = F, method = "lm") +
  xlab("swadesh pairwise distance correlation") +
  ylab("distance (meters)") +
  theme_classic()

corr_geo_filtered %>%
  ggplot(aes(x = r, y = dist)) +
  geom_smooth(aes(color = item), se = F ,method = "lm") +
  xlab("swadesh pairwise distance correlation") +
  ylab("distance (meters)") +
  theme_classic()

corr_geo_filtered %>%
  group_by(item) %>%
  do(tidy(cor.test(.$r, .$dist))) %>%
  select(item, estimate, p.value) %>%
  arrange(estimate) %>%
  kable()
item estimate p.value
sand -0.2040444 0.0000180
night -0.1790781 0.0002872
day -0.1706691 0.0002177
smoke -0.1704664 0.0002216
mountain -0.1651925 0.0005417
sun -0.1638499 0.0006019
moon -0.1534174 0.0013293
dust -0.1335875 0.0052596
year -0.1298341 0.0050462
fire -0.1201527 0.0121463
earth -0.0996430 0.0316937
sea -0.0994672 0.0319973
wind -0.0930592 0.0448917
ash -0.0785465 0.0906802
water -0.0737212 0.1247240
cloud -0.0564941 0.2240141
lake -0.0417242 0.3853391
sky -0.0170535 0.7137856
stone -0.0000216 0.9996299
river 0.0073467 0.8785656
salt 0.0693106 0.1489742
star 0.0818810 0.0777520