QUESTION: Do centroid distances correlate with linguistic measure of distance?

hd_centroid_dists_raw <- read_csv("../../all_data/pairwise_country_distances/HD_centroid_distances.csv") 
hd_centroid_dists <- hd_centroid_dists_raw %>% 
          mutate(id = 1:n(),
                lang2 = colnames(hd_centroid_dists_raw)) %>%
        gather("lang1", "hd_dist", -c(id, lang2)) %>%
        select(lang1, lang2, hd_dist) %>%
  filter(!is.na(hd_dist)) 

Read in data

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") # gets unique pairs for joining

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") 


all_value_pairs <- 
  hd_centroid_dists %>%
  left_join(lang_dists) %>%
  left_join(wals) 

We’re mising 21 out of 66 eucliean pairwise dists and 45 out of 66 pairwise asjp dists.

all_value_pairs_f <- all_value_pairs %>%
  filter(!is.na(asjp_dist) || !is.na(wals),
         lang1 != lang2)

0.0.1 WALS

ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

cor.test(all_value_pairs_f$wals_euclidean_dist, all_value_pairs_f$hd_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$wals_euclidean_dist and all_value_pairs_f$hd_dist
## t = 3.0024, df = 34, p-value = 0.004995
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1521244 0.6835175
## sample estimates:
##      cor 
## 0.457781

0.0.2 ASJP

ggplot(all_value_pairs_f, aes(x = asjp_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

cor.test(all_value_pairs_f$asjp_dist, all_value_pairs_f$hd_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$asjp_dist and all_value_pairs_f$hd_dist
## t = 0.79508, df = 13, p-value = 0.4408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3337351  0.6553163
## sample estimates:
##       cor 
## 0.2153427