QUESTION: Do centroid distances correlate with linguistic measure of distance?

hd_centroid_dists_raw <- read_csv("../../../data/processed/pairwise_country_distances/HD_centroid_distances_all.csv") 


hd_centroid_dists <- hd_centroid_dists_raw %>% 
          mutate(id = 1:n(),
                lang2 = colnames(hd_centroid_dists_raw)) %>%
        gather("lang1", "hd_dist", -c(id, lang2)) %>%
        select(lang1, lang2, hd_dist) %>%
        filter(!is.na(hd_dist)) 

Read in data

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") # gets unique pairs for joining

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") 


all_value_pairs <- 
  hd_centroid_dists %>%
  left_join(lang_dists) %>%
  left_join(wals) 

We’re mising 377 out of 630 euclidean pairwise dists and 510 out of 630 pairwise asjp dists.

all_value_pairs_f <- all_value_pairs %>%
  filter(!is.na(asjp_dist) || !is.na(wals),
         lang1 != lang2)

0.0.1 WALS

ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

cor.test(all_value_pairs_f$wals_euclidean_dist, all_value_pairs_f$hd_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$wals_euclidean_dist and all_value_pairs_f$hd_dist
## t = 2.3524, df = 229, p-value = 0.0195
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02502648 0.27718969
## sample estimates:
##      cor 
## 0.153608

0.0.2 ASJP

ggplot(all_value_pairs_f, aes(x = asjp_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

cor.test(all_value_pairs_f$asjp_dist, all_value_pairs_f$hd_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$asjp_dist and all_value_pairs_f$hd_dist
## t = 3.8191, df = 103, p-value = 0.0002292
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1721492 0.5094683
## sample estimates:
##       cor 
## 0.3521938

There are the outliers with asjp_dist < .7 (langs very similiar)

filter(all_value_pairs_f, asjp_dist < .7) %>%
  kable()
lang1 lang2 hd_dist asjp_dist wals_euclidean_dist
BUL POL 0.9164063 0.5353481 100.81622
BUL RUS 0.8779301 0.6247660 123.95096
HIN URD 0.6351134 0.4254257 NA
ITA POR 1.1273105 0.5832219 86.92013
ITA SPA 1.2005809 0.5058394 145.24048
POL RUS 0.9268018 0.6076491 136.06592
POR SPA 0.7701023 0.6058661 148.58768

Exlcuding these outliers

all_value_pairs_f_asjpsub <- filter(all_value_pairs_f, asjp_dist > .7)
ggplot(all_value_pairs_f_asjpsub, aes(x = asjp_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5)) 

cor.test(all_value_pairs_f_asjpsub$asjp_dist, all_value_pairs_f_asjpsub$hd_dist)
## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f_asjpsub$asjp_dist and all_value_pairs_f_asjpsub$hd_dist
## t = -0.30951, df = 96, p-value = 0.7576
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2285628  0.1678987
## sample estimates:
##         cor 
## -0.03157396