L2ETS Study 2 analyses (HD)

0.0.1 WALS
0.0.2 ASJP

QUESTION: Do centroid distances correlate with linguistic measure of distance?

hd_centroid_dists_raw <- read_csv("../../../data/processed/pairwise_country_distances/HD_centroid_distances_all.csv") 


hd_centroid_dists <- hd_centroid_dists_raw %>% 
          mutate(id = 1:n(),
                lang2 = colnames(hd_centroid_dists_raw)) %>%
        gather("lang1", "hd_dist", -c(id, lang2)) %>%
        select(lang1, lang2, hd_dist) %>%
        filter(!is.na(hd_dist))

Read in data

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") # gets unique pairs for joining

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2)) %>%
  distinct() %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(lang1), as.character(lang2))) %>%
  distinct(all_codes, .keep_all = TRUE) %>%
  select(-lang1, -lang2) %>%
  separate(all_codes, c("lang1", "lang2"), "_") 


all_value_pairs <- 
  hd_centroid_dists %>%
  left_join(lang_dists) %>%
  left_join(wals)

We’re mising 377 out of 630 euclidean pairwise dists and 510 out of 630 pairwise asjp dists.

all_value_pairs_f <- all_value_pairs %>%
  filter(!is.na(asjp_dist) || !is.na(wals),
         lang1 != lang2)

0.0.1 WALS

ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

cor.test(all_value_pairs_f$wals_euclidean_dist, all_value_pairs_f$hd_dist)

## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$wals_euclidean_dist and all_value_pairs_f$hd_dist
## t = 2.3524, df = 229, p-value = 0.0195
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02502648 0.27718969
## sample estimates:
##      cor 
## 0.153608

0.0.2 ASJP

ggplot(all_value_pairs_f, aes(x = asjp_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

cor.test(all_value_pairs_f$asjp_dist, all_value_pairs_f$hd_dist)

## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f$asjp_dist and all_value_pairs_f$hd_dist
## t = 3.8191, df = 103, p-value = 0.0002292
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1721492 0.5094683
## sample estimates:
##       cor 
## 0.3521938

There are the outliers with asjp_dist < .7 (langs very similiar)

filter(all_value_pairs_f, asjp_dist < .7) %>%
  kable()

lang1	lang2	hd_dist	asjp_dist	wals_euclidean_dist
BUL	POL	0.9164063	0.5353481	100.81622
BUL	RUS	0.8779301	0.6247660	123.95096
HIN	URD	0.6351134	0.4254257	NA
ITA	POR	1.1273105	0.5832219	86.92013
ITA	SPA	1.2005809	0.5058394	145.24048
POL	RUS	0.9268018	0.6076491	136.06592
POR	SPA	0.7701023	0.6058661	148.58768

Exlcuding these outliers

all_value_pairs_f_asjpsub <- filter(all_value_pairs_f, asjp_dist > .7)
ggplot(all_value_pairs_f_asjpsub, aes(x = asjp_dist, y = hd_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic distance \n(in model space)") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

cor.test(all_value_pairs_f_asjpsub$asjp_dist, all_value_pairs_f_asjpsub$hd_dist)

## 
##  Pearson's product-moment correlation
## 
## data:  all_value_pairs_f_asjpsub$asjp_dist and all_value_pairs_f_asjpsub$hd_dist
## t = -0.30951, df = 96, p-value = 0.7576
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2285628  0.1678987
## sample estimates:
##         cor 
## -0.03157396

L2ETS Study 2 analyses (HD) - Semantics

Molly Lewis

2017-12-03

0.0.1 WALS

0.0.2 ASJP