L2ETS Study 2 analyses - xling correlations of pairwise distances

Mean pairwise correlations
Dendograms
Correlations with linguistic distances
- WALS
- ASJP

Read in all pairwise correlaitons

all_words <- read_feather("../../../data/processed/pairwise_country_distances/low_model_xling_distances_from_word_dists.feather") %>%
  mutate(prop_words = as.character(1.0))

all_files <- list.files("../../../data/processed/pairwise_country_distances/params/")

all_corrs <- map_df(all_files, function(x){read_feather(paste0("../../../data/processed/pairwise_country_distances/params/",x)) %>% mutate(file = x)}) %>%
  separate(file, sep = "PROP_", into= c("a","b")) %>%
  separate(b, sep = "_low", into = c("prop_words", "c")) %>%
  select(-a, -c) %>%
  select(prop_words, everything()) %>%
  bind_rows(all_words)
  #mutate(prop_words = as.numeric(prop_words)) %>%
  

actually_unique <- unique(as.data.frame(t(apply(all_corrs[,2:3], 1, sort)))) %>%
  rename(lang1 = V1, lang2 = V2)

all_corrs_unique <- actually_unique %>%
  left_join(all_corrs) %>%
  mutate_if(is.character, as.factor)

Mean pairwise correlations

ggplot(all_corrs_unique, aes(x = estimate, fill = prop_words)) +
  geom_histogram() +
  facet_grid(.~prop_words) +
  theme_bw() +
  theme(legend.position = "none")

mean_corrs <- all_corrs_unique %>%
  group_by(prop_words) %>%
  multi_boot_standard(col = "estimate")

ggplot(mean_corrs, aes(x = as.numeric(as.character(prop_words)), y = mean)) +
  geom_line() +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2, color = "red") +
  ylab("mean correlation between languages \n(pearson's r)")+
  xlab("prop words included") +
  theme_bw() +
  ggtitle("mean pairwise language corrlelation \nas a function of prop words included")

Dendograms

Similarity matrix between languages (based on pairwise word distances)

p = list()
all_props <- unique(all_corrs_unique$prop_words)
for (i in 1:length(all_props)){
  wide_corrs <- all_corrs_unique %>%
    filter(prop_words == all_props[i]) %>%
    select(c(1,2,4)) %>%
    spread(lang2, estimate)
  
  all_corrs_mat <- as.matrix(wide_corrs[,-1])
  rownames(all_corrs_mat) <- wide_corrs$lang1
  
  dist_matrix <- dist(all_corrs_mat)
  dist_matrix_kable <- as.matrix(dist_matrix)
  dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
  kable(as.data.frame(dist_matrix_kable), digits = 2)
  
  p[[i]] <- ggdendro::ggdendrogram(hclust(dist_matrix)) +
    ggtitle(paste0("prop =", all_props[i])) +
    theme(axis.text.x=element_text(size = 6))
}

library(gridExtra)
do.call("grid.arrange", c(p, ncol=3))

Correlations with linguistic distances

Read in data

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2)) %>%
  distinct()

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2)) %>%
  distinct() 

all_value_pairs <- 
  all_corrs_unique %>%
  left_join(lang_dists) %>%
  left_join(wals) 

all_value_pairs_f <- all_value_pairs %>%
  filter(!is.na(asjp_dist) || !is.na(wals),
         lang1 != lang2)

WALS

(grammatical distance)

ggplot(all_value_pairs_f, aes(x = wals_euclidean_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

all_value_pairs_f %>% 
  group_by(prop_words) %>% do(tidy(cor.test(.$wals_euclidean_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()

prop_words	estimate	statistic	p.value	parameter	conf.low	conf.high
0.01	0.0106232	0.1607667	0.8724189	229	-0.1186172	0.1395095
0.02	-0.0800410	-1.2151382	0.2255647	229	-0.2069802	0.0495486
0.03	-0.1451142	-2.2194701	0.0274354	229	-0.2691505	-0.0163427
0.04	-0.1733081	-2.6629243	0.0082962	229	-0.2957694	-0.0452425
0.05	-0.1857603	-2.8608557	0.0046158	229	-0.3074657	-0.0580751
0.1	-0.1905870	-2.9379568	0.0036415	229	-0.3119896	-0.0630607
0.2	-0.1890258	-2.9129938	0.0039341	229	-0.3105269	-0.0614473
0.3	-0.1910124	-2.9447619	0.0035652	229	-0.3123880	-0.0635003
0.4	-0.1918603	-2.9583330	0.0034175	229	-0.3131821	-0.0643770
1	-0.1911828	-2.9474892	0.0035351	229	-0.3125476	-0.0636765

ASJP

(lexical distance)

ggplot(all_value_pairs_f, aes(x = asjp_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("grammatical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

all_value_pairs_f %>% 
  group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()

prop_words	estimate	statistic	p.value	parameter	conf.low	conf.high
0.01	-0.2339244	-2.441822	0.0163172	103	-0.4073270	-0.0442428
0.02	-0.4040874	-4.483381	0.0000191	103	-0.5529286	-0.2302554
0.03	-0.4777920	-5.519873	0.0000003	103	-0.6132942	-0.3149706
0.04	-0.5160870	-6.114991	0.0000000	103	-0.6440459	-0.3600350
0.05	-0.5216627	-6.205575	0.0000000	103	-0.6484890	-0.3666575
0.1	-0.5099233	-6.016085	0.0000000	103	-0.6391240	-0.3527322
0.2	-0.5085667	-5.994487	0.0000000	103	-0.6380394	-0.3511275
0.3	-0.5089598	-6.000738	0.0000000	103	-0.6383537	-0.3515923
0.4	-0.5092203	-6.004886	0.0000000	103	-0.6385620	-0.3519005
1	-0.5089941	-6.001285	0.0000000	103	-0.6383812	-0.3516330

Excluding outliers

ggplot(all_value_pairs_f %>% filter(asjp_dist > .7), aes(x = asjp_dist, y = estimate)) +
  facet_wrap(~prop_words) +
  geom_point() +
  geom_smooth(method = "lm") +
  ylab("semantic space correlations") +
  xlab("lexical distance")+
  ggtitle("Language-pairwise distances") +
  theme_minimal() +
    theme(
        text = element_text(size = 13),
        legend.background = element_rect(fill="gray90", size=.5))

all_value_pairs_f %>% 
  filter(asjp_dist > .7) %>%
  group_by(prop_words) %>% do(tidy(cor.test(.$asjp_dist, .$estimate))) %>%
  select(-method, -alternative) %>%
  kable()

prop_words	estimate	statistic	p.value	parameter	conf.low	conf.high
0.01	0.0600458	0.589390	0.5569842	96	-0.1400435	0.2554234
0.02	-0.1204153	-1.188472	0.2375791	96	-0.3113960	0.0799148
0.03	-0.2516765	-2.547931	0.0124235	96	-0.4286895	-0.0560550
0.04	-0.3235129	-3.349912	0.0011566	96	-0.4904509	-0.1336729
0.05	-0.3370854	-3.508061	0.0006886	96	-0.5019344	-0.1486040
0.1	-0.3538034	-3.706273	0.0003518	96	-0.5159999	-0.1671144
0.2	-0.3516228	-3.680196	0.0003848	96	-0.5141703	-0.1646925
0.3	-0.3548796	-3.719169	0.0003364	96	-0.5169024	-0.1683105
0.4	-0.3548765	-3.719132	0.0003365	96	-0.5168999	-0.1683071
1	-0.3537389	-3.705501	0.0003527	96	-0.5159459	-0.1670427

L2ETS Study 2 analyses - xling correlations of pairwise distances

low scoring essay models, parameterized

Molly Lewis

2017-12-05

Mean pairwise correlations

Dendograms

Correlations with linguistic distances

WALS

ASJP