ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_semantics_correlations_ets_by_cluster.csv")
WIKI_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_semantics_correlations_wiki_by_cluster.csv")
cluster_ets <- read_csv(ETS_CLUSTER_CORR_PATH,
col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
mutate(corpus = "TOEFL")
cluster_wiki <- read_csv(WIKI_CLUSTER_CORR_PATH,
col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
distinct() %>%
mutate(corpus = "Wikipedia")
cluster_pair_means <- cluster_ets %>%
bind_rows(cluster_wiki) %>%
group_by(corpus, cluster1, cluster2) %>% # aggregate across languages
summarize(cor = mean(cor, na.rm = T))
full_cluster_pair_means <- cluster_pair_means %>%
bind_rows(data.frame(corpus = cluster_pair_means$corpus,
cluster2 = cluster_pair_means$cluster1,
cluster1 = cluster_pair_means$cluster2,
cor = cluster_pair_means$cor)) %>%
mutate(same = case_when(cluster1 == cluster2 ~ "Local", TRUE ~ "Global")) %>%
distinct()
df <- full_cluster_pair_means %>%
group_by(corpus, cluster1, same) %>%
multi_boot_standard(col = "cor") %>%
ungroup() %>%
mutate(same = fct_rev(same))
overall_means <- df %>%
group_by(corpus, same) %>%
multi_boot_standard(col = "mean")
label_df <- data.frame(corpus = c("Wikipedia", "Wikipedia"),
same = c("Global", "Local"),
ypos = c(.2, .4),
x = c(10, 10))
df_segment <- df %>%
select(corpus, cluster1, same, mean) %>%
spread(same, mean)
#pdf("figs/local_global_plot.pdf", width = 10, height = 4.4)
scale_label <- "Comparison"
ggplot() +
facet_wrap(~corpus)+
geom_segment(data = df_segment, aes(y = Global, yend = Local, x = cluster1, xend = cluster1),
linetype = 1, size = .6) +
geom_linerange(data = filter(df, same == "Global"),
aes(x = cluster1, ymin = ci_lower, ymax = ci_upper), color = "#377EB8", size = 1.5) +
geom_point(data = df, size = 5, aes(x = cluster1, y = mean, color = same, shape = same)) +
ylab("Cross-linguistic\nWord Distance Correlation") +
scale_x_continuous(breaks = 1:10, name = "Cluster") +
scale_shape_manual(scale_label, values = c(19, 15)) +
scale_fill_manual(scale_label, values = c( "#E41A1C", "#377EB8" )) +
scale_color_manual(scale_label, values = c("#E41A1C", "#377EB8" )) +
theme_classic(base_size = 20) +
theme(axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.background = element_rect(linetype = 1, size = 0.5, colour = 1))
#dev.off()
cluster_corr <- cluster_ets %>%
bind_rows(cluster_wiki) %>%
mutate(local_global = case_when(cluster1 == cluster2 ~ "local", TRUE ~ "global"),
lang_pair = glue("{lang1}_{lang2}")) %>%
group_by(corpus, lang_pair, local_global) %>%
summarize(mean_corr = mean(cor, na.rm = T)) %>% # aggregate across cluster pairs
ungroup() %>%
as.data.frame()
cluster_dif_wide_wiki <- cluster_corr %>%
filter(corpus == "Wikipedia") %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global)
cluster_dif_wide_ets <- cluster_corr %>%
filter(corpus == "TOEFL") %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global)
summary_stats <- cluster_dif_wide_wiki %>%
bind_rows(cluster_dif_wide_ets) %>%
group_by(corpus) %>%
summarize(mean_dif = mean(dif),
sd_dif = sd(dif)) %>%
mutate_if(is.numeric, round, 3)
glue("TOEFL: $M$ = {summary_stats %>% filter(corpus == 'TOEFL') %>% pull(mean_dif)}, $SD$ = {summary_stats %>% filter(corpus == 'TOEFL') %>% pull(sd_dif)}; Wikipedia: $M$ = {summary_stats %>% filter(corpus == 'Wikipedia') %>% pull(mean_dif)}, $SD$ = {summary_stats %>% filter(corpus == 'Wikipedia') %>% pull(sd_dif)}")
## TOEFL: $M$ = 0.058, $SD$ = 0.008; Wikipedia: $M$ = 0.038, $SD$ = 0.024
paired t-test
paired_t_wiki <- t.test(cluster_dif_wide_wiki$local, cluster_dif_wide_wiki$global,
paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2)
kable(paired_t_wiki)
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 0.04 | 38.27 | 0 | 594 | 0.0359844 | 0.039878 | Paired t-test | two.sided |
glue("Wiki: $t$({paired_t_wiki$parameter}) = {paired_t_wiki$statistic}; $p$ $<$ .0001")
## Wiki: $t$(594) = 38.27; $p$ $<$ .0001
Effect size
es_data_wiki <- cluster_dif_wide_wiki %>%
mutate(id = 1:n()) %>%
select(id, local, global) %>%
pivot_longer(cols = 2:3) %>%
group_by(name) %>%
summarize(m = mean(value),
sd = sd(value),
n= n())
wiki_es <- mes(es_data_wiki %>% filter(name == "local") %>% pull(m),
es_data_wiki %>% filter(name == "global") %>% pull(m),
es_data_wiki %>% filter(name == "local") %>% pull(sd),
es_data_wiki %>% filter(name == "global") %>% pull(sd),
es_data_wiki %>% filter(name == "local") %>% pull(n),
es_data_wiki %>% filter(name == "global") %>% pull(n),
verbose = F)
glue("Wiki: $d$ = {wiki_es$d} [{wiki_es$l.d}, {wiki_es$u.d}]")
## Wiki: $d$ = 0.31 [0.19, 0.42]
pair_means <- cluster_corr %>%
group_by(corpus, local_global) %>%
multi_boot_standard(col = "mean_corr") %>%
ungroup() %>%
mutate(local_global = as.numeric(as.factor(local_global)))
pdf("figs/local_global_plot2.pdf", width = 5, height = 3)
cluster_corr %>%
mutate(local_global = as.numeric(as.factor(local_global))) %>%
ggplot(aes(x = local_global, y = mean_corr,
fill = as.factor(local_global))) +
geom_line(aes(group = lang_pair), alpha = .15, size = .3) +
#geom_point(size = .2) +
geom_pointrange(data = pair_means, size = .6,
aes(y = mean, ymin = ci_lower, ymax = ci_upper, color = as.factor(local_global))) +
facet_wrap(~corpus) +
scale_x_continuous(labels=c("Local","Global"),
breaks=c(1,2), limits=c(.6,2.4), expand=c(0,0)) +
scale_color_manual(values = c( "#377EB8" , "#E41A1C")) +
theme_classic() +
theme(legend.position = "none") +
ylab("Cross-linguistic\nWord Distance Correlation") +
xlab("") +
theme_classic(base_size = 14) +
theme(#axis.line = element_line(size = 1.2),
# axis.ticks.x = element_blank(), ## <- this line
legend.position = "none")
dev.off()
## quartz_off_screen
## 2
Controling for language distance
LANG_NAMES1 <- here("analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/iso_to_wals_for_ling_dists.csv")
LANG_NAMES2 <- here("analyses/02_concreteness_semantics/data/lang_name_to_wiki_iso.csv")
lang_names1 <- read_csv(LANG_NAMES1)
lang_names2 <- read_csv(LANG_NAMES2)
all_names <- lang_names1 %>%
left_join(lang_names2, by = c("lang_name2" = "language_name")) %>%
mutate(wiki_language_code=replace(wiki_language_code, ETS_lang_name=="ara", "ar"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="chi", "zh"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="guj", "gu"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="kan", "kn"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="mal", "ml"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="mar", "mr"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="pan", "pa"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="tel", "te"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="yor", "yo"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="fas", "fa"),
wiki_language_code=replace(wiki_language_code, ETS_lang_name=="ibo", "ig")) %>%
select(ETS_lang_name, wiki_language_code)
LANGUAGE_DISTANCES <- here("analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/wals_language_distance")
lang_dists <- read_csv(LANGUAGE_DISTANCES)
lang_dists_tidy <- lang_dists %>%
left_join(all_names, by = c("lang1_ETS" = "ETS_lang_name")) %>%
rename("wiki_language_code1" = "wiki_language_code") %>%
left_join(all_names, by = c("lang2_ETS" = "ETS_lang_name")) %>%
rename("wiki_language_code2" = "wiki_language_code") %>%
mutate(lang_pair = paste0(wiki_language_code1, "_", wiki_language_code2)) %>%
select(lang_pair, wals_lang_dist)
diff_with_language_distance <- cluster_dif_wide_wiki %>%
left_join(lang_dists_tidy)
cor.test(diff_with_language_distance$dif,
diff_with_language_distance$wals_lang_dist)
##
## Pearson's product-moment correlation
##
## data: diff_with_language_distance$dif and diff_with_language_distance$wals_lang_dist
## t = -0.62684, df = 593, p-value = 0.531
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10589403 0.05476068
## sample estimates:
## cor
## -0.02573282
lm(dif ~ 1 + wals_lang_dist, data = diff_with_language_distance) %>%
summary()
##
## Call:
## lm(formula = dif ~ 1 + wals_lang_dist, data = diff_with_language_distance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.122187 -0.014415 0.002292 0.015858 0.154766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.024e-02 3.817e-03 10.542 <2e-16 ***
## wals_lang_dist -1.770e-05 2.823e-05 -0.627 0.531
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02419 on 593 degrees of freedom
## Multiple R-squared: 0.0006622, Adjusted R-squared: -0.001023
## F-statistic: 0.3929 on 1 and 593 DF, p-value: 0.531
paired t-test
paired_t_toefl <- t.test(cluster_dif_wide_ets$local,
cluster_dif_wide_ets$global,
paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2)
kable(paired_t_toefl)
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 0.06 | 185.97 | 0 | 594 | 0.0577477 | 0.0589804 | Paired t-test | two.sided |
glue("TOEFL: $t$({paired_t_toefl$parameter}) = {paired_t_toefl$statistic}; $p$ $<$ .0001")
## TOEFL: $t$(594) = 185.97; $p$ $<$ .0001
Effect size
es_data_ets <- cluster_dif_wide_ets %>%
mutate(id = 1:n()) %>%
select(id, local, global) %>%
pivot_longer(cols = 2:3) %>%
group_by(name) %>%
summarize(m = mean(value),
sd = sd(value),
n= n())
toefl_es <- mes(es_data_ets %>% filter(name == "local") %>% pull(m),
es_data_ets %>% filter(name == "global") %>% pull(m),
es_data_ets %>% filter(name == "local") %>% pull(sd),
es_data_ets %>% filter(name == "global") %>% pull(sd),
es_data_ets %>% filter(name == "local") %>% pull(n),
es_data_ets %>% filter(name == "global") %>% pull(n),
verbose = F)
glue("TEOFL: $d$ = {toefl_es$d} [{toefl_es$l.d}, {toefl_es$u.d}]")
## TEOFL: $d$ = 2.84 [2.68, 3]
Controling for language distance
lang_dists_tidy_ets <- lang_dists %>%
mutate(lang_pair = paste0(toupper(lang1_ETS), "_", toupper(lang2_ETS))) %>%
select(lang_pair, wals_lang_dist)
diff_with_language_distance_ets <- cluster_dif_wide_ets %>%
left_join(lang_dists_tidy_ets)
cor.test(diff_with_language_distance_ets$dif,
diff_with_language_distance_ets$wals_lang_dist)
##
## Pearson's product-moment correlation
##
## data: diff_with_language_distance_ets$dif and diff_with_language_distance_ets$wals_lang_dist
## t = 0.5831, df = 593, p-value = 0.56
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05655076 0.10411819
## sample estimates:
## cor
## 0.0239383
lm(dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets) %>%
summary()
##
## Call:
## lm(formula = dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0214522 -0.0051728 -0.0000825 0.0054592 0.0199183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.768e-02 1.209e-03 47.725 <2e-16 ***
## wals_lang_dist 5.213e-06 8.939e-06 0.583 0.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00766 on 593 degrees of freedom
## Multiple R-squared: 0.000573, Adjusted R-squared: -0.001112
## F-statistic: 0.34 on 1 and 593 DF, p-value: 0.56