Plot

cluster_pair_means <- cluster_ets %>%
  bind_rows(cluster_wiki) %>%
  group_by(corpus, cluster1, cluster2) %>% # aggregate across languages
  summarize(cor = mean(cor, na.rm = T)) 

full_cluster_pair_means <- cluster_pair_means %>%
  bind_rows(data.frame(corpus = cluster_pair_means$corpus,
                       cluster2 = cluster_pair_means$cluster1,
                       cluster1 = cluster_pair_means$cluster2,
                       cor = cluster_pair_means$cor)) %>%
  mutate(same = case_when(cluster1 == cluster2 ~ "Local", TRUE ~ "Global")) %>%
  distinct()

df <- full_cluster_pair_means %>%
  group_by(corpus, cluster1, same) %>%
  multi_boot_standard(col = "cor") %>%
  ungroup()  %>%
  mutate(same = fct_rev(same))

overall_means <- df %>%
  group_by(corpus, same) %>%
  multi_boot_standard(col = "mean") 

label_df <- data.frame(corpus = c("Wikipedia", "Wikipedia"), 
                       same = c("Global", "Local"),
                       ypos = c(.2, .4),
                       x = c(10, 10))

df_segment <- df %>%
  select(corpus, cluster1, same, mean) %>%
  spread(same, mean) 

#pdf("figs/local_global_plot.pdf", width = 10, height  = 4.4)
scale_label <- "Comparison"
ggplot() +
  facet_wrap(~corpus)+
  geom_segment(data = df_segment, aes(y = Global, yend = Local, x = cluster1, xend = cluster1), 
              linetype = 1, size = .6) +
  geom_linerange(data = filter(df, same == "Global"),
                 aes(x = cluster1, ymin = ci_lower, ymax = ci_upper), color = "#377EB8", size = 1.5) +
  geom_point(data = df, size = 5, aes(x = cluster1, y = mean, color = same, shape = same)) +
  ylab("Cross-linguistic\nWord Distance Correlation") +
  scale_x_continuous(breaks = 1:10, name = "Cluster") +
  scale_shape_manual(scale_label, values = c(19, 15)) +
  scale_fill_manual(scale_label, values = c( "#E41A1C", "#377EB8" )) +
  scale_color_manual(scale_label, values = c("#E41A1C", "#377EB8" )) +
  theme_classic(base_size = 20) +
  theme(axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 10),
        legend.background = element_rect(linetype = 1, size = 0.5, colour = 1))

#dev.off()

Stats

Summary Stats

cluster_corr <- cluster_ets %>%
  bind_rows(cluster_wiki)  %>%
  mutate(local_global = case_when(cluster1 == cluster2 ~ "local", TRUE ~ "global"),
           lang_pair = glue("{lang1}_{lang2}")) %>%
  group_by(corpus, lang_pair, local_global) %>%
  summarize(mean_corr = mean(cor, na.rm = T)) %>% # aggregate across cluster pairs
  ungroup()  %>%
  as.data.frame()

cluster_dif_wide_wiki <- cluster_corr %>%
  filter(corpus == "Wikipedia") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) 

cluster_dif_wide_ets <- cluster_corr %>%
  filter(corpus == "TOEFL") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) 

summary_stats <- cluster_dif_wide_wiki %>%
  bind_rows(cluster_dif_wide_ets) %>%
  group_by(corpus) %>%
  summarize(mean_dif = mean(dif),
            sd_dif = sd(dif)) %>%
  mutate_if(is.numeric, round, 3)

glue("TOEFL: $M$  = {summary_stats %>% filter(corpus == 'TOEFL') %>% pull(mean_dif)}, $SD$ =  {summary_stats %>% filter(corpus == 'TOEFL') %>% pull(sd_dif)}; Wikipedia: $M$  = {summary_stats %>% filter(corpus == 'Wikipedia') %>% pull(mean_dif)}, $SD$  =  {summary_stats %>% filter(corpus == 'Wikipedia') %>% pull(sd_dif)}")

## TOEFL: $M$  = 0.058, $SD$ =  0.008; Wikipedia: $M$  = 0.038, $SD$  =  0.024

Local vs. global stats

Wiki

paired t-test

paired_t_wiki <- t.test(cluster_dif_wide_wiki$local, cluster_dif_wide_wiki$global, 
                        paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2)

kable(paired_t_wiki)

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.04	38.27	0	594	0.0359844	0.039878	Paired t-test	two.sided

glue("Wiki: $t$({paired_t_wiki$parameter}) = {paired_t_wiki$statistic}; $p$ $<$ .0001")

## Wiki: $t$(594) = 38.27; $p$ $<$ .0001

Effect size

es_data_wiki <- cluster_dif_wide_wiki %>%
  mutate(id = 1:n()) %>%
  select(id, local, global) %>%
  pivot_longer(cols = 2:3) %>%
  group_by(name) %>%
  summarize(m = mean(value),
         sd = sd(value),
         n= n())
  
wiki_es <- mes(es_data_wiki %>% filter(name == "local") %>% pull(m),
    es_data_wiki %>% filter(name == "global") %>% pull(m),
    es_data_wiki %>% filter(name == "local") %>% pull(sd),
    es_data_wiki %>% filter(name == "global") %>% pull(sd),
    es_data_wiki %>% filter(name == "local") %>% pull(n),
    es_data_wiki %>% filter(name == "global") %>% pull(n),
    verbose = F)
glue("Wiki: $d$ = {wiki_es$d} [{wiki_es$l.d}, {wiki_es$u.d}]")

## Wiki: $d$ = 0.31 [0.19, 0.42]

pair_means <- cluster_corr %>%
  group_by(corpus, local_global) %>%
  multi_boot_standard(col = "mean_corr") %>%
  ungroup() %>%
  mutate(local_global = as.numeric(as.factor(local_global))) 

pdf("figs/local_global_plot2.pdf", width = 5, height  = 3)
cluster_corr %>%
  mutate(local_global = as.numeric(as.factor(local_global))) %>%
  ggplot(aes(x = local_global, y = mean_corr, 
                                fill = as.factor(local_global))) +
  geom_line(aes(group = lang_pair), alpha = .15, size = .3) +
  #geom_point(size = .2) +
  geom_pointrange(data = pair_means, size = .6, 
                  aes(y = mean, ymin = ci_lower, ymax = ci_upper, color = as.factor(local_global))) +
  facet_wrap(~corpus) + 
  scale_x_continuous(labels=c("Local","Global"), 
                         breaks=c(1,2), limits=c(.6,2.4), expand=c(0,0)) +
  scale_color_manual(values = c( "#377EB8" , "#E41A1C")) +
  theme_classic() +
  theme(legend.position = "none") +
  ylab("Cross-linguistic\nWord Distance Correlation") +
  xlab("") +
  theme_classic(base_size = 14) +
  theme(#axis.line = element_line(size = 1.2),
      #  axis.ticks.x = element_blank(),        ## <- this line 
        legend.position = "none")
dev.off()

## quartz_off_screen 
##                 2

Controling for language distance

LANG_NAMES1 <- here("analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/iso_to_wals_for_ling_dists.csv")
LANG_NAMES2 <- here("analyses/02_concreteness_semantics/data/lang_name_to_wiki_iso.csv")

lang_names1 <- read_csv(LANG_NAMES1)
lang_names2 <- read_csv(LANG_NAMES2)

all_names <- lang_names1 %>%
  left_join(lang_names2, by = c("lang_name2" = "language_name")) %>%
  mutate(wiki_language_code=replace(wiki_language_code, ETS_lang_name=="ara", "ar"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="chi", "zh"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="guj", "gu"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="kan", "kn"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="mal", "ml"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="mar", "mr"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="pan", "pa"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="tel", "te"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="yor", "yo"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="fas", "fa"),
         wiki_language_code=replace(wiki_language_code, ETS_lang_name=="ibo", "ig")) %>%
  select(ETS_lang_name, wiki_language_code)

LANGUAGE_DISTANCES <- here("analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/wals_language_distance")
lang_dists <- read_csv(LANGUAGE_DISTANCES) 

lang_dists_tidy <- lang_dists %>%
  left_join(all_names, by = c("lang1_ETS" = "ETS_lang_name")) %>%
  rename("wiki_language_code1" = "wiki_language_code") %>%
  left_join(all_names, by = c("lang2_ETS" = "ETS_lang_name")) %>%
  rename("wiki_language_code2" = "wiki_language_code") %>%
  mutate(lang_pair = paste0(wiki_language_code1, "_", wiki_language_code2)) %>%
  select(lang_pair, wals_lang_dist)

diff_with_language_distance <- cluster_dif_wide_wiki %>%
  left_join(lang_dists_tidy)

cor.test(diff_with_language_distance$dif, 
         diff_with_language_distance$wals_lang_dist)

## 
##  Pearson's product-moment correlation
## 
## data:  diff_with_language_distance$dif and diff_with_language_distance$wals_lang_dist
## t = -0.62684, df = 593, p-value = 0.531
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10589403  0.05476068
## sample estimates:
##         cor 
## -0.02573282

lm(dif ~ 1 + wals_lang_dist, data = diff_with_language_distance) %>% 
  summary()

## 
## Call:
## lm(formula = dif ~ 1 + wals_lang_dist, data = diff_with_language_distance)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.122187 -0.014415  0.002292  0.015858  0.154766 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.024e-02  3.817e-03  10.542   <2e-16 ***
## wals_lang_dist -1.770e-05  2.823e-05  -0.627    0.531    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02419 on 593 degrees of freedom
## Multiple R-squared:  0.0006622,  Adjusted R-squared:  -0.001023 
## F-statistic: 0.3929 on 1 and 593 DF,  p-value: 0.531

TOEFL

paired t-test

paired_t_toefl <- t.test(cluster_dif_wide_ets$local, 
                         cluster_dif_wide_ets$global, 
                         paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2)

kable(paired_t_toefl)

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.06	185.97	0	594	0.0577477	0.0589804	Paired t-test	two.sided

glue("TOEFL: $t$({paired_t_toefl$parameter}) = {paired_t_toefl$statistic}; $p$ $<$ .0001")

## TOEFL: $t$(594) = 185.97; $p$ $<$ .0001

Effect size

es_data_ets <- cluster_dif_wide_ets %>%
  mutate(id = 1:n()) %>%
  select(id, local, global) %>%
  pivot_longer(cols = 2:3) %>%
  group_by(name) %>%
  summarize(m = mean(value),
         sd = sd(value),
         n= n())
  
toefl_es <- mes(es_data_ets %>% filter(name == "local") %>% pull(m),
    es_data_ets %>% filter(name == "global") %>% pull(m),
    es_data_ets %>% filter(name == "local") %>% pull(sd),
    es_data_ets %>% filter(name == "global") %>% pull(sd),
    es_data_ets %>% filter(name == "local") %>% pull(n),
    es_data_ets %>% filter(name == "global") %>% pull(n),
    verbose = F)

glue("TEOFL: $d$ = {toefl_es$d} [{toefl_es$l.d}, {toefl_es$u.d}]")

## TEOFL: $d$ = 2.84 [2.68, 3]

Controling for language distance

lang_dists_tidy_ets <- lang_dists %>%
  mutate(lang_pair = paste0(toupper(lang1_ETS), "_", toupper(lang2_ETS))) %>%
  select(lang_pair, wals_lang_dist)

diff_with_language_distance_ets <- cluster_dif_wide_ets %>%
  left_join(lang_dists_tidy_ets)

cor.test(diff_with_language_distance_ets$dif, 
         diff_with_language_distance_ets$wals_lang_dist)

## 
##  Pearson's product-moment correlation
## 
## data:  diff_with_language_distance_ets$dif and diff_with_language_distance_ets$wals_lang_dist
## t = 0.5831, df = 593, p-value = 0.56
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05655076  0.10411819
## sample estimates:
##       cor 
## 0.0239383

lm(dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets) %>% 
  summary()

## 
## Call:
## lm(formula = dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0214522 -0.0051728 -0.0000825  0.0054592  0.0199183 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.768e-02  1.209e-03  47.725   <2e-16 ***
## wals_lang_dist 5.213e-06  8.939e-06   0.583     0.56    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00766 on 593 degrees of freedom
## Multiple R-squared:  0.000573,   Adjusted R-squared:  -0.001112 
## F-statistic:  0.34 on 1 and 593 DF,  p-value: 0.56

Semantics by clusters correlations

Molly Lewis

2020-05-04

Plot

Stats

Summary Stats

Local vs. global stats

Wiki

TOEFL