CORRS_BY_WORD <- here("analyses/thompson_analyses/data/corrs_by_word.csv")
THOMPSON_ALIGN_BY_WORD <- here("analyses/thompson_analyses/data/thompson_words_with_alignments.csv")
BRYSBAERT_PATH <- here("analyses/02_concreteness_semantics/data/brysbaert_corpus.csv")
corrs_by_word <- read_csv(CORRS_BY_WORD)
thompson_alignment_by_word_raw <- read_csv(THOMPSON_ALIGN_BY_WORD)
thompson_alignment_by_word <- thompson_alignment_by_word_raw %>%
group_by(english_word_form) %>%
summarize(alignment = mean(alignment)) %>%
left_join(thompson_alignment_by_word_raw %>% group_by(english_word_form) %>% slice(1) %>% select(english_word_form, semantic_domain))
brysbaert_norms <- read_csv(BRYSBAERT_PATH) %>%
clean_names() %>%
select(word, conc_m)
all_alignments <- corrs_by_word %>%
left_join(thompson_alignment_by_word,
by = c("word" = "english_word_form")) %>%
left_join(brysbaert_norms)
ggplot(all_alignments, aes(x = corr, y = alignment)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()
cor.test(all_alignments$corr, all_alignments$alignment)
##
## Pearson's product-moment correlation
##
## data: all_alignments$corr and all_alignments$alignment
## t = 7.8951, df = 595, p-value = 1.403e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2334645 0.3788251
## sample estimates:
## cor
## 0.3079408
#resid(lm(alignment ~ corr, all_alignments)) %>%
# as.data.frame() %>%
# rename("residual" = ".") %>%
# bind_cols(all_alignments$word) %>%
# rename(word = "...2") %>%
# arrange(-residual)
ggplot(all_alignments, aes(x = conc_m, y = alignment)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()
cor.test(all_alignments$conc_m, all_alignments$alignment)
##
## Pearson's product-moment correlation
##
## data: all_alignments$conc_m and all_alignments$alignment
## t = 1.2817, df = 592, p-value = 0.2005
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02796174 0.13249174
## sample estimates:
## cor
## 0.05260452
ggplot(all_alignments, aes(x = conc_m, y = corr)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()
ggplot(all_alignments, aes(x = conc_m, y = corr)) +
geom_point() +
facet_wrap(~semantic_domain, scale = "free_y")+
geom_smooth(method = "lm") +
theme_classic()
Thoughts:
CORRS_BY_DOMAIN <- here("analyses/thompson_analyses/data/corrs_local_global.csv")
corrs_by_domain <- read_csv(CORRS_BY_DOMAIN) %>%
rename(lang1 = l1,
lang2 = l2)
overall_corrs <- corrs_by_domain %>%
group_by(local_global) %>%
multi_boot_standard(col = "corr")
ggplot(corrs_by_domain, aes(fill = local_global, x = corr)) +
geom_density() +
theme_classic()
ggplot(overall_corrs, aes(x = local_global,
fill = local_global, y = mean)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_classic()
conc_in_out_wide <- corrs_by_domain %>%
select(-ci_lower, -ci_upper) %>%
spread(local_global, corr) %>%
mutate(dif = local- global) %>%
select(lang1, lang2, local, global, dif)
lm(dif ~ 1, conc_in_out_wide) %>%
summary()
##
## Call:
## lm(formula = dif ~ 1, data = conc_in_out_wide)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13321 -0.02189 0.00273 0.02364 0.08618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.135584 0.001425 95.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03475 on 594 degrees of freedom
#tidy() %>%
#kable()
paired_t_ets <- t.test(conc_in_out_wide$local,
conc_in_out_wide$global,
paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2)
kable(paired_t_ets)
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 0.14 | 95.18 | 0 | 594 | 0.1327859 | 0.1383814 | Paired t-test | two.sided |
glue("TOEFL: t({paired_t_ets$parameter}) = {paired_t_ets$statistic}; p < .0001")
## TOEFL: t(594) = 95.18; p < .0001
wilcox_paired_ets <- wilcox.test(conc_in_out_wide$local, conc_in_out_wide$global, paired = TRUE)
wilcox_paired_ets
##
## Wilcoxon signed rank test with continuity correction
##
## data: conc_in_out_wide$local and conc_in_out_wide$global
## V = 177310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
glue("$W$ = {wilcox_paired_ets$statistic}, $p$ $<$ .0001")
## $W$ = 177310, $p$ $<$ .0001
es_data_ets <- conc_in_out_wide %>%
mutate(id = 1:n()) %>%
select(id, local, global) %>%
pivot_longer(cols = 2:3) %>%
group_by(name) %>%
summarize(m = mean(value),
sd = sd(value),
n= n())
ets_es <- mes(es_data_ets %>% filter(name == "local") %>% pull(m),
es_data_ets %>% filter(name == "global") %>% pull(m),
es_data_ets %>% filter(name == "local") %>% pull(sd),
es_data_ets %>% filter(name == "global") %>% pull(sd),
es_data_ets %>% filter(name == "local") %>% pull(n),
es_data_ets %>% filter(name == "global") %>% pull(n),
verbose = F)
glue("TOEFL: $d$ = {ets_es$d} [{ets_es$l.d}, {ets_es$u.d}]")
## TOEFL: $d$ = 3.44 [3.26, 3.62]
This effect size is bigger than TOEFL with our words (cf. \(d\) = 2.84),
conc_in_out_wide_full <- conc_in_out_wide %>%
select(-local, -global) %>%
bind_rows(conc_in_out_wide %>%
select(-local, -global) %>% rename(lang1 = lang2,
lang2 = lang1))
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(~lang1)