TIDY_PATH <- here("analyses/11_xling_concreteness_norms/data/tidy/")
BRYSBAERT_ENG_CONC <- "/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv"
eng_norms <- read.csv(BRYSBAERT_ENG_CONC) %>%
as_tibble() %>%
clean_names() %>%
select(word, conc_m, conc_sd) %>%
rename(english_word = word,
eng_conc_mean = conc_m,
eng_conc_sd = conc_sd)
all_norms <- map_df(list.files(TIDY_PATH, full.names = T), read_csv)
all_norms_tidy <- all_norms %>%
mutate(english_word = word(english_word)) %>%
left_join(eng_norms) %>%
filter(!is.na(eng_conc_mean)) #Remove words we don't have english norms for.
all_norms_tidy %>%
ggplot(aes(x = conc_mean)) +
geom_density(aes(fill = lang)) +
facet_wrap(~lang) +
theme_classic() +
theme(legend.position = "none")
N words per language that we also have Ennglish concreteness norms for.
all_norms_tidy %>%
count(lang) %>%
kable()
| lang | n |
|---|---|
| croatian | 2930 |
| french | 1557 |
| indonesian | 1054 |
| polish | 4583 |
| portuguese_br | 79 |
| portuguese_euro | 3620 |
| spanish | 1355 |
corr_values <- all_norms_tidy %>%
group_by(lang) %>%
nest() %>%
mutate(corr = map(data, ~tidy(cor.test(.$conc_mean, .$eng_conc_mean))),
n = map(data, nrow)) %>%
select(-data) %>%
unnest()
xling_mean <- corr_values %>%
ungroup() %>%
summarize(mean = mean(estimate))
corr_values %>%
ggplot(aes(y = estimate, x = lang)) +
geom_hline(aes(yintercept = xling_mean$mean), color = "red", linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high)) +
ylim(0,1) +
coord_flip() +
theme_classic() +
xlab("Language") +
ylab("Pearson's r") +
ggtitle("Native language vs. English concreteness norms")