TIDY_PATH <- here("analyses/11_xling_concreteness_norms/data/tidy/")
BRYSBAERT_ENG_CONC <- "/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv"

eng_norms <- read.csv(BRYSBAERT_ENG_CONC) %>%
  as_tibble() %>%
  clean_names() %>%
  select(word, conc_m, conc_sd) %>%
  rename(english_word = word,
         eng_conc_mean = conc_m,
         eng_conc_sd = conc_sd)

all_norms <- map_df(list.files(TIDY_PATH, full.names = T), read_csv)

all_norms_tidy <- all_norms %>% 
  mutate(english_word = word(english_word)) %>%
  left_join(eng_norms) %>%
  filter(!is.na(eng_conc_mean)) #Remove words we don't have english norms for. 
all_norms_tidy %>%
  ggplot(aes(x = conc_mean)) +
  geom_density(aes(fill = lang)) +
  facet_wrap(~lang) +
  theme_classic() +
  theme(legend.position = "none")

N words per language that we also have Ennglish concreteness norms for.

all_norms_tidy %>%
  count(lang) %>%
  kable()
lang n
croatian 2930
french 1557
indonesian 1054
polish 4583
portuguese_br 79
portuguese_euro 3620
spanish 1355
corr_values <-  all_norms_tidy %>%
  group_by(lang) %>%
  nest() %>%
  mutate(corr = map(data, ~tidy(cor.test(.$conc_mean, .$eng_conc_mean))),
         n = map(data, nrow)) %>%
  select(-data) %>%
  unnest()
xling_mean <- corr_values %>%
  ungroup() %>%
  summarize(mean = mean(estimate))

corr_values %>%
  ggplot(aes(y = estimate, x = lang)) +
  geom_hline(aes(yintercept = xling_mean$mean), color = "red", linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high)) +
  ylim(0,1) +
  coord_flip() +
  theme_classic() +
  xlab("Language") +
  ylab("Pearson's r") +
  ggtitle("Native language vs. English concreteness norms")