High arousing words tend to be less frequent in children’s text; the opposite is true for the adult corpus. These corpora also differ on the written vs. spoken dimension.

RATING_RATINGS_PATH <- here("data/processed/words/gender_ratings_mean.csv")

mean_rating_no_sense <- read_csv(RATING_RATINGS_PATH) %>%
  mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]])) %>%
  group_by(word) %>%
  summarize(gender = mean(mean))

# tasa norms
TASA_PATH <- here("data/raw/other_norms/TASA formatted.txt")
tasa_norms <- read_tsv(TASA_PATH) %>%
  clean_names() 

tasa_norms_tidy <- tasa_norms %>%
  mutate(log_tasa_freq = log(all)) %>%
  select(word, log_tasa_freq)

SUBTLEXUS_PATH <- here("data/raw/other_norms/SUBTLEXus_corpus.txt")
subtlexus_norms <- read_tsv(SUBTLEXUS_PATH) %>%
  clean_names()  %>%
  select(word, lg10wf) %>%
  rename(log_subt_freq = lg10wf)

# valence and arousal norms
EMOT_PATH <- here("data/raw/other_norms/BRM-emot-submit.csv")
emot_norms <- read_csv(EMOT_PATH) %>%
  clean_names() 
emot_norms_tidy <- emot_norms %>%
  select(word, v_mean_sum, a_mean_sum) %>%
  rename(valence = v_mean_sum,
         arousal = a_mean_sum)

# conc norms
CONC_PATH <- here("data/raw/other_norms/brysbaert_corpus.csv")
conc_norms <- read_csv(CONC_PATH) %>%
  clean_names()
conc_norms_tidy <- conc_norms %>%
  select(word, conc_m) %>%
  rename(concreteness  = conc_m)

# aoa norms
AOA_PATH <- here("data/raw/other_norms/AoA_ratings_Kuperman_et_al_BRM.csv")
aoa_norms <- read_csv(AOA_PATH) %>%
  clean_names()
aoa_norms_tidy <- aoa_norms %>%
  select(word, rating_mean) %>%
  rename(adult_aoa = rating_mean)

all_words_with_norms <- list(mean_rating_no_sense %>% select(word, gender),
       tasa_norms_tidy, 
       subtlexus_norms,
       emot_norms_tidy, 
       conc_norms_tidy, 
       aoa_norms_tidy) %>%
  reduce(left_join, by = "word") 

all_words_with_norms_no_na <- all_words_with_norms %>%
  drop_na()

TASA cumulative frequency

Points

ggplot(all_words_with_norms_no_na, 
       aes(x = log_tasa_freq, y = arousal)) +
  geom_point(size = .4)  +
  geom_smooth(method = "lm") +
  #geom_smooth(color = "red") +
  theme_classic()

Words

ggplot(all_words_with_norms_no_na, 
         aes(x = log_tasa_freq, y = arousal)) +
    geom_text(aes(label = word), size = 1.2) +
  geom_smooth(method = "lm") +
  theme_classic()

Subtlexus frequency

Points

ggplot(all_words_with_norms_no_na, 
       aes(x = log_subt_freq, y = arousal)) +
  geom_point(size = .2)  +
  geom_smooth(method = "lm") +
  #geom_smooth(color = "red") +
  theme_classic()

Words

ggplot(all_words_with_norms_no_na, 
         aes(x = log_subt_freq, y = arousal)) +
    geom_text(aes(label = word), size = 1) +
  geom_smooth(method = "lm") +
  theme_classic()