Read in complexity norms.

complexity = read.csv("data/cdi_complexity_norms.csv") %>%
  select(mean, wordLength, word) %>%
  rename(complexity = mean) 

CB for all CDI words

ggplot(complexity, aes(y = wordLength, x = complexity)) +
  geom_label(aes(label = word), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

tidy(cor.test(complexity$wordLength, complexity$complexity)) %>%
 select(estimate, statistic, p.value) %>% 
  kable()
estimate statistic p.value
0.3482136 9.279105 0
##  CB for Mika's AOA words
cdi.means = d %>%
  group_by(senseless_cdi_words) %>%
  multi_boot_standard(column = "complexity")  %>%
  mutate(two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
   left_join(d %>% group_by(senseless_cdi_words) %>% summarise(num_phonemes = mean(num_phonemes))) %>%
  filter(!two_words) # filter out 2 word items (e.g. "washing machine") %>%

ggplot(cdi.means, aes(y = num_phonemes, x = mean)) +
  geom_label(aes(label = senseless_cdi_words), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

Complexity and other measures

Read in aoa and other predictors.

words = read.csv("RC43_words.csv") %>%
  select(cdi_words, senseless_cdi_words)

aoa_preds = read.csv("data/all_aoa_pred.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
   filter(language == "English")

# "comb", "dress" not merge, not sure why. "in" missing?
#filter(aoa_preds, is.na(senseless_cdi_words) & language == "English")

d = aoa_preds %>%
    left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
    filter( language == "English") %>%
    filter(!is.na(complexity)) %>%
    select(-word) %>%
    mutate( two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
  filter(!two_words)
  
    # for some reason the data from mika only includes AOAs for 391 words
complexity.correlates = d %>%
        gather("measure_pred", "value", c(9:18, 21))

ggplot(complexity.correlates, aes(x = complexity, y=value)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~measure_pred, scales = "free_y")

ggplot(complexity.correlates, aes(x = complexity, y=value)) +
  geom_point() +
  geom_smooth(method = "lm", se = F, aes(color = lexical_category)) +
  facet_wrap(~measure_pred, scales = "free_y")

Complexity is reliably correlated with arousal and aoa (as well as length)

complexity.correlates %>%
  group_by(measure_pred) %>%
  summarise(r = tidy(cor.test(value, complexity))$estimate,
            p = tidy(cor.test(value, complexity))$p.value,
            sig = ifelse(p<.05, "*", "")) %>%
  kable()
measure_pred r p sig
aoa 0.1622411 0.0019854 *
arousal 0.2108207 0.0001226 *
babiness 0.0119442 0.8362321
concreteness -0.0254391 0.6314232
dominance -0.0143007 0.7966966
iconicity -0.0775841 0.1787209
log.frequency -0.1308043 0.0140517 *
num_phonemes 0.3725241 0.0000000 *
num_syllables 0.3260802 0.0000000 *
valence 0.0511560 0.3564645
wordLength 0.3887051 0.0000000 *

Does complexity predict aoa, controling for other stuff? Yes.

tidy(lm(aoa ~ complexity + wordLength + babiness + concreteness + log.frequency + arousal, d)) %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 9.8687783 1.9000432 5.193976 0.0000004
complexity 0.5154499 0.2558496 2.014660 0.0449699
wordLength -0.1477522 0.1449219 -1.019530 0.3088993
babiness -0.4059421 0.0850850 -4.771018 0.0000031
concreteness -1.0064352 0.2441284 -4.122565 0.0000504
log.frequency -1.1883704 0.1781925 -6.669025 0.0000000
arousal 0.4260905 0.2037121 2.091630 0.0374420

CB across development

Does the complexity bias in the “lexicon” of the child change across development? Yes.

#[Mika data]
d.devo = d %>%
  filter(aoa > 2 & aoa < 26) %>%
  mutate(aoa.cut = cut_width(aoa, width = 4)) %>%
  gather("length_metric", "length", c(17,18,21))

ggplot(d.devo, aes(y = length, x = complexity)) +
  geom_point() +
  facet_grid(~aoa.cut) +
  geom_smooth(method = "lm", aes(color = length_metric)) +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

ggplot(d.devo, aes(x = length, color = aoa.cut)) +
  geom_density() +
  #facet_grid(~aoa.cut) +
  ylab("Length") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

d.devo %>%
  group_by(aoa.cut) %>%
  summarise(mean = mean(length),
            sd = sd(length)) %>%
  gather(measure,value,2:3) %>%
  ggplot(aes(x = aoa.cut, y = value, group = measure, color  = measure)) +
  geom_point() + 
  geom_line() +
  theme_bw()

Complexity bias decreases across development – Is this because variability in words decreases?

The above isn’t really vocabulary - it’s the set of new words learned at each stage. Now, let’s look at cumulative vocabulary.

c1 =  filter(d.devo, aoa.cut == "[6,10]") 
c2 = filter(d.devo, aoa.cut == "(10,14]") 
c3 = filter(d.devo, aoa.cut == "(14,18]") 
c4 = filter(d.devo, aoa.cut == "(18,22]") 
c5 = filter(d.devo, aoa.cut == "(22,26]") 

cc1 = mutate(c1, age = 1)
cc2 = rbind(c1, c2) %>% mutate(age = 2)
cc3 = rbind(c1, c2, c3) %>% mutate(age = 3)
cc4 = rbind(c1, c2, c3, c4) %>% mutate(age = 4)
cc5 = rbind(c1, c2, c3, c4, c5) %>% mutate(age = 5)

d.devo.cumulative = rbind(cc1, cc2, cc3, cc4, cc5)

ggplot(d.devo.cumulative, aes(y = length, x = complexity)) +
  geom_point() +
  facet_grid(~age) +
  geom_smooth(method = "lm", aes(color = length_metric)) +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

d.devo.cumulative %>%
  group_by(age) %>%
  summarise(mean = mean(length),
            sd = sd(length)) %>%
  gather(measure,value,2:3) %>%
  ggplot(aes(x = age, y = value, group = measure, color  = measure)) +
  geom_point() + 
  geom_line() +
  theme_bw()

The decrease in SD is smaller in the cumulative vocabulary, but still present.

Compare to google correlations for each language

aoa_preds.all = read.csv("data/all_aoa_pred.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
  left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
  filter(!is.na(complexity)) %>%
  select(-word) %>%
  mutate(two_words = (grepl(" ", words) || grepl(" ", senseless_cdi_words)) ,
          wordLength = nchar(as.character(words))) %>% 
  filter(!two_words)

d.devo.all = aoa_preds.all %>%
  filter(wordLength < 12)  

google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
  rename(corr.g = corr,
         lci.g = lower.ci,
         hci.g = upper.ci)

d.model.fits.lang = d.devo.all %>%
  mutate(language = tolower(language)) %>%
  group_by(language) %>%
  do(tidy(cor.test(.$wordLength,.$complexity))) %>%
  rename(corr.aoa = estimate,
         lci.aoa = conf.low,
         hci.aoa = conf.high)

all_xling_corrs = left_join(d.model.fits.lang, google.cb, by = "language")

ggplot(all_xling_corrs,aes(x = corr.g,y = corr.aoa)) + 
    geom_point() + 
    geom_smooth(method = "lm") +
    geom_errorbar(aes(ymin = lci.aoa,ymax = hci.aoa)) + 
    geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
    geom_label(aes(label = language), size = 3) +
    ylim(-.1, .75) +
    xlim(0, .75) +
    theme_bw(base_size = 15) +
    ggtitle("Cross-sample correlations of CB") +
    xlab("Google Translate CB Correlation") +
    ylab("CDI WG Comprehension CB Correlation")

#write.csv( d.model.fits.lang, "aoa_corrs.csv")

tidy(cor.test(all_xling_corrs$corr.g, all_xling_corrs$corr.aoa)) %>%
 select(estimate, statistic, p.value) %>% 
  kable()
estimate statistic p.value
0.8407045 3.47164 0.0178179

The two samples are positively correlated.