Read in complexity norms.

complexity = read.csv("data/cdi_complexity_norms.csv") %>%
  select(mean, wordLength, word) %>%
  rename(complexity = mean) 

CB for all CDI words

ggplot(complexity, aes(y = wordLength, x = complexity)) +
  geom_label(aes(label = word), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

tidy(cor.test(complexity$wordLength, complexity$complexity))
##    estimate statistic p.value parameter  conf.low conf.high
## 1 0.3482136  9.279105       0       624 0.2774203  0.415246

Read in aoa and other predictors.

words = read.csv("../list_maker/RC43_words.csv") %>%
  select(cdi_words, senseless_cdi_words)

aoa_preds = read.csv("data/all_aoa_pred.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
   filter(language == "English")

# "comb", "dress" not merge, not sure why. "in" missing?
#filter(aoa_preds, is.na(senseless_cdi_words) & language == "English")

d = aoa_preds %>%
    left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
    filter( language == "English") %>%
    filter(!is.na(complexity)) %>%
    select(-word) %>%
    mutate( two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
  filter(!two_words)
  
    # for some reason the data from mika only includes AOAs for 391 words

CB for Mika’s AOA words

cdi.means = d %>%
  group_by(senseless_cdi_words) %>%
  multi_boot_standard(column = "complexity")  %>%
  mutate(two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
   left_join(d %>% group_by(senseless_cdi_words) %>% summarise(num_phonemes = mean(num_phonemes))) %>%
  filter(!two_words) # filter out 2 word items (e.g. "washing machine") %>%

ggplot(cdi.means, aes(y = num_phonemes, x = mean)) +
  geom_label(aes(label = senseless_cdi_words), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

Complexity and other measures

complexity.correlates = d %>%
        gather("measure_pred", "value", c(9:18, 21))

ggplot(complexity.correlates, aes(x = complexity, y=value)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~measure_pred, scales = "free_y")

ggplot(complexity.correlates, aes(x = complexity, y=value)) +
  geom_point() +
  geom_smooth(method = "lm", se = F, aes(color = lexical_category)) +
  facet_wrap(~measure_pred, scales = "free_y")

ggplot(complexity.correlates, aes(x = complexity, y=value)) +
  geom_point() + 
  geom_smooth(method = "lm", se = F, aes(color = lexical_class)) +
  facet_wrap(~measure_pred, scales = "free_y")

Complexity is reliably correlated with arousal and aoa (as well as length)

complexity.correlates %>%
  group_by(measure_pred) %>%
  summarise(r = tidy(cor.test(value, complexity))$estimate,
            p = tidy(cor.test(value, complexity))$p.value,
            sig = ifelse(p<.05, "*", ""))
## Source: local data frame [11 x 4]
## 
##     measure_pred           r            p   sig
##            (chr)       (dbl)        (dbl) (chr)
## 1            aoa  0.16224113 1.985427e-03     *
## 2        arousal  0.21082074 1.225547e-04     *
## 3       babiness  0.01194425 8.362321e-01      
## 4   concreteness -0.02543910 6.314232e-01      
## 5      dominance -0.01430071 7.966966e-01      
## 6      iconicity -0.07758407 1.787209e-01      
## 7  log.frequency -0.13080431 1.405175e-02     *
## 8   num_phonemes  0.37252412 2.509104e-13     *
## 9  num_syllables  0.32608022 2.175871e-10     *
## 10       valence  0.05115601 3.564645e-01      
## 11    wordLength  0.38870510 1.820766e-14     *

Does complexity predict aoa, controling for other stuff?

Yes.

summary(lm(aoa ~ complexity + wordLength + babiness + concreteness + log.frequency + arousal, d))
## 
## Call:
## lm(formula = aoa ~ complexity + wordLength + babiness + concreteness + 
##     log.frequency + arousal, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8862 -1.7444 -0.0944  1.2752 10.3457 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.86878    1.90004   5.194 4.16e-07 ***
## complexity     0.51545    0.25585   2.015   0.0450 *  
## wordLength    -0.14775    0.14492  -1.020   0.3089    
## babiness      -0.40594    0.08509  -4.771 3.06e-06 ***
## concreteness  -1.00644    0.24413  -4.123 5.04e-05 ***
## log.frequency -1.18837    0.17819  -6.669 1.54e-10 ***
## arousal        0.42609    0.20371   2.092   0.0374 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.767 on 260 degrees of freedom
##   (94 observations deleted due to missingness)
## Multiple R-squared:  0.263,  Adjusted R-squared:  0.246 
## F-statistic: 15.47 on 6 and 260 DF,  p-value: 3.658e-15

Does the CB of a word predict AOA?

No evidence for this. Maybe smaller residual earlier AOA; bigger later aoa?

cb.mod = summary(lm(num_phonemes ~ complexity, d))
aoa.mod = summary(lm(aoa ~  wordLength +log.frequency,d))

aoa.resids = as.data.frame(resid(aoa.mod)) %>%
            mutate(rownames = rownames(.))

cb.resids = as.data.frame(resid(cb.mod)) %>%
            mutate(rownames = rownames(.))

d.resid = d %>% 
  mutate(rownames = rownames(.)) %>%
  left_join(aoa.resids) %>%
  left_join(cb.resids) %>%
  rename(aoa.resid = `resid(aoa.mod)`,
         cb.resid = `resid(cb.mod)`)

d.resid %>%
  filter(cb.resid<4) %>%
  ggplot(aes(y = aoa, x = abs(cb.resid))) +
  geom_label(aes(label = senseless_cdi_words), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 18)

d.resid %>%
  filter(cb.resid<6) %>%
  ggplot( aes(y = aoa, x = cb.resid)) +
  geom_label(aes(label = senseless_cdi_words), position = "jitter") +
  #geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 18)

CB across development

Does the complexity bias in the “lexicon” of the child change across development? Yes!

AOA as WG comprehension [Mika data]

d.devo = d %>%
  filter(aoa > 2 & aoa < 26) %>%
  mutate(aoa.cut = cut_width(aoa, width = 4)) %>%
  gather("length_metric", "length", c(17,18,21))

ggplot(d.devo, aes(y = length, x = complexity)) +
  geom_point() +
  facet_grid(~aoa.cut) +
  geom_smooth(method = "lm", aes(color = length_metric)) +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

(note I also have AOA as WG comprehension from Mike, but it’s different from Mika’s data – check this.)

AOA as WG production [from Mike]

Read in aoa and other predictors.

aoa_preds.wg = read.csv("data/eng_wg_production_aoas.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) 

d.wg = aoa_preds.wg %>%
    left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
    filter(!is.na(complexity)) %>%
    mutate( two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
  filter(!two_words)
d.devo.wg = d.wg  %>%
  filter(aoa > 10 & aoa < 26) %>%
  mutate(aoa.cut = cut_width(aoa, width = 4))

ggplot(d.devo.wg, aes(y = wordLength, x = complexity)) +
  geom_point() +
  facet_grid(~aoa.cut) +
  geom_smooth(method = "lm") +
  ylab("word length (number of characters)") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

AOA as WS production [from Mike]

Read in aoa and other predictors.

aoa_preds.ws = read.csv("data/eng_ws_production_aoas.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) 

d.ws = aoa_preds.ws %>%
    left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
    filter(!is.na(complexity)) %>%
    mutate(two_words = grepl(" ", senseless_cdi_words),
          wordLength = nchar(senseless_cdi_words)) %>% 
  filter(!two_words)
d.devo.ws = d.ws  %>%
  filter(aoa > 17) %>%
  mutate(aoa.cut = cut_width(aoa, width = 3))

ggplot(d.devo.ws, aes(y = wordLength, x = complexity)) +
  geom_point() +
  facet_grid(~aoa.cut) +
  geom_smooth(method = "lm") +
  ylab("word length (number of characters)") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

Other languages

The word count here is pretty rough. Issues of multiple words and unicode translations.

CB

aoa_preds.all = read.csv("data/all_aoa_pred.csv") %>%
  left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
  left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
  filter(!is.na(complexity)) %>%
  select(-word) %>%
  mutate(two_words = (grepl(" ", words) || grepl(" ", senseless_cdi_words)) ,
          wordLength = nchar(as.character(words))) %>% 
  filter(!two_words)
d.devo.all = aoa_preds.all %>%
  filter(wordLength < 12)  

ggplot(d.devo.all, aes(y = wordLength, x = complexity)) +
  geom_point(size = .2) +
  facet_grid(~language) +
  geom_smooth(method = "lm", aes(color = language))+
  ylab("Word length (characters)") +
  xlab("Mean complexity rating") +
  ggtitle("Complexity Bias") +
  theme_bw(base_size = 15) +
  theme(legend.position="none")

Compare to google correlations for each language

google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
  rename(corr.g = corr,
         lci.g = lower.ci,
         hci.g = upper.ci)

d.model.fits.lang = d.devo.all %>%
  mutate(language = tolower(language)) %>%
  group_by(language) %>%
  do(tidy(cor.test(.$wordLength,.$complexity))) %>%
  rename(corr.aoa = estimate,
         lci.aoa = conf.low,
         hci.aoa = conf.high)

all_xling_corrs = left_join(d.model.fits.lang, google.cb, by = "language")

ggplot(all_xling_corrs,aes(x = corr.g,y = corr.aoa)) + 
    geom_point() + 
    geom_smooth(method = "lm") +
    geom_errorbar(aes(ymin = lci.aoa,ymax = hci.aoa)) + 
    geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
    geom_label(aes(label = language), size = 3) +
    ylim(-.1, .75) +
    xlim(0, .75) +
    theme_bw(base_size = 15) +
    ggtitle("Cross-sample correlations of CB") +
    xlab("Google Translate CB Correlation") +
    ylab("CDI WG Comprehension CB Correlation")

The bias is overall smaller in the CDI data, but the two samples are reasonably correlated.

CB across development.

d.devo.all = aoa_preds.all %>%
  filter(aoa > 10 & aoa < 26) %>%
  filter(wordLength < 12) %>%
  mutate(aoa.cut = cut_width(aoa, width = 4)) 

ggplot(d.devo.all, aes(y = wordLength, x = complexity)) +
  geom_point(size = .2) +
  facet_grid(language~aoa.cut, scales = "free") +
  geom_smooth(method = "lm", aes(color = language))+
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 15) +
  theme(legend.position="none")

Correlation across development.

d.model.fits = d.devo.all %>%
  group_by(aoa.cut, language) %>%
  do(tidy(cor.test(.$wordLength,.$complexity))) %>%
  mutate(sig = if (p.value < .05) "significant" else "not significant")

ggplot(d.model.fits, aes(x = aoa.cut, 
                         y = estimate, color = language)) +
  geom_pointrange(aes(ymin = conf.low, 
                      ymax = conf.high, 
                      alpha = sig)) +
  geom_line(aes(group = language)) +
  ylab("complexity bias (pearson's r)") +
  xlab("age slice") +
  facet_grid(~language) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  theme(legend.position="none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

Significance is indicated by the alpha of the point.