Complexity and Age of Acquisition

There’s some evidence that AOA is related to complexity from wordbirth data.

Relately, there’s evidence that less arbitrary means are learned earlier: Monaghan, P. et al. (2014) How arbitrary is language? Philos. Trans. R. Soc. Lond. B: Biol. Sci. 369, 20130299

So, here ask: Are more complex meanings learned later?

aoa.wb from: Wordbak repository.

aoa.k from: Kuperman, V., Stadthagen-Gonzalez, H., & Brysbaert, M. (2012). Age-of-acquisition ratings for 30,000 English words. Behavior Research Methods, 1-13. Retrieved from http://dx.doi.org/10.3758/s13428-012-0210-4

Read in data, etc.

complexity = read.csv("complexityNormsEnglishexp9.csv")
aoa.k = read.csv("AoA_ratings_Kuperman_et_al_BRM.csv")
x <- getURL("https://raw.githubusercontent.com/mikabr/aoa-prediction/master/aoa_data.csv")
aoa.wb <- read.csv(text = x)
mrc = read.csv("MRC_corpus.csv") %>% distinct(word)
morph = read.csv("CELEX2_numMorph.csv")
freq = read.table("SUBTLEXus_corpus.txt", header = T)

aoa.wb.en  = aoa.wb %>%
              filter(measure == "understands")  %>%
              filter(language == "English") %>%
              filter(aoa > 5) %>%
             # rename(c("aoa" = "aoa.wb"))
              rename(aoa.wb = aoa)


aoa.k.en = aoa.k %>%
         #rename(c("Rating.Mean" = "aoa.k"))
          rename(aoa.k = Rating.Mean)

d = complexity %>%
    left_join(aoa.k.en) %>%
    left_join(mrc) %>%
    left_join(freq, by = c("word" = "Word")) %>%
    select(-1, -13) %>%
    mutate(word = as.factor(word)) %>%
    left_join(aoa.wb.en, by = c("word" = "words"))

Compare two measures of AOA with eachother – Wordbank and Kuperman

aoa_only = aoa.k.en %>%
  left_join(aoa.wb.en, by = c("word" = "words")) 
  
  ggplot(aoa_only, aes(x = aoa.k, y = aoa.wb)) +
  geom_point() +
  geom_smooth(method = "lm") +
  annotate("text", x=2, y=13, color = "red", size=10,
           label=paste("r=",round(cor(aoa_only$aoa.wb,
                                      aoa_only$aoa.k, use = "complete"), 2))) +
  ylab("wordbank aoa (months)") +
  xlab("kuperman aoa") +
  xlim(1,8) +
  themeML

Compare each measure of AOA with complexity.

Wordbank AOAs

ggplot(d, aes(x = complexity, y = aoa.wb)) +
  geom_point() +
  geom_smooth(method = "lm") +
  annotate("text", x=2, y=13, color = "red", size= 10,
           label=paste("r=",round(cor(d$complexity, d$aoa.wb, 
                                      use = "complete"), 2))) +
  ylab("wordbank aoa (months)") +
  themeML

Kuperman AOAs

ggplot(d, aes(x = complexity, y = aoa.k)) +
  geom_point() +
  geom_smooth(method = "lm") +
  annotate("text", x=2, y=13, color = "red", size=10,
           label = paste("r=",round(cor(d$complexity, d$aoa.k, use = "complete"), 2))) +
  themeML

For only those words in both AOA datasets

There are 52 words in all three data sets.

 d2 = d %>% 
    filter(!is.na(aoa.wb) & !is.na(complexity) & !is.na(aoa.k))
  
  ggplot(d2, aes(x = complexity, y = aoa.wb)) +
  geom_point() +
  geom_smooth(method = "lm") +
  annotate("text", x=2, y=13, color = "red", size= 10,
           label=paste("r=",round(cor(d$complexity, d$aoa.wb, 
                                      use = "complete"), 2))) +
  ylab("wordbank aoa (months)") +
  ggtitle('Wordbank') +
  themeML

  ggplot(d2, aes(x = complexity, y = aoa.k)) +
  geom_point() +
  geom_smooth(method = "lm") +
  annotate("text", x=2, y=6, color = "red", size=10,
           label = paste("r=",round(cor(d$complexity, d$aoa.k, use = "complete"), 2))) +
  ggtitle('Kuperman') +
  themeML

cor.test(d2$aoa.wb, d2$complexity)

## 
##  Pearson's product-moment correlation
## 
## data:  d2$aoa.wb and d2$complexity
## t = 2.0637, df = 50, p-value = 0.04425
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.007869092 0.513785157
## sample estimates:
##       cor 
## 0.2801678

cor.test(d2$aoa.k, d2$complexity)

## 
##  Pearson's product-moment correlation
## 
## data:  d2$aoa.k and d2$complexity
## t = 2.1092, df = 50, p-value = 0.03996
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01403148 0.51830701
## sample estimates:
##       cor 
## 0.2858373

AOA vs. complexity cross-linguistically, and across measures in wordbank

aoa.wb.all = aoa.wb %>%
            group_by(language, measure)%>%
            #rename(aoa.wb = aoa) %>%
            rename(aoa.wb = aoa) %>%
            filter(aoa.wb > 5 & aoa.wb < 35) %>% # aoa between 5 and 35
            left_join(complexity, by = c("uni_lemma" = "word")) 


corrs = aoa.wb.all  %>%
          summarize(cor = cor(aoa.wb, complexity, use = "complete"), 
                    sig = cor.test(aoa.wb, complexity)$p.value) %>%
          mutate(sig_char = ifelse(sig<.05, "*", "")) %>%
          mutate(sig_string = paste(round(cor,2),sig_char, sep = "")) 

ggplot(aoa.wb.all, aes(x = complexity, y = aoa.wb, color = measure)) +
 geom_point() +
 geom_smooth(method = "lm") +
 geom_text(data = corrs[corrs$measure == "produces",], 
           aes(label = sig_string, x = 3, y = 30)) +
   geom_text(data = corrs[corrs$measure == "understands",], 
             aes(label = sig_string, x = 3, y = 10)) +
 ylab("wordbank aoa (months)") +
 facet_grid(. ~ language) +
 themeML

Models predicting AOA with all predictors

Wordbank

m1 = lm(aoa.wb ~ complexity + mrc.phon + mrc.fam + mrc.conc + mrc.imag + Lg10WF, d)
summary(m1)

## 
## Call:
## lm(formula = aoa.wb ~ complexity + mrc.phon + mrc.fam + mrc.conc + 
##     mrc.imag + Lg10WF, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9936 -1.8904 -0.2874  1.0553  7.7177 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 37.314780  11.546274   3.232  0.00314 **
## complexity   0.691157   0.791433   0.873  0.38993   
## mrc.phon     0.359360   0.703751   0.511  0.61361   
## mrc.fam     -0.010570   0.017842  -0.592  0.55834   
## mrc.conc     0.008906   0.011345   0.785  0.43904   
## mrc.imag    -0.033781   0.012945  -2.610  0.01439 * 
## Lg10WF      -1.162721   1.176335  -0.988  0.33141   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.027 on 28 degrees of freedom
##   (464 observations deleted due to missingness)
## Multiple R-squared:  0.4516, Adjusted R-squared:  0.3341 
## F-statistic: 3.843 on 6 and 28 DF,  p-value: 0.006395

Kuperman

m2 = lm(aoa.k ~ complexity + mrc.phon + mrc.fam + mrc.conc + mrc.imag + Lg10WF, d)
summary(m2)

## 
## Call:
## lm(formula = aoa.k ~ complexity + mrc.phon + mrc.fam + mrc.conc + 
##     mrc.imag + Lg10WF, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9864 -0.8624 -0.0831  0.8966  6.0482 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.6964700  0.9210308  15.957  < 2e-16 ***
## complexity   0.7157948  0.0984249   7.272 2.63e-12 ***
## mrc.phon     0.0196226  0.0514660   0.381  0.70325    
## mrc.fam     -0.0097059  0.0017286  -5.615 4.20e-08 ***
## mrc.conc    -0.0006938  0.0013124  -0.529  0.59744    
## mrc.imag    -0.0044211  0.0015266  -2.896  0.00403 ** 
## Lg10WF      -0.8927329  0.1347983  -6.623 1.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.39 on 327 degrees of freedom
##   (165 observations deleted due to missingness)
## Multiple R-squared:  0.7238, Adjusted R-squared:  0.7188 
## F-statistic: 142.8 on 6 and 327 DF,  p-value: < 2.2e-16