There’s some evidence that AOA is related to complexity from wordbirth data.
Relately, there’s evidence that less arbitrary means are learned earlier: Monaghan, P. et al. (2014) How arbitrary is language? Philos. Trans. R. Soc. Lond. B: Biol. Sci. 369, 20130299
So, here ask: Are more complex meanings learned later?
aoa.wb from: Wordbak repository.
aoa.k from: Kuperman, V., Stadthagen-Gonzalez, H., & Brysbaert, M. (2012). Age-of-acquisition ratings for 30,000 English words. Behavior Research Methods, 1-13. Retrieved from http://dx.doi.org/10.3758/s13428-012-0210-4
Read in data, etc.
complexity = read.csv("complexityNormsEnglishexp9.csv")
aoa.k = read.csv("AoA_ratings_Kuperman_et_al_BRM.csv")
x <- getURL("https://raw.githubusercontent.com/mikabr/aoa-prediction/master/aoa_data.csv")
aoa.wb <- read.csv(text = x)
mrc = read.csv("MRC_corpus.csv") %>% distinct(word)
morph = read.csv("CELEX2_numMorph.csv")
freq = read.table("SUBTLEXus_corpus.txt", header = T)
aoa.wb.en = aoa.wb %>%
filter(measure == "understands") %>%
filter(language == "English") %>%
filter(aoa > 5) %>%
# rename(c("aoa" = "aoa.wb"))
rename(aoa.wb = aoa)
aoa.k.en = aoa.k %>%
#rename(c("Rating.Mean" = "aoa.k"))
rename(aoa.k = Rating.Mean)
d = complexity %>%
left_join(aoa.k.en) %>%
left_join(mrc) %>%
left_join(freq, by = c("word" = "Word")) %>%
select(-1, -13) %>%
mutate(word = as.factor(word)) %>%
left_join(aoa.wb.en, by = c("word" = "words"))
aoa_only = aoa.k.en %>%
left_join(aoa.wb.en, by = c("word" = "words"))
ggplot(aoa_only, aes(x = aoa.k, y = aoa.wb)) +
geom_point() +
geom_smooth(method = "lm") +
annotate("text", x=2, y=13, color = "red", size=10,
label=paste("r=",round(cor(aoa_only$aoa.wb,
aoa_only$aoa.k, use = "complete"), 2))) +
ylab("wordbank aoa (months)") +
xlab("kuperman aoa") +
xlim(1,8) +
themeML
Wordbank AOAs
ggplot(d, aes(x = complexity, y = aoa.wb)) +
geom_point() +
geom_smooth(method = "lm") +
annotate("text", x=2, y=13, color = "red", size= 10,
label=paste("r=",round(cor(d$complexity, d$aoa.wb,
use = "complete"), 2))) +
ylab("wordbank aoa (months)") +
themeML
Kuperman AOAs
ggplot(d, aes(x = complexity, y = aoa.k)) +
geom_point() +
geom_smooth(method = "lm") +
annotate("text", x=2, y=13, color = "red", size=10,
label = paste("r=",round(cor(d$complexity, d$aoa.k, use = "complete"), 2))) +
themeML
For only those words in both AOA datasets
There are 52 words in all three data sets.
d2 = d %>%
filter(!is.na(aoa.wb) & !is.na(complexity) & !is.na(aoa.k))
ggplot(d2, aes(x = complexity, y = aoa.wb)) +
geom_point() +
geom_smooth(method = "lm") +
annotate("text", x=2, y=13, color = "red", size= 10,
label=paste("r=",round(cor(d$complexity, d$aoa.wb,
use = "complete"), 2))) +
ylab("wordbank aoa (months)") +
ggtitle('Wordbank') +
themeML
ggplot(d2, aes(x = complexity, y = aoa.k)) +
geom_point() +
geom_smooth(method = "lm") +
annotate("text", x=2, y=6, color = "red", size=10,
label = paste("r=",round(cor(d$complexity, d$aoa.k, use = "complete"), 2))) +
ggtitle('Kuperman') +
themeML
cor.test(d2$aoa.wb, d2$complexity)
##
## Pearson's product-moment correlation
##
## data: d2$aoa.wb and d2$complexity
## t = 2.0637, df = 50, p-value = 0.04425
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.007869092 0.513785157
## sample estimates:
## cor
## 0.2801678
cor.test(d2$aoa.k, d2$complexity)
##
## Pearson's product-moment correlation
##
## data: d2$aoa.k and d2$complexity
## t = 2.1092, df = 50, p-value = 0.03996
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01403148 0.51830701
## sample estimates:
## cor
## 0.2858373
aoa.wb.all = aoa.wb %>%
group_by(language, measure)%>%
#rename(aoa.wb = aoa) %>%
rename(aoa.wb = aoa) %>%
filter(aoa.wb > 5 & aoa.wb < 35) %>% # aoa between 5 and 35
left_join(complexity, by = c("uni_lemma" = "word"))
corrs = aoa.wb.all %>%
summarize(cor = cor(aoa.wb, complexity, use = "complete"),
sig = cor.test(aoa.wb, complexity)$p.value) %>%
mutate(sig_char = ifelse(sig<.05, "*", "")) %>%
mutate(sig_string = paste(round(cor,2),sig_char, sep = ""))
ggplot(aoa.wb.all, aes(x = complexity, y = aoa.wb, color = measure)) +
geom_point() +
geom_smooth(method = "lm") +
geom_text(data = corrs[corrs$measure == "produces",],
aes(label = sig_string, x = 3, y = 30)) +
geom_text(data = corrs[corrs$measure == "understands",],
aes(label = sig_string, x = 3, y = 10)) +
ylab("wordbank aoa (months)") +
facet_grid(. ~ language) +
themeML
Wordbank
m1 = lm(aoa.wb ~ complexity + mrc.phon + mrc.fam + mrc.conc + mrc.imag + Lg10WF, d)
summary(m1)
##
## Call:
## lm(formula = aoa.wb ~ complexity + mrc.phon + mrc.fam + mrc.conc +
## mrc.imag + Lg10WF, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9936 -1.8904 -0.2874 1.0553 7.7177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.314780 11.546274 3.232 0.00314 **
## complexity 0.691157 0.791433 0.873 0.38993
## mrc.phon 0.359360 0.703751 0.511 0.61361
## mrc.fam -0.010570 0.017842 -0.592 0.55834
## mrc.conc 0.008906 0.011345 0.785 0.43904
## mrc.imag -0.033781 0.012945 -2.610 0.01439 *
## Lg10WF -1.162721 1.176335 -0.988 0.33141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.027 on 28 degrees of freedom
## (464 observations deleted due to missingness)
## Multiple R-squared: 0.4516, Adjusted R-squared: 0.3341
## F-statistic: 3.843 on 6 and 28 DF, p-value: 0.006395
Kuperman
m2 = lm(aoa.k ~ complexity + mrc.phon + mrc.fam + mrc.conc + mrc.imag + Lg10WF, d)
summary(m2)
##
## Call:
## lm(formula = aoa.k ~ complexity + mrc.phon + mrc.fam + mrc.conc +
## mrc.imag + Lg10WF, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9864 -0.8624 -0.0831 0.8966 6.0482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.6964700 0.9210308 15.957 < 2e-16 ***
## complexity 0.7157948 0.0984249 7.272 2.63e-12 ***
## mrc.phon 0.0196226 0.0514660 0.381 0.70325
## mrc.fam -0.0097059 0.0017286 -5.615 4.20e-08 ***
## mrc.conc -0.0006938 0.0013124 -0.529 0.59744
## mrc.imag -0.0044211 0.0015266 -2.896 0.00403 **
## Lg10WF -0.8927329 0.1347983 -6.623 1.45e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.39 on 327 degrees of freedom
## (165 observations deleted due to missingness)
## Multiple R-squared: 0.7238, Adjusted R-squared: 0.7188
## F-statistic: 142.8 on 6 and 327 DF, p-value: < 2.2e-16