Read in complexity norms.
complexity = read.csv("data/cdi_complexity_norms.csv") %>%
select(mean, wordLength, word) %>%
rename(complexity = mean)
ggplot(complexity, aes(y = wordLength, x = complexity)) +
geom_label(aes(label = word), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
tidy(cor.test(complexity$wordLength, complexity$complexity)) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.3482136 | 9.279105 | 0 |
## CB for Mika's AOA words
cdi.means = d %>%
group_by(senseless_cdi_words) %>%
multi_boot_standard(column = "complexity") %>%
mutate(two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
left_join(d %>% group_by(senseless_cdi_words) %>% summarise(num_phonemes = mean(num_phonemes))) %>%
filter(!two_words) # filter out 2 word items (e.g. "washing machine") %>%
ggplot(cdi.means, aes(y = num_phonemes, x = mean)) +
geom_label(aes(label = senseless_cdi_words), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
Read in aoa and other predictors.
words = read.csv("RC43_words.csv") %>%
select(cdi_words, senseless_cdi_words)
aoa_preds = read.csv("data/all_aoa_pred.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
filter(language == "English")
# "comb", "dress" not merge, not sure why. "in" missing?
#filter(aoa_preds, is.na(senseless_cdi_words) & language == "English")
d = aoa_preds %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter( language == "English") %>%
filter(!is.na(complexity)) %>%
select(-word) %>%
mutate( two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
filter(!two_words)
# for some reason the data from mika only includes AOAs for 391 words
complexity.correlates = d %>%
gather("measure_pred", "value", c(9:18, 21))
ggplot(complexity.correlates, aes(x = complexity, y=value)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~measure_pred, scales = "free_y")
ggplot(complexity.correlates, aes(x = complexity, y=value)) +
geom_point() +
geom_smooth(method = "lm", se = F, aes(color = lexical_category)) +
facet_wrap(~measure_pred, scales = "free_y")
Complexity is reliably correlated with arousal and aoa (as well as length)
complexity.correlates %>%
group_by(measure_pred) %>%
summarise(r = tidy(cor.test(value, complexity))$estimate,
p = tidy(cor.test(value, complexity))$p.value,
sig = ifelse(p<.05, "*", "")) %>%
kable()
| measure_pred | r | p | sig |
|---|---|---|---|
| aoa | 0.1622411 | 0.0019854 | * |
| arousal | 0.2108207 | 0.0001226 | * |
| babiness | 0.0119442 | 0.8362321 | |
| concreteness | -0.0254391 | 0.6314232 | |
| dominance | -0.0143007 | 0.7966966 | |
| iconicity | -0.0775841 | 0.1787209 | |
| log.frequency | -0.1308043 | 0.0140517 | * |
| num_phonemes | 0.3725241 | 0.0000000 | * |
| num_syllables | 0.3260802 | 0.0000000 | * |
| valence | 0.0511560 | 0.3564645 | |
| wordLength | 0.3887051 | 0.0000000 | * |
Does complexity predict aoa, controling for other stuff? Yes.
tidy(lm(aoa ~ complexity + wordLength + babiness + concreteness + log.frequency + arousal, d)) %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 9.8687783 | 1.9000432 | 5.193976 | 0.0000004 |
| complexity | 0.5154499 | 0.2558496 | 2.014660 | 0.0449699 |
| wordLength | -0.1477522 | 0.1449219 | -1.019530 | 0.3088993 |
| babiness | -0.4059421 | 0.0850850 | -4.771018 | 0.0000031 |
| concreteness | -1.0064352 | 0.2441284 | -4.122565 | 0.0000504 |
| log.frequency | -1.1883704 | 0.1781925 | -6.669025 | 0.0000000 |
| arousal | 0.4260905 | 0.2037121 | 2.091630 | 0.0374420 |
Does the complexity bias in the “lexicon” of the child change across development? Yes.
#[Mika data]
d.devo = d %>%
filter(aoa > 2 & aoa < 26) %>%
mutate(aoa.cut = cut_width(aoa, width = 4)) %>%
gather("length_metric", "length", c(17,18,21))
ggplot(d.devo, aes(y = length, x = complexity)) +
geom_point() +
facet_grid(~aoa.cut) +
geom_smooth(method = "lm", aes(color = length_metric)) +
ylab("Length") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 18)
ggplot(d.devo, aes(x = length, color = aoa.cut)) +
geom_density() +
#facet_grid(~aoa.cut) +
ylab("Length") +
ggtitle("Age") +
theme_bw(base_size = 18)
d.devo %>%
group_by(aoa.cut) %>%
summarise(mean = mean(length),
sd = sd(length)) %>%
gather(measure,value,2:3) %>%
ggplot(aes(x = aoa.cut, y = value, group = measure, color = measure)) +
geom_point() +
geom_line() +
theme_bw()
Complexity bias decreases across development – Is this because variability in words decreases?
The above isn’t really vocabulary - it’s the set of new words learned at each stage. Now, let’s look at cumulative vocabulary.
c1 = filter(d.devo, aoa.cut == "[6,10]")
c2 = filter(d.devo, aoa.cut == "(10,14]")
c3 = filter(d.devo, aoa.cut == "(14,18]")
c4 = filter(d.devo, aoa.cut == "(18,22]")
c5 = filter(d.devo, aoa.cut == "(22,26]")
cc1 = mutate(c1, age = 1)
cc2 = rbind(c1, c2) %>% mutate(age = 2)
cc3 = rbind(c1, c2, c3) %>% mutate(age = 3)
cc4 = rbind(c1, c2, c3, c4) %>% mutate(age = 4)
cc5 = rbind(c1, c2, c3, c4, c5) %>% mutate(age = 5)
d.devo.cumulative = rbind(cc1, cc2, cc3, cc4, cc5)
ggplot(d.devo.cumulative, aes(y = length, x = complexity)) +
geom_point() +
facet_grid(~age) +
geom_smooth(method = "lm", aes(color = length_metric)) +
ylab("Length") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 18)
d.devo.cumulative %>%
group_by(age) %>%
summarise(mean = mean(length),
sd = sd(length)) %>%
gather(measure,value,2:3) %>%
ggplot(aes(x = age, y = value, group = measure, color = measure)) +
geom_point() +
geom_line() +
theme_bw()
The decrease in SD is smaller in the cumulative vocabulary, but still present.
aoa_preds.all = read.csv("data/all_aoa_pred.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter(!is.na(complexity)) %>%
select(-word) %>%
mutate(two_words = (grepl(" ", words) || grepl(" ", senseless_cdi_words)) ,
wordLength = nchar(as.character(words))) %>%
filter(!two_words)
d.devo.all = aoa_preds.all %>%
filter(wordLength < 12)
google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
rename(corr.g = corr,
lci.g = lower.ci,
hci.g = upper.ci)
d.model.fits.lang = d.devo.all %>%
mutate(language = tolower(language)) %>%
group_by(language) %>%
do(tidy(cor.test(.$wordLength,.$complexity))) %>%
rename(corr.aoa = estimate,
lci.aoa = conf.low,
hci.aoa = conf.high)
all_xling_corrs = left_join(d.model.fits.lang, google.cb, by = "language")
ggplot(all_xling_corrs,aes(x = corr.g,y = corr.aoa)) +
geom_point() +
geom_smooth(method = "lm") +
geom_errorbar(aes(ymin = lci.aoa,ymax = hci.aoa)) +
geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
geom_label(aes(label = language), size = 3) +
ylim(-.1, .75) +
xlim(0, .75) +
theme_bw(base_size = 15) +
ggtitle("Cross-sample correlations of CB") +
xlab("Google Translate CB Correlation") +
ylab("CDI WG Comprehension CB Correlation")
#write.csv( d.model.fits.lang, "aoa_corrs.csv")
tidy(cor.test(all_xling_corrs$corr.g, all_xling_corrs$corr.aoa)) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.8407045 | 3.47164 | 0.0178179 |
The two samples are positively correlated.