Read in complexity norms.
complexity = read.csv("data/cdi_complexity_norms.csv") %>%
select(mean, wordLength, word) %>%
rename(complexity = mean)
ggplot(complexity, aes(y = wordLength, x = complexity)) +
geom_label(aes(label = word), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
tidy(cor.test(complexity$wordLength, complexity$complexity))
## estimate statistic p.value parameter conf.low conf.high
## 1 0.3482136 9.279105 0 624 0.2774203 0.415246
Read in aoa and other predictors.
words = read.csv("../list_maker/RC43_words.csv") %>%
select(cdi_words, senseless_cdi_words)
aoa_preds = read.csv("data/all_aoa_pred.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
filter(language == "English")
# "comb", "dress" not merge, not sure why. "in" missing?
#filter(aoa_preds, is.na(senseless_cdi_words) & language == "English")
d = aoa_preds %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter( language == "English") %>%
filter(!is.na(complexity)) %>%
select(-word) %>%
mutate( two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
filter(!two_words)
# for some reason the data from mika only includes AOAs for 391 words
cdi.means = d %>%
group_by(senseless_cdi_words) %>%
multi_boot_standard(column = "complexity") %>%
mutate(two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
left_join(d %>% group_by(senseless_cdi_words) %>% summarise(num_phonemes = mean(num_phonemes))) %>%
filter(!two_words) # filter out 2 word items (e.g. "washing machine") %>%
ggplot(cdi.means, aes(y = num_phonemes, x = mean)) +
geom_label(aes(label = senseless_cdi_words), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
complexity.correlates = d %>%
gather("measure_pred", "value", c(9:18, 21))
ggplot(complexity.correlates, aes(x = complexity, y=value)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~measure_pred, scales = "free_y")
ggplot(complexity.correlates, aes(x = complexity, y=value)) +
geom_point() +
geom_smooth(method = "lm", se = F, aes(color = lexical_category)) +
facet_wrap(~measure_pred, scales = "free_y")
ggplot(complexity.correlates, aes(x = complexity, y=value)) +
geom_point() +
geom_smooth(method = "lm", se = F, aes(color = lexical_class)) +
facet_wrap(~measure_pred, scales = "free_y")
Complexity is reliably correlated with arousal and aoa (as well as length)
complexity.correlates %>%
group_by(measure_pred) %>%
summarise(r = tidy(cor.test(value, complexity))$estimate,
p = tidy(cor.test(value, complexity))$p.value,
sig = ifelse(p<.05, "*", ""))
## Source: local data frame [11 x 4]
##
## measure_pred r p sig
## (chr) (dbl) (dbl) (chr)
## 1 aoa 0.16224113 1.985427e-03 *
## 2 arousal 0.21082074 1.225547e-04 *
## 3 babiness 0.01194425 8.362321e-01
## 4 concreteness -0.02543910 6.314232e-01
## 5 dominance -0.01430071 7.966966e-01
## 6 iconicity -0.07758407 1.787209e-01
## 7 log.frequency -0.13080431 1.405175e-02 *
## 8 num_phonemes 0.37252412 2.509104e-13 *
## 9 num_syllables 0.32608022 2.175871e-10 *
## 10 valence 0.05115601 3.564645e-01
## 11 wordLength 0.38870510 1.820766e-14 *
Yes.
summary(lm(aoa ~ complexity + wordLength + babiness + concreteness + log.frequency + arousal, d))
##
## Call:
## lm(formula = aoa ~ complexity + wordLength + babiness + concreteness +
## log.frequency + arousal, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8862 -1.7444 -0.0944 1.2752 10.3457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.86878 1.90004 5.194 4.16e-07 ***
## complexity 0.51545 0.25585 2.015 0.0450 *
## wordLength -0.14775 0.14492 -1.020 0.3089
## babiness -0.40594 0.08509 -4.771 3.06e-06 ***
## concreteness -1.00644 0.24413 -4.123 5.04e-05 ***
## log.frequency -1.18837 0.17819 -6.669 1.54e-10 ***
## arousal 0.42609 0.20371 2.092 0.0374 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.767 on 260 degrees of freedom
## (94 observations deleted due to missingness)
## Multiple R-squared: 0.263, Adjusted R-squared: 0.246
## F-statistic: 15.47 on 6 and 260 DF, p-value: 3.658e-15
No evidence for this. Maybe smaller residual earlier AOA; bigger later aoa?
cb.mod = summary(lm(num_phonemes ~ complexity, d))
aoa.mod = summary(lm(aoa ~ wordLength +log.frequency,d))
aoa.resids = as.data.frame(resid(aoa.mod)) %>%
mutate(rownames = rownames(.))
cb.resids = as.data.frame(resid(cb.mod)) %>%
mutate(rownames = rownames(.))
d.resid = d %>%
mutate(rownames = rownames(.)) %>%
left_join(aoa.resids) %>%
left_join(cb.resids) %>%
rename(aoa.resid = `resid(aoa.mod)`,
cb.resid = `resid(cb.mod)`)
d.resid %>%
filter(cb.resid<4) %>%
ggplot(aes(y = aoa, x = abs(cb.resid))) +
geom_label(aes(label = senseless_cdi_words), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw(base_size = 18)
d.resid %>%
filter(cb.resid<6) %>%
ggplot( aes(y = aoa, x = cb.resid)) +
geom_label(aes(label = senseless_cdi_words), position = "jitter") +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw(base_size = 18)
Does the complexity bias in the “lexicon” of the child change across development? Yes!
d.devo = d %>%
filter(aoa > 2 & aoa < 26) %>%
mutate(aoa.cut = cut_width(aoa, width = 4)) %>%
gather("length_metric", "length", c(17,18,21))
ggplot(d.devo, aes(y = length, x = complexity)) +
geom_point() +
facet_grid(~aoa.cut) +
geom_smooth(method = "lm", aes(color = length_metric)) +
ylab("Length") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 18)
(note I also have AOA as WG comprehension from Mike, but it’s different from Mika’s data – check this.)
Read in aoa and other predictors.
aoa_preds.wg = read.csv("data/eng_wg_production_aoas.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words"))
d.wg = aoa_preds.wg %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter(!is.na(complexity)) %>%
mutate( two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
filter(!two_words)
d.devo.wg = d.wg %>%
filter(aoa > 10 & aoa < 26) %>%
mutate(aoa.cut = cut_width(aoa, width = 4))
ggplot(d.devo.wg, aes(y = wordLength, x = complexity)) +
geom_point() +
facet_grid(~aoa.cut) +
geom_smooth(method = "lm") +
ylab("word length (number of characters)") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 18)
Read in aoa and other predictors.
aoa_preds.ws = read.csv("data/eng_ws_production_aoas.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words"))
d.ws = aoa_preds.ws %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter(!is.na(complexity)) %>%
mutate(two_words = grepl(" ", senseless_cdi_words),
wordLength = nchar(senseless_cdi_words)) %>%
filter(!two_words)
d.devo.ws = d.ws %>%
filter(aoa > 17) %>%
mutate(aoa.cut = cut_width(aoa, width = 3))
ggplot(d.devo.ws, aes(y = wordLength, x = complexity)) +
geom_point() +
facet_grid(~aoa.cut) +
geom_smooth(method = "lm") +
ylab("word length (number of characters)") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 18)
The word count here is pretty rough. Issues of multiple words and unicode translations.
aoa_preds.all = read.csv("data/all_aoa_pred.csv") %>%
left_join(words, by =c("uni_lemma" = "cdi_words")) %>%
left_join(complexity, by =c("senseless_cdi_words" = "word")) %>%
filter(!is.na(complexity)) %>%
select(-word) %>%
mutate(two_words = (grepl(" ", words) || grepl(" ", senseless_cdi_words)) ,
wordLength = nchar(as.character(words))) %>%
filter(!two_words)
d.devo.all = aoa_preds.all %>%
filter(wordLength < 12)
ggplot(d.devo.all, aes(y = wordLength, x = complexity)) +
geom_point(size = .2) +
facet_grid(~language) +
geom_smooth(method = "lm", aes(color = language))+
ylab("Word length (characters)") +
xlab("Mean complexity rating") +
ggtitle("Complexity Bias") +
theme_bw(base_size = 15) +
theme(legend.position="none")
google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
rename(corr.g = corr,
lci.g = lower.ci,
hci.g = upper.ci)
d.model.fits.lang = d.devo.all %>%
mutate(language = tolower(language)) %>%
group_by(language) %>%
do(tidy(cor.test(.$wordLength,.$complexity))) %>%
rename(corr.aoa = estimate,
lci.aoa = conf.low,
hci.aoa = conf.high)
all_xling_corrs = left_join(d.model.fits.lang, google.cb, by = "language")
ggplot(all_xling_corrs,aes(x = corr.g,y = corr.aoa)) +
geom_point() +
geom_smooth(method = "lm") +
geom_errorbar(aes(ymin = lci.aoa,ymax = hci.aoa)) +
geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
geom_label(aes(label = language), size = 3) +
ylim(-.1, .75) +
xlim(0, .75) +
theme_bw(base_size = 15) +
ggtitle("Cross-sample correlations of CB") +
xlab("Google Translate CB Correlation") +
ylab("CDI WG Comprehension CB Correlation")
The bias is overall smaller in the CDI data, but the two samples are reasonably correlated.
d.devo.all = aoa_preds.all %>%
filter(aoa > 10 & aoa < 26) %>%
filter(wordLength < 12) %>%
mutate(aoa.cut = cut_width(aoa, width = 4))
ggplot(d.devo.all, aes(y = wordLength, x = complexity)) +
geom_point(size = .2) +
facet_grid(language~aoa.cut, scales = "free") +
geom_smooth(method = "lm", aes(color = language))+
ylab("Length") +
xlab("Mean complexity rating") +
ggtitle("Age") +
theme_bw(base_size = 15) +
theme(legend.position="none")
d.model.fits = d.devo.all %>%
group_by(aoa.cut, language) %>%
do(tidy(cor.test(.$wordLength,.$complexity))) %>%
mutate(sig = if (p.value < .05) "significant" else "not significant")
ggplot(d.model.fits, aes(x = aoa.cut,
y = estimate, color = language)) +
geom_pointrange(aes(ymin = conf.low,
ymax = conf.high,
alpha = sig)) +
geom_line(aes(group = language)) +
ylab("complexity bias (pearson's r)") +
xlab("age slice") +
facet_grid(~language) +
geom_hline(yintercept = 0) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Significance is indicated by the alpha of the point.