First we load in the frequency data from several corpora preprocessed in CDIorigins.Rmd. The frequencies are already normalized to counts per million tokens.
load("data/merged_word_freqs.Rdata")
aoas <- readRDS("data/english_(american)_aoa_bydefinition.rds") %>%
filter(measure=="produces")
Use to pick compelling examples.
# Montag corpus
dd <- ch_freq_smooth %>% filter(on_cdi==1) %>%
select(-ch_book_vs_speech)
# Charlesworth (FB children's books)
dd_novels <- ch_freq_smooth_charles %>% filter(on_cdi==1) %>%
select(-ch_book_vs_speech)
ddd <- dd %>% select(-prop_booky) %>%
left_join(dd_novels %>% select(-prop_booky)) %>%
mutate(Nletters = ifelse(is.na(Nletters), nchar(word), Nletters)) %>%
left_join(aoas %>% select(-measure))
## Joining, by = c("word", "definition", "on_cdi", "Nletters", "lexical_class", "adult_books")
## Joining, by = "definition"
dddc <- ddd %>% mutate(adult_books = ifelse(is.na(adult_books), 0.07322, adult_books),
speech = ifelse(is.na(speech), 10, speech),
books = ifelse(is.na(books), 10, books),
tv = ifelse(is.na(tv), 10, tv))
summary(ddd)
## word definition on_cdi CHILDES
## Length:758 Length:758 Min. :1 Min. : 10.00
## Class :character Class :character 1st Qu.:1 1st Qu.: 73.37
## Mode :character Mode :character Median :1 Median : 163.35
## Mean :1 Mean : 948.43
## 3rd Qu.:1 3rd Qu.: 485.15
## Max. :1 Max. :44512.28
##
## Montag Nletters lexical_class adult_books
## Min. : 10.00 Min. : 1.000 Length:758 Min. : 0.07
## 1st Qu.: 52.95 1st Qu.: 4.000 Class :character 1st Qu.: 7.12
## Median : 138.86 Median : 5.000 Mode :character Median : 31.31
## Mean : 821.61 Mean : 4.892 Mean : 663.56
## 3rd Qu.: 482.50 3rd Qu.: 6.000 3rd Qu.: 138.27
## Max. :51941.50 Max. :14.000 Max. :71382.55
## NA's :14
## books tv speech language
## Min. : 10.00 Min. : 10.00 Min. : 10.18 Length:758
## 1st Qu.: 20.31 1st Qu.: 46.24 1st Qu.: 76.23 Class :character
## Median : 79.25 Median : 127.46 Median : 173.94 Mode :character
## Mean : 788.44 Mean : 748.65 Mean : 956.18
## 3rd Qu.: 290.57 3rd Qu.: 432.05 3rd Qu.: 486.08
## Max. :51162.11 Max. :45860.64 Max. :47750.03
## NA's :1 NA's :1 NA's :1
## intercept slope aoa
## Min. :-10.786 Min. :0.1349 Min. : 9.924
## 1st Qu.: -8.472 1st Qu.:0.3000 1st Qu.:22.454
## Median : -7.798 Median :0.3002 Median :24.947
## Mean : -7.637 Mean :0.3080 Mean :24.869
## 3rd Qu.: -6.939 3rd Qu.:0.3004 3rd Qu.:27.345
## Max. : -2.257 Max. :0.4561 Max. :34.830
##
dddc %>% mutate(CHILDES = log(CHILDES),
Montag = log(Montag),
books = log(books),
tv = log(tv),
adult_books = log(adult_books)) %>%
ggpairs(columns = c("CHILDES","Montag","books","tv","adult_books"),
ggplot2::aes(colour=lexical_class)) +
theme_classic()
#ggsave("CDIword_frequency_distro_comparisons.pdf", width=9, height=9)
Significant contributions of children’s books, children’s speech, adult book frequencies, and number of letters.
lm1 <- lm(aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) + Nletters, data=dddc)
summary(lm1)
##
## Call:
## lm(formula = aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) +
## Nletters, data = dddc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4176 -2.2670 0.0587 2.1087 9.2265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.75277 0.82483 31.222 < 2e-16 ***
## log(books) 1.43010 0.16661 8.584 < 2e-16 ***
## log(speech) -1.90259 0.16249 -11.709 < 2e-16 ***
## log(tv) -0.28121 0.18213 -1.544 0.123
## log(adult_books) 0.64032 0.11109 5.764 1.20e-08 ***
## Nletters 0.41300 0.08365 4.937 9.77e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.337 on 752 degrees of freedom
## Multiple R-squared: 0.3063, Adjusted R-squared: 0.3017
## F-statistic: 66.42 on 5 and 752 DF, p-value: < 2.2e-16
# R^2 = .31
car::vif(lm1) %>%
knitr::kable(digits = 3, format = "html", table.attr = "style='width:20%;'")
x | |
---|---|
log(books) | 6.474 |
log(speech) | 4.165 |
log(tv) | 5.967 |
log(adult_books) | 5.474 |
Nletters | 1.412 |
But VIFs are >>1: the word frequency distributions are highly correlated, so we will use PCA to disentangle.
load("data/Charlesworth-unlemmatized-counts.Rdata")
adult_dat <- adult_dat %>% filter(source!="books") %>%
select(source, word, word_count_norm) %>%
pivot_wider(id_col = word, names_from = source, values_from = word_count_norm) %>%
rename(`Adult Speech` = speech,
`Adult TV` = TV)
dddc <- dddc %>%
rename(`Adult Books` = adult_books, # Google books
`Child Books` = books, # Facebooks children's books (novels..)
`Child TV` = tv,
`Child Speech` = speech) %>%
select(-CHILDES) %>% # less well-cleaned version
left_join(adult_dat)
## Joining, by = "word"
dddc <- dddc %>% replace_na(list(`Adult Speech` = 0.32,
`Adult TV` = 0.32))
We will use all relevant frequency distributions and try to understand the composition of the principal components.
pc_bsa <- prcomp(log(dddc[,c("Adult Books","Child Books",
"Adult Speech","Child Speech",
"Adult TV", "Child TV")])) # add Montag separately?
summary(pc_bsa)$importance %>%
knitr::kable(digits = 3, format = "html", table.attr = "style='width:50%;'")
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
---|---|---|---|---|---|---|
Standard deviation | 5.008 | 1.030 | 0.934 | 0.799 | 0.565 | 0.431 |
Proportion of Variance | 0.891 | 0.038 | 0.031 | 0.023 | 0.011 | 0.007 |
Cumulative Proportion | 0.891 | 0.928 | 0.959 | 0.982 | 0.993 | 1.000 |
The first component explains the bulk the variance (89%), and the first four components capture 98% of the variance. What do these components look like? PC1 captures overall frequency, but loads somewhat higher on adult distributions. PC2 (3.8% of variance) differentiates adult books and TV, especially from children’s speech. PC3 (3.1% of variance) differentiates adult speech and to a small extent adult books from everything else. PC4 (2.2% of variance) captures similarity of child and adult books. PC5 adult books and child speech similarity?
#pc_bsa$rotation * -1
ddbsa <- cbind(dddc, data.frame(pc_bsa$x))
pc_bsa$rotation %>%
knitr::kable(digits = 2, format = "html", table.attr = "style='width:50%;'")
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
---|---|---|---|---|---|---|
Adult Books | -0.49 | -0.50 | 0.01 | -0.41 | 0.47 | 0.35 |
Child Books | -0.34 | 0.18 | -0.36 | -0.62 | -0.54 | -0.21 |
Adult Speech | -0.51 | 0.35 | 0.77 | 0.06 | -0.13 | -0.02 |
Child Speech | -0.26 | 0.61 | -0.36 | 0.08 | 0.61 | -0.22 |
Adult TV | -0.47 | -0.44 | -0.18 | 0.51 | -0.11 | -0.53 |
Child TV | -0.30 | 0.17 | -0.34 | 0.42 | -0.29 | 0.71 |
outlier_thresh = 5
p1 <- ggplot(ddbsa, aes(x = PC1, y = aoa, col = lexical_class)) +
geom_point() + geom_smooth(method = "lm") +
geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh | PC1 > outlier_thresh), aes(label = word)) +
facet_wrap(~lexical_class) + theme_classic()
p2 <- ggplot(ddbsa, aes(x = PC2, y = aoa, col = lexical_class)) +
geom_point() + geom_smooth(method = "lm") +
geom_text_repel(data = filter(ddbsa, PC2 < -outlier_thresh | PC2 > outlier_thresh), aes(label = word)) +
facet_wrap(~lexical_class) + theme_classic()
p3 <- ggplot(ddbsa, aes(x = PC3, y = aoa, col = lexical_class)) +
geom_point() + geom_smooth(method = "lm") +
geom_text_repel(data = filter(ddbsa, PC3 < -outlier_thresh | PC3 > outlier_thresh), aes(label = word)) +
facet_wrap(~lexical_class) + theme_classic()
p4 <- ggplot(ddbsa, aes(x = PC4, y = aoa, col = lexical_class)) +
geom_point() + geom_smooth(method = "lm") +
geom_text_repel(data = filter(ddbsa, PC4 < -outlier_thresh | PC4 > outlier_thresh), aes(label = word)) +
facet_wrap(~lexical_class) + theme_classic()
ggarrange(p1, p2, p3, p4, nrow=2, ncol=2, common.legend = T)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggplot(ddbsa, aes(x = PC1, y = PC2, col = lexical_class)) +
geom_point() + theme_classic() +
facet_wrap(~lexical_class) +
geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh, PC1 > outlier_thresh,
PC2 < -outlier_thresh, PC2 > outlier_thresh),
aes(label = word), max.overlaps = 10, size =3)
ddbsa$lexical_class <- fct_relevel(ddbsa$lexical_class, "nouns")
summary(lm(aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data=ddbsa))
##
## Call:
## lm(formula = aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = ddbsa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5754 -2.1133 0.0118 2.1840 11.4051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.86911 0.12105 205.440 < 2e-16 ***
## PC1 -0.16798 0.02419 -6.945 8.23e-12 ***
## PC2 -1.09625 0.11759 -9.323 < 2e-16 ***
## PC3 0.90949 0.12964 7.016 5.12e-12 ***
## PC4 -1.31503 0.15170 -8.669 < 2e-16 ***
## PC5 -1.85318 0.21427 -8.649 < 2e-16 ***
## PC6 -0.32835 0.28073 -1.170 0.243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.333 on 751 degrees of freedom
## Multiple R-squared: 0.3089, Adjusted R-squared: 0.3034
## F-statistic: 55.95 on 6 and 751 DF, p-value: < 2.2e-16
Now with lexical class.
lexclass_mod <- lm(aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class, data=ddbsa)
summary(lexclass_mod)
##
## Call:
## lm(formula = aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class,
## data = ddbsa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3638 -1.5602 0.0462 1.5482 10.0344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.694540 0.190735 118.984 < 2e-16 ***
## PC1 0.349831 0.043038 8.128 1.87e-15 ***
## PC2 -1.576619 0.173537 -9.085 < 2e-16 ***
## PC3 1.022259 0.157129 6.506 1.44e-10 ***
## PC4 -0.900851 0.187710 -4.799 1.93e-06 ***
## PC5 -1.339357 0.256766 -5.216 2.38e-07 ***
## lexical_classadjectives 2.681119 0.452705 5.922 4.89e-09 ***
## lexical_classfunction_words 6.908462 0.671819 10.283 < 2e-16 ***
## lexical_classother 1.415323 0.354467 3.993 7.19e-05 ***
## lexical_classverbs 3.037481 0.374853 8.103 2.26e-15 ***
## PC1:lexical_classadjectives -0.328140 0.113091 -2.902 0.00383 **
## PC1:lexical_classfunction_words -0.240471 0.083411 -2.883 0.00406 **
## PC1:lexical_classother -0.310925 0.076903 -4.043 5.84e-05 ***
## PC1:lexical_classverbs -0.213629 0.097817 -2.184 0.02928 *
## PC2:lexical_classadjectives 0.465898 0.462751 1.007 0.31436
## PC2:lexical_classfunction_words 0.501814 0.258804 1.939 0.05289 .
## PC2:lexical_classother -1.132418 0.347333 -3.260 0.00116 **
## PC2:lexical_classverbs 1.266526 0.435137 2.911 0.00372 **
## PC3:lexical_classadjectives -0.777828 0.546617 -1.423 0.15517
## PC3:lexical_classfunction_words -0.589557 0.325427 -1.812 0.07045 .
## PC3:lexical_classother 0.304570 0.329083 0.926 0.35501
## PC3:lexical_classverbs 0.126851 0.367843 0.345 0.73031
## PC4:lexical_classadjectives 0.004856 0.677785 0.007 0.99429
## PC4:lexical_classfunction_words -0.220476 0.388603 -0.567 0.57065
## PC4:lexical_classother 0.483371 0.389369 1.241 0.21485
## PC4:lexical_classverbs 0.428928 0.471078 0.911 0.36285
## PC5:lexical_classadjectives 0.332782 0.678095 0.491 0.62374
## PC5:lexical_classfunction_words 0.768651 0.601547 1.278 0.20173
## PC5:lexical_classother -0.685181 0.519351 -1.319 0.18748
## PC5:lexical_classverbs -0.184790 0.628403 -0.294 0.76879
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.731 on 728 degrees of freedom
## Multiple R-squared: 0.5502, Adjusted R-squared: 0.5322
## F-statistic: 30.7 on 29 and 728 DF, p-value: < 2.2e-16
ddbsa$hatvals <- hatvalues(lexclass_mod)
arrange(ddbsa,desc(hatvals)) %>%
ungroup() %>%
select(word, lexical_class, aoa, starts_with("PC"), hatvals) %>%
slice(1:20) %>%
knitr::kable(digits = 2, format = "html", table.attr = "style='width:80%;'")
word | lexical_class | aoa | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | hatvals |
---|---|---|---|---|---|---|---|---|---|
don’t | function_words | 25.25 | 1.71 | 9.29 | 3.61 | -2.40 | -1.40 | -2.71 | 0.60 |
yucky | adjectives | 21.88 | 8.55 | 2.92 | 0.80 | 0.00 | -0.61 | -0.34 | 0.39 |
was | function_words | 31.80 | -7.95 | 1.70 | 1.52 | -4.81 | 1.05 | -0.19 | 0.37 |
lemme/let me | function_words | 26.49 | 8.02 | 4.19 | -2.51 | -1.88 | -0.38 | -1.65 | 0.32 |
gotta | function_words | 30.95 | 2.90 | 0.31 | -4.40 | 1.15 | -0.49 | -0.35 | 0.30 |
hafta | function_words | 29.79 | 8.22 | 3.36 | -2.17 | -2.26 | -1.60 | -1.35 | 0.29 |
cute | adjectives | 26.88 | 2.21 | 1.21 | 1.38 | 1.79 | 0.10 | -0.33 | 0.22 |
does | function_words | 31.32 | -3.79 | 1.36 | 1.23 | -2.23 | 1.65 | 0.08 | 0.21 |
tickle | verbs | 22.83 | 7.25 | 1.37 | -1.63 | -0.28 | 0.82 | 0.25 | 0.21 |
smile | verbs | 27.26 | 0.29 | -2.50 | -2.39 | 0.41 | -1.18 | -0.02 | 0.20 |
naughty | adjectives | 30.39 | 6.29 | -0.63 | -1.56 | -1.06 | -0.66 | -0.39 | 0.19 |
windy | adjectives | 27.43 | 5.08 | -0.09 | 1.65 | -0.46 | -0.50 | -0.42 | 0.18 |
camping | other | 31.58 | 3.02 | 0.68 | 3.94 | -0.08 | -0.51 | 0.20 | 0.18 |
little | adjectives | 25.17 | -7.46 | 1.48 | -0.45 | -0.63 | -0.19 | -0.41 | 0.18 |
lot | function_words | 29.35 | -4.44 | 0.52 | 2.52 | 1.28 | -0.12 | -0.67 | 0.16 |
mommy | other | 9.92 | 3.79 | 2.25 | -1.14 | 1.26 | 2.13 | -0.58 | 0.16 |
thirsty | adjectives | 26.19 | 6.25 | -0.09 | -1.63 | -1.18 | 0.40 | 0.42 | 0.16 |
noisy | adjectives | 28.24 | 4.32 | -0.28 | 1.29 | -0.85 | 0.28 | -0.08 | 0.15 |
poor | adjectives | 32.67 | -1.60 | -0.47 | 0.02 | -2.07 | -0.43 | 0.15 | 0.15 |
bye | other | 13.03 | -0.23 | 2.27 | 2.37 | 0.40 | -0.28 | -0.09 | 0.15 |
Use mother’s education to predict children’s vocabulary.
load("data/en_ws_production.Rdata")
samp <- d_demo %>% filter(!is.na(mom_ed)) # 2773
produces ~ book_freq * mom_ed + speech_freq * mom_ed + (1|item) + (1|child) produces ~ lexical_class * book_freq * mom_ed + lexical_class * speech_freq * mom_ed + (1|item) + (1|child)
d_samp <- d_en_ws %>% filter(is.element(data_id, samp$data_id)) %>%
left_join(samp %>% select(data_id, age, production, mom_ed, ses_group)) %>% # vocab? booky_production
left_join(ddd %>% select(definition, lexical_class, books, tv, speech, #prop_booky,
adult_books))
## Joining, by = "data_id"
## Adding missing grouping variables: `word`, `on_cdi`
## Joining, by = c("definition", "lexical_class")
d_samp <- d_samp %>% mutate(age_sc = scale(age, center=T, scale=F),
books_sc = scale(log(books)),
tv_sc = scale(log(tv)),
speech_sc = scale(log(speech)),
#prop_booky_sc = scale(log(prop_booky)),
mom_ed_sc = scale(as.numeric(mom_ed), center=T, scale=F))
ToDo: need to re-fit
m1 <- glmer(produces ~ age_sc + books_sc * mom_ed_sc + speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)
m2 <- glmer(produces ~ age_sc + lexical_class * books_sc * mom_ed_sc + lexical_class * speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)
#Model failed to converge with max|grad| = 0.0255532 (tol = 0.002, component 1)Model is nearly unidentifiable: large eigenvalue ratio
# - Rescale variables?
save(m1, m2, file="data/model_fits_scaled.Rdata")