Load Data

First we load in the frequency data from several corpora preprocessed in CDIorigins.Rmd. The frequencies are already normalized to counts per million tokens.

load("data/merged_word_freqs.Rdata")

aoas <- readRDS("data/english_(american)_aoa_bydefinition.rds") %>%
  filter(measure=="produces")

Word frequency table

Use to pick compelling examples.

Show entries

Search:

	definition	books	tv	speech	prop_booky	Nletters	lexical_class	adult_books
1	a	21,561	26,145	22,153	49%	1	function_words	20,582
2	about	2,080	1,782	2,466	46%	5	function_words	1,440
3	above	194	192	58	77%	5	function_words	346
4	after	1,302	703	512	72%	5	other	1,078
5	airplane	10	31	169	6%	8	nouns	7
6	all	5,241	3,835	3,666	59%	3	function_words	2,773
7	alligator	11	15	54	18%	9	nouns	1
8	am	897	690	303	75%	2	function_words	317
9	an	1,979	1,924	1,683	54%	2	function_words	3,665
10	and	37,811	23,255	14,568	72%	3	function_words	30,426

Showing 1 to 10 of 701 entries

Previous1 2 3 4 5…71Next

Prepare data

# Montag corpus
dd <- ch_freq_smooth %>% filter(on_cdi==1) %>% 
  select(-ch_book_vs_speech) 

# Charlesworth (FB children's books)
dd_novels <- ch_freq_smooth_charles %>% filter(on_cdi==1) %>% 
  select(-ch_book_vs_speech) 

ddd <- dd %>% select(-prop_booky) %>%
  left_join(dd_novels %>% select(-prop_booky)) %>%
  mutate(Nletters = ifelse(is.na(Nletters), nchar(word), Nletters)) %>%
  left_join(aoas %>% select(-measure))

## Joining, by = c("word", "definition", "on_cdi", "Nletters", "lexical_class", "adult_books")

## Joining, by = "definition"

dddc <- ddd %>% mutate(adult_books = ifelse(is.na(adult_books), 0.07322, adult_books),
                       speech = ifelse(is.na(speech), 10, speech),
                       books = ifelse(is.na(books), 10, books),
                       tv = ifelse(is.na(tv), 10, tv))

summary(ddd)

##      word            definition            on_cdi     CHILDES        
##  Length:758         Length:758         Min.   :1   Min.   :   10.00  
##  Class :character   Class :character   1st Qu.:1   1st Qu.:   73.37  
##  Mode  :character   Mode  :character   Median :1   Median :  163.35  
##                                        Mean   :1   Mean   :  948.43  
##                                        3rd Qu.:1   3rd Qu.:  485.15  
##                                        Max.   :1   Max.   :44512.28  
##                                                                      
##      Montag            Nletters      lexical_class       adult_books      
##  Min.   :   10.00   Min.   : 1.000   Length:758         Min.   :    0.07  
##  1st Qu.:   52.95   1st Qu.: 4.000   Class :character   1st Qu.:    7.12  
##  Median :  138.86   Median : 5.000   Mode  :character   Median :   31.31  
##  Mean   :  821.61   Mean   : 4.892                      Mean   :  663.56  
##  3rd Qu.:  482.50   3rd Qu.: 6.000                      3rd Qu.:  138.27  
##  Max.   :51941.50   Max.   :14.000                      Max.   :71382.55  
##                                                         NA's   :14        
##      books                tv               speech           language        
##  Min.   :   10.00   Min.   :   10.00   Min.   :   10.18   Length:758        
##  1st Qu.:   20.31   1st Qu.:   46.24   1st Qu.:   76.23   Class :character  
##  Median :   79.25   Median :  127.46   Median :  173.94   Mode  :character  
##  Mean   :  788.44   Mean   :  748.65   Mean   :  956.18                     
##  3rd Qu.:  290.57   3rd Qu.:  432.05   3rd Qu.:  486.08                     
##  Max.   :51162.11   Max.   :45860.64   Max.   :47750.03                     
##  NA's   :1          NA's   :1          NA's   :1                            
##    intercept           slope             aoa        
##  Min.   :-10.786   Min.   :0.1349   Min.   : 9.924  
##  1st Qu.: -8.472   1st Qu.:0.3000   1st Qu.:22.454  
##  Median : -7.798   Median :0.3002   Median :24.947  
##  Mean   : -7.637   Mean   :0.3080   Mean   :24.869  
##  3rd Qu.: -6.939   3rd Qu.:0.3004   3rd Qu.:27.345  
##  Max.   : -2.257   Max.   :0.4561   Max.   :34.830  
##

Correlations between word frequency distributions

dddc %>% mutate(CHILDES = log(CHILDES),
               Montag = log(Montag),
               books = log(books),
               tv = log(tv),
               adult_books = log(adult_books)) %>%
  ggpairs(columns = c("CHILDES","Montag","books","tv","adult_books"),
        ggplot2::aes(colour=lexical_class)) + 
  theme_classic()

#ggsave("CDIword_frequency_distro_comparisons.pdf", width=9, height=9)

Basic AoA regression

Significant contributions of children’s books, children’s speech, adult book frequencies, and number of letters.

lm1 <- lm(aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) + Nletters, data=dddc)
summary(lm1)

## 
## Call:
## lm(formula = aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) + 
##     Nletters, data = dddc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4176 -2.2670  0.0587  2.1087  9.2265 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25.75277    0.82483  31.222  < 2e-16 ***
## log(books)        1.43010    0.16661   8.584  < 2e-16 ***
## log(speech)      -1.90259    0.16249 -11.709  < 2e-16 ***
## log(tv)          -0.28121    0.18213  -1.544    0.123    
## log(adult_books)  0.64032    0.11109   5.764 1.20e-08 ***
## Nletters          0.41300    0.08365   4.937 9.77e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.337 on 752 degrees of freedom
## Multiple R-squared:  0.3063, Adjusted R-squared:  0.3017 
## F-statistic: 66.42 on 5 and 752 DF,  p-value: < 2.2e-16

# R^2 = .31

car::vif(lm1) %>%
  knitr::kable(digits = 3, format = "html", table.attr = "style='width:20%;'")

	x
log(books)	6.474
log(speech)	4.165
log(tv)	5.967
log(adult_books)	5.474
Nletters	1.412

But VIFs are >>1: the word frequency distributions are highly correlated, so we will use PCA to disentangle.

PCA

load("data/Charlesworth-unlemmatized-counts.Rdata") 

adult_dat <- adult_dat %>% filter(source!="books") %>%
  select(source, word, word_count_norm) %>%
  pivot_wider(id_col = word, names_from = source, values_from = word_count_norm) %>%
  rename(`Adult Speech` = speech,
         `Adult TV` = TV)

dddc <- dddc %>% 
  rename(`Adult Books` = adult_books, # Google books
         `Child Books` = books, # Facebooks children's books (novels..)
         `Child TV` = tv,
         `Child Speech` = speech) %>% 
  select(-CHILDES) %>% # less well-cleaned version
  left_join(adult_dat)

## Joining, by = "word"

dddc <- dddc %>% replace_na(list(`Adult Speech` = 0.32,
                                 `Adult TV` = 0.32))

We will use all relevant frequency distributions and try to understand the composition of the principal components.

pc_bsa <- prcomp(log(dddc[,c("Adult Books","Child Books",
                             "Adult Speech","Child Speech",
                             "Adult TV", "Child TV")])) # add Montag separately?
summary(pc_bsa)$importance %>%
  knitr::kable(digits = 3, format = "html", table.attr = "style='width:50%;'")

	PC1	PC2	PC3	PC4	PC5	PC6
Standard deviation	5.008	1.030	0.934	0.799	0.565	0.431
Proportion of Variance	0.891	0.038	0.031	0.023	0.011	0.007
Cumulative Proportion	0.891	0.928	0.959	0.982	0.993	1.000

The first component explains the bulk the variance (89%), and the first four components capture 98% of the variance. What do these components look like? PC1 captures overall frequency, but loads somewhat higher on adult distributions. PC2 (3.8% of variance) differentiates adult books and TV, especially from children’s speech. PC3 (3.1% of variance) differentiates adult speech and to a small extent adult books from everything else. PC4 (2.2% of variance) captures similarity of child and adult books. PC5 adult books and child speech similarity?

#pc_bsa$rotation  * -1
ddbsa <- cbind(dddc, data.frame(pc_bsa$x))
pc_bsa$rotation %>% 
  knitr::kable(digits = 2, format = "html", table.attr = "style='width:50%;'")

	PC1	PC2	PC3	PC4	PC5	PC6
Adult Books	-0.49	-0.50	0.01	-0.41	0.47	0.35
Child Books	-0.34	0.18	-0.36	-0.62	-0.54	-0.21
Adult Speech	-0.51	0.35	0.77	0.06	-0.13	-0.02
Child Speech	-0.26	0.61	-0.36	0.08	0.61	-0.22
Adult TV	-0.47	-0.44	-0.18	0.51	-0.11	-0.53
Child TV	-0.30	0.17	-0.34	0.42	-0.29	0.71

outlier_thresh = 5

p1 <- ggplot(ddbsa, aes(x = PC1, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh | PC1 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p2 <- ggplot(ddbsa, aes(x = PC2, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC2 < -outlier_thresh | PC2 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p3 <- ggplot(ddbsa, aes(x = PC3, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC3 < -outlier_thresh | PC3 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p4 <- ggplot(ddbsa, aes(x = PC4, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC4 < -outlier_thresh | PC4 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

ggarrange(p1, p2, p3, p4, nrow=2, ncol=2, common.legend = T)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggplot(ddbsa, aes(x = PC1, y = PC2, col = lexical_class)) + 
  geom_point() + theme_classic() +
  facet_wrap(~lexical_class) + 
  geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh, PC1 > outlier_thresh, 
                                PC2 < -outlier_thresh, PC2 > outlier_thresh), 
                  aes(label = word), max.overlaps = 10, size =3)

Regression on PCA Loadings

ddbsa$lexical_class <- fct_relevel(ddbsa$lexical_class, "nouns")
summary(lm(aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data=ddbsa))

## 
## Call:
## lm(formula = aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = ddbsa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5754  -2.1133   0.0118   2.1840  11.4051 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.86911    0.12105 205.440  < 2e-16 ***
## PC1         -0.16798    0.02419  -6.945 8.23e-12 ***
## PC2         -1.09625    0.11759  -9.323  < 2e-16 ***
## PC3          0.90949    0.12964   7.016 5.12e-12 ***
## PC4         -1.31503    0.15170  -8.669  < 2e-16 ***
## PC5         -1.85318    0.21427  -8.649  < 2e-16 ***
## PC6         -0.32835    0.28073  -1.170    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.333 on 751 degrees of freedom
## Multiple R-squared:  0.3089, Adjusted R-squared:  0.3034 
## F-statistic: 55.95 on 6 and 751 DF,  p-value: < 2.2e-16

Now with lexical class.

lexclass_mod <- lm(aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class, data=ddbsa)
summary(lexclass_mod)

## 
## Call:
## lm(formula = aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class, 
##     data = ddbsa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3638  -1.5602   0.0462   1.5482  10.0344 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     22.694540   0.190735 118.984  < 2e-16 ***
## PC1                              0.349831   0.043038   8.128 1.87e-15 ***
## PC2                             -1.576619   0.173537  -9.085  < 2e-16 ***
## PC3                              1.022259   0.157129   6.506 1.44e-10 ***
## PC4                             -0.900851   0.187710  -4.799 1.93e-06 ***
## PC5                             -1.339357   0.256766  -5.216 2.38e-07 ***
## lexical_classadjectives          2.681119   0.452705   5.922 4.89e-09 ***
## lexical_classfunction_words      6.908462   0.671819  10.283  < 2e-16 ***
## lexical_classother               1.415323   0.354467   3.993 7.19e-05 ***
## lexical_classverbs               3.037481   0.374853   8.103 2.26e-15 ***
## PC1:lexical_classadjectives     -0.328140   0.113091  -2.902  0.00383 ** 
## PC1:lexical_classfunction_words -0.240471   0.083411  -2.883  0.00406 ** 
## PC1:lexical_classother          -0.310925   0.076903  -4.043 5.84e-05 ***
## PC1:lexical_classverbs          -0.213629   0.097817  -2.184  0.02928 *  
## PC2:lexical_classadjectives      0.465898   0.462751   1.007  0.31436    
## PC2:lexical_classfunction_words  0.501814   0.258804   1.939  0.05289 .  
## PC2:lexical_classother          -1.132418   0.347333  -3.260  0.00116 ** 
## PC2:lexical_classverbs           1.266526   0.435137   2.911  0.00372 ** 
## PC3:lexical_classadjectives     -0.777828   0.546617  -1.423  0.15517    
## PC3:lexical_classfunction_words -0.589557   0.325427  -1.812  0.07045 .  
## PC3:lexical_classother           0.304570   0.329083   0.926  0.35501    
## PC3:lexical_classverbs           0.126851   0.367843   0.345  0.73031    
## PC4:lexical_classadjectives      0.004856   0.677785   0.007  0.99429    
## PC4:lexical_classfunction_words -0.220476   0.388603  -0.567  0.57065    
## PC4:lexical_classother           0.483371   0.389369   1.241  0.21485    
## PC4:lexical_classverbs           0.428928   0.471078   0.911  0.36285    
## PC5:lexical_classadjectives      0.332782   0.678095   0.491  0.62374    
## PC5:lexical_classfunction_words  0.768651   0.601547   1.278  0.20173    
## PC5:lexical_classother          -0.685181   0.519351  -1.319  0.18748    
## PC5:lexical_classverbs          -0.184790   0.628403  -0.294  0.76879    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.731 on 728 degrees of freedom
## Multiple R-squared:  0.5502, Adjusted R-squared:  0.5322 
## F-statistic:  30.7 on 29 and 728 DF,  p-value: < 2.2e-16

ddbsa$hatvals <- hatvalues(lexclass_mod)

arrange(ddbsa,desc(hatvals)) %>%
  ungroup() %>%
  select(word, lexical_class, aoa, starts_with("PC"), hatvals) %>%
  slice(1:20) %>%
  knitr::kable(digits = 2, format = "html", table.attr = "style='width:80%;'")

word	lexical_class	aoa	PC1	PC2	PC3	PC4	PC5	PC6	hatvals
don’t	function_words	25.25	1.71	9.29	3.61	-2.40	-1.40	-2.71	0.60
yucky	adjectives	21.88	8.55	2.92	0.80	0.00	-0.61	-0.34	0.39
was	function_words	31.80	-7.95	1.70	1.52	-4.81	1.05	-0.19	0.37
lemme/let me	function_words	26.49	8.02	4.19	-2.51	-1.88	-0.38	-1.65	0.32
gotta	function_words	30.95	2.90	0.31	-4.40	1.15	-0.49	-0.35	0.30
hafta	function_words	29.79	8.22	3.36	-2.17	-2.26	-1.60	-1.35	0.29
cute	adjectives	26.88	2.21	1.21	1.38	1.79	0.10	-0.33	0.22
does	function_words	31.32	-3.79	1.36	1.23	-2.23	1.65	0.08	0.21
tickle	verbs	22.83	7.25	1.37	-1.63	-0.28	0.82	0.25	0.21
smile	verbs	27.26	0.29	-2.50	-2.39	0.41	-1.18	-0.02	0.20
naughty	adjectives	30.39	6.29	-0.63	-1.56	-1.06	-0.66	-0.39	0.19
windy	adjectives	27.43	5.08	-0.09	1.65	-0.46	-0.50	-0.42	0.18
camping	other	31.58	3.02	0.68	3.94	-0.08	-0.51	0.20	0.18
little	adjectives	25.17	-7.46	1.48	-0.45	-0.63	-0.19	-0.41	0.18
lot	function_words	29.35	-4.44	0.52	2.52	1.28	-0.12	-0.67	0.16
mommy	other	9.92	3.79	2.25	-1.14	1.26	2.13	-0.58	0.16
thirsty	adjectives	26.19	6.25	-0.09	-1.63	-1.18	0.40	0.42	0.16
noisy	adjectives	28.24	4.32	-0.28	1.29	-0.85	0.28	-0.08	0.15
poor	adjectives	32.67	-1.60	-0.47	0.02	-2.07	-0.43	0.15	0.15
bye	other	13.03	-0.23	2.27	2.37	0.40	-0.28	-0.09	0.15

Item- and child-level data

Use mother’s education to predict children’s vocabulary.

load("data/en_ws_production.Rdata")
samp <- d_demo %>% filter(!is.na(mom_ed)) # 2773

produces ~ book_freq * mom_ed + speech_freq * mom_ed + (1|item) + (1|child) produces ~ lexical_class * book_freq * mom_ed + lexical_class * speech_freq * mom_ed + (1|item) + (1|child)

d_samp <- d_en_ws %>% filter(is.element(data_id, samp$data_id)) %>%
  left_join(samp %>% select(data_id, age, production, mom_ed, ses_group)) %>% # vocab? booky_production
  left_join(ddd %>% select(definition, lexical_class, books, tv, speech, #prop_booky, 
                           adult_books))

## Joining, by = "data_id"

## Adding missing grouping variables: `word`, `on_cdi`

## Joining, by = c("definition", "lexical_class")

d_samp <- d_samp %>% mutate(age_sc = scale(age, center=T, scale=F), 
                            books_sc = scale(log(books)),
                            tv_sc = scale(log(tv)),
                            speech_sc = scale(log(speech)),
                            #prop_booky_sc = scale(log(prop_booky)),
                            mom_ed_sc = scale(as.numeric(mom_ed), center=T, scale=F))

ToDo: need to re-fit

m1 <- glmer(produces ~ age_sc + books_sc * mom_ed_sc + speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)

m2 <- glmer(produces ~ age_sc + lexical_class * books_sc * mom_ed_sc + lexical_class * speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)
#Model failed to converge with max|grad| = 0.0255532 (tol = 0.002, component 1)Model is nearly unidentifiable: large eigenvalue ratio
# - Rescale variables?

save(m1, m2, file="data/model_fits_scaled.Rdata")

English-cogsci-results

George & Georgia

12/7/2021