Load Data

First we load in the frequency data from several corpora preprocessed in CDIorigins.Rmd. The frequencies are already normalized to counts per million tokens.

load("data/merged_word_freqs.Rdata")

aoas <- readRDS("data/english_(american)_aoa_bydefinition.rds") %>%
  filter(measure=="produces")

Word frequency table

Use to pick compelling examples.

Prepare data

# Montag corpus
dd <- ch_freq_smooth %>% filter(on_cdi==1) %>% 
  select(-ch_book_vs_speech) 

# Charlesworth (FB children's books)
dd_novels <- ch_freq_smooth_charles %>% filter(on_cdi==1) %>% 
  select(-ch_book_vs_speech) 

ddd <- dd %>% select(-prop_booky) %>%
  left_join(dd_novels %>% select(-prop_booky)) %>%
  mutate(Nletters = ifelse(is.na(Nletters), nchar(word), Nletters)) %>%
  left_join(aoas %>% select(-measure))
## Joining, by = c("word", "definition", "on_cdi", "Nletters", "lexical_class", "adult_books")
## Joining, by = "definition"
dddc <- ddd %>% mutate(adult_books = ifelse(is.na(adult_books), 0.07322, adult_books),
                       speech = ifelse(is.na(speech), 10, speech),
                       books = ifelse(is.na(books), 10, books),
                       tv = ifelse(is.na(tv), 10, tv))

summary(ddd)
##      word            definition            on_cdi     CHILDES        
##  Length:758         Length:758         Min.   :1   Min.   :   10.00  
##  Class :character   Class :character   1st Qu.:1   1st Qu.:   73.37  
##  Mode  :character   Mode  :character   Median :1   Median :  163.35  
##                                        Mean   :1   Mean   :  948.43  
##                                        3rd Qu.:1   3rd Qu.:  485.15  
##                                        Max.   :1   Max.   :44512.28  
##                                                                      
##      Montag            Nletters      lexical_class       adult_books      
##  Min.   :   10.00   Min.   : 1.000   Length:758         Min.   :    0.07  
##  1st Qu.:   52.95   1st Qu.: 4.000   Class :character   1st Qu.:    7.12  
##  Median :  138.86   Median : 5.000   Mode  :character   Median :   31.31  
##  Mean   :  821.61   Mean   : 4.892                      Mean   :  663.56  
##  3rd Qu.:  482.50   3rd Qu.: 6.000                      3rd Qu.:  138.27  
##  Max.   :51941.50   Max.   :14.000                      Max.   :71382.55  
##                                                         NA's   :14        
##      books                tv               speech           language        
##  Min.   :   10.00   Min.   :   10.00   Min.   :   10.18   Length:758        
##  1st Qu.:   20.31   1st Qu.:   46.24   1st Qu.:   76.23   Class :character  
##  Median :   79.25   Median :  127.46   Median :  173.94   Mode  :character  
##  Mean   :  788.44   Mean   :  748.65   Mean   :  956.18                     
##  3rd Qu.:  290.57   3rd Qu.:  432.05   3rd Qu.:  486.08                     
##  Max.   :51162.11   Max.   :45860.64   Max.   :47750.03                     
##  NA's   :1          NA's   :1          NA's   :1                            
##    intercept           slope             aoa        
##  Min.   :-10.786   Min.   :0.1349   Min.   : 9.924  
##  1st Qu.: -8.472   1st Qu.:0.3000   1st Qu.:22.454  
##  Median : -7.798   Median :0.3002   Median :24.947  
##  Mean   : -7.637   Mean   :0.3080   Mean   :24.869  
##  3rd Qu.: -6.939   3rd Qu.:0.3004   3rd Qu.:27.345  
##  Max.   : -2.257   Max.   :0.4561   Max.   :34.830  
## 

Correlations between word frequency distributions

dddc %>% mutate(CHILDES = log(CHILDES),
               Montag = log(Montag),
               books = log(books),
               tv = log(tv),
               adult_books = log(adult_books)) %>%
  ggpairs(columns = c("CHILDES","Montag","books","tv","adult_books"),
        ggplot2::aes(colour=lexical_class)) + 
  theme_classic()

#ggsave("CDIword_frequency_distro_comparisons.pdf", width=9, height=9)

Basic AoA regression

Significant contributions of children’s books, children’s speech, adult book frequencies, and number of letters.

lm1 <- lm(aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) + Nletters, data=dddc)
summary(lm1) 
## 
## Call:
## lm(formula = aoa ~ log(books) + log(speech) + log(tv) + log(adult_books) + 
##     Nletters, data = dddc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4176 -2.2670  0.0587  2.1087  9.2265 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25.75277    0.82483  31.222  < 2e-16 ***
## log(books)        1.43010    0.16661   8.584  < 2e-16 ***
## log(speech)      -1.90259    0.16249 -11.709  < 2e-16 ***
## log(tv)          -0.28121    0.18213  -1.544    0.123    
## log(adult_books)  0.64032    0.11109   5.764 1.20e-08 ***
## Nletters          0.41300    0.08365   4.937 9.77e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.337 on 752 degrees of freedom
## Multiple R-squared:  0.3063, Adjusted R-squared:  0.3017 
## F-statistic: 66.42 on 5 and 752 DF,  p-value: < 2.2e-16
# R^2 = .31
car::vif(lm1) %>%
  knitr::kable(digits = 3, format = "html", table.attr = "style='width:20%;'")
x
log(books) 6.474
log(speech) 4.165
log(tv) 5.967
log(adult_books) 5.474
Nletters 1.412

But VIFs are >>1: the word frequency distributions are highly correlated, so we will use PCA to disentangle.

PCA

load("data/Charlesworth-unlemmatized-counts.Rdata") 

adult_dat <- adult_dat %>% filter(source!="books") %>%
  select(source, word, word_count_norm) %>%
  pivot_wider(id_col = word, names_from = source, values_from = word_count_norm) %>%
  rename(`Adult Speech` = speech,
         `Adult TV` = TV)

dddc <- dddc %>% 
  rename(`Adult Books` = adult_books, # Google books
         `Child Books` = books, # Facebooks children's books (novels..)
         `Child TV` = tv,
         `Child Speech` = speech) %>% 
  select(-CHILDES) %>% # less well-cleaned version
  left_join(adult_dat)
## Joining, by = "word"
dddc <- dddc %>% replace_na(list(`Adult Speech` = 0.32,
                                 `Adult TV` = 0.32))

We will use all relevant frequency distributions and try to understand the composition of the principal components.

pc_bsa <- prcomp(log(dddc[,c("Adult Books","Child Books",
                             "Adult Speech","Child Speech",
                             "Adult TV", "Child TV")])) # add Montag separately?
summary(pc_bsa)$importance %>%
  knitr::kable(digits = 3, format = "html", table.attr = "style='width:50%;'")
PC1 PC2 PC3 PC4 PC5 PC6
Standard deviation 5.008 1.030 0.934 0.799 0.565 0.431
Proportion of Variance 0.891 0.038 0.031 0.023 0.011 0.007
Cumulative Proportion 0.891 0.928 0.959 0.982 0.993 1.000

The first component explains the bulk the variance (89%), and the first four components capture 98% of the variance. What do these components look like? PC1 captures overall frequency, but loads somewhat higher on adult distributions. PC2 (3.8% of variance) differentiates adult books and TV, especially from children’s speech. PC3 (3.1% of variance) differentiates adult speech and to a small extent adult books from everything else. PC4 (2.2% of variance) captures similarity of child and adult books. PC5 adult books and child speech similarity?

#pc_bsa$rotation  * -1
ddbsa <- cbind(dddc, data.frame(pc_bsa$x))
pc_bsa$rotation %>% 
  knitr::kable(digits = 2, format = "html", table.attr = "style='width:50%;'")
PC1 PC2 PC3 PC4 PC5 PC6
Adult Books -0.49 -0.50 0.01 -0.41 0.47 0.35
Child Books -0.34 0.18 -0.36 -0.62 -0.54 -0.21
Adult Speech -0.51 0.35 0.77 0.06 -0.13 -0.02
Child Speech -0.26 0.61 -0.36 0.08 0.61 -0.22
Adult TV -0.47 -0.44 -0.18 0.51 -0.11 -0.53
Child TV -0.30 0.17 -0.34 0.42 -0.29 0.71
outlier_thresh = 5

p1 <- ggplot(ddbsa, aes(x = PC1, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh | PC1 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p2 <- ggplot(ddbsa, aes(x = PC2, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC2 < -outlier_thresh | PC2 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p3 <- ggplot(ddbsa, aes(x = PC3, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC3 < -outlier_thresh | PC3 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

p4 <- ggplot(ddbsa, aes(x = PC4, y = aoa, col = lexical_class)) + 
  geom_point() + geom_smooth(method = "lm") + 
  geom_text_repel(data = filter(ddbsa, PC4 < -outlier_thresh | PC4 > outlier_thresh), aes(label = word)) + 
  facet_wrap(~lexical_class) + theme_classic()

ggarrange(p1, p2, p3, p4, nrow=2, ncol=2, common.legend = T)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggplot(ddbsa, aes(x = PC1, y = PC2, col = lexical_class)) + 
  geom_point() + theme_classic() +
  facet_wrap(~lexical_class) + 
  geom_text_repel(data = filter(ddbsa, PC1 < -outlier_thresh, PC1 > outlier_thresh, 
                                PC2 < -outlier_thresh, PC2 > outlier_thresh), 
                  aes(label = word), max.overlaps = 10, size =3)

Regression on PCA Loadings

ddbsa$lexical_class <- fct_relevel(ddbsa$lexical_class, "nouns")
summary(lm(aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data=ddbsa)) 
## 
## Call:
## lm(formula = aoa ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = ddbsa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5754  -2.1133   0.0118   2.1840  11.4051 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.86911    0.12105 205.440  < 2e-16 ***
## PC1         -0.16798    0.02419  -6.945 8.23e-12 ***
## PC2         -1.09625    0.11759  -9.323  < 2e-16 ***
## PC3          0.90949    0.12964   7.016 5.12e-12 ***
## PC4         -1.31503    0.15170  -8.669  < 2e-16 ***
## PC5         -1.85318    0.21427  -8.649  < 2e-16 ***
## PC6         -0.32835    0.28073  -1.170    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.333 on 751 degrees of freedom
## Multiple R-squared:  0.3089, Adjusted R-squared:  0.3034 
## F-statistic: 55.95 on 6 and 751 DF,  p-value: < 2.2e-16

Now with lexical class.

lexclass_mod <- lm(aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class, data=ddbsa)
summary(lexclass_mod)
## 
## Call:
## lm(formula = aoa ~ (PC1 + PC2 + PC3 + PC4 + PC5) * lexical_class, 
##     data = ddbsa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3638  -1.5602   0.0462   1.5482  10.0344 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     22.694540   0.190735 118.984  < 2e-16 ***
## PC1                              0.349831   0.043038   8.128 1.87e-15 ***
## PC2                             -1.576619   0.173537  -9.085  < 2e-16 ***
## PC3                              1.022259   0.157129   6.506 1.44e-10 ***
## PC4                             -0.900851   0.187710  -4.799 1.93e-06 ***
## PC5                             -1.339357   0.256766  -5.216 2.38e-07 ***
## lexical_classadjectives          2.681119   0.452705   5.922 4.89e-09 ***
## lexical_classfunction_words      6.908462   0.671819  10.283  < 2e-16 ***
## lexical_classother               1.415323   0.354467   3.993 7.19e-05 ***
## lexical_classverbs               3.037481   0.374853   8.103 2.26e-15 ***
## PC1:lexical_classadjectives     -0.328140   0.113091  -2.902  0.00383 ** 
## PC1:lexical_classfunction_words -0.240471   0.083411  -2.883  0.00406 ** 
## PC1:lexical_classother          -0.310925   0.076903  -4.043 5.84e-05 ***
## PC1:lexical_classverbs          -0.213629   0.097817  -2.184  0.02928 *  
## PC2:lexical_classadjectives      0.465898   0.462751   1.007  0.31436    
## PC2:lexical_classfunction_words  0.501814   0.258804   1.939  0.05289 .  
## PC2:lexical_classother          -1.132418   0.347333  -3.260  0.00116 ** 
## PC2:lexical_classverbs           1.266526   0.435137   2.911  0.00372 ** 
## PC3:lexical_classadjectives     -0.777828   0.546617  -1.423  0.15517    
## PC3:lexical_classfunction_words -0.589557   0.325427  -1.812  0.07045 .  
## PC3:lexical_classother           0.304570   0.329083   0.926  0.35501    
## PC3:lexical_classverbs           0.126851   0.367843   0.345  0.73031    
## PC4:lexical_classadjectives      0.004856   0.677785   0.007  0.99429    
## PC4:lexical_classfunction_words -0.220476   0.388603  -0.567  0.57065    
## PC4:lexical_classother           0.483371   0.389369   1.241  0.21485    
## PC4:lexical_classverbs           0.428928   0.471078   0.911  0.36285    
## PC5:lexical_classadjectives      0.332782   0.678095   0.491  0.62374    
## PC5:lexical_classfunction_words  0.768651   0.601547   1.278  0.20173    
## PC5:lexical_classother          -0.685181   0.519351  -1.319  0.18748    
## PC5:lexical_classverbs          -0.184790   0.628403  -0.294  0.76879    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.731 on 728 degrees of freedom
## Multiple R-squared:  0.5502, Adjusted R-squared:  0.5322 
## F-statistic:  30.7 on 29 and 728 DF,  p-value: < 2.2e-16
ddbsa$hatvals <- hatvalues(lexclass_mod)

arrange(ddbsa,desc(hatvals)) %>%
  ungroup() %>%
  select(word, lexical_class, aoa, starts_with("PC"), hatvals) %>%
  slice(1:20) %>%
  knitr::kable(digits = 2, format = "html", table.attr = "style='width:80%;'")
word lexical_class aoa PC1 PC2 PC3 PC4 PC5 PC6 hatvals
don’t function_words 25.25 1.71 9.29 3.61 -2.40 -1.40 -2.71 0.60
yucky adjectives 21.88 8.55 2.92 0.80 0.00 -0.61 -0.34 0.39
was function_words 31.80 -7.95 1.70 1.52 -4.81 1.05 -0.19 0.37
lemme/let me function_words 26.49 8.02 4.19 -2.51 -1.88 -0.38 -1.65 0.32
gotta function_words 30.95 2.90 0.31 -4.40 1.15 -0.49 -0.35 0.30
hafta function_words 29.79 8.22 3.36 -2.17 -2.26 -1.60 -1.35 0.29
cute adjectives 26.88 2.21 1.21 1.38 1.79 0.10 -0.33 0.22
does function_words 31.32 -3.79 1.36 1.23 -2.23 1.65 0.08 0.21
tickle verbs 22.83 7.25 1.37 -1.63 -0.28 0.82 0.25 0.21
smile verbs 27.26 0.29 -2.50 -2.39 0.41 -1.18 -0.02 0.20
naughty adjectives 30.39 6.29 -0.63 -1.56 -1.06 -0.66 -0.39 0.19
windy adjectives 27.43 5.08 -0.09 1.65 -0.46 -0.50 -0.42 0.18
camping other 31.58 3.02 0.68 3.94 -0.08 -0.51 0.20 0.18
little adjectives 25.17 -7.46 1.48 -0.45 -0.63 -0.19 -0.41 0.18
lot function_words 29.35 -4.44 0.52 2.52 1.28 -0.12 -0.67 0.16
mommy other 9.92 3.79 2.25 -1.14 1.26 2.13 -0.58 0.16
thirsty adjectives 26.19 6.25 -0.09 -1.63 -1.18 0.40 0.42 0.16
noisy adjectives 28.24 4.32 -0.28 1.29 -0.85 0.28 -0.08 0.15
poor adjectives 32.67 -1.60 -0.47 0.02 -2.07 -0.43 0.15 0.15
bye other 13.03 -0.23 2.27 2.37 0.40 -0.28 -0.09 0.15

Item- and child-level data

Use mother’s education to predict children’s vocabulary.

load("data/en_ws_production.Rdata")
samp <- d_demo %>% filter(!is.na(mom_ed)) # 2773

produces ~ book_freq * mom_ed + speech_freq * mom_ed + (1|item) + (1|child) produces ~ lexical_class * book_freq * mom_ed + lexical_class * speech_freq * mom_ed + (1|item) + (1|child)

d_samp <- d_en_ws %>% filter(is.element(data_id, samp$data_id)) %>%
  left_join(samp %>% select(data_id, age, production, mom_ed, ses_group)) %>% # vocab? booky_production
  left_join(ddd %>% select(definition, lexical_class, books, tv, speech, #prop_booky, 
                           adult_books))
## Joining, by = "data_id"
## Adding missing grouping variables: `word`, `on_cdi`
## Joining, by = c("definition", "lexical_class")
d_samp <- d_samp %>% mutate(age_sc = scale(age, center=T, scale=F), 
                            books_sc = scale(log(books)),
                            tv_sc = scale(log(tv)),
                            speech_sc = scale(log(speech)),
                            #prop_booky_sc = scale(log(prop_booky)),
                            mom_ed_sc = scale(as.numeric(mom_ed), center=T, scale=F))

ToDo: need to re-fit

m1 <- glmer(produces ~ age_sc + books_sc * mom_ed_sc + speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)

m2 <- glmer(produces ~ age_sc + lexical_class * books_sc * mom_ed_sc + lexical_class * speech_sc * mom_ed_sc + (1|item_id) + (1|data_id), family=binomial, data=d_samp)
#Model failed to converge with max|grad| = 0.0255532 (tol = 0.002, component 1)Model is nearly unidentifiable: large eigenvalue ratio
# - Rescale variables?

save(m1, m2, file="data/model_fits_scaled.Rdata")