Load data

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   Gender = col_character(),
##   Ethnic = col_character(),
##   FatherEd = col_character(),
##   MedBirth = col_character(),
##   Language = col_character(),
##   CDIForm = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   Gender = col_character(),
##   Ethnic = col_character(),
##   MotherEd = col_character(),
##   FatherEd = col_character(),
##   MedBirth = col_character(),
##   Language = col_character(),
##   CDIForm = col_character()
## )
## See spec(...) for full column specifications.
## Warning: Missing column names filled in: 'X3' [3]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   id = col_character(),
##   X3 = col_logical(),
##   whosp = col_character(),
##   whoeng = col_character(),
##   spcdiby = col_character(),
##   engcdiby = col_character(),
##   `mgcorig Mother/Guardian origin` = col_character(),
##   `fgcorig Father's Country of Origin` = col_character(),
##   `chicorig Child's Country of Origin` = col_character(),
##   `mgnatlng Mother/Guardian native language` = col_character(),
##   `fgnatlng Father/Guardian Native Language` = col_character(),
##   `homelng primary language spoken at home` = col_character()
## )
## See spec(...) for full column specifications.

Overview

Total Vocabulary and Combine Data

en_voc <- en_ws[,15:811] # 797 columns (not 680) - UL / COMPLX mixed in:
ul_col_names <- c("USECMPL","USEFUT","USEMISS","USEPAST","USEPOSS")
# COMPLX01 - COMPLX37: cols 176:211
complx_col_names <- c(paste0("COMPLX0",1:9), paste0("COMPLX",10:37))
en_complx_ul <- en_ws[,c(complx_col_names, ul_col_names)] # 42, missing some? e.g. COMBINE?
en_voc <- en_voc %>% select(-all_of(complx_col_names), -all_of(ul_col_names))
en_ws_new <- en_ws %>% select(-c(15:811)) %>% 
  mutate(Total = rowSums(en_voc)) %>%
  select(-AgeCDI)

sp_voc <- sp_ws[,49:752]
sp_cmplx_ul <- sp_ws[,12:48]
sp_ws_new <- sp_ws %>% select(-c(12:752)) %>% 
  mutate(Total = rowSums(sp_voc),
         EngProp = NA,
         SpanProp = NA) %>%
  select(-USEPOSS, -USEFUT, -USEMISS, -USEPAST, -SCOMBINE, -USECMPL)
  
# need to recast some vars (e.g., sp_ws$MotherEd is character)
#cdat <- left_join(en_ws %>% mutate(MotherEd = as.numeric(MotherEd),
#                                   FatherEd = as.numeric(FatherEd)), 
#                  sp_ws %>% mutate(MotherEd = as.numeric(MotherEd),
#                                   FatherEd = as.numeric(FatherEd) #%>%
                                     #dplyr::select(-CDIForm, -MedBirth)), 
#                  by=c("ParticipantId", "CDIAge"))
# , "Gender", "Language", "MotherEd", "FatherEd", "BOrder")) # 
# intersect(names(en_ws), names(sp_ws))

# long format data
admins <- en_ws_new %>% 
  mutate(FatherEd = replace(FatherEd, which(FatherEd=="NR" | FatherEd=="Null"), NA),
         FatherEd = as.numeric(FatherEd)) %>% bind_rows(sp_ws_new)

for (s in unique(admins %>% filter(Language=="English") %>% .$ParticipantId)) {
  subj_rows = which(admins$ParticipantId==s)
  for (age in unique(admins[subj_rows,] %>% filter(Language=="English") %>% .$CDIAge)) {
    EngProp = admins[which(admins$ParticipantId==s &
                             admins$CDIAge==age &
                             admins$Language=="English"),]$EngProp
    sp_ind = which(admins$ParticipantId==s &
                     abs(admins$CDIAge - age)<=1 & # fuzzy age match (±1 from English CDIAge)
                     admins$Language=="Spanish")
    if (length(EngProp)!=0) {
      admins[sp_ind,]$EngProp = EngProp
      admins[sp_ind,]$SpanProp = 100 - EngProp
    } else {
      admins[sp_ind,c("EngProp","SpanProp")]= NA
    }
  }
}
# 4 missing Spanish CDI admin EngProps (no close match for English CDIAge)

#View(admins %>% arrange(ParticipantId, CDIAge))
# Spanish doesn't have SpProp/EngProp: were those never collected during SpCDI admins?
# (for English, SpProp changes across administrations, and SpCDI admins are not always at the same time as EnCDIs)

There are 163 unique participant IDs in the Spanish dataframe (225 observations), and 161 unique participant IDs in the English dataframe (222 observations).

Note: 4 Spanish CDI administrations are missing language exposure proportion data, as it seems that these were not collected during Spanish CDI administrations, and there were no corresponding English CDI administrations (within one month of the Spanish CDI administration).

Center and scale data

We center CDIAge and MotherEd, and center/scale proportion exposure.

admins <- admins %>% mutate(
  CDIAgeCtr = CDIAge - mean(CDIAge),
  MotherEd = as.numeric(MotherEd),
  MotherEdCtr = MotherEd - mean(MotherEd, na.rm=T),
  EngPropCS = as.numeric(scale(EngProp)),
  SpanPropCS = as.numeric(scale(SpanProp))
)

Regressions

# center Age, MotherEd, center/scale EngProp
m1 <- lmer(Total ~ Language * EngPropCS + CDIAgeCtr + MotherEdCtr + 
                Gender + BOrder + (1 | ParticipantId), data=admins)

summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ Language * EngPropCS + CDIAgeCtr + MotherEdCtr + Gender +  
##     BOrder + (1 | ParticipantId)
##    Data: admins
## 
## REML criterion at convergence: 5451.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4773 -0.5641 -0.0992  0.4364  3.3829 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept)  6148     78.41  
##  Residual                  11547    107.46  
## Number of obs: 440, groups:  ParticipantId, 161
## 
## Fixed effects:
##                           Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)                186.203     19.254  181.662   9.671  < 2e-16 ***
## LanguageSpanish              4.097     10.259  283.881   0.399    0.690    
## EngPropCS                   67.332      9.931  317.090   6.780 5.88e-11 ***
## CDIAgeCtr                   15.852      1.477  426.098  10.733  < 2e-16 ***
## MotherEdCtr                  2.414      2.786  163.464   0.867    0.387    
## GenderM                    -19.526     16.485  156.859  -1.184    0.238    
## BOrder                      -6.767      7.953  160.231  -0.851    0.396    
## LanguageSpanish:EngPropCS -125.281     10.327  285.570 -12.131  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) LnggSp EngPCS CDIAgC MthrEC GendrM BOrder
## LangugSpnsh -0.262                                          
## EngPropCS    0.012  0.004                                   
## CDIAgeCtr   -0.005  0.010  0.085                            
## MotherEdCtr -0.170 -0.005 -0.276  0.011                     
## GenderM     -0.428 -0.001 -0.071  0.078  0.030              
## BOrder      -0.744 -0.002  0.038 -0.036  0.221 -0.019       
## LnggSp:EPCS -0.001  0.004 -0.506  0.014 -0.002  0.007 -0.001

Separate Regressions

English

en_m1 <- lmer(Total ~ CDIAgeCtr + MotherEdCtr + EngPropCS + 
                Gender + BOrder + (1 | ParticipantId), data=admins %>% filter(Language=="English"))

summary(en_m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ CDIAgeCtr + MotherEdCtr + EngPropCS + Gender + BOrder +  
##     (1 | ParticipantId)
##    Data: admins %>% filter(Language == "English")
## 
## REML criterion at convergence: 2722.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0501 -0.4847 -0.1516  0.3177  3.5880 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept) 7077     84.13   
##  Residual                  8766     93.63   
## Number of obs: 222, groups:  ParticipantId, 161
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)  193.773     21.181 159.369   9.148 2.63e-16 ***
## CDIAgeCtr     17.808      1.748 166.260  10.189  < 2e-16 ***
## MotherEdCtr    5.185      3.172 161.795   1.635   0.1041    
## EngPropCS     61.987      9.807 186.666   6.321 1.86e-09 ***
## GenderM       -3.489     18.783 157.234  -0.186   0.8529    
## BOrder       -15.377      9.083 161.781  -1.693   0.0924 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDIAgC MthrEC EngPCS GendrM
## CDIAgeCtr   -0.006                            
## MotherEdCtr -0.176  0.014                     
## EngPropCS    0.018  0.099 -0.324              
## GenderM     -0.443  0.080  0.028 -0.085       
## BOrder      -0.772 -0.036  0.220  0.041 -0.021

Spanish

sp_m1 <- lmer(Total ~ CDIAgeCtr + MotherEdCtr + SpanPropCS + 
                Gender + BOrder + (1 | ParticipantId), data=admins %>% filter(Language=="Spanish"))

summary(sp_m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ CDIAgeCtr + MotherEdCtr + SpanPropCS + Gender + BOrder +  
##     (1 | ParticipantId)
##    Data: admins %>% filter(Language == "Spanish")
## 
## REML criterion at convergence: 2704.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1774 -0.4678 -0.0545  0.3832  3.6588 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept) 10213    101.06  
##  Residual                   8809     93.86  
## Number of obs: 218, groups:  ParticipantId, 159
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept) 183.4145    23.6329 157.4090   7.761 1.00e-12 ***
## CDIAgeCtr    13.7569     1.8643 148.4882   7.379 1.05e-11 ***
## MotherEdCtr  -0.5712     3.5640 159.8183  -0.160   0.8729    
## SpanPropCS   52.2496    10.9483 192.2682   4.772 3.60e-06 ***
## GenderM     -35.9405    20.9654 155.6624  -1.714   0.0885 .  
## BOrder        1.7858    10.0817 159.0231   0.177   0.8596    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDIAgC MthrEC SpnPCS GendrM
## CDIAgeCtr    0.003                            
## MotherEdCtr -0.185  0.001                     
## SpanPropCS  -0.016 -0.122  0.322              
## GenderM     -0.446  0.080  0.034  0.069       
## BOrder      -0.772 -0.038  0.223 -0.043 -0.016