Load data

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   Gender = col_character(),
##   Ethnic = col_character(),
##   FatherEd = col_character(),
##   MedBirth = col_character(),
##   Language = col_character(),
##   CDIForm = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   Gender = col_character(),
##   Ethnic = col_character(),
##   MotherEd = col_character(),
##   FatherEd = col_character(),
##   MedBirth = col_character(),
##   Language = col_character(),
##   CDIForm = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: Missing column names filled in: 'X3' [3]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   ParticipantId = col_character(),
##   id = col_character(),
##   X3 = col_logical(),
##   whosp = col_character(),
##   whoeng = col_character(),
##   spcdiby = col_character(),
##   engcdiby = col_character(),
##   `mgcorig Mother/Guardian origin` = col_character(),
##   `fgcorig Father's Country of Origin` = col_character(),
##   `chicorig Child's Country of Origin` = col_character(),
##   `mgnatlng Mother/Guardian native language` = col_character(),
##   `fgnatlng Father/Guardian Native Language` = col_character(),
##   `homelng primary language spoken at home` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

Overview

Total Vocabulary and Combine Data

en_voc <- en_ws[,15:811] # 797 columns (not 680) - UL / COMPLX mixed in:
ul_col_names <- c("USECMPL","USEFUT","USEMISS","USEPAST","USEPOSS")
# COMPLX01 - COMPLX37: cols 176:211
complx_col_names <- c(paste0("COMPLX0",1:9), paste0("COMPLX",10:37))
en_complx_ul <- en_ws[,c(complx_col_names, ul_col_names)] # 42, missing some? e.g. COMBINE?
en_voc <- en_voc %>% select(-all_of(complx_col_names), -all_of(ul_col_names))
en_ws <- en_ws %>% select(-c(15:811)) %>% 
  mutate(Total = rowSums(en_voc)) %>%
  select(-AgeCDI)

sp_voc <- sp_ws[,49:752]
sp_cmplx_ul <- sp_ws[,12:48]
sp_ws <- sp_ws %>% select(-c(12:752)) %>% 
  mutate(Total = rowSums(sp_voc),
         EngProp = NA,
         SpanProp = NA) %>%
  select(-USEPOSS, -USEFUT, -USEMISS, -USEPAST, -SCOMBINE, -USECMPL)
  
# need to recast some vars (e.g., sp_ws$MotherEd is character)
#cdat <- left_join(en_ws %>% mutate(MotherEd = as.numeric(MotherEd),
#                                   FatherEd = as.numeric(FatherEd)), 
#                  sp_ws %>% mutate(MotherEd = as.numeric(MotherEd),
#                                   FatherEd = as.numeric(FatherEd) #%>%
                                     #dplyr::select(-CDIForm, -MedBirth)), 
#                  by=c("ParticipantId", "CDIAge"))
# , "Gender", "Language", "MotherEd", "FatherEd", "BOrder")) # 
# intersect(names(en_ws), names(sp_ws))

# long format data
admins <- en_ws %>% 
  mutate(FatherEd = replace(FatherEd, which(FatherEd=="NR" | FatherEd=="Null"), NA),
         FatherEd = as.numeric(FatherEd)) %>% bind_rows(sp_ws)


for(s in unique(admins$ParticipantId)) {
  subj_rows = which(admins$ParticipantId==s)
  for(age in unique(admins[subj_rows,]$CDIAge)) {
    EngProp = admins[which(admins$ParticipantId==s & admins$CDIAge==age & admins$Language=="English"),]$EngProp
    sp_ind = which(admins$ParticipantId==s & admins$CDIAge==age & admins$Language=="Spanish")
    if(length(EngProp)!=0) {
      admins[sp_ind,]$EngProp = EngProp
      admins[sp_ind,]$SpanProp = 100 - EngProp
    } else {
      admins[sp_ind,c("EngProp","SpanProp")]= NA
    }
  }
}
# 7 missing Spanish CDI admin EngProps (set fuzzy age match? e.g. +/-1 months copy from Eng CDI?)



#View(admins %>% arrange(ParticipantId, CDIAge))
# Spanish doesn't have SpProp/EngProp: were those never collected during SpCDI admins?
# (for English, SpProp changes across administrations, and SpCDI admins are not always at the same time as EnCDIs)

There are 163 unique participant IDs in the Spanish dataframe (225 observations), and 161 unique participant IDs in the English dataframe (222 observations).

Regressions

# center Age, MotherEd, center/scale EngProp
m1 <- lmer(Total ~ Language * EngProp + CDIAge + MotherEd + 
                Gender + BOrder + (1 | ParticipantId), data=admins)

summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ Language * EngProp + CDIAge + MotherEd + Gender + BOrder +  
##     (1 | ParticipantId)
##    Data: admins
## 
## REML criterion at convergence: 5440.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4743 -0.5655 -0.0988  0.4359  3.3735 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept)  6172     78.56  
##  Residual                  11573    107.58  
## Number of obs: 438, groups:  ParticipantId, 161
## 
## Fixed effects:
##                          Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)             -369.1782    59.7165  273.9213  -6.182 2.29e-09 ***
## LanguageSpanish          220.2716    20.6653  283.1639  10.659  < 2e-16 ***
## EngProp                    2.7528     0.4092  315.6427   6.728 8.11e-11 ***
## CDIAge                    15.8040     1.4815  423.9529  10.668  < 2e-16 ***
## MotherEd                   2.4273     2.7908  163.8798   0.870    0.386    
## GenderM                  -19.8199    16.5181  157.3719  -1.200    0.232    
## BOrder                    -6.8678     7.9669  160.5467  -0.862    0.390    
## LanguageSpanish:EngProp   -5.1143     0.4284  284.6165 -11.939  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) LnggSp EngPrp CDIAge MthrEd GendrM BOrder
## LangugSpnsh -0.164                                          
## EngProp     -0.166  0.443                                   
## CDIAge      -0.671 -0.008  0.086                            
## MotherEd    -0.611  0.000 -0.276  0.009                     
## GenderM     -0.187 -0.007 -0.071  0.080  0.029              
## BOrder      -0.367  0.001  0.038 -0.036  0.221 -0.019       
## LnggSpns:EP  0.138 -0.867 -0.506  0.015 -0.004  0.008 -0.003

Separate Regressions

English

en_m1 <- lmer(Total ~ CDIAge + MotherEd + EngProp + 
                Gender + BOrder + (1 | ParticipantId), data=admins %>% filter(Language=="English"))

summary(en_m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ CDIAge + MotherEd + EngProp + Gender + BOrder + (1 |  
##     ParticipantId)
##    Data: admins %>% filter(Language == "English")
## 
## REML criterion at convergence: 2729.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0501 -0.4847 -0.1516  0.3177  3.5880 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept) 7077     84.13   
##  Residual                  8766     93.63   
## Number of obs: 222, groups:  ParticipantId, 161
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) -441.9463    68.1234  214.1261  -6.487 5.96e-10 ***
## CDIAge        17.8081     1.7478  166.2603  10.189  < 2e-16 ***
## MotherEd       5.1853     3.1723  161.7954   1.635   0.1041    
## EngProp        2.5482     0.4031  186.6661   6.321 1.86e-09 ***
## GenderM       -3.4893    18.7825  157.2338  -0.186   0.8529    
## BOrder       -15.3765     9.0834  161.7809  -1.693   0.0924 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) CDIAge MthrEd EngPrp GendrM
## CDIAge   -0.696                            
## MotherEd -0.611  0.014                     
## EngProp  -0.106  0.099 -0.324              
## GenderM  -0.187  0.080  0.028 -0.085       
## BOrder   -0.364 -0.036  0.220  0.041 -0.021

Spanish

sp_m1 <- lmer(Total ~ CDIAge + MotherEd + SpanProp + 
                Gender + BOrder + (1 | ParticipantId), data=admins %>% filter(Language=="Spanish"))

summary(sp_m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Total ~ CDIAge + MotherEd + SpanProp + Gender + BOrder + (1 |  
##     ParticipantId)
##    Data: admins %>% filter(Language == "Spanish")
## 
## REML criterion at convergence: 2688.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1520 -0.4617 -0.0552  0.3858  3.6377 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  ParticipantId (Intercept) 10231    101.15  
##  Residual                   8907     94.38  
## Number of obs: 216, groups:  ParticipantId, 159
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) -288.6229    82.7217  206.8204  -3.489 0.000592 ***
## CDIAge        13.9061     1.8843  148.7724   7.380 1.04e-11 ***
## MotherEd      -0.6675     3.5769  160.4570  -0.187 0.852190    
## SpanProp       2.1273     0.4526  192.6404   4.701 4.93e-06 ***
## GenderM      -35.2976    21.0460  156.4961  -1.677 0.095507 .  
## BOrder         1.6593    10.1145  159.4750   0.164 0.869901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) CDIAge MthrEd SpnPrp GendrM
## CDIAge   -0.544                            
## MotherEd -0.736 -0.003                     
## SpanProp -0.436 -0.127  0.324              
## GenderM  -0.217  0.084  0.033  0.066       
## BOrder   -0.315 -0.039  0.223 -0.041 -0.016