library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(feather)
library(broom)
library(lme4)

theme_set(theme_classic(base_size = 10))

Merge and tidy the data

Get age bins. 1 month bins 16 - 32 months.

#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
  select(-item, -value) %>%
  distinct(child_id, session_num, .keep_all = T)  %>%
  select(-session_num) %>%
  mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
  filter(!is.na(age_bin)) %>%
  group_by(child_id, age_bin)  %>%
  summarize_all(mean)

lag_session_words_spoken <- demographic_data %>%
  mutate(one_subsequent_words_spoken = lead(words_spoken, 1), by = "age_bin",
         two_subsequent_words_spoken = lead(words_spoken, 2), by = "age_bin",
         three_subsequent_words_spoken = lead(words_spoken, 3), by = "age_bin",
         four_subsequent_words_spoken = lead(words_spoken, 4), by = "age_bin",
         five_subsequent_words_spoken = lead(words_spoken, 5), by = "age_bin",
         one_delta = one_subsequent_words_spoken - words_spoken,
         two_delta = two_subsequent_words_spoken - words_spoken,
         three_delta = three_subsequent_words_spoken - words_spoken,
         four_delta = four_subsequent_words_spoken - words_spoken,
         five_delta = five_subsequent_words_spoken - words_spoken) %>%
  select(child_id, age_bin, words_spoken, contains("delta"))  %>%
  gather("lag", "delta_words", -child_id, -age_bin, -words_spoken)

lag_session_age <- demographic_data %>%
  mutate(one_subsequent_age = lead(age, 1), by = "age_bin",
         two_subsequent_age = lead(age, 2), by = "age_bin",
         three_subsequent_age = lead(age, 3), by = "age_bin",
         four_subsequent_age = lead(age, 4), by = "age_bin",
         five_subsequent_age = lead(age, 5), by = "age_bin",
         one_delta = one_subsequent_age - age,
         two_delta = two_subsequent_age - age,
         three_delta = three_subsequent_age - age,
         four_delta = four_subsequent_age - age,
         five_delta = five_subsequent_age - age) %>%
  select(child_id, age_bin, contains("delta")) %>%
  gather("lag", "delta_age", -child_id, -age_bin)

outcome_age_bin_readable <- demographic_data %>%
 ungroup()%>%
  distinct(age_bin) %>%
  rename(outcome_age_bin = age_bin) %>%
  mutate(outcome_age_bin_numeric = as.numeric(outcome_age_bin))

# join together       
full_df <- hypernyms_score_by_kid %>%
   left_join(lag_session_words_spoken) %>%
   left_join(lag_session_age) %>%
   mutate(lag_num = case_when(lag == "one_delta" ~ 1, 
                             lag == "two_delta" ~ 2,
                             lag == "three_delta" ~ 3,
                             lag == "four_delta" ~ 4,
                             lag == "five_delta" ~ 5),
          predictor_age_bin_numeric = as.numeric(age_bin)) %>%
  filter(!is.na(delta_words)) %>% # get rid of cases where delta session doesn't exisit
  rename(predictor_age_bin = age_bin) %>%
  mutate(outcome_age_bin_numeric = predictor_age_bin_numeric + lag_num)  %>%
  left_join(outcome_age_bin_readable, by = "outcome_age_bin_numeric")  %>%
  select(child_id, predictor_age_bin, predictor_age_bin_numeric, lag, lag_num, outcome_age_bin, outcome_age_bin_numeric, mean_hypernyms, mean_freq, words_spoken, delta_words, delta_age)

The data:

## Observations: 3,410
## Variables: 12
## Groups: child_id [194]
## $ child_id                  <chr> "4139_LTP102", "4139_LTP102", "4139_LT…
## $ predictor_age_bin         <fct> "(16,17]", "(16,17]", "(16,17]", "(16,…
## $ predictor_age_bin_numeric <dbl> 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4,…
## $ lag                       <chr> "one_delta", "two_delta", "three_delta…
## $ lag_num                   <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3,…
## $ outcome_age_bin           <fct> "(17,18]", "(18,19]", "(19,20]", "(20,…
## $ outcome_age_bin_numeric   <dbl> 2, 3, 4, 5, 6, 4, 5, 6, 7, 8, 5, 6, 7,…
## $ mean_hypernyms            <dbl> 9.397260, 9.397260, 9.397260, 9.397260…
## $ mean_freq                 <dbl> 7.814661, 7.814661, 7.814661, 7.814661…
## $ words_spoken              <dbl> 94, 94, 94, 94, 94, 153, 153, 153, 153…
## $ delta_words               <dbl> 59, 73, 98, 143, 194, 14, 39, 84, 135,…
## $ delta_age                 <dbl> 1.1176857, 2.2024984, 3.1886917, 4.142…

Fixed effect model

Lag = 1

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms + delta_age + mean_freq + 
##     words_spoken + predictor_age_bin_numeric, data = full_df %>% 
##     filter(lag_num == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -310.78  -25.20   -5.98   16.58  399.21 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               501.58041   42.33756  11.847  < 2e-16 ***
## mean_hypernyms            -14.94045    2.45741  -6.080  1.7e-09 ***
## delta_age                  41.04791    2.39883  17.112  < 2e-16 ***
## mean_freq                 -43.68708    5.11556  -8.540  < 2e-16 ***
## words_spoken               -0.12135    0.01381  -8.786  < 2e-16 ***
## predictor_age_bin_numeric   0.26137    0.64673   0.404    0.686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.15 on 1022 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3016, Adjusted R-squared:  0.2981 
## F-statistic: 88.25 on 5 and 1022 DF,  p-value: < 2.2e-16

All lags

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms + lag_num + delta_age + 
##     mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340.04  -45.26   -5.56   36.50  465.12 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               889.60733   37.75619  23.562  < 2e-16 ***
## mean_hypernyms            -28.29170    2.20311 -12.842  < 2e-16 ***
## lag_num                     8.08063    2.28280   3.540 0.000406 ***
## delta_age                  38.62866    1.59895  24.159  < 2e-16 ***
## mean_freq                 -74.91100    4.52319 -16.562  < 2e-16 ***
## words_spoken               -0.30367    0.01342 -22.626  < 2e-16 ***
## predictor_age_bin_numeric   2.16025    0.65342   3.306 0.000956 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.65 on 3397 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.5691, Adjusted R-squared:  0.5684 
## F-statistic: 747.8 on 6 and 3397 DF,  p-value: < 2.2e-16

With lag interaction term

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms * lag_num + delta_age + 
##     mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -351.44  -45.34   -5.08   36.32  468.28 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1025.36066   46.66692  21.972  < 2e-16 ***
## mean_hypernyms             -42.60266    3.64531 -11.687  < 2e-16 ***
## lag_num                    -38.58993    9.75858  -3.954 7.83e-05 ***
## delta_age                   38.33434    1.59464  24.039  < 2e-16 ***
## mean_freq                  -76.04209    4.51369 -16.847  < 2e-16 ***
## words_spoken                -0.31041    0.01345 -23.086  < 2e-16 ***
## predictor_age_bin_numeric    2.03122    0.65173   3.117  0.00184 ** 
## mean_hypernyms:lag_num       5.37255    1.09242   4.918 9.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.38 on 3396 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.5722, Adjusted R-squared:  0.5713 
## F-statistic: 648.8 on 7 and 3396 DF,  p-value: < 2.2e-16
  • missing data are from cases where we don’t have hypernyms for vocabulary (e.g. kid only knows two onomatopoeia words)

Mixed-effect model

Lag = 1

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +  
##     predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df %>% filter(lag_num == 1)
## 
## REML criterion at convergence: 10826
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.5876 -0.5270 -0.1242  0.3521  8.4408 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept)   10.22   3.196  
##  Residual             2213.34  47.046  
## Number of obs: 1028, groups:  child_id, 194
## 
## Fixed effects:
##                           Estimate Std. Error t value
## (Intercept)               505.2929    42.5572  11.873
## mean_hypernyms            -15.0373     2.4698  -6.088
## delta_age                  41.0234     2.3988  17.101
## mean_freq                 -44.0367     5.1527  -8.546
## words_spoken               -0.1228     0.0139  -8.836
## predictor_age_bin_numeric   0.3018     0.6505   0.464
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.311                            
## delta_age   -0.078 -0.008                     
## mean_freq   -0.837 -0.251  0.016              
## words_spokn -0.590  0.512 -0.013  0.319       
## prdctr_g_b_  0.000  0.043 -0.042 -0.080 -0.503

All lags

## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms + lag_num + delta_age + mean_freq +  
##     words_spoken + predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 37927.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4979 -0.5075  0.0018  0.5173  5.1389 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept) 9875     99.38   
##  Residual             3323     57.65   
## Number of obs: 3404, groups:  child_id, 194
## 
## Fixed effects:
##                            Estimate Std. Error t value
## (Intercept)               794.12344   44.64368  17.788
## mean_hypernyms            -29.29887    2.39281 -12.245
## lag_num                    14.00618    2.03003   6.899
## delta_age                  34.34593    1.45091  23.672
## mean_freq                 -65.73870    5.61926 -11.699
## words_spoken               -0.80073    0.01761 -45.463
## predictor_age_bin_numeric  26.33288    0.91816  28.680
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s
## mn_hyprnyms -0.194                                   
## lag_num     -0.015  0.001                            
## delta_age   -0.013  0.012 -0.918                     
## mean_freq   -0.851 -0.321 -0.012  0.012              
## words_spokn -0.128  0.210  0.026 -0.019  0.034       
## prdctr_g_b_ -0.261  0.175  0.073 -0.019  0.124 -0.771

with lag interaction term:

## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms * lag_num + delta_age + mean_freq +  
##     words_spoken + predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 37879.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4711 -0.5086  0.0020  0.5125  4.2589 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept) 9677     98.37   
##  Residual             3281     57.28   
## Number of obs: 3404, groups:  child_id, 194
## 
## Fixed effects:
##                            Estimate Std. Error t value
## (Intercept)               961.49558   50.65418  18.982
## mean_hypernyms            -45.54624    3.36481 -13.536
## lag_num                   -34.10388    7.33943  -4.647
## delta_age                  33.91757    1.44289  23.507
## mean_freq                 -68.52758    5.59674 -12.244
## words_spoken               -0.81646    0.01765 -46.247
## predictor_age_bin_numeric  26.31629    0.91193  28.858
## mean_hypernyms:lag_num      5.56527    0.81634   6.817
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s prd___
## mn_hyprnyms -0.462                                          
## lag_num     -0.468  0.681                                   
## delta_age   -0.033  0.040 -0.210                            
## mean_freq   -0.778 -0.175  0.066  0.016                     
## words_spokn -0.176  0.242  0.136 -0.013  0.043              
## prdctr_g_b_ -0.227  0.123  0.019 -0.019  0.124 -0.764       
## mn_hyprny:_  0.483 -0.708 -0.962 -0.044 -0.072 -0.134  0.001