Predicting delta words at various lags based on age bins

Merge and tidy the data
Fixed effect model
Mixed-effect model
- Lag = 1
- All lags

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(feather)
library(broom)
library(lme4)

theme_set(theme_classic(base_size = 10))

Merge and tidy the data

item_key <- read_csv("data/item_key.csv")  %>%
  mutate(num_item_id = as.character(num_item_id))

item_data <- read_csv("data/item_data.csv") %>%
  select(1,4) %>%
  select(num_item_id, category) %>%
  mutate(num_item_id = as.character(num_item_id))

word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
  filter(uni_lemma != "feet") %>%
  select(uni_lemma, hypernyms)  %>%
  left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
  left_join(item_data, by = "num_item_id")   %>%
  select(num_item_id, uni_lemma, category, hypernyms) %>%
  mutate_if(is.character, as.factor) %>%
  ungroup()

mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"

cdi_data <- read_csv(mcdi_path) %>%
  bind_rows(read_csv(mcdi_path2)) %>%
  select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
         -num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories, -gender) %>%
   arrange(child_id, session_num)

# get produced words by kid
produced_words <- read_csv("data/produced_words_tidy.csv") %>%
    mutate_if(is.character, as.factor)  %>%
    mutate(num_item_id = as.factor(num_item_id))

freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
  select(word, log_freq)

Get age bins. 1 month bins 16 - 32 months.

cdi_data %>%
  distinct(child_id, age) %>%
  ggplot(aes(x = age)) +
  geom_histogram()

cdi_data %>%
  distinct(child_id, age) %>%
  mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
  ggplot(aes(x = age_bin)) +
  geom_histogram(stat = "count")

# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
  mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
  filter(!is.na(age_bin)) %>%
  left_join(word_bank_hyper, by = "num_item_id") %>%
  left_join(freqs, by = c("item_clean" = "word")) %>%
  group_by(child_id, session_num, age_bin) %>%
  summarize(mean_hypernyms = mean(hypernyms, na.rm = TRUE),
            mean_freq = mean(log_freq, na.rm = T)) %>%
  group_by(child_id, age_bin) %>% # deal with any cases where a kid has two sessions per age bin
  summarize(mean_hypernyms = mean(mean_hypernyms, na.rm = TRUE),
            mean_freq = mean(mean_freq, na.rm = T))

#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
  select(-item, -value) %>%
  distinct(child_id, session_num, .keep_all = T)  %>%
  select(-session_num) %>%
  mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
  filter(!is.na(age_bin)) %>%
  group_by(child_id, age_bin)  %>%
  summarize_all(mean)

lag_session_words_spoken <- demographic_data %>%
  mutate(one_subsequent_words_spoken = lead(words_spoken, 1), by = "age_bin",
         two_subsequent_words_spoken = lead(words_spoken, 2), by = "age_bin",
         three_subsequent_words_spoken = lead(words_spoken, 3), by = "age_bin",
         four_subsequent_words_spoken = lead(words_spoken, 4), by = "age_bin",
         five_subsequent_words_spoken = lead(words_spoken, 5), by = "age_bin",
         one_delta = one_subsequent_words_spoken - words_spoken,
         two_delta = two_subsequent_words_spoken - words_spoken,
         three_delta = three_subsequent_words_spoken - words_spoken,
         four_delta = four_subsequent_words_spoken - words_spoken,
         five_delta = five_subsequent_words_spoken - words_spoken) %>%
  select(child_id, age_bin, words_spoken, contains("delta"))  %>%
  gather("lag", "delta_words", -child_id, -age_bin, -words_spoken)

lag_session_age <- demographic_data %>%
  mutate(one_subsequent_age = lead(age, 1), by = "age_bin",
         two_subsequent_age = lead(age, 2), by = "age_bin",
         three_subsequent_age = lead(age, 3), by = "age_bin",
         four_subsequent_age = lead(age, 4), by = "age_bin",
         five_subsequent_age = lead(age, 5), by = "age_bin",
         one_delta = one_subsequent_age - age,
         two_delta = two_subsequent_age - age,
         three_delta = three_subsequent_age - age,
         four_delta = four_subsequent_age - age,
         five_delta = five_subsequent_age - age) %>%
  select(child_id, age_bin, contains("delta")) %>%
  gather("lag", "delta_age", -child_id, -age_bin)

outcome_age_bin_readable <- demographic_data %>%
 ungroup()%>%
  distinct(age_bin) %>%
  rename(outcome_age_bin = age_bin) %>%
  mutate(outcome_age_bin_numeric = as.numeric(outcome_age_bin))

# join together       
full_df <- hypernyms_score_by_kid %>%
   left_join(lag_session_words_spoken) %>%
   left_join(lag_session_age) %>%
   mutate(lag_num = case_when(lag == "one_delta" ~ 1, 
                             lag == "two_delta" ~ 2,
                             lag == "three_delta" ~ 3,
                             lag == "four_delta" ~ 4,
                             lag == "five_delta" ~ 5),
          predictor_age_bin_numeric = as.numeric(age_bin)) %>%
  filter(!is.na(delta_words)) %>% # get rid of cases where delta session doesn't exisit
  rename(predictor_age_bin = age_bin) %>%
  mutate(outcome_age_bin_numeric = predictor_age_bin_numeric + lag_num)  %>%
  left_join(outcome_age_bin_readable, by = "outcome_age_bin_numeric")  %>%
  select(child_id, predictor_age_bin, predictor_age_bin_numeric, lag, lag_num, outcome_age_bin, outcome_age_bin_numeric, mean_hypernyms, mean_freq, words_spoken, delta_words, delta_age)

The data:

glimpse(full_df)

## Observations: 3,410
## Variables: 12
## Groups: child_id [194]
## $ child_id                  <chr> "4139_LTP102", "4139_LTP102", "4139_LT…
## $ predictor_age_bin         <fct> "(16,17]", "(16,17]", "(16,17]", "(16,…
## $ predictor_age_bin_numeric <dbl> 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4,…
## $ lag                       <chr> "one_delta", "two_delta", "three_delta…
## $ lag_num                   <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3,…
## $ outcome_age_bin           <fct> "(17,18]", "(18,19]", "(19,20]", "(20,…
## $ outcome_age_bin_numeric   <dbl> 2, 3, 4, 5, 6, 4, 5, 6, 7, 8, 5, 6, 7,…
## $ mean_hypernyms            <dbl> 9.397260, 9.397260, 9.397260, 9.397260…
## $ mean_freq                 <dbl> 7.814661, 7.814661, 7.814661, 7.814661…
## $ words_spoken              <dbl> 94, 94, 94, 94, 94, 153, 153, 153, 153…
## $ delta_words               <dbl> 59, 73, 98, 143, 194, 14, 39, 84, 135,…
## $ delta_age                 <dbl> 1.1176857, 2.2024984, 3.1886917, 4.142…

Fixed effect model

Lag = 1

lm(delta_words ~ mean_hypernyms + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric, 
   data = full_df %>% filter(lag_num == 1)) %>%
  summary()

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms + delta_age + mean_freq + 
##     words_spoken + predictor_age_bin_numeric, data = full_df %>% 
##     filter(lag_num == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -310.78  -25.20   -5.98   16.58  399.21 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               501.58041   42.33756  11.847  < 2e-16 ***
## mean_hypernyms            -14.94045    2.45741  -6.080  1.7e-09 ***
## delta_age                  41.04791    2.39883  17.112  < 2e-16 ***
## mean_freq                 -43.68708    5.11556  -8.540  < 2e-16 ***
## words_spoken               -0.12135    0.01381  -8.786  < 2e-16 ***
## predictor_age_bin_numeric   0.26137    0.64673   0.404    0.686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.15 on 1022 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3016, Adjusted R-squared:  0.2981 
## F-statistic: 88.25 on 5 and 1022 DF,  p-value: < 2.2e-16

All lags

lm(delta_words ~ mean_hypernyms + lag_num + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric, 
   data = full_df) %>%
  summary()

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms + lag_num + delta_age + 
##     mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340.04  -45.26   -5.56   36.50  465.12 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               889.60733   37.75619  23.562  < 2e-16 ***
## mean_hypernyms            -28.29170    2.20311 -12.842  < 2e-16 ***
## lag_num                     8.08063    2.28280   3.540 0.000406 ***
## delta_age                  38.62866    1.59895  24.159  < 2e-16 ***
## mean_freq                 -74.91100    4.52319 -16.562  < 2e-16 ***
## words_spoken               -0.30367    0.01342 -22.626  < 2e-16 ***
## predictor_age_bin_numeric   2.16025    0.65342   3.306 0.000956 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.65 on 3397 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.5691, Adjusted R-squared:  0.5684 
## F-statistic: 747.8 on 6 and 3397 DF,  p-value: < 2.2e-16

With lag interaction term

lm(delta_words ~ mean_hypernyms*lag_num + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric, 
   data = full_df) %>%
  summary()

## 
## Call:
## lm(formula = delta_words ~ mean_hypernyms * lag_num + delta_age + 
##     mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -351.44  -45.34   -5.08   36.32  468.28 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1025.36066   46.66692  21.972  < 2e-16 ***
## mean_hypernyms             -42.60266    3.64531 -11.687  < 2e-16 ***
## lag_num                    -38.58993    9.75858  -3.954 7.83e-05 ***
## delta_age                   38.33434    1.59464  24.039  < 2e-16 ***
## mean_freq                  -76.04209    4.51369 -16.847  < 2e-16 ***
## words_spoken                -0.31041    0.01345 -23.086  < 2e-16 ***
## predictor_age_bin_numeric    2.03122    0.65173   3.117  0.00184 ** 
## mean_hypernyms:lag_num       5.37255    1.09242   4.918 9.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.38 on 3396 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.5722, Adjusted R-squared:  0.5713 
## F-statistic: 648.8 on 7 and 3396 DF,  p-value: < 2.2e-16

missing data are from cases where we don’t have hypernyms for vocabulary (e.g. kid only knows two onomatopoeia words)

Coefficents by predictor age x outcome age bin

These are from fitting a separate model for each predictor age x outocme age combo.

by_session_lag_hypernym_beta <- full_df %>%
 # mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken", "delta_age", "delta_words"), scale) %>%
  group_by(predictor_age_bin, outcome_age_bin, lag_num) %>%
  nest() %>%
  mutate(temp = map(data, ~ tidy(lm(delta_words ~ mean_hypernyms + delta_age + 
                                      mean_freq + words_spoken, data = .x))),
         num_obs = map_dbl(data, nrow)) %>%
  select(-data) %>%
  unnest()  %>%
  drop_na() %>%
  filter(term == "mean_hypernyms") %>%
  mutate(is_significant = p.value < .05)

ggplot(by_session_lag_hypernym_beta, 
       aes(x = predictor_age_bin, 
           y = estimate, 
           group = outcome_age_bin, 
           shape = is_significant)) +  
  geom_hline(aes(yintercept = 0), lty = 2, color = "grey") +
  geom_line() +
  ylab("estimate of effect of mean hyperyms on delta words") +
  geom_point(size = 2) +
  facet_wrap(~outcome_age_bin) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "bottom")

ggplot(by_session_lag_hypernym_beta, 
       aes(x = outcome_age_bin, 
           y = estimate, 
           group = lag_num, 
           shape = is_significant)) +  
  geom_hline(aes(yintercept = 0), lty = 2, color = "grey") +
  facet_wrap(~lag_num) +
  geom_line() +
  ylab("estimate of effect of mean hyperyms on delta words") +
  geom_point(aes(color = as.factor(lag_num)), size = 2) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "bottom")

Mixed-effect model

Lag = 1

lmer_model <- 
lmer(delta_words ~ mean_hypernyms + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric+ (1|child_id), full_df %>% filter(lag_num == 1))

 summary(lmer_model)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +  
##     predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df %>% filter(lag_num == 1)
## 
## REML criterion at convergence: 10826
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.5876 -0.5270 -0.1242  0.3521  8.4408 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept)   10.22   3.196  
##  Residual             2213.34  47.046  
## Number of obs: 1028, groups:  child_id, 194
## 
## Fixed effects:
##                           Estimate Std. Error t value
## (Intercept)               505.2929    42.5572  11.873
## mean_hypernyms            -15.0373     2.4698  -6.088
## delta_age                  41.0234     2.3988  17.101
## mean_freq                 -44.0367     5.1527  -8.546
## words_spoken               -0.1228     0.0139  -8.836
## predictor_age_bin_numeric   0.3018     0.6505   0.464
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.311                            
## delta_age   -0.078 -0.008                     
## mean_freq   -0.837 -0.251  0.016              
## words_spokn -0.590  0.512 -0.013  0.319       
## prdctr_g_b_  0.000  0.043 -0.042 -0.080 -0.503

All lags

lmer_model <- 
lmer(delta_words ~ mean_hypernyms+lag_num + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric + (1|child_id), full_df) 

summary(lmer_model)

## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms + lag_num + delta_age + mean_freq +  
##     words_spoken + predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 37927.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4979 -0.5075  0.0018  0.5173  5.1389 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept) 9875     99.38   
##  Residual             3323     57.65   
## Number of obs: 3404, groups:  child_id, 194
## 
## Fixed effects:
##                            Estimate Std. Error t value
## (Intercept)               794.12344   44.64368  17.788
## mean_hypernyms            -29.29887    2.39281 -12.245
## lag_num                    14.00618    2.03003   6.899
## delta_age                  34.34593    1.45091  23.672
## mean_freq                 -65.73870    5.61926 -11.699
## words_spoken               -0.80073    0.01761 -45.463
## predictor_age_bin_numeric  26.33288    0.91816  28.680
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s
## mn_hyprnyms -0.194                                   
## lag_num     -0.015  0.001                            
## delta_age   -0.013  0.012 -0.918                     
## mean_freq   -0.851 -0.321 -0.012  0.012              
## words_spokn -0.128  0.210  0.026 -0.019  0.034       
## prdctr_g_b_ -0.261  0.175  0.073 -0.019  0.124 -0.771

with lag interaction term:

lmer_model <- 
lmer(delta_words ~ mean_hypernyms*lag_num + delta_age +  mean_freq + words_spoken + predictor_age_bin_numeric + (1|child_id), full_df) 

summary(lmer_model)

## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms * lag_num + delta_age + mean_freq +  
##     words_spoken + predictor_age_bin_numeric + (1 | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 37879.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4711 -0.5086  0.0020  0.5125  4.2589 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  child_id (Intercept) 9677     98.37   
##  Residual             3281     57.28   
## Number of obs: 3404, groups:  child_id, 194
## 
## Fixed effects:
##                            Estimate Std. Error t value
## (Intercept)               961.49558   50.65418  18.982
## mean_hypernyms            -45.54624    3.36481 -13.536
## lag_num                   -34.10388    7.33943  -4.647
## delta_age                  33.91757    1.44289  23.507
## mean_freq                 -68.52758    5.59674 -12.244
## words_spoken               -0.81646    0.01765 -46.247
## predictor_age_bin_numeric  26.31629    0.91193  28.858
## mean_hypernyms:lag_num      5.56527    0.81634   6.817
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s prd___
## mn_hyprnyms -0.462                                          
## lag_num     -0.468  0.681                                   
## delta_age   -0.033  0.040 -0.210                            
## mean_freq   -0.778 -0.175  0.066  0.016                     
## words_spokn -0.176  0.242  0.136 -0.013  0.043              
## prdctr_g_b_ -0.227  0.123  0.019 -0.019  0.124 -0.764       
## mn_hyprny:_  0.483 -0.708 -0.962 -0.044 -0.072 -0.134  0.001

Predicting delta words at various lags based on age bins

CDI

2019-05-22

Merge and tidy the data

Fixed effect model

Lag = 1

All lags

Coefficents by predictor age x outcome age bin

Mixed-effect model

Lag = 1

All lags