Childes data

This is for all kids in Childes in the NA and UK corpora for children with more than one timepoint between 12 and 36 months of age. There are two things that are different from the previous analyses: (1) I’m using a different age range of kids, and (2) I’m using the MTLD measure in the chilesr package. This measure is new to the package and not well documented.

IND_VARS_NA <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_na.csv")
PREDICT_VARS_NA<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_na_complete.csv")
IND_VARS_UK <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_uk.csv")
PREDICT_VARS_UK<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_uk_complete.csv")


independent_vars_na <- read_csv(IND_VARS_NA,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "NA")
predictor_vars_na <- read_csv(PREDICT_VARS_NA)

independent_vars_uk <- read_csv(IND_VARS_UK,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "UK")
predictor_vars_uk <- read_csv(PREDICT_VARS_UK)


tidy_pred_vars <- bind_rows(predictor_vars_na, predictor_vars_uk)
tidy_ind_vars <- bind_rows(independent_vars_uk, independent_vars_na) %>%
  group_by(collection, target_child_name, target_child_age, corpus) %>% # collapse across multiple transcripts at same age point
  summarize_if(is.numeric, mean)
  

full_df <- inner_join(tidy_ind_vars, tidy_pred_vars)

There are 232 that satisfy this criteria.

multiple_transcript_children <- full_df %>% # (after getting rid of children with multiple trasncripts at same timepoint)
  ungroup() %>%
  count(corpus, target_child_name) %>%
  filter(n > 1) %>%
  select(-n)

# add in session info and child_id
full_df_by_session <- full_df %>%
  right_join(multiple_transcript_children) %>%
  arrange(corpus, target_child_name, target_child_age) %>%
  group_by(corpus, target_child_name) %>%
  mutate(session_num = 1:n(),
         child_id = paste0(corpus, "_", target_child_name)) %>%
  select(child_id, target_child_age, session_num, everything())

full_df_with_delta <- full_df_by_session %>%
  mutate(subsequent_age = lead(target_child_age), by = session_num,
         subsequent_mtld = lead(mtld), by = session_num,
          subsequent_tokens= lead(num_tokens), by = session_num,

         delta_age = subsequent_age - target_child_age,
          delta_mtld = subsequent_mtld - mtld) 

Parent vs child hypernyms

make_corr_plot(tidy_pred_vars %>% select(-target_child_age))

Models predicting vocab size with previous mean vocab hypernym

These models are predicting change in mtld from current timepoint to next timepoint as a function of: (1) child/parents’s mean hypernym level of vocab, (2) change in age from timepoint 1 to 2, (3) mtld at timepoint 1, (4) log number of child tokens at timepoint 1, and (5) log number of child tokens at timepoint2.

If we think the num of tokens measures are important - which I’m not sure that it is (it’s uncorrelated with mtld) - then we should probably also control for the parent tokens when using parent predictors.

MTLD

hypernyms all

CHILD

lmer(delta_mtld  ~ child_hypernyms_all+ delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 3.3323433 0.3904752 8.5340706 fixed
child_hypernyms_all -0.5183051 0.1092993 -4.7420719 fixed
delta_age 0.1943016 0.0250413 7.7592485 fixed
mtld -0.7833235 0.0203674 -38.4596022 fixed
log(num_tokens) 0.2125462 0.0492386 4.3166567 fixed
log(subsequent_tokens) -0.0117948 0.0497353 -0.2371512 fixed
sd_(Intercept).child_id 1.1175005 NA NA child_id
sd_session_num.child_id 0.0687533 NA NA child_id
cor_(Intercept).session_num.child_id -0.2560401 NA NA child_id
sd_Observation.Residual 1.0433448 NA NA Residual
lmer(delta_mtld  ~ child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 2.8183276 0.3778911 7.4580410 fixed
child_hyponyms_all 0.2106510 0.1147148 1.8363010 fixed
delta_age 0.1831099 0.0253831 7.2138487 fixed
mtld -0.7764377 0.0202537 -38.3356489 fixed
log(num_tokens) 0.2491810 0.0485964 5.1275635 fixed
log(subsequent_tokens) 0.0006225 0.0498039 0.0124981 fixed
sd_(Intercept).child_id 1.1688126 NA NA child_id
sd_session_num.child_id 0.0727278 NA NA child_id
cor_(Intercept).session_num.child_id -0.2686045 NA NA child_id
sd_Observation.Residual 1.0443963 NA NA Residual
lmer(delta_mtld  ~ child_hypernyms_all + child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 3.3047540 0.3951407 8.3634875 fixed
child_hypernyms_all -0.5025941 0.1145531 -4.3874343 fixed
child_hyponyms_all 0.0545827 0.1196573 0.4561584 fixed
delta_age 0.1937328 0.0250973 7.7192698 fixed
mtld -0.7833587 0.0203738 -38.4492643 fixed
log(num_tokens) 0.2125195 0.0492468 4.3153930 fixed
log(subsequent_tokens) -0.0123721 0.0497565 -0.2486535 fixed
sd_(Intercept).child_id 1.1179539 NA NA child_id
sd_session_num.child_id 0.0688456 NA NA child_id
cor_(Intercept).session_num.child_id -0.2524352 NA NA child_id
sd_Observation.Residual 1.0434429 NA NA Residual

PARENT

lmer(delta_mtld  ~  parent_hypernyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 2.4674642 0.4024927 6.1304578 fixed
parent_hypernyms_all -0.0650035 0.1245202 -0.5220318 fixed
delta_age 0.1888674 0.0244832 7.7141621 fixed
mtld -0.7459717 0.0207794 -35.8994946 fixed
log(num_tokens) 0.2825264 0.0526897 5.3620841 fixed
log(subsequent_tokens) 0.0224394 0.0527036 0.4257658 fixed
sd_(Intercept).child_id 1.0751953 NA NA child_id
sd_session_num.child_id 0.0559395 NA NA child_id
cor_(Intercept).session_num.child_id -0.5396342 NA NA child_id
sd_Observation.Residual 1.0432358 NA NA Residual
lmer(delta_mtld  ~  parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 2.5826824 0.4115205 6.2759501 fixed
parent_hyponyms_all -0.1483186 0.1329605 -1.1155089 fixed
delta_age 0.1884776 0.0245008 7.6927000 fixed
mtld -0.7474853 0.0207598 -36.0064700 fixed
log(num_tokens) 0.2797349 0.0526643 5.3116591 fixed
log(subsequent_tokens) 0.0226011 0.0526284 0.4294458 fixed
sd_(Intercept).child_id 1.0802898 NA NA child_id
sd_session_num.child_id 0.0564595 NA NA child_id
cor_(Intercept).session_num.child_id -0.5403671 NA NA child_id
sd_Observation.Residual 1.0425709 NA NA Residual
lmer(delta_mtld  ~  parent_hypernyms_all + parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 2.5784859 0.4113696 6.2680515 fixed
parent_hypernyms_all -0.1000220 0.1275977 -0.7838852 fixed
parent_hyponyms_all -0.1715770 0.1363207 -1.2586281 fixed
delta_age 0.1890361 0.0244940 7.7176449 fixed
mtld -0.7462129 0.0207790 -35.9119046 fixed
log(num_tokens) 0.2813218 0.0526892 5.3392684 fixed
log(subsequent_tokens) 0.0206155 0.0527131 0.3910892 fixed
sd_(Intercept).child_id 1.0774944 NA NA child_id
sd_session_num.child_id 0.0561263 NA NA child_id
cor_(Intercept).session_num.child_id -0.5404173 NA NA child_id
sd_Observation.Residual 1.0429218 NA NA Residual

BOTH

lmer(delta_mtld  ~ child_hypernyms_all + parent_hypernyms_all + child_hyponyms_all + parent_hyponyms_all  +delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 3.1224218 0.4225784 7.3889761 fixed
child_hypernyms_all -0.6404351 0.1221951 -5.2410858 fixed
parent_hypernyms_all 0.0597762 0.1318017 0.4535316 fixed
child_hyponyms_all 0.1108226 0.1261206 0.8787035 fixed
parent_hyponyms_all -0.1663104 0.1390458 -1.1960842 fixed
delta_age 0.2018054 0.0240914 8.3766579 fixed
mtld -0.7613316 0.0209726 -36.3012357 fixed
log(num_tokens) 0.2267443 0.0533142 4.2529815 fixed
log(subsequent_tokens) 0.0030475 0.0525001 0.0580475 fixed
sd_(Intercept).child_id 1.0015465 NA NA child_id
sd_session_num.child_id 0.0544134 NA NA child_id
cor_(Intercept).session_num.child_id -0.5012903 NA NA child_id
sd_Observation.Residual 1.0395933 NA NA Residual