This is for all kids in Childes in the NA and UK corpora for children with more than one timepoint between 12 and 36 months of age. There are two things that are different from the previous analyses: (1) I’m using a different age range of kids, and (2) I’m using the MTLD measure in the chilesr package. This measure is new to the package and not well documented.
IND_VARS_NA <- here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_na.csv")
PREDICT_VARS_NA<- here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_na_complete.csv")
IND_VARS_UK <- here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_uk.csv")
PREDICT_VARS_UK<- here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_uk_complete.csv")
independent_vars_na <- read_csv(IND_VARS_NA,
col_names = c("corpus", "target_child_name", "target_child_age",
"num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
"num_morphemes")) %>%
mutate(collection = "NA")
predictor_vars_na <- read_csv(PREDICT_VARS_NA)
independent_vars_uk <- read_csv(IND_VARS_UK,
col_names = c("corpus", "target_child_name", "target_child_age",
"num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
"num_morphemes")) %>%
mutate(collection = "UK")
predictor_vars_uk <- read_csv(PREDICT_VARS_UK)
tidy_pred_vars <- bind_rows(predictor_vars_na, predictor_vars_uk)
tidy_ind_vars <- bind_rows(independent_vars_uk, independent_vars_na) %>%
group_by(collection, target_child_name, target_child_age, corpus) %>% # collapse across multiple transcripts at same age point
summarize_if(is.numeric, mean)
full_df <- inner_join(tidy_ind_vars, tidy_pred_vars)There are 232 that satisfy this criteria.
multiple_transcript_children <- full_df %>% # (after getting rid of children with multiple trasncripts at same timepoint)
ungroup() %>%
count(corpus, target_child_name) %>%
filter(n > 1) %>%
select(-n)
# add in session info and child_id
full_df_by_session <- full_df %>%
right_join(multiple_transcript_children) %>%
arrange(corpus, target_child_name, target_child_age) %>%
group_by(corpus, target_child_name) %>%
mutate(session_num = 1:n(),
child_id = paste0(corpus, "_", target_child_name)) %>%
select(child_id, target_child_age, session_num, everything())
full_df_with_delta <- full_df_by_session %>%
mutate(subsequent_age = lead(target_child_age), by = session_num,
subsequent_mtld = lead(mtld), by = session_num,
subsequent_tokens= lead(num_tokens), by = session_num,
delta_age = subsequent_age - target_child_age,
delta_mtld = subsequent_mtld - mtld) These models are predicting change in mtld from current timepoint to next timepoint as a function of: (1) child/parents’s mean hypernym level of vocab, (2) change in age from timepoint 1 to 2, (3) mtld at timepoint 1, (4) log number of child tokens at timepoint 1, and (5) log number of child tokens at timepoint2.
If we think the num of tokens measures are important - which I’m not sure that it is (it’s uncorrelated with mtld) - then we should probably also control for the parent tokens when using parent predictors.
CHILD
lmer(delta_mtld ~ child_hypernyms_all+ delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control = lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 3.3323433 | 0.3904752 | 8.5340706 | fixed |
| child_hypernyms_all | -0.5183051 | 0.1092993 | -4.7420719 | fixed |
| delta_age | 0.1943016 | 0.0250413 | 7.7592485 | fixed |
| mtld | -0.7833235 | 0.0203674 | -38.4596022 | fixed |
| log(num_tokens) | 0.2125462 | 0.0492386 | 4.3166567 | fixed |
| log(subsequent_tokens) | -0.0117948 | 0.0497353 | -0.2371512 | fixed |
| sd_(Intercept).child_id | 1.1175005 | NA | NA | child_id |
| sd_session_num.child_id | 0.0687533 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.2560401 | NA | NA | child_id |
| sd_Observation.Residual | 1.0433448 | NA | NA | Residual |
lmer(delta_mtld ~ child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control = lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 2.8183276 | 0.3778911 | 7.4580410 | fixed |
| child_hyponyms_all | 0.2106510 | 0.1147148 | 1.8363010 | fixed |
| delta_age | 0.1831099 | 0.0253831 | 7.2138487 | fixed |
| mtld | -0.7764377 | 0.0202537 | -38.3356489 | fixed |
| log(num_tokens) | 0.2491810 | 0.0485964 | 5.1275635 | fixed |
| log(subsequent_tokens) | 0.0006225 | 0.0498039 | 0.0124981 | fixed |
| sd_(Intercept).child_id | 1.1688126 | NA | NA | child_id |
| sd_session_num.child_id | 0.0727278 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.2686045 | NA | NA | child_id |
| sd_Observation.Residual | 1.0443963 | NA | NA | Residual |
lmer(delta_mtld ~ child_hypernyms_all + child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control = lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 3.3047540 | 0.3951407 | 8.3634875 | fixed |
| child_hypernyms_all | -0.5025941 | 0.1145531 | -4.3874343 | fixed |
| child_hyponyms_all | 0.0545827 | 0.1196573 | 0.4561584 | fixed |
| delta_age | 0.1937328 | 0.0250973 | 7.7192698 | fixed |
| mtld | -0.7833587 | 0.0203738 | -38.4492643 | fixed |
| log(num_tokens) | 0.2125195 | 0.0492468 | 4.3153930 | fixed |
| log(subsequent_tokens) | -0.0123721 | 0.0497565 | -0.2486535 | fixed |
| sd_(Intercept).child_id | 1.1179539 | NA | NA | child_id |
| sd_session_num.child_id | 0.0688456 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.2524352 | NA | NA | child_id |
| sd_Observation.Residual | 1.0434429 | NA | NA | Residual |
PARENT
lmer(delta_mtld ~ parent_hypernyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control=lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 2.4674642 | 0.4024927 | 6.1304578 | fixed |
| parent_hypernyms_all | -0.0650035 | 0.1245202 | -0.5220318 | fixed |
| delta_age | 0.1888674 | 0.0244832 | 7.7141621 | fixed |
| mtld | -0.7459717 | 0.0207794 | -35.8994946 | fixed |
| log(num_tokens) | 0.2825264 | 0.0526897 | 5.3620841 | fixed |
| log(subsequent_tokens) | 0.0224394 | 0.0527036 | 0.4257658 | fixed |
| sd_(Intercept).child_id | 1.0751953 | NA | NA | child_id |
| sd_session_num.child_id | 0.0559395 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.5396342 | NA | NA | child_id |
| sd_Observation.Residual | 1.0432358 | NA | NA | Residual |
lmer(delta_mtld ~ parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control = lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 2.5826824 | 0.4115205 | 6.2759501 | fixed |
| parent_hyponyms_all | -0.1483186 | 0.1329605 | -1.1155089 | fixed |
| delta_age | 0.1884776 | 0.0245008 | 7.6927000 | fixed |
| mtld | -0.7474853 | 0.0207598 | -36.0064700 | fixed |
| log(num_tokens) | 0.2797349 | 0.0526643 | 5.3116591 | fixed |
| log(subsequent_tokens) | 0.0226011 | 0.0526284 | 0.4294458 | fixed |
| sd_(Intercept).child_id | 1.0802898 | NA | NA | child_id |
| sd_session_num.child_id | 0.0564595 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.5403671 | NA | NA | child_id |
| sd_Observation.Residual | 1.0425709 | NA | NA | Residual |
lmer(delta_mtld ~ parent_hypernyms_all + parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control = lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 2.5784859 | 0.4113696 | 6.2680515 | fixed |
| parent_hypernyms_all | -0.1000220 | 0.1275977 | -0.7838852 | fixed |
| parent_hyponyms_all | -0.1715770 | 0.1363207 | -1.2586281 | fixed |
| delta_age | 0.1890361 | 0.0244940 | 7.7176449 | fixed |
| mtld | -0.7462129 | 0.0207790 | -35.9119046 | fixed |
| log(num_tokens) | 0.2813218 | 0.0526892 | 5.3392684 | fixed |
| log(subsequent_tokens) | 0.0206155 | 0.0527131 | 0.3910892 | fixed |
| sd_(Intercept).child_id | 1.0774944 | NA | NA | child_id |
| sd_session_num.child_id | 0.0561263 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.5404173 | NA | NA | child_id |
| sd_Observation.Residual | 1.0429218 | NA | NA | Residual |
BOTH
lmer(delta_mtld ~ child_hypernyms_all + parent_hypernyms_all + child_hyponyms_all + parent_hyponyms_all +delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
(session_num|child_id), full_df_with_delta,
control=lmerControl(optimizer="bobyqa")) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | group |
|---|---|---|---|---|
| (Intercept) | 3.1224218 | 0.4225784 | 7.3889761 | fixed |
| child_hypernyms_all | -0.6404351 | 0.1221951 | -5.2410858 | fixed |
| parent_hypernyms_all | 0.0597762 | 0.1318017 | 0.4535316 | fixed |
| child_hyponyms_all | 0.1108226 | 0.1261206 | 0.8787035 | fixed |
| parent_hyponyms_all | -0.1663104 | 0.1390458 | -1.1960842 | fixed |
| delta_age | 0.2018054 | 0.0240914 | 8.3766579 | fixed |
| mtld | -0.7613316 | 0.0209726 | -36.3012357 | fixed |
| log(num_tokens) | 0.2267443 | 0.0533142 | 4.2529815 | fixed |
| log(subsequent_tokens) | 0.0030475 | 0.0525001 | 0.0580475 | fixed |
| sd_(Intercept).child_id | 1.0015465 | NA | NA | child_id |
| sd_session_num.child_id | 0.0544134 | NA | NA | child_id |
| cor_(Intercept).session_num.child_id | -0.5012903 | NA | NA | child_id |
| sd_Observation.Residual | 1.0395933 | NA | NA | Residual |