Does hypernym level of vocab at t1 predict vocab size at t2?

Childes data

This is for all kids in Childes in the NA and UK corpora for children with more than one timepoint between 12 and 36 months of age. There are two things that are different from the previous analyses: (1) I’m using a different age range of kids, and (2) I’m using the MTLD measure in the chilesr package. This measure is new to the package and not well documented.

IND_VARS_NA <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_na.csv")
PREDICT_VARS_NA<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_na_complete.csv")
IND_VARS_UK <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_uk.csv")
PREDICT_VARS_UK<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_uk_complete.csv")


independent_vars_na <- read_csv(IND_VARS_NA,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "NA")
predictor_vars_na <- read_csv(PREDICT_VARS_NA)

independent_vars_uk <- read_csv(IND_VARS_UK,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "UK")
predictor_vars_uk <- read_csv(PREDICT_VARS_UK)


tidy_pred_vars <- bind_rows(predictor_vars_na, predictor_vars_uk)
tidy_ind_vars <- bind_rows(independent_vars_uk, independent_vars_na) %>%
  group_by(collection, target_child_name, target_child_age, corpus) %>% # collapse across multiple transcripts at same age point
  summarize_if(is.numeric, mean)
  

full_df <- inner_join(tidy_ind_vars, tidy_pred_vars)

There are 232 that satisfy this criteria.

multiple_transcript_children <- full_df %>% # (after getting rid of children with multiple trasncripts at same timepoint)
  ungroup() %>%
  count(corpus, target_child_name) %>%
  filter(n > 1) %>%
  select(-n)

# add in session info and child_id
full_df_by_session <- full_df %>%
  right_join(multiple_transcript_children) %>%
  arrange(corpus, target_child_name, target_child_age) %>%
  group_by(corpus, target_child_name) %>%
  mutate(session_num = 1:n(),
         child_id = paste0(corpus, "_", target_child_name)) %>%
  select(child_id, target_child_age, session_num, everything())

full_df_with_delta <- full_df_by_session %>%
  mutate(subsequent_age = lead(target_child_age), by = session_num,
         subsequent_mtld = lead(mtld), by = session_num,
          subsequent_tokens= lead(num_tokens), by = session_num,

         delta_age = subsequent_age - target_child_age,
          delta_mtld = subsequent_mtld - mtld)

Parent vs child hypernyms

make_corr_plot(tidy_pred_vars %>% select(-target_child_age))

Models predicting vocab size with previous mean vocab hypernym

These models are predicting change in mtld from current timepoint to next timepoint as a function of: (1) child/parents’s mean hypernym level of vocab, (2) change in age from timepoint 1 to 2, (3) mtld at timepoint 1, (4) log number of child tokens at timepoint 1, and (5) log number of child tokens at timepoint2.

If we think the num of tokens measures are important - which I’m not sure that it is (it’s uncorrelated with mtld) - then we should probably also control for the parent tokens when using parent predictors.

MTLD

hypernyms all

CHILD

lmer(delta_mtld  ~ child_hypernyms_all+ delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	3.3323433	0.3904752	8.5340706	fixed
child_hypernyms_all	-0.5183051	0.1092993	-4.7420719	fixed
delta_age	0.1943016	0.0250413	7.7592485	fixed
mtld	-0.7833235	0.0203674	-38.4596022	fixed
log(num_tokens)	0.2125462	0.0492386	4.3166567	fixed
log(subsequent_tokens)	-0.0117948	0.0497353	-0.2371512	fixed
sd_(Intercept).child_id	1.1175005	NA	NA	child_id
sd_session_num.child_id	0.0687533	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.2560401	NA	NA	child_id
sd_Observation.Residual	1.0433448	NA	NA	Residual

lmer(delta_mtld  ~ child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	2.8183276	0.3778911	7.4580410	fixed
child_hyponyms_all	0.2106510	0.1147148	1.8363010	fixed
delta_age	0.1831099	0.0253831	7.2138487	fixed
mtld	-0.7764377	0.0202537	-38.3356489	fixed
log(num_tokens)	0.2491810	0.0485964	5.1275635	fixed
log(subsequent_tokens)	0.0006225	0.0498039	0.0124981	fixed
sd_(Intercept).child_id	1.1688126	NA	NA	child_id
sd_session_num.child_id	0.0727278	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.2686045	NA	NA	child_id
sd_Observation.Residual	1.0443963	NA	NA	Residual

lmer(delta_mtld  ~ child_hypernyms_all + child_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	3.3047540	0.3951407	8.3634875	fixed
child_hypernyms_all	-0.5025941	0.1145531	-4.3874343	fixed
child_hyponyms_all	0.0545827	0.1196573	0.4561584	fixed
delta_age	0.1937328	0.0250973	7.7192698	fixed
mtld	-0.7833587	0.0203738	-38.4492643	fixed
log(num_tokens)	0.2125195	0.0492468	4.3153930	fixed
log(subsequent_tokens)	-0.0123721	0.0497565	-0.2486535	fixed
sd_(Intercept).child_id	1.1179539	NA	NA	child_id
sd_session_num.child_id	0.0688456	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.2524352	NA	NA	child_id
sd_Observation.Residual	1.0434429	NA	NA	Residual

PARENT

lmer(delta_mtld  ~  parent_hypernyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	2.4674642	0.4024927	6.1304578	fixed
parent_hypernyms_all	-0.0650035	0.1245202	-0.5220318	fixed
delta_age	0.1888674	0.0244832	7.7141621	fixed
mtld	-0.7459717	0.0207794	-35.8994946	fixed
log(num_tokens)	0.2825264	0.0526897	5.3620841	fixed
log(subsequent_tokens)	0.0224394	0.0527036	0.4257658	fixed
sd_(Intercept).child_id	1.0751953	NA	NA	child_id
sd_session_num.child_id	0.0559395	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5396342	NA	NA	child_id
sd_Observation.Residual	1.0432358	NA	NA	Residual

lmer(delta_mtld  ~  parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	2.5826824	0.4115205	6.2759501	fixed
parent_hyponyms_all	-0.1483186	0.1329605	-1.1155089	fixed
delta_age	0.1884776	0.0245008	7.6927000	fixed
mtld	-0.7474853	0.0207598	-36.0064700	fixed
log(num_tokens)	0.2797349	0.0526643	5.3116591	fixed
log(subsequent_tokens)	0.0226011	0.0526284	0.4294458	fixed
sd_(Intercept).child_id	1.0802898	NA	NA	child_id
sd_session_num.child_id	0.0564595	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5403671	NA	NA	child_id
sd_Observation.Residual	1.0425709	NA	NA	Residual

lmer(delta_mtld  ~  parent_hypernyms_all + parent_hyponyms_all + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control = lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	2.5784859	0.4113696	6.2680515	fixed
parent_hypernyms_all	-0.1000220	0.1275977	-0.7838852	fixed
parent_hyponyms_all	-0.1715770	0.1363207	-1.2586281	fixed
delta_age	0.1890361	0.0244940	7.7176449	fixed
mtld	-0.7462129	0.0207790	-35.9119046	fixed
log(num_tokens)	0.2813218	0.0526892	5.3392684	fixed
log(subsequent_tokens)	0.0206155	0.0527131	0.3910892	fixed
sd_(Intercept).child_id	1.0774944	NA	NA	child_id
sd_session_num.child_id	0.0561263	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5404173	NA	NA	child_id
sd_Observation.Residual	1.0429218	NA	NA	Residual

BOTH

lmer(delta_mtld  ~ child_hypernyms_all + parent_hypernyms_all + child_hyponyms_all + parent_hyponyms_all  +delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	3.1224218	0.4225784	7.3889761	fixed
child_hypernyms_all	-0.6404351	0.1221951	-5.2410858	fixed
parent_hypernyms_all	0.0597762	0.1318017	0.4535316	fixed
child_hyponyms_all	0.1108226	0.1261206	0.8787035	fixed
parent_hyponyms_all	-0.1663104	0.1390458	-1.1960842	fixed
delta_age	0.2018054	0.0240914	8.3766579	fixed
mtld	-0.7613316	0.0209726	-36.3012357	fixed
log(num_tokens)	0.2267443	0.0533142	4.2529815	fixed
log(subsequent_tokens)	0.0030475	0.0525001	0.0580475	fixed
sd_(Intercept).child_id	1.0015465	NA	NA	child_id
sd_session_num.child_id	0.0544134	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5012903	NA	NA	child_id
sd_Observation.Residual	1.0395933	NA	NA	Residual