Does hypernym level of vocab at t1 predict vocab size at t2?

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(tidyr)
library(here)
library(lme4)
library(broom)
theme_set(theme_classic(base_size = 12))

Childes data

This is for all kids in Childes in the NA and UK corpora for children with more than one timepoint between 12 and 36 months of age. There are two things that are different from the previous analyses: (1) I’m using a different age range of kids, and (2) I’m using the MTLD measure in the chilesr package. This measure is new to the package and not well documented.

IND_VARS_NA <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_na.csv")
PREDICT_VARS_NA<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_na.csv")
IND_VARS_UK <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_uk.csv")
PREDICT_VARS_UK<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_uk.csv")


independent_vars_na <- read_csv(IND_VARS_NA,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "NA")
predictor_vars_na <- read_csv(PREDICT_VARS_NA)



independent_vars_uk <- read_csv(IND_VARS_UK,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "UK")
predictor_vars_uk <- read_csv(PREDICT_VARS_UK)


tidy_pred_vars <- bind_rows(predictor_vars_na, predictor_vars_uk)
tidy_ind_vars <- bind_rows(independent_vars_uk, independent_vars_na) %>%
  group_by(collection, target_child_name, target_child_age, corpus) %>% # collapse across multiple transcripts at same age point
  summarize_if(is.numeric, mean)
  

full_df <- inner_join(tidy_ind_vars, tidy_pred_vars)

There are 232 that satisfy this criteria.

multiple_transcript_children <- full_df %>% # (after getting rid of children with multiple trasncripts at same timepoint)
  ungroup() %>%
  count(corpus, target_child_name) %>%
  filter(n > 1) %>%
  select(-n)

# add in session info and child_id
full_df_by_session <- full_df %>%
  right_join(multiple_transcript_children) %>%
  arrange(corpus, target_child_name, target_child_age) %>%
  group_by(corpus, target_child_name) %>%
  mutate(session_num = 1:n(),
         child_id = paste0(corpus, "_", target_child_name)) %>%
  select(child_id, target_child_age, session_num, everything())

full_df_with_delta <- full_df_by_session %>%
  mutate(subsequent_age = lead(target_child_age), by = session_num,
         subsequent_mtld = lead(mtld), by = session_num,
          subsequent_tokens= lead(num_tokens), by = session_num,

         delta_age = subsequent_age - target_child_age,
          delta_mtld = subsequent_mtld - mtld)

Parent vs child hypernyms

Parent and child vocab hypernyms are correlated at ~ .3.

ggplot(tidy_pred_vars, aes(x = mean_hypernymparent, y = mean_hypernymchild)) +
  geom_point() +
  geom_smooth(method = "lm")

Models predicting vocab size with previous mean vocab hypernym

These models are predicting change in mtld from current timepoint to next timepoint as a function of: (1) child/parents’s mean hypernym level of vocab, (2) change in age from timepoint 1 to 2, (3) mtld at timepoint 1, (4) log number of child tokens at timepoint 1, and (5) log number of child tokens at timepoint2.

If we think the num of tokens measures are important - which I’m not sure that it is (it’s uncorrelated with mtld) - then we should probably also control for the parent tokens when using parent predictors.

MTLD

lmer(delta_mtld  ~ mean_hypernymchild + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()

term	estimate	std.error	statistic	group
(Intercept)	4.2220503	0.5523145	7.6442858	fixed
mean_hypernymchild	-0.1177589	0.0337476	-3.4893981	fixed
delta_age	0.1892664	0.0252528	7.4948568	fixed
mtld	-0.7809828	0.0204165	-38.2526123	fixed
log(num_tokens)	0.2226614	0.0493964	4.5076406	fixed
log(subsequent_tokens)	-0.0015686	0.0498169	-0.0314879	fixed
sd_(Intercept).child_id	1.1448106	NA	NA	child_id
sd_session_num.child_id	0.0704914	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.2749746	NA	NA	child_id
sd_Observation.Residual	1.0445846	NA	NA	Residual

lmer(delta_mtld  ~ mean_hypernymparent+  delta_age + mtld+ log(num_tokens) +   log(subsequent_tokens) + (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
  tidy() %>% 
  kable()

term	estimate	std.error	statistic	group
(Intercept)	3.6160304	0.5093903	7.0987420	fixed
mean_hypernymparent	-0.1310942	0.0367181	-3.5702910	fixed
delta_age	0.1924444	0.0244150	7.8822257	fixed
mtld	-0.7500478	0.0207426	-36.1598078	fixed
log(num_tokens)	0.2847607	0.0524526	5.4289155	fixed
log(subsequent_tokens)	0.0170947	0.0525046	0.3255853	fixed
sd_(Intercept).child_id	1.0778515	NA	NA	child_id
sd_session_num.child_id	0.0559699	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5413113	NA	NA	child_id
sd_Observation.Residual	1.0360714	NA	NA	Residual

lmer(delta_mtld  ~ mean_hypernymparent+  mean_hypernymchild + delta_age  + mtld+  log(num_tokens)  + log(subsequent_tokens) +(session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
  tidy() %>% 
  kable()

term	estimate	std.error	statistic	group
(Intercept)	4.9451821	0.6171692	8.0126849	fixed
mean_hypernymparent	-0.0980749	0.0376935	-2.6019031	fixed
mean_hypernymchild	-0.1384514	0.0355053	-3.8994624	fixed
delta_age	0.1982205	0.0242225	8.1833157	fixed
mtld	-0.7601677	0.0209654	-36.2581609	fixed
log(num_tokens)	0.2447944	0.0534748	4.5777534	fixed
log(subsequent_tokens)	0.0105729	0.0524647	0.2015236	fixed
sd_(Intercept).child_id	1.0401743	NA	NA	child_id
sd_session_num.child_id	0.0556894	NA	NA	child_id
cor_(Intercept).session_num.child_id	-0.5232671	NA	NA	child_id
sd_Observation.Residual	1.0353143	NA	NA	Residual

Does hypernym level of vocab at t1 predict vocab size at t2?

2020-04-08

Childes data

Parent vs child hypernyms

Models predicting vocab size with previous mean vocab hypernym

MTLD