library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(tidyr)
library(here)
library(lme4)
library(broom)
theme_set(theme_classic(base_size = 12))

Childes data

This is for all kids in Childes in the NA and UK corpora for children with more than one timepoint between 12 and 36 months of age. There are two things that are different from the previous analyses: (1) I’m using a different age range of kids, and (2) I’m using the MTLD measure in the chilesr package. This measure is new to the package and not well documented.

IND_VARS_NA <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_na.csv")
PREDICT_VARS_NA<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_na.csv")
IND_VARS_UK <-  here("exploratory_analyses/17_taxonomic_childes/data/child_stats_eng_uk.csv")
PREDICT_VARS_UK<-  here("exploratory_analyses/17_taxonomic_childes/data/mean_hypernym_eng_uk.csv")


independent_vars_na <- read_csv(IND_VARS_NA,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "NA")
predictor_vars_na <- read_csv(PREDICT_VARS_NA)



independent_vars_uk <- read_csv(IND_VARS_UK,
                               col_names = c("corpus", "target_child_name", "target_child_age",
                                      "num_utterances", "mlu_w", "num_types", "num_tokens", "hdd", "mtld", "mlu_m",
                                      "num_morphemes")) %>%
  mutate(collection = "UK")
predictor_vars_uk <- read_csv(PREDICT_VARS_UK)


tidy_pred_vars <- bind_rows(predictor_vars_na, predictor_vars_uk)
tidy_ind_vars <- bind_rows(independent_vars_uk, independent_vars_na) %>%
  group_by(collection, target_child_name, target_child_age, corpus) %>% # collapse across multiple transcripts at same age point
  summarize_if(is.numeric, mean)
  

full_df <- inner_join(tidy_ind_vars, tidy_pred_vars)

There are 232 that satisfy this criteria.

multiple_transcript_children <- full_df %>% # (after getting rid of children with multiple trasncripts at same timepoint)
  ungroup() %>%
  count(corpus, target_child_name) %>%
  filter(n > 1) %>%
  select(-n)

# add in session info and child_id
full_df_by_session <- full_df %>%
  right_join(multiple_transcript_children) %>%
  arrange(corpus, target_child_name, target_child_age) %>%
  group_by(corpus, target_child_name) %>%
  mutate(session_num = 1:n(),
         child_id = paste0(corpus, "_", target_child_name)) %>%
  select(child_id, target_child_age, session_num, everything())

full_df_with_delta <- full_df_by_session %>%
  mutate(subsequent_age = lead(target_child_age), by = session_num,
         subsequent_mtld = lead(mtld), by = session_num,
          subsequent_tokens= lead(num_tokens), by = session_num,

         delta_age = subsequent_age - target_child_age,
          delta_mtld = subsequent_mtld - mtld)

Parent vs child hypernyms

Parent and child vocab hypernyms are correlated at ~ .3.

ggplot(tidy_pred_vars, aes(x = mean_hypernymparent, y = mean_hypernymchild)) +
  geom_point() +
  geom_smooth(method = "lm")

Models predicting vocab size with previous mean vocab hypernym

These models are predicting change in mtld from current timepoint to next timepoint as a function of: (1) child/parents’s mean hypernym level of vocab, (2) change in age from timepoint 1 to 2, (3) mtld at timepoint 1, (4) log number of child tokens at timepoint 1, and (5) log number of child tokens at timepoint2.

If we think the num of tokens measures are important - which I’m not sure that it is (it’s uncorrelated with mtld) - then we should probably also control for the parent tokens when using parent predictors.

MTLD

lmer(delta_mtld  ~ mean_hypernymchild + delta_age + mtld + log(num_tokens) + log(subsequent_tokens) +
       (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
    tidy() %>% 
    kable()
term estimate std.error statistic group
(Intercept) 4.2220503 0.5523145 7.6442858 fixed
mean_hypernymchild -0.1177589 0.0337476 -3.4893981 fixed
delta_age 0.1892664 0.0252528 7.4948568 fixed
mtld -0.7809828 0.0204165 -38.2526123 fixed
log(num_tokens) 0.2226614 0.0493964 4.5076406 fixed
log(subsequent_tokens) -0.0015686 0.0498169 -0.0314879 fixed
sd_(Intercept).child_id 1.1448106 NA NA child_id
sd_session_num.child_id 0.0704914 NA NA child_id
cor_(Intercept).session_num.child_id -0.2749746 NA NA child_id
sd_Observation.Residual 1.0445846 NA NA Residual
lmer(delta_mtld  ~ mean_hypernymparent+  delta_age + mtld+ log(num_tokens) +   log(subsequent_tokens) + (session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
  tidy() %>% 
  kable()
term estimate std.error statistic group
(Intercept) 3.6160304 0.5093903 7.0987420 fixed
mean_hypernymparent -0.1310942 0.0367181 -3.5702910 fixed
delta_age 0.1924444 0.0244150 7.8822257 fixed
mtld -0.7500478 0.0207426 -36.1598078 fixed
log(num_tokens) 0.2847607 0.0524526 5.4289155 fixed
log(subsequent_tokens) 0.0170947 0.0525046 0.3255853 fixed
sd_(Intercept).child_id 1.0778515 NA NA child_id
sd_session_num.child_id 0.0559699 NA NA child_id
cor_(Intercept).session_num.child_id -0.5413113 NA NA child_id
sd_Observation.Residual 1.0360714 NA NA Residual
lmer(delta_mtld  ~ mean_hypernymparent+  mean_hypernymchild + delta_age  + mtld+  log(num_tokens)  + log(subsequent_tokens) +(session_num|child_id), full_df_with_delta,
     control=lmerControl(optimizer="bobyqa")) %>%
  tidy() %>% 
  kable()
term estimate std.error statistic group
(Intercept) 4.9451821 0.6171692 8.0126849 fixed
mean_hypernymparent -0.0980749 0.0376935 -2.6019031 fixed
mean_hypernymchild -0.1384514 0.0355053 -3.8994624 fixed
delta_age 0.1982205 0.0242225 8.1833157 fixed
mtld -0.7601677 0.0209654 -36.2581609 fixed
log(num_tokens) 0.2447944 0.0534748 4.5777534 fixed
log(subsequent_tokens) 0.0105729 0.0524647 0.2015236 fixed
sd_(Intercept).child_id 1.0401743 NA NA child_id
sd_session_num.child_id 0.0556894 NA NA child_id
cor_(Intercept).session_num.child_id -0.5232671 NA NA child_id
sd_Observation.Residual 1.0353143 NA NA Residual