Childes Exploration

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(childesr)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(lme4)

theme_set(theme_classic(base_size = 10))

Get American English CHILDES data

d_eng_na <- get_transcripts(collection = "Eng-NA", 
                            corpus = NULL, 
                            child = NULL) %>%
  mutate_if(is.character, as.factor) %>%
  filter(language == "eng") %>%
  filter(!is.na(target_child_id))

d_eng_na %>%
  count(target_child_id) %>%
  ggplot(aes(x = n)) +
  geom_histogram() +
  ggtitle("Number of transcripts per child in American English Corpus")

more_than_one <- d_eng_na %>%
  count(target_child_id) %>%
  filter(n > 1)

There are 283 children with multiple transcripts.

What are the timepoints of these multiple transcripts?

long_ages <- d_eng_na %>%
  filter(target_child_id %in% more_than_one$target_child_id) %>%
  select(target_child_id, target_child_age)  

long_ages %>%
  ggplot(aes(x = target_child_age)) +
  geom_histogram() +
  ggtitle("Distribution of ages (days)")

Subset transcripts to target agerange

Let’s filter to 350 - 1500 days (~18 months - 42 months).

target_longitudinal_transcripts <- long_ages %>%
  filter(target_child_age >= 558, 
         target_child_age <= 1350)

transcript_counts <- target_longitudinal_transcripts %>%
  count(target_child_id) %>%
  arrange(n) %>%
  filter(n > 1)

ordered_transcripts <- target_longitudinal_transcripts %>%
 filter(target_child_id %in% transcript_counts$target_child_id) %>%
 mutate(target_child_id = as.factor(target_child_id),
        target_child_id = fct_relevel(target_child_id, transcript_counts$target_child_id))


 ggplot(ordered_transcripts, aes(y=target_child_id, x=target_child_age),
        fill = "black") + 
 geom_tile() +
 geom_vline(aes(xintercept = c(558)), color = "red") +
 geom_vline(aes(xintercept = c(744)), color = "red") +
 geom_vline(aes(xintercept = c(930)), color = "red") +
 geom_vline(aes(xintercept = c(1116)), color = "red") +
 geom_vline(aes(xintercept = c(1302)), color = "red") +
  ylab("Child ID") +
   xlab("Child age (days)") +
 theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
ggtitle("Longitudinal time points for each child with multiple transcripts")

There are 156 children with greater than one transcript in the 18-42 age bin.

Collapsed into time bins

target_longitudinal_transcripts_bin <- long_ages %>%
  filter(target_child_age >= 558, 
         target_child_age <= 1302) %>%
  mutate(age_bin = cut(target_child_age, 
                       breaks = c(558, 744, 930, 1116, 1302), 
                       labels = c("18m", "24m", "30m", "36m"), include.lowest = T),
         target_child_id = as.factor(target_child_id)) 

transcript_counts_bin <- target_longitudinal_transcripts_bin %>%
  group_by(target_child_id, age_bin) %>%
  slice(1) %>%
  group_by(target_child_id) %>%
  summarize(n = n()) %>%
  filter(n > 1)

target_longitudinal_transcripts_bin_counts <- target_longitudinal_transcripts_bin %>%
   filter(target_child_id %in% transcript_counts_bin$target_child_id) %>%
  count(target_child_id, age_bin) 

ggplot(target_longitudinal_transcripts_bin_counts, 
       aes(y=target_child_id, x= age_bin, fill = n)) + 
  scale_fill_gradient(low = "grey", high = "red") +
 geom_tile() +
 theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
 ggtitle("Longitudinal time points (binned) for each child with multiple transcripts")

min <- target_longitudinal_transcripts_bin_counts %>% filter(!(target_child_id %in% c("5204", "5255")))
ggplot(target_longitudinal_transcripts_bin_counts %>% filter(!(target_child_id %in% c("5204", "5255", "5193"))), 
       aes(y=target_child_id, x= age_bin, fill = n)) + 
  scale_fill_gradient(low = "grey", high = "red") +
 geom_tile() +
 theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
 ggtitle("Longitudinal time points (binned) for each child with multiple transcripts")

Here’s what the distribution fo time points looks like binned (“18m”- “24m”-“30m”- “36m”):

bin_dist <- target_longitudinal_transcripts_bin_counts %>%
  count(target_child_id, age_bin) %>%
  mutate(nn = as.character(ifelse(nn > 0, 1,NA))) %>%  spread(age_bin, nn) %>% 
  rowwise() %>%
  mutate(timepoints = paste0(`18m`, `24m`, `30m`, `36m`), 
         timepoints = str_replace_all(timepoints, "NA", "0")) %>%
  count(timepoints) %>% 
  arrange(-n) 

kable(bin_dist)

timepoints	n
1100	46
0101	17
1010	16
1111	10
0111	7
1110	6
0011	3
0110	2

There are 107 children with multiple binned timepoints between 18-42 months.

targ_paricipants <- unique(target_longitudinal_transcripts_bin_counts$target_child_id)

Get vocab data

Index into transcript with child’s name and corpus name, and save type counts to feather

target_participants_full <- d_eng_na %>%
  group_by(target_child_id) %>%
  slice(1) %>%
  filter(target_child_id %in% targ_paricipants) %>%
  select(target_child_id, corpus_name, target_child_name) %>%
  ungroup() %>%
  mutate_all(as.character) 

write_tokens_by_child <- function(target_child_id, corpus_name, target_child_name){
  print(corpus_name)
  print(target_child_name)
  types <- get_types(corpus = corpus_name, child = target_child_name) 
  types_clean <- types %>%
      mutate_if(is.character, as.factor) %>%
      filter(speaker_role == "Target_Child") %>%
      mutate(corpus = corpus_name,
             age_bin = cut(target_child_age, 
                       breaks = c(558, 744, 930, 1116, 1302), 
                       labels = c("18m", "24m", "30m", "36m"),
                       include.lowest = T)) %>%
      group_by(age_bin, gloss) %>%
      summarize(count = sum(count))
  
  types_clean_with_demo <- types_clean %>%
        ungroup() %>%
        mutate(corpus_id = types$corpus_id[1],
               target_child_id = types$target_child_id[1], 
               target_child_name = types$target_child_name[1],
               target_child_sex = types$target_child_sex[1]) %>%
        select(corpus_id, target_child_name, target_child_id, 
                  age_bin, target_child_sex, gloss, count) 
  
  file_name <- paste0("childes_type_data/child_id_", 
                      target_child_id, "_types.feather") 
  write_feather(types_clean_with_demo, file_name)
}
target_participants_full %>%
  as.list() %>%
  pwalk(write_tokens_by_child)
# note that some (~ 5) do not have corpus name

Read in counts by types

all_types <- map_df(targ_paricipants, function(x){  
  file_name <- paste0("childes_type_data/child_id_", x, "_types.feather") 
  read_feather(file_name)
}) %>%
  select(target_child_id, age_bin, gloss, count) %>% 
  filter(!is.na(age_bin))

Set cutoff for knowing word

WORD_CUTOFF <- 5
filtered_types <- all_types %>%
  filter(count >= WORD_CUTOFF) %>%
  mutate(numeric_age_bin = as.numeric(age_bin))

Get “vocab” size of child at each age point

# total tokens 
relative_vocab_size <- filtered_types %>%
  group_by(target_child_id, age_bin) %>%
  summarize(n_types = n(),
            total_tokens = sum(count))

Types vs. Tokens

relative_vocab_size %>%
  ungroup()%>%
  ggplot(aes(y = n_types, x = total_tokens)) +
  geom_smooth() +
  ylab("N words (types)") +
  xlab("Number of child-produced words in transcript (tokens)") +
  facet_wrap(~age_bin)

Tokens vs. age bin

relative_vocab_size %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "total_tokens") %>%
  ggplot(aes(x = age_bin, y = mean, ymin = ci_lower, ymax = ci_upper)) +
  ylab("Number of child-produced words in transcript (tokens)") +
  geom_pointrange()

Get token-residualized vocab estimates

relative_vocab_size_with_resids <- relative_vocab_size %>%
  add_residuals(lm(n_types~total_tokens, d = relative_vocab_size))

relative_vocab_size_with_resids %>%
  ggplot(aes(x = resid, fill = age_bin)) + 
  geom_histogram() +
  facet_grid(~age_bin) +
  theme(legend.position = "none")

relative_vocab_size_with_resids %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "resid", na.rm = T) %>%
  ggplot(aes(x = age_bin, y = mean)) +
  ggtitle("Population-level vocabulary size") +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
  ylab("number of words (token-residualized)") +
  theme(legend.position = "none", text = element_text(size = 18))

Let’s look within-child

vocab_deltas <- relative_vocab_size_with_resids %>%
  mutate(age_bin = as.numeric(age_bin)) %>%
  group_by(target_child_id) %>%
  mutate(min_bin = min(age_bin),
         min_bin = ifelse(min_bin == age_bin, "min", "not_min")) %>%
  mutate(vocab_delta = resid - lag(resid,
                                   default=first(resid)),
         log_vocab_delta = log(vocab_delta))

ggplot(filter(vocab_deltas, min_bin == "not_min") %>% ungroup(),
       aes(x = vocab_delta)) +
  geom_histogram(binwidth = 3, fill = "grey") +
  #geom_density() +
  ggtitle("Distribution of change in vocab by child across timepoints") +
  geom_vline(aes(xintercept = 0), color = "red", size = 1) +
  xlab("number of words (token-residualized)")  +
    theme(legend.position = "none", text = element_text(size = 18))

ggplot(filter(vocab_deltas, min_bin == "not_min"), 
       aes(x = log_vocab_delta)) +
  geom_histogram() +
  ggtitle("Distribution of change in vocab by child across timepoints") +
  geom_vline(aes(xintercept = 1), color = "red") +
  xlab("log number of words (token-residualized)")

# note negative deltas are removed when logged

vocab_deltas %>%
  filter(min_bin == "not_min") %>%
  mutate(age_bin = as.factor(age_bin)) %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "vocab_delta", na.rm = T) %>%
  ggplot(aes(x = age_bin, y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
  ylab("change in number of words (token-residualized)") +
  geom_hline(aes(yintercept = 0), color = "red") +
  theme(legend.position = "none", text = element_text(size = 18))

vocab_deltas %>%
  filter(min_bin == "not_min") %>%
  mutate(age_bin = as.factor(age_bin)) %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "log_vocab_delta", na.rm = T) %>%
  ggplot(aes(x = age_bin, y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
  ylab("change in number of words (token-residualized)") +
  geom_hline(aes(yintercept = 1), color = "red") +
  theme(legend.position = "none", text = element_text(size = 18))

mostly greater than zero

Get core words

core_words <- read_feather("kernel_core_words.feather")  %>%
  unlist(use.names = F)

At each time point, how many core words does the child know?

get_num_core_words <- function(this_target_child_id, this_age_bin, df, core_words) {
  
  cumulative_df <- df %>%
    filter(numeric_age_bin <= this_age_bin, 
           target_child_id == this_target_child_id) 

  n_core_types_cumulative <- cumulative_df %>%
    filter(gloss %in% core_words) %>%
    summarize(n_core_types_cumulative = n()) %>%
    select(n_core_types_cumulative) %>%
    pull()
  
  n_Ncore_types_cumulative <- cumulative_df %>%
    filter(!(gloss %in% core_words)) %>%
    summarize(n_Ncore_types_cumulative = n()) %>%
    select(n_Ncore_types_cumulative) %>%
    pull()

  
  data.frame(target_child_id = this_target_child_id,
              numeric_age_bin = this_age_bin, 
              n_types_cumulative = nrow(cumulative_df),
              n_tokens_cumulative = sum(cumulative_df$count),
              n_core_types_cumulative = ifelse(length(n_core_types_cumulative) > 0 ,n_core_types_cumulative, NA),
              n_Ncore_types_cumulative = ifelse(length(n_Ncore_types_cumulative) > 0 ,n_Ncore_types_cumulative, NA))
  
}

child_age_bin_pairs <- distinct(filtered_types, target_child_id, numeric_age_bin)

all_counts <- map2_df(child_age_bin_pairs$target_child_id, 
          child_age_bin_pairs$numeric_age_bin,
          get_num_core_words, filtered_types, core_words)

all_counts %>%
  gather(word_type, value, c(-1:-2, -4)) %>%
  group_by(numeric_age_bin, word_type) %>%
  multi_boot_standard(col = "value", na.rm = T)  %>%
  ungroup() %>%
  mutate(fct_age_bin = as.factor(numeric_age_bin),
         age_bin = fct_recode(fct_age_bin, "18m" = "1", "24m" = "2", "32m" = "3", "48m" = "4"),
         word_type = as.factor(word_type),
         word_type = fct_recode(word_type, 
                                                           "core" = "n_core_types_cumulative", "non-core" = "n_Ncore_types_cumulative", "total" = "n_types_cumulative")) %>%
  ggplot(aes(x = age_bin, y = mean, color = word_type, group = word_type)) +
  ggtitle("Cumulative words") +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("N types (cumulative)") +
    theme(text = element_text(size = 18)) +
    geom_line()

all_counts_with_resids <-  all_counts %>%
  add_residuals(lm(n_types_cumulative ~ n_tokens_cumulative, d = all_counts), 
                var = "cumulative_vocab_resid") %>%
  add_residuals(lm(n_core_types_cumulative ~ n_tokens_cumulative, d = all_counts), 
                var = "cumulative_core_resid") %>%
  add_residuals(lm(n_Ncore_types_cumulative ~ n_tokens_cumulative, d = all_counts), 
                var = "cumulative_Ncore_resid")  %>%
  group_by(target_child_id) %>%
  arrange(target_child_id, numeric_age_bin) %>% 
  mutate(previous_cumulative_core_resid = lag(cumulative_core_resid),
         previous_cumulative_vocab_resid = lag(cumulative_vocab_resid))

Predicting vocab size

Does Kernal vocab size at t1 predict vocab size at t2?

total (t2) ~ core (t1)

m <- lm(cumulative_vocab_resid ~ previous_cumulative_core_resid + 
            previous_cumulative_vocab_resid, all_counts_with_resids)

summary(m) %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	28.2011881	5.0806928	5.550658	0.0000002
previous_cumulative_core_resid	-0.3875922	0.3222494	-1.202771	0.2313751
previous_cumulative_vocab_resid	1.3517189	0.1333684	10.135225	0.0000000

non-core (t2) ~ core (t1)

m <- lm(cumulative_Ncore_resid ~ previous_cumulative_core_resid + 
            previous_cumulative_vocab_resid, all_counts_with_resids)

summary(m) %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	17.060724	3.2004525	5.330722	5e-07
previous_cumulative_core_resid	-1.429663	0.2029928	-7.042924	0e+00
previous_cumulative_vocab_resid	1.294281	0.0840120	15.405907	0e+00

core (t2) ~ core (t1)

m <- lm(cumulative_core_resid ~ previous_cumulative_core_resid + 
            previous_cumulative_vocab_resid, all_counts_with_resids)

summary(m) %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	11.1404644	2.0532952	5.425652	0.0000003
previous_cumulative_core_resid	1.0420705	0.1302329	8.001594	0.0000000
previous_cumulative_vocab_resid	0.0574375	0.0538991	1.065648	0.2886701

m <- lmer(cumulative_vocab_resid ~ previous_cumulative_core_resid + 
            previous_cumulative_vocab_resid+
        (1|target_child_id) , all_counts_with_resids)

summary(m) 

all_counts_with_resids %>%
  ungroup() %>%
  ggplot(aes(y = log(cumulative_vocab_resid), 
             x = log(previous_prop_core_cumulative))) +
  geom_point() +
  geom_smooth(method = "lm")

all_counts_with_resids %>%
  ungroup() %>%
  ggplot(aes(y = log(non_cumulative_vocab_resid),
             x = log(previous_prop_core_non_cumulative))) +
  geom_point() +
  geom_smooth(method = "lm")

cor.test(log(all_counts_with_resids$cumulative_vocab_resid), log(all_counts_with_resids$previous_prop_core_cumulative))