library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(childesr)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(lme4)
theme_set(theme_classic(base_size = 10))d_eng_na <- get_transcripts(collection = "Eng-NA",
corpus = NULL,
child = NULL) %>%
mutate_if(is.character, as.factor) %>%
filter(language == "eng") %>%
filter(!is.na(target_child_id))
d_eng_na %>%
count(target_child_id) %>%
ggplot(aes(x = n)) +
geom_histogram() +
ggtitle("Number of transcripts per child in American English Corpus")more_than_one <- d_eng_na %>%
count(target_child_id) %>%
filter(n > 1) There are 283 children with multiple transcripts.
long_ages <- d_eng_na %>%
filter(target_child_id %in% more_than_one$target_child_id) %>%
select(target_child_id, target_child_age)
long_ages %>%
ggplot(aes(x = target_child_age)) +
geom_histogram() +
ggtitle("Distribution of ages (days)")Let’s filter to 350 - 1500 days (~18 months - 42 months).
target_longitudinal_transcripts <- long_ages %>%
filter(target_child_age >= 558,
target_child_age <= 1350)
transcript_counts <- target_longitudinal_transcripts %>%
count(target_child_id) %>%
arrange(n) %>%
filter(n > 1)
ordered_transcripts <- target_longitudinal_transcripts %>%
filter(target_child_id %in% transcript_counts$target_child_id) %>%
mutate(target_child_id = as.factor(target_child_id),
target_child_id = fct_relevel(target_child_id, transcript_counts$target_child_id))
ggplot(ordered_transcripts, aes(y=target_child_id, x=target_child_age),
fill = "black") +
geom_tile() +
geom_vline(aes(xintercept = c(558)), color = "red") +
geom_vline(aes(xintercept = c(744)), color = "red") +
geom_vline(aes(xintercept = c(930)), color = "red") +
geom_vline(aes(xintercept = c(1116)), color = "red") +
geom_vline(aes(xintercept = c(1302)), color = "red") +
ylab("Child ID") +
xlab("Child age (days)") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle("Longitudinal time points for each child with multiple transcripts")There are 156 children with greater than one transcript in the 18-42 age bin.
target_longitudinal_transcripts_bin <- long_ages %>%
filter(target_child_age >= 558,
target_child_age <= 1302) %>%
mutate(age_bin = cut(target_child_age,
breaks = c(558, 744, 930, 1116, 1302),
labels = c("18m", "24m", "30m", "36m"), include.lowest = T),
target_child_id = as.factor(target_child_id))
transcript_counts_bin <- target_longitudinal_transcripts_bin %>%
group_by(target_child_id, age_bin) %>%
slice(1) %>%
group_by(target_child_id) %>%
summarize(n = n()) %>%
filter(n > 1)
target_longitudinal_transcripts_bin_counts <- target_longitudinal_transcripts_bin %>%
filter(target_child_id %in% transcript_counts_bin$target_child_id) %>%
count(target_child_id, age_bin)
ggplot(target_longitudinal_transcripts_bin_counts,
aes(y=target_child_id, x= age_bin, fill = n)) +
scale_fill_gradient(low = "grey", high = "red") +
geom_tile() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle("Longitudinal time points (binned) for each child with multiple transcripts")min <- target_longitudinal_transcripts_bin_counts %>% filter(!(target_child_id %in% c("5204", "5255")))
ggplot(target_longitudinal_transcripts_bin_counts %>% filter(!(target_child_id %in% c("5204", "5255", "5193"))),
aes(y=target_child_id, x= age_bin, fill = n)) +
scale_fill_gradient(low = "grey", high = "red") +
geom_tile() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle("Longitudinal time points (binned) for each child with multiple transcripts")Here’s what the distribution fo time points looks like binned (“18m”- “24m”-“30m”- “36m”):
bin_dist <- target_longitudinal_transcripts_bin_counts %>%
count(target_child_id, age_bin) %>%
mutate(nn = as.character(ifelse(nn > 0, 1,NA))) %>% spread(age_bin, nn) %>%
rowwise() %>%
mutate(timepoints = paste0(`18m`, `24m`, `30m`, `36m`),
timepoints = str_replace_all(timepoints, "NA", "0")) %>%
count(timepoints) %>%
arrange(-n)
kable(bin_dist)| timepoints | n |
|---|---|
| 1100 | 46 |
| 0101 | 17 |
| 1010 | 16 |
| 1111 | 10 |
| 0111 | 7 |
| 1110 | 6 |
| 0011 | 3 |
| 0110 | 2 |
There are 107 children with multiple binned timepoints between 18-42 months.
targ_paricipants <- unique(target_longitudinal_transcripts_bin_counts$target_child_id)Index into transcript with child’s name and corpus name, and save type counts to feather
target_participants_full <- d_eng_na %>%
group_by(target_child_id) %>%
slice(1) %>%
filter(target_child_id %in% targ_paricipants) %>%
select(target_child_id, corpus_name, target_child_name) %>%
ungroup() %>%
mutate_all(as.character)
write_tokens_by_child <- function(target_child_id, corpus_name, target_child_name){
print(corpus_name)
print(target_child_name)
types <- get_types(corpus = corpus_name, child = target_child_name)
types_clean <- types %>%
mutate_if(is.character, as.factor) %>%
filter(speaker_role == "Target_Child") %>%
mutate(corpus = corpus_name,
age_bin = cut(target_child_age,
breaks = c(558, 744, 930, 1116, 1302),
labels = c("18m", "24m", "30m", "36m"),
include.lowest = T)) %>%
group_by(age_bin, gloss) %>%
summarize(count = sum(count))
types_clean_with_demo <- types_clean %>%
ungroup() %>%
mutate(corpus_id = types$corpus_id[1],
target_child_id = types$target_child_id[1],
target_child_name = types$target_child_name[1],
target_child_sex = types$target_child_sex[1]) %>%
select(corpus_id, target_child_name, target_child_id,
age_bin, target_child_sex, gloss, count)
file_name <- paste0("childes_type_data/child_id_",
target_child_id, "_types.feather")
write_feather(types_clean_with_demo, file_name)
}
target_participants_full %>%
as.list() %>%
pwalk(write_tokens_by_child)
# note that some (~ 5) do not have corpus nameRead in counts by types
all_types <- map_df(targ_paricipants, function(x){
file_name <- paste0("childes_type_data/child_id_", x, "_types.feather")
read_feather(file_name)
}) %>%
select(target_child_id, age_bin, gloss, count) %>%
filter(!is.na(age_bin))Set cutoff for knowing word
WORD_CUTOFF <- 5
filtered_types <- all_types %>%
filter(count >= WORD_CUTOFF) %>%
mutate(numeric_age_bin = as.numeric(age_bin))Get “vocab” size of child at each age point
# total tokens
relative_vocab_size <- filtered_types %>%
group_by(target_child_id, age_bin) %>%
summarize(n_types = n(),
total_tokens = sum(count))Types vs. Tokens
relative_vocab_size %>%
ungroup()%>%
ggplot(aes(y = n_types, x = total_tokens)) +
geom_smooth() +
ylab("N words (types)") +
xlab("Number of child-produced words in transcript (tokens)") +
facet_wrap(~age_bin)Tokens vs. age bin
relative_vocab_size %>%
group_by(age_bin) %>%
multi_boot_standard(col = "total_tokens") %>%
ggplot(aes(x = age_bin, y = mean, ymin = ci_lower, ymax = ci_upper)) +
ylab("Number of child-produced words in transcript (tokens)") +
geom_pointrange() Get token-residualized vocab estimates
relative_vocab_size_with_resids <- relative_vocab_size %>%
add_residuals(lm(n_types~total_tokens, d = relative_vocab_size))
relative_vocab_size_with_resids %>%
ggplot(aes(x = resid, fill = age_bin)) +
geom_histogram() +
facet_grid(~age_bin) +
theme(legend.position = "none")relative_vocab_size_with_resids %>%
group_by(age_bin) %>%
multi_boot_standard(col = "resid", na.rm = T) %>%
ggplot(aes(x = age_bin, y = mean)) +
ggtitle("Population-level vocabulary size") +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
ylab("number of words (token-residualized)") +
theme(legend.position = "none", text = element_text(size = 18))Let’s look within-child
vocab_deltas <- relative_vocab_size_with_resids %>%
mutate(age_bin = as.numeric(age_bin)) %>%
group_by(target_child_id) %>%
mutate(min_bin = min(age_bin),
min_bin = ifelse(min_bin == age_bin, "min", "not_min")) %>%
mutate(vocab_delta = resid - lag(resid,
default=first(resid)),
log_vocab_delta = log(vocab_delta))
ggplot(filter(vocab_deltas, min_bin == "not_min") %>% ungroup(),
aes(x = vocab_delta)) +
geom_histogram(binwidth = 3, fill = "grey") +
#geom_density() +
ggtitle("Distribution of change in vocab by child across timepoints") +
geom_vline(aes(xintercept = 0), color = "red", size = 1) +
xlab("number of words (token-residualized)") +
theme(legend.position = "none", text = element_text(size = 18))ggplot(filter(vocab_deltas, min_bin == "not_min"),
aes(x = log_vocab_delta)) +
geom_histogram() +
ggtitle("Distribution of change in vocab by child across timepoints") +
geom_vline(aes(xintercept = 1), color = "red") +
xlab("log number of words (token-residualized)") # note negative deltas are removed when logged
vocab_deltas %>%
filter(min_bin == "not_min") %>%
mutate(age_bin = as.factor(age_bin)) %>%
group_by(age_bin) %>%
multi_boot_standard(col = "vocab_delta", na.rm = T) %>%
ggplot(aes(x = age_bin, y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
ylab("change in number of words (token-residualized)") +
geom_hline(aes(yintercept = 0), color = "red") +
theme(legend.position = "none", text = element_text(size = 18))vocab_deltas %>%
filter(min_bin == "not_min") %>%
mutate(age_bin = as.factor(age_bin)) %>%
group_by(age_bin) %>%
multi_boot_standard(col = "log_vocab_delta", na.rm = T) %>%
ggplot(aes(x = age_bin, y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper, color = age_bin)) +
ylab("change in number of words (token-residualized)") +
geom_hline(aes(yintercept = 1), color = "red") +
theme(legend.position = "none", text = element_text(size = 18))mostly greater than zero
core_words <- read_feather("kernel_core_words.feather") %>%
unlist(use.names = F)At each time point, how many core words does the child know?
get_num_core_words <- function(this_target_child_id, this_age_bin, df, core_words) {
cumulative_df <- df %>%
filter(numeric_age_bin <= this_age_bin,
target_child_id == this_target_child_id)
n_core_types_cumulative <- cumulative_df %>%
filter(gloss %in% core_words) %>%
summarize(n_core_types_cumulative = n()) %>%
select(n_core_types_cumulative) %>%
pull()
n_Ncore_types_cumulative <- cumulative_df %>%
filter(!(gloss %in% core_words)) %>%
summarize(n_Ncore_types_cumulative = n()) %>%
select(n_Ncore_types_cumulative) %>%
pull()
data.frame(target_child_id = this_target_child_id,
numeric_age_bin = this_age_bin,
n_types_cumulative = nrow(cumulative_df),
n_tokens_cumulative = sum(cumulative_df$count),
n_core_types_cumulative = ifelse(length(n_core_types_cumulative) > 0 ,n_core_types_cumulative, NA),
n_Ncore_types_cumulative = ifelse(length(n_Ncore_types_cumulative) > 0 ,n_Ncore_types_cumulative, NA))
}
child_age_bin_pairs <- distinct(filtered_types, target_child_id, numeric_age_bin)
all_counts <- map2_df(child_age_bin_pairs$target_child_id,
child_age_bin_pairs$numeric_age_bin,
get_num_core_words, filtered_types, core_words)
all_counts %>%
gather(word_type, value, c(-1:-2, -4)) %>%
group_by(numeric_age_bin, word_type) %>%
multi_boot_standard(col = "value", na.rm = T) %>%
ungroup() %>%
mutate(fct_age_bin = as.factor(numeric_age_bin),
age_bin = fct_recode(fct_age_bin, "18m" = "1", "24m" = "2", "32m" = "3", "48m" = "4"),
word_type = as.factor(word_type),
word_type = fct_recode(word_type,
"core" = "n_core_types_cumulative", "non-core" = "n_Ncore_types_cumulative", "total" = "n_types_cumulative")) %>%
ggplot(aes(x = age_bin, y = mean, color = word_type, group = word_type)) +
ggtitle("Cumulative words") +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("N types (cumulative)") +
theme(text = element_text(size = 18)) +
geom_line() all_counts_with_resids <- all_counts %>%
add_residuals(lm(n_types_cumulative ~ n_tokens_cumulative, d = all_counts),
var = "cumulative_vocab_resid") %>%
add_residuals(lm(n_core_types_cumulative ~ n_tokens_cumulative, d = all_counts),
var = "cumulative_core_resid") %>%
add_residuals(lm(n_Ncore_types_cumulative ~ n_tokens_cumulative, d = all_counts),
var = "cumulative_Ncore_resid") %>%
group_by(target_child_id) %>%
arrange(target_child_id, numeric_age_bin) %>%
mutate(previous_cumulative_core_resid = lag(cumulative_core_resid),
previous_cumulative_vocab_resid = lag(cumulative_vocab_resid))Does Kernal vocab size at t1 predict vocab size at t2?
m <- lm(cumulative_vocab_resid ~ previous_cumulative_core_resid +
previous_cumulative_vocab_resid, all_counts_with_resids)
summary(m) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 28.2011881 | 5.0806928 | 5.550658 | 0.0000002 |
| previous_cumulative_core_resid | -0.3875922 | 0.3222494 | -1.202771 | 0.2313751 |
| previous_cumulative_vocab_resid | 1.3517189 | 0.1333684 | 10.135225 | 0.0000000 |
m <- lm(cumulative_Ncore_resid ~ previous_cumulative_core_resid +
previous_cumulative_vocab_resid, all_counts_with_resids)
summary(m) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 17.060724 | 3.2004525 | 5.330722 | 5e-07 |
| previous_cumulative_core_resid | -1.429663 | 0.2029928 | -7.042924 | 0e+00 |
| previous_cumulative_vocab_resid | 1.294281 | 0.0840120 | 15.405907 | 0e+00 |
m <- lm(cumulative_core_resid ~ previous_cumulative_core_resid +
previous_cumulative_vocab_resid, all_counts_with_resids)
summary(m) %>%
tidy() %>%
kable()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 11.1404644 | 2.0532952 | 5.425652 | 0.0000003 |
| previous_cumulative_core_resid | 1.0420705 | 0.1302329 | 8.001594 | 0.0000000 |
| previous_cumulative_vocab_resid | 0.0574375 | 0.0538991 | 1.065648 | 0.2886701 |
m <- lmer(cumulative_vocab_resid ~ previous_cumulative_core_resid +
previous_cumulative_vocab_resid+
(1|target_child_id) , all_counts_with_resids)
summary(m)
all_counts_with_resids %>%
ungroup() %>%
ggplot(aes(y = log(cumulative_vocab_resid),
x = log(previous_prop_core_cumulative))) +
geom_point() +
geom_smooth(method = "lm")
all_counts_with_resids %>%
ungroup() %>%
ggplot(aes(y = log(non_cumulative_vocab_resid),
x = log(previous_prop_core_non_cumulative))) +
geom_point() +
geom_smooth(method = "lm")
cor.test(log(all_counts_with_resids$cumulative_vocab_resid), log(all_counts_with_resids$previous_prop_core_cumulative))