library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
theme_set(theme_classic(base_size = 10))To do this analysis, we need a good measure of vocab size from corpus data. Turns out TTR is terrible, it asymptotes early in development, while more objective measures of vocab size continue to grow. Here we use a differnt measure that is more highly correlated with CDI scores, MTLD.
in N. America corpus (kids with 2 timepoints only)
ld_df <- read_csv("diversity_measures_by_age.csv",
col_names = c("target_child_name",
"corpus_name",
"target_child_id",
"target_child_age",
"mtld")) %>%
filter(is.finite(mtld)) %>%
mutate(collection = "NAM")
ld_df %>%
filter(target_child_age < 2000) %>%
mutate(age_bin = cut(target_child_age,
breaks = 30)) %>%
group_by(age_bin) %>%
multi_boot_standard(col = "mtld") %>%
ggplot(aes(x = age_bin, y = mean, group = 1)) +
geom_smooth() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
ggtitle("CHILDES MTLD across development") +
ylab("MTLD") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))longitudinal_kids <- ld_df %>%
count(target_child_name, corpus_name) %>%
filter(n > 1)
longitudinal_df <- ld_df %>%
filter(target_child_age < 2000) %>%
right_join(longitudinal_kids)
ggplot(longitudinal_df,
aes(x = target_child_age, y = mtld, group = 1)) +
facet_wrap(~target_child_id, ncol = 8) +
geom_line(color = "blue") +
geom_point(size = .4) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))We don’t have enough data really to ask longitidinal questions. Let’s also look at the UK corpus.
ld_df_UK <- read_csv("diversity_measures_by_age_UK.csv",
col_names = c("target_child_name",
"corpus_name",
"target_child_id",
"target_child_age",
"mtld")) %>%
filter(is.finite(mtld)) %>%
mutate(collection = "UK")
all_ld_df <- ld_df %>%
bind_rows(ld_df_UK)
all_ld_df %>%
filter(target_child_age < 2000) %>%
mutate(age_bin = cut(target_child_age,
breaks = 30)) %>%
group_by(age_bin, collection) %>%
multi_boot_standard(col = "mtld") %>%
ggplot(aes(x = age_bin, y = mean, group = collection,
color = collection)) +
geom_smooth() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
ggtitle("CHILDES MTLD across development") +
ylab("MTLD") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) It’s noisier, but they look pretty comparable - let’s combine them
longitudinal_kids_all <- all_ld_df %>%
count(collection, target_child_name, corpus_name) %>%
filter(n > 1)
longitudinal_df_all <- all_ld_df %>%
filter(target_child_age < 2000) %>%
right_join(longitudinal_kids_all)This is the optimal cutoff that maximizes kids in the two groups.
ggplot(longitudinal_df_all, aes(x = target_child_age, y = mtld,
group = target_child_id, color = collection)) +
geom_point(size = .4) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))MIN_AGE <- 450
MAX_AGE <- 1150
t_midpoint <- MIN_AGE + (MAX_AGE-MIN_AGE)/2
t2_kids <- longitudinal_df_all %>%
filter(target_child_age < MAX_AGE & target_child_age > MIN_AGE) %>%
mutate(tbin = ifelse(target_child_age> t_midpoint, "t2", "t1"))
ggplot(t2_kids, aes(x = target_child_age, y = mtld,
group = target_child_id, color = collection)) +
geom_point(size = .4) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))# average across kids who have multiple measures in this
t2_kids_ms <- t2_kids %>%
group_by(target_child_id, tbin) %>%
summarize(mean_mtld = mean(mtld),
mean_target_child_age = mean(target_child_age)) %>%
arrange(target_child_id, tbin)
two_timepoint_kids <- t2_kids_ms %>%
count(target_child_id) %>%
filter(n>1)
ggplot(t2_kids_ms %>% right_join(two_timepoint_kids),
aes(x = mean_target_child_age, y = mean_mtld,
group = target_child_id, color = as.factor(target_child_id))) +
geom_vline(aes(xintercept = t_midpoint), linetype = 2) +
geom_line() +
xlim(MIN_AGE, MAX_AGE) +
geom_point(size = .4) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position= "none")Using this time point, here’s the distribution of MTLD at the two time points.
good_kids <- t2_kids_ms %>%
right_join(two_timepoint_kids) %>%
select(-n)
good_kids %>%
ggplot(aes(x = mean_mtld, group = tbin, fill = tbin)) +
geom_density(alpha = .5) +
theme_classic() +
ggtitle("Distributions of MTLD at two time points")I’m operationalizing this as the kids with the greatest change in MTLD, controling for t1 MTLD and age difference.
Get slopes:
delta_mtld <- good_kids %>%
select(-mean_target_child_age) %>%
spread(tbin,mean_mtld) %>%
rename(mtld_t1 = t1, mtld_t2 = t2 ) %>%
left_join(good_kids %>%
select(-mean_mtld) %>%
spread(tbin,mean_target_child_age) %>%
rename(age_t1 = t1, age_t2 = t2)) %>%
mutate(mtld_diff = mtld_t2 - mtld_t1,
age_diff = age_t2 - age_t1,
slope = mtld_diff/age_diff)
delta_mtld %>%
ggplot(aes(x = slope, group = 1)) +
geom_density(fill = "grey") +
theme_classic() +
ggtitle("Change in MTLD across timepoints")Get slopes, controling for other stuff:
mod <- lm(slope ~ age_diff + mtld_t1, d = delta_mtld)
delta_mtld_with_resids <- delta_mtld %>%
modelr::add_residuals(mod, var = "delta_resid")
median_delta_resid <- median(delta_mtld_with_resids$delta_resid)
kid_groups <- delta_mtld_with_resids %>%
mutate(delta_resid_group = ifelse(delta_resid > median_delta_resid,
"high", "low"))Here’s what measures look like for the two groups:
kid_groups %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "delta_resid") %>%
ggplot(aes(x = delta_resid_group, y = mean,
ymin = ci_lower, ymax = ci_upper)) +
geom_pointrange() +
ggtitle("delta_resid between two groups") +
theme_classic()kid_groups %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "mtld_t1") %>%
ggplot(aes(x = delta_resid_group, y = mean,
ymin = ci_lower, ymax = ci_upper)) +
geom_pointrange() +
ggtitle("mtld_t1 between two groups") +
theme_classic()groups_info <- kid_groups %>%
select(target_child_id, delta_resid_group) %>%
left_join(all_ld_df %>% distinct(target_child_id,
target_child_name,
collection, corpus_name)) %>%
ungroup() %>%
mutate(collection = fct_recode(collection,
"Eng-NA" = "NAM",
"Eng-UK" = "UK")) %>%
mutate_all(as.character)
#write_csv(groups_info, "groups_info.csv")Get types for all kids.
MIN_COUNT <- 5
MURMURS = c("xxx", "yyy", "yyy_yyy","---",
"s","f","t","n","d","b","e","c","o",
"v","h","r","k","j","g",
"p","x","z","u","v","m","l")
get_types_by_bin <- function(corpus_name,
target_child_name,
collection,
min_age,
max_age,
min_count,
murmurs){
print(target_child_name)
all_types <- childesr::get_types(
corpus = corpus_name,
child = target_child_name,
collection = collection,
role = "target_child")
midpoint <- min_age + (max_age - min_age)/2
all_types %>%
filter(count >= min_count,
target_child_age >= min_age,
target_child_age <= max_age) %>%
mutate(tbin = case_when(target_child_age > midpoint ~ "t2",
target_child_age < midpoint ~ "t1")) %>%
filter(!(gloss %in% murmurs)) %>%
select(collection_name,
corpus_id,
target_child_name,
target_child_id,
tbin,
gloss) %>%
distinct() # get rid of multiple entries in same time bin
}
target_types <- groups_info %>%
select(corpus_name, target_child_name, collection) %>%
as.list() %>%
pmap_df(get_types_by_bin, MIN_AGE, MAX_AGE, MIN_COUNT, MURMURS)
write_csv(target_types, "target_types_delta_450_1150.csv")Average number of word types per kid: 125
stem_hunspell <- function(term) {
# look up the term in the dictionary
stems <- hunspell::hunspell_stem(term)[[1]]
if (length(stems) == 0) { # if there are no stems, use the original term
stem <- term
} else { # if there are multiple stems, use the last one
stem <- stems[[length(stems)]]
}
stem
}
target_types <- read_csv("target_types_delta_450_1150.csv") %>%
mutate(tbin = fct_recode(tbin,
t1 = "low",
t2 = "high")) %>%
mutate(gloss = tolower(gloss)) %>%
rowwise() %>%
mutate(stem_gloss = corpus::text_tokens(gloss, stemmer = stem_hunspell)[[1]][1])
#target_types %>%
# count(target_child_name, collection_name, tbin) %>%
# summarize(mean = mean(n))Let’s filter what words we look at to only be those words that were produced min number of times by each kid (5). Also, not all kids actually have data at both timepoints once we exclude murmurs, etc. Finally, exlcude words that fewer than 3 kids said.
MIN_KIDS_PRODUCED_WORD <- 3
good_words <- target_types %>%
count(stem_gloss, target_child_id) %>%
count(stem_gloss) %>%
filter(nn >= MIN_KIDS_PRODUCED_WORD)
good_kids <- target_types %>%
filter(stem_gloss %in% good_words$stem_gloss) %>%
count(target_child_id, tbin) %>%
count(target_child_id) %>%
filter(nn > 1)
good_types <- target_types %>%
filter(stem_gloss %in% good_words$stem_gloss,
target_child_id %in% good_kids$target_child_id)t1_words <- good_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1") %>%
distinct(tbin, delta_resid_group, stem_gloss)
# get all t1 low/high words
t1_words_low <- filter(t1_words, delta_resid_group == "low")
t1_words_high <- filter(t1_words, delta_resid_group == "high")
high_crit_words <- setdiff(t1_words_high$stem_gloss,
t1_words_low$stem_gloss) %>%
sort() %>%# words that high kids know that low do not %>%
as.data.frame() %>%
mutate(type = "high_crit_words")
low_crit_words <- setdiff(t1_words_low$stem_gloss,
t1_words_high$stem_gloss) %>%
sort() %>% # words that low kids know that high do not
as.data.frame() %>%
mutate(type = "low_crit_words")
both_crit_words <- bind_rows(high_crit_words, low_crit_words) %>%
select(type, everything()) %>%
rename(stem_gloss = ".")
#write_csv(both_crit_words, "target_crit_words2.csv")prop_from_each_group_said_word_t1 <- good_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
count(stem_gloss, tbin, delta_resid_group) %>%
filter(tbin == "t1") %>%
group_by(stem_gloss) %>%
mutate(prop = n / 51) %>%
select(-tbin, -n) %>%
group_by(delta_resid_group, stem_gloss) %>%
arrange(delta_resid_group, -prop)
diff_in_props_between_groups <- prop_from_each_group_said_word_t1 %>%
spread(delta_resid_group, prop) %>%
mutate(high = ifelse(is.na(high), 0, high),
low = ifelse(is.na(low), 0, low),
diff = high - low) %>%
arrange(-diff) %>%
left_join(distinct(target_types, stem_gloss, gloss)) %>%
select(stem_gloss, gloss, diff, high, low) %>%
ungroup()
DT::datatable(diff_in_props_between_groups, caption = "Proportion of kids in each group that said each word, and the difference in proportions.")coding_pos <- prop_from_each_group_said_word_t1 %>%
ungroup() %>%
distinct(stem_gloss) %>%
left_join(distinct(target_types, gloss, stem_gloss)) %>%
left_join(lexicon::hash_grady_pos, by = c("stem_gloss" = "word")) %>%
group_by(stem_gloss) %>%
slice(1) %>%
select(gloss, stem_gloss, pos)
#write_csv(coding_pos, "t1_words_pos.csv")pos_tagged <- read_csv("t1_words_pos_molly_tagged.csv") %>%
select(-pos)
pos_distribution_of_all_words <- prop_from_each_group_said_word_t1 %>%
left_join(pos_tagged) %>%
group_by(delta_resid_group, molly_pos) %>%
summarise (n = n(),
sum_prop = sum(prop)) %>%
mutate(freq = n / sum(n),
freq_weighted = sum_prop / sum(sum_prop)) %>%
arrange(molly_pos) %>%
data.frame()
ggplot(pos_distribution_of_all_words, aes(group = delta_resid_group, x = molly_pos,
fill = delta_resid_group, y = freq)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Distribution of POS of all words")prop_pos_by_crit <- both_crit_words %>%
left_join(pos_tagged) %>%
group_by(type, molly_pos) %>%
summarise (n = n()) %>%
mutate(freq = n / sum(n)) %>%
arrange(molly_pos)
ggplot(prop_pos_by_crit, aes(group = type, x = molly_pos,
fill = type, y = freq)) +
geom_bar(stat = "identity", position = "dodge",
color = "black") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Distribution of POS of words said only by each group")both_crit_words %>%
left_join(pos_tagged) %>%
filter(type == "low_crit_words" & molly_pos %in% c("Adjective", "Noun", "Verb")) %>%
arrange(molly_pos) %>%
select(-type) %>%
DT::datatable(caption = "Content words produced only by low kids")And, only by words produced only by high kids:
both_crit_words %>%
left_join(pos_tagged) %>%
filter(type == "high_crit_words" & molly_pos %in% c("Adjective", "Noun", "Verb")) %>%
arrange(molly_pos) %>%
select(-type) %>%
DT::datatable(caption = "Content words produced only by high kids")The low words definitely seem “odder”,