MTLD exploration

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)

theme_set(theme_classic(base_size = 10))

To do this analysis, we need a good measure of vocab size from corpus data. Turns out TTR is terrible, it asymptotes early in development, while more objective measures of vocab size continue to grow. Here we use a differnt measure that is more highly correlated with CDI scores, MTLD.

Measure of vocab - MTLD

Group average across development

in N. America corpus (kids with 2 timepoints only)

ld_df <- read_csv("diversity_measures_by_age.csv", 
                  col_names = c("target_child_name",
                                "corpus_name",
                                "target_child_id",
                                "target_child_age",
                                "mtld")) %>%
  filter(is.finite(mtld))  %>%
  mutate(collection = "NAM")

ld_df %>%
  filter(target_child_age < 2000) %>%
  mutate(age_bin = cut(target_child_age, 
                       breaks = 30)) %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "mtld") %>%
  ggplot(aes(x = age_bin, y = mean, group = 1)) +
  geom_smooth() +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
  ggtitle("CHILDES MTLD across development") +
  ylab("MTLD") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

by kid

longitudinal_kids <- ld_df %>%
  count(target_child_name, corpus_name) %>%
  filter(n > 1)

longitudinal_df <- ld_df %>%
filter(target_child_age < 2000) %>%
 right_join(longitudinal_kids)

ggplot(longitudinal_df, 
       aes(x = target_child_age, y = mtld, group = 1)) +
  facet_wrap(~target_child_id, ncol = 8) +
  geom_line(color = "blue") +
  geom_point(size = .4) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

We don’t have enough data really to ask longitidinal questions. Let’s also look at the UK corpus.

UK corpus

ld_df_UK <- read_csv("diversity_measures_by_age_UK.csv", 
                  col_names = c("target_child_name",
                                "corpus_name",
                                "target_child_id",
                                "target_child_age",
                                "mtld")) %>%
  filter(is.finite(mtld))  %>%
  mutate(collection = "UK")

all_ld_df <- ld_df %>%
  bind_rows(ld_df_UK)

all_ld_df %>%
  filter(target_child_age < 2000) %>%
  mutate(age_bin = cut(target_child_age, 
                       breaks = 30)) %>%
  group_by(age_bin, collection) %>%
  multi_boot_standard(col = "mtld") %>%
  ggplot(aes(x = age_bin, y = mean, group = collection, 
             color = collection)) +
  geom_smooth() +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
  ggtitle("CHILDES MTLD across development") +
  ylab("MTLD") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

It’s noisier, but they look pretty comparable - let’s combine them

longitudinal_kids_all <- all_ld_df %>%
  count(collection, target_child_name, corpus_name) %>%
  filter(n > 1)

longitudinal_df_all <- all_ld_df %>%
    filter(target_child_age < 2000) %>%
    right_join(longitudinal_kids_all)

Split kids two time points

This is the optimal cutoff that maximizes kids in the two groups.

ggplot(longitudinal_df_all, aes(x = target_child_age, y = mtld, 
                      group = target_child_id, color = collection)) +
  geom_point(size = .4) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

MIN_AGE <- 450
MAX_AGE <- 1150

t_midpoint <- MIN_AGE + (MAX_AGE-MIN_AGE)/2

t2_kids <- longitudinal_df_all %>%
  filter(target_child_age < MAX_AGE & target_child_age > MIN_AGE) %>%
  mutate(tbin = ifelse(target_child_age> t_midpoint, "t2", "t1"))

ggplot(t2_kids, aes(x = target_child_age, y = mtld, 
                      group = target_child_id, color = collection)) +
  geom_point(size = .4) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# average across kids who have multiple measures in this
t2_kids_ms <- t2_kids %>%
  group_by(target_child_id, tbin) %>%
  summarize(mean_mtld = mean(mtld),
            mean_target_child_age = mean(target_child_age)) %>%
  arrange(target_child_id, tbin)

two_timepoint_kids <- t2_kids_ms %>%
  count(target_child_id) %>%
  filter(n>1) 

ggplot(t2_kids_ms %>% right_join(two_timepoint_kids), 
       aes(x = mean_target_child_age, y = mean_mtld, 
                      group = target_child_id, color = as.factor(target_child_id))) +
  geom_vline(aes(xintercept = t_midpoint), linetype = 2) +
  geom_line() +
  xlim(MIN_AGE, MAX_AGE) +
  geom_point(size = .4) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position= "none")

Using this time point, here’s the distribution of MTLD at the two time points.

good_kids <- t2_kids_ms %>% 
  right_join(two_timepoint_kids) %>%
  select(-n)
good_kids %>%
  ggplot(aes(x = mean_mtld, group = tbin, fill = tbin)) +
  geom_density(alpha = .5) +
  theme_classic() +
  ggtitle("Distributions of MTLD at two time points")

Get critical DV - early and late talkers

I’m operationalizing this as the kids with the greatest change in MTLD, controling for t1 MTLD and age difference.

Get slopes:

delta_mtld <- good_kids %>%
  select(-mean_target_child_age) %>%
  spread(tbin,mean_mtld) %>%
  rename(mtld_t1 = t1, mtld_t2 = t2 ) %>%
      left_join(good_kids %>%
                select(-mean_mtld) %>%
                spread(tbin,mean_target_child_age) %>%
                rename(age_t1 = t1, age_t2 = t2)) %>%
    mutate(mtld_diff = mtld_t2 - mtld_t1, 
           age_diff = age_t2 - age_t1,
           slope = mtld_diff/age_diff)

delta_mtld %>%
  ggplot(aes(x = slope, group = 1)) +
  geom_density(fill = "grey") +
  theme_classic() +
  ggtitle("Change in MTLD across timepoints")

Get slopes, controling for other stuff:

mod <- lm(slope ~ age_diff + mtld_t1, d = delta_mtld) 

delta_mtld_with_resids <- delta_mtld %>%
  modelr::add_residuals(mod, var = "delta_resid")

median_delta_resid <- median(delta_mtld_with_resids$delta_resid)

kid_groups <- delta_mtld_with_resids %>%
  mutate(delta_resid_group = ifelse(delta_resid > median_delta_resid,
                                    "high", "low"))

Here’s what measures look like for the two groups:

kid_groups %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "delta_resid") %>%
  ggplot(aes(x = delta_resid_group,  y = mean, 
             ymin = ci_lower, ymax = ci_upper)) +
           geom_pointrange() +
  ggtitle("delta_resid between two groups") +
  theme_classic()

kid_groups %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "mtld_t1") %>%
  ggplot(aes(x = delta_resid_group,  y = mean, 
             ymin = ci_lower, ymax = ci_upper)) +
           geom_pointrange() +
  ggtitle("mtld_t1 between two groups") +
  theme_classic()

groups_info <- kid_groups %>%
  select(target_child_id, delta_resid_group) %>%
  left_join(all_ld_df %>% distinct(target_child_id,
                                   target_child_name, 
                                   collection, corpus_name))  %>%
  ungroup()  %>%
  mutate(collection = fct_recode(collection, 
                                 "Eng-NA" = "NAM",
                                 "Eng-UK" = "UK"))  %>%
  mutate_all(as.character)

#write_csv(groups_info, "groups_info.csv")

T1 words by talk group

Get types for all kids.

MIN_COUNT <- 5
MURMURS = c("xxx", "yyy", "yyy_yyy","---",
               "s","f","t","n","d","b","e","c","o",
               "v","h","r","k","j","g",
               "p","x","z","u","v","m","l")
  
get_types_by_bin <- function(corpus_name, 
                      target_child_name, 
                      collection, 
                      min_age, 
                      max_age, 
                      min_count,
                      murmurs){
  print(target_child_name)
  
  all_types <- childesr::get_types(
                          corpus = corpus_name, 
                          child = target_child_name,
                          collection = collection,
                          role = "target_child") 
  
  midpoint <- min_age + (max_age - min_age)/2

  all_types %>%
    filter(count >= min_count,
           target_child_age >=  min_age,
           target_child_age <= max_age) %>%
    mutate(tbin = case_when(target_child_age > midpoint ~ "t2",
                            target_child_age < midpoint ~ "t1")) %>%
    filter(!(gloss %in% murmurs)) %>%
    select(collection_name, 
           corpus_id,
           target_child_name,
           target_child_id,  
           tbin, 
           gloss) %>%
    distinct() # get rid of multiple entries in same time bin
}     

target_types <- groups_info %>%
  select(corpus_name, target_child_name, collection) %>%
  as.list() %>%
  pmap_df(get_types_by_bin, MIN_AGE, MAX_AGE, MIN_COUNT, MURMURS)

write_csv(target_types, "target_types_delta_450_1150.csv")

Average number of word types per kid: 125

stem_hunspell <- function(term) {
    # look up the term in the dictionary
    stems <- hunspell::hunspell_stem(term)[[1]]

    if (length(stems) == 0) { # if there are no stems, use the original term
        stem <- term
    } else { # if there are multiple stems, use the last one
        stem <- stems[[length(stems)]]
    }

    stem
}

target_types <- read_csv("target_types_delta_450_1150.csv") %>%
  mutate(tbin = fct_recode(tbin, 
                           t1 = "low",
                           t2 = "high")) %>%
  mutate(gloss = tolower(gloss)) %>%
  rowwise() %>%
  mutate(stem_gloss = corpus::text_tokens(gloss, stemmer = stem_hunspell)[[1]][1]) 
                                          
#target_types %>%
#    count(target_child_name, collection_name, tbin) %>% 
#    summarize(mean = mean(n))

Let’s filter what words we look at to only be those words that were produced min number of times by each kid (5). Also, not all kids actually have data at both timepoints once we exclude murmurs, etc. Finally, exlcude words that fewer than 3 kids said.

MIN_KIDS_PRODUCED_WORD <- 3

good_words <- target_types %>%
  count(stem_gloss, target_child_id) %>%
  count(stem_gloss) %>%
  filter(nn >= MIN_KIDS_PRODUCED_WORD)

good_kids <- target_types %>%
  filter(stem_gloss %in% good_words$stem_gloss) %>%
  count(target_child_id, tbin) %>%
  count(target_child_id) %>%
  filter(nn > 1)

good_types <- target_types %>%
    filter(stem_gloss %in% good_words$stem_gloss,
           target_child_id %in% good_kids$target_child_id)

t1_words <- good_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  distinct(tbin, delta_resid_group, stem_gloss)

# get all t1 low/high words
t1_words_low <- filter(t1_words, delta_resid_group == "low")
t1_words_high <- filter(t1_words, delta_resid_group == "high")

high_crit_words <- setdiff(t1_words_high$stem_gloss,
                           t1_words_low$stem_gloss)  %>%
  sort() %>%# words that high kids know that low do not %>%
  as.data.frame() %>%
  mutate(type = "high_crit_words")

low_crit_words <- setdiff(t1_words_low$stem_gloss,
                          t1_words_high$stem_gloss)  %>% 
  sort() %>% # words that low kids know that high do not
  as.data.frame() %>%
  mutate(type = "low_crit_words")

both_crit_words <- bind_rows(high_crit_words, low_crit_words) %>% 
            select(type, everything()) %>%
  rename(stem_gloss = ".")

#write_csv(both_crit_words, "target_crit_words2.csv")

prop_from_each_group_said_word_t1 <- good_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id)))  %>%
  count(stem_gloss, tbin, delta_resid_group) %>%
  filter(tbin == "t1") %>%
  group_by(stem_gloss) %>%
  mutate(prop = n / 51)  %>%
  select(-tbin, -n) %>%
  group_by(delta_resid_group, stem_gloss) %>%
  arrange(delta_resid_group, -prop) 

diff_in_props_between_groups <- prop_from_each_group_said_word_t1 %>%
  spread(delta_resid_group, prop) %>%
  mutate(high = ifelse(is.na(high), 0, high),
         low = ifelse(is.na(low), 0, low),
         diff = high - low) %>%
  arrange(-diff) %>%
  left_join(distinct(target_types, stem_gloss, gloss)) %>%
  select(stem_gloss, gloss,  diff, high, low) %>%
  ungroup() 

DT::datatable(diff_in_props_between_groups, caption = "Proportion of kids in each group that said each word, and the difference in proportions.")

Part of speech

coding_pos <- prop_from_each_group_said_word_t1 %>%
  ungroup() %>%
  distinct(stem_gloss) %>%
  left_join(distinct(target_types, gloss, stem_gloss)) %>%
  left_join(lexicon::hash_grady_pos, by = c("stem_gloss" = "word")) %>%
  group_by(stem_gloss) %>%
  slice(1) %>%
  select(gloss, stem_gloss, pos)
#write_csv(coding_pos, "t1_words_pos.csv")

pos_tagged <- read_csv("t1_words_pos_molly_tagged.csv") %>%
  select(-pos)

pos_distribution_of_all_words <- prop_from_each_group_said_word_t1 %>%
  left_join(pos_tagged) %>%
  group_by(delta_resid_group, molly_pos) %>%
  summarise (n = n(),
             sum_prop = sum(prop)) %>%
  mutate(freq = n / sum(n),
         freq_weighted = sum_prop / sum(sum_prop)) %>%
  arrange(molly_pos) %>%
  data.frame()

ggplot(pos_distribution_of_all_words, aes(group = delta_resid_group, x = molly_pos, 
                             fill = delta_resid_group, y = freq)) +
  geom_bar(stat = "identity", position = "dodge",  color = "black") +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Distribution of POS of all words")

prop_pos_by_crit <- both_crit_words %>%
  left_join(pos_tagged) %>%
  group_by(type, molly_pos) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n)) %>%
  arrange(molly_pos)

ggplot(prop_pos_by_crit, aes(group = type, x = molly_pos, 
                             fill = type, y = freq)) +
  geom_bar(stat = "identity", position = "dodge", 
           color = "black") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Distribution of POS of words said only by each group")

both_crit_words %>%
  left_join(pos_tagged) %>%
  filter(type == "low_crit_words" & molly_pos %in% c("Adjective", "Noun", "Verb")) %>%
  arrange(molly_pos) %>%
  select(-type) %>%
  DT::datatable(caption = "Content words produced only by low kids")

And, only by words produced only by high kids:

both_crit_words %>%
  left_join(pos_tagged) %>%
  filter(type == "high_crit_words" & molly_pos %in% c("Adjective", "Noun", "Verb")) %>%
  arrange(molly_pos) %>%
  select(-type) %>%
  DT::datatable(caption = "Content words produced only by high kids")

The low words definitely seem “odder”,