MTLD words by usefuleness

Group level
- number of trigrams
- frequency of trigrams
Individual kids
- number of trigrams
- frequency of trigrams

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Do kids that know more words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in adult speech from childes (child-directed). Because high kids have bigger vocabs, we sample n words per group (here, 500).

It turns out, that the pattern of effects depends a lot on what you set the threshold value for knowing a word to be. The results shown here are where the threhold is set at 5 occurences (but it looks different if it’s set to 1.)

FINDINGS:

Min occurences of word in vocab = 1:

At the group level, the high group produces more frequent trigrams, controling for the frequency of the individual words.
At the kid level, there are no effect when controlling for word frequency

Min occurences of word in vocab = 5:

At the group level, the high group produces more frequent trigrams, controling for the frequency of the individual words.
At the kid level, high kids produce more trigrams controling for frequency of words in kids’ vocab (there are a number of kids missing here though, either because they don’t produce words at least 5 times or their words are not in the sampled words)
At kid level, low kids produce more frequent trigrams, controling for vocab word frequency

Group level

MINCOUNT <- 5
TOPNWORDS_PER_GROUP <- 500

# Get all unique words for two groups at t1. 
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900.csv")
target_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
  group_by(target_child_id, tbin, gloss) %>%
  summarize(count = sum(count)) %>%
  ungroup() %>%
  filter(count >= MINCOUNT) %>%
  mutate(gloss = tolower(gloss))


t1_words <- target_types %>%
  filter(tbin == "t1") %>% # #t1 only
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
  group_by(delta_resid_group, gloss)  %>% # get glosses by group
  summarize(n = n()) %>% # get gloss counts by group
  top_n(TOPNWORDS_PER_GROUP) %>%
  group_by(delta_resid_group)  %>%
  slice(1:TOPNWORDS_PER_GROUP) %>%
  arrange(delta_resid_group, -n)

childes_trigrams <- read_csv("data/trigrams/adult_childes_trigrams_turns.csv") %>%
  data.table() 

t1_words_high <- t1_words %>%
  ungroup()%>%
                    filter(delta_resid_group == "high") %>%
                    select(gloss) %>%
  unlist(use.names = F)

t1_words_low <- t1_words %>%
    ungroup() %>%
                    filter(delta_resid_group == "low") %>%
                    select(gloss) %>%
  unlist(use.names = F)

high_trigrams <- childes_trigrams[w1 %in% t1_words_high &
                                 w2 %in% t1_words_high & 
                                 w3 %in% t1_words_high] %>%
  mutate(vocab_group = "high")

low_trigrams <- childes_trigrams[w1 %in% t1_words_low &
                                 w2 %in% t1_words_low & 
                                 w3 %in% t1_words_low] %>%
  mutate(vocab_group = "low")

all_trigrams <- bind_rows(high_trigrams, low_trigrams)

number of trigrams

count(all_trigrams, vocab_group) %>%
  kable()

vocab_group	n
high	100231
low	25762

frequency of trigrams

all_trigrams %>%
  mutate(log_freq = log(freq)) %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "log_freq", na.rm = T) %>%
  ggplot(aes( x = vocab_group, y = mean, group = vocab_group, fill = vocab_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

  #summarize(mean_freq= mean(log_freq),
  #          ci_lower_freq = mean(log_freq) - (1.96 * sd(log_freq)),
  #          ci_upper_freq = mean(log_freq) + (1.96 * sd(log_freq)))
  
 all_trigrams %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "freq", na.rm = T) %>%
  kable()

vocab_group	ci_lower	ci_upper	mean
high	4.023067	4.362018	4.16881
low	3.726431	4.612536	4.09417

freq <- read_tsv("../1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
  select(Word, Lg10WF)

freq_tidy <- freq %>%
  mutate(Word = tolower(Word)) %>%
  rename(w1 = Word,
         log_freq1 = Lg10WF) %>%
  mutate(w2 = w1,
         log_freq2 = log_freq1,
         w3 = w1,
         log_freq3 = log_freq1) 

all_trigrams_with_freq <- all_trigrams %>%  
  left_join(freq_tidy %>% select(w1, log_freq1)) %>%
  left_join(freq_tidy %>% select(w2, log_freq2)) %>%
  left_join(freq_tidy %>% select(w3, log_freq3))  %>%
  rowwise()%>%
  mutate(mean_word_freq = mean(c(log_freq1, log_freq2, log_freq3), 
                          na.rm = T),
         max_word_freq =  max(c(log_freq1, log_freq2, log_freq3), 
                          na.rm = T),
         log_trigram_freq = log(freq)) %>%
  select(vocab_group, log_trigram_freq, max_word_freq, mean_word_freq)

lm(log_trigram_freq ~ vocab_group + mean_word_freq, d = all_trigrams_with_freq) %>%
  summary()

## 
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + mean_word_freq, 
##     data = all_trigrams_with_freq)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5250 -0.5693 -0.2550  0.3161  7.3738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.941802   0.035122 -112.23  < 2e-16 ***
## vocab_grouplow -0.039697   0.005960   -6.66 2.74e-11 ***
## mean_word_freq  0.866465   0.006744  128.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.853 on 125950 degrees of freedom
##   (40 observations deleted due to missingness)
## Multiple R-squared:  0.116,  Adjusted R-squared:  0.116 
## F-statistic:  8266 on 2 and 125950 DF,  p-value: < 2.2e-16

lm(log_trigram_freq ~ vocab_group + max_word_freq, d = all_trigrams_with_freq %>% filter(is.finite(max_word_freq))) %>%
  summary()

## 
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + max_word_freq, 
##     data = all_trigrams_with_freq %>% filter(is.finite(max_word_freq)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8122 -0.5972 -0.3792  0.3139  7.7071 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -2.043505   0.037827 -54.023  < 2e-16 ***
## vocab_grouplow -0.026020   0.006222  -4.182 2.89e-05 ***
## max_word_freq   0.452620   0.006565  68.949  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8905 on 125950 degrees of freedom
## Multiple R-squared:  0.03654,    Adjusted R-squared:  0.03652 
## F-statistic:  2388 on 2 and 125950 DF,  p-value: < 2.2e-16

Kids in the high group produce high frequency trigrams, contorling for the mean frequency of the words/max frequency of the words in the trigrams.

Individual kids

good_types_t1 <- target_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  filter(gloss %in% t1_words$gloss)

get_trigrams_by_kid <- function(df, all_trigrams, measure, freq){
     current_trigrams <- all_trigrams[w1 %in% df$gloss &
                                 w2 %in% df$gloss & 
                                 w3 %in% df$gloss]  
    
      vocab_freqs <- df %>%
          left_join(freq, by = c("gloss" = "Word")) 
     
     mean_freq <- mean(vocab_freqs$Lg10WF, na.rm = T )
     median_freq <- median(vocab_freqs$Lg10WF, na.rm = T )
     max_freq <- median(vocab_freqs$Lg10WF, na.rm = T )

     if (measure == "num"){
      list(log_trigram_num = log(nrow(current_trigrams)), 
           mean_word_freq = mean_freq, 
           median_word_freq = median_freq,
           max_word_freq = max_freq)

     } else if (measure == "freq"){
      list(mean_trigram_freq = mean(log(current_trigrams$freq)), 
           mean_word_freq = mean_freq, 
           median_word_freq = median_freq,
           max_word_freq = max_freq)
     }
}

number of trigrams

trigram_num_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(kid_measures = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "num", freq),
                     kid_measure_names = list(c("log_num_trigrams",  # this is bad
                                                "mean_word_freq",
                                                "median_word_freq",
                                                "max_word_freq"))) %>%
            select(-data) %>%
            unnest() %>%
            mutate(kid_measures = unlist(kid_measures)) %>%
            left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
            spread(kid_measure_names, kid_measures)



ggplot(trigram_num_by_kid, 
       aes(x = log_num_trigrams, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4) +
  theme_classic()

ggplot(trigram_num_by_kid, 
       aes(x = log_num_trigrams, 
                         fill = delta_resid_group)) +
       geom_histogram() +
  theme_classic()

trigram_num_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "log_num_trigrams", na.rm = T) %>%
    ggplot(aes( x = delta_resid_group, y = mean, 
                group = delta_resid_group, fill = delta_resid_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

lm(log_num_trigrams ~ delta_resid_group + mean_word_freq, 
   d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + mean_word_freq, 
##     data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5340 -1.7205  0.6778  2.1102  5.7103 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           16.1982     2.6749   6.056 5.34e-08 ***
## delta_resid_grouplow  -2.6669     0.6923  -3.852 0.000247 ***
## mean_word_freq        -2.0165     0.5970  -3.378 0.001168 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.979 on 74 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.3053, Adjusted R-squared:  0.2865 
## F-statistic: 16.26 on 2 and 74 DF,  p-value: 1.403e-06

lm(log_num_trigrams ~ delta_resid_group + median_word_freq, 
   d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + median_word_freq, 
##     data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5658 -2.0368  0.4806  2.0022  5.5137 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           14.3034     2.2911   6.243 2.44e-08 ***
## delta_resid_grouplow  -2.6411     0.7028  -3.758 0.000339 ***
## median_word_freq      -1.5834     0.5070  -3.123 0.002552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.009 on 74 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.2916, Adjusted R-squared:  0.2724 
## F-statistic: 15.23 on 2 and 74 DF,  p-value: 2.892e-06

frequency of trigrams

trigram_freqs_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(kid_measures = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "freq", freq),
                     kid_measure_names = list(c("log_mean_trigram_frequency",  # this is bad
                                                "mean_word_freq",
                                                "median_word_freq",
                                                "max_word_freq"))) %>%
            select(-data) %>%
            unnest() %>%
            mutate(kid_measures = unlist(kid_measures)) %>%
            left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
            spread(kid_measure_names, kid_measures)

ggplot(trigram_freqs_by_kid, 
       aes(x = log_mean_trigram_frequency, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4) +
  theme_classic()

trigram_freqs_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "log_mean_trigram_frequency", na.rm = T) %>%
    ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

lm(log_mean_trigram_frequency ~ delta_resid_group + mean_word_freq, d = trigram_freqs_by_kid) %>%
  summary()

## 
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group + 
##     mean_word_freq, data = trigram_freqs_by_kid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3618 -0.4136 -0.1613  0.1153  3.1254 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.4181     0.7778  -3.109  0.00267 ** 
## delta_resid_grouplow   0.5240     0.2013   2.603  0.01116 *  
## mean_word_freq         0.7703     0.1736   4.437 3.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8664 on 74 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.3029, Adjusted R-squared:  0.284 
## F-statistic: 16.08 on 2 and 74 DF,  p-value: 1.593e-06

lm(log_mean_trigram_frequency ~ delta_resid_group + median_word_freq, d = trigram_freqs_by_kid) %>%
  summary()

## 
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group + 
##     median_word_freq, data = trigram_freqs_by_kid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3375 -0.4262 -0.1738  0.1008  3.1990 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.4487     0.6840  -2.118  0.03752 *  
## delta_resid_grouplow   0.5309     0.2098   2.531  0.01351 *  
## median_word_freq       0.5493     0.1513   3.629  0.00052 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8982 on 74 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.2508, Adjusted R-squared:  0.2305 
## F-statistic: 12.38 on 2 and 74 DF,  p-value: 2.296e-05

MTLD words by usefuleness

defined as adult trigrams from childes

2018-05-08

Group level

number of trigrams

frequency of trigrams

Individual kids

number of trigrams

frequency of trigrams