library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Do kids that know more words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in adult speech from childes (child-directed). Because high kids have bigger vocabs, we sample n words per group (here, 500).

It turns out, that the pattern of effects depends a lot on what you set the threshold value for knowing a word to be. The results shown here are where the threhold is set at 5 occurences (but it looks different if it’s set to 1.)

FINDINGS:

Min occurences of word in vocab = 1:

Min occurences of word in vocab = 5:

Group level

MINCOUNT <- 5
TOPNWORDS_PER_GROUP <- 500

# Get all unique words for two groups at t1. 
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900.csv")
target_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
  group_by(target_child_id, tbin, gloss) %>%
  summarize(count = sum(count)) %>%
  ungroup() %>%
  filter(count >= MINCOUNT) %>%
  mutate(gloss = tolower(gloss))


t1_words <- target_types %>%
  filter(tbin == "t1") %>% # #t1 only
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
  group_by(delta_resid_group, gloss)  %>% # get glosses by group
  summarize(n = n()) %>% # get gloss counts by group
  top_n(TOPNWORDS_PER_GROUP) %>%
  group_by(delta_resid_group)  %>%
  slice(1:TOPNWORDS_PER_GROUP) %>%
  arrange(delta_resid_group, -n)
childes_trigrams <- read_csv("data/trigrams/adult_childes_trigrams_turns.csv") %>%
  data.table() 

t1_words_high <- t1_words %>%
  ungroup()%>%
                    filter(delta_resid_group == "high") %>%
                    select(gloss) %>%
  unlist(use.names = F)

t1_words_low <- t1_words %>%
    ungroup() %>%
                    filter(delta_resid_group == "low") %>%
                    select(gloss) %>%
  unlist(use.names = F)

high_trigrams <- childes_trigrams[w1 %in% t1_words_high &
                                 w2 %in% t1_words_high & 
                                 w3 %in% t1_words_high] %>%
  mutate(vocab_group = "high")

low_trigrams <- childes_trigrams[w1 %in% t1_words_low &
                                 w2 %in% t1_words_low & 
                                 w3 %in% t1_words_low] %>%
  mutate(vocab_group = "low")

all_trigrams <- bind_rows(high_trigrams, low_trigrams)

number of trigrams

count(all_trigrams, vocab_group) %>%
  kable()
vocab_group n
high 100231
low 25762

frequency of trigrams

all_trigrams %>%
  mutate(log_freq = log(freq)) %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "log_freq", na.rm = T) %>%
  ggplot(aes( x = vocab_group, y = mean, group = vocab_group, fill = vocab_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

  #summarize(mean_freq= mean(log_freq),
  #          ci_lower_freq = mean(log_freq) - (1.96 * sd(log_freq)),
  #          ci_upper_freq = mean(log_freq) + (1.96 * sd(log_freq)))
  
 all_trigrams %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "freq", na.rm = T) %>%
  kable()
vocab_group ci_lower ci_upper mean
high 4.023067 4.362018 4.16881
low 3.726431 4.612536 4.09417
freq <- read_tsv("../1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
  select(Word, Lg10WF)

freq_tidy <- freq %>%
  mutate(Word = tolower(Word)) %>%
  rename(w1 = Word,
         log_freq1 = Lg10WF) %>%
  mutate(w2 = w1,
         log_freq2 = log_freq1,
         w3 = w1,
         log_freq3 = log_freq1) 

all_trigrams_with_freq <- all_trigrams %>%  
  left_join(freq_tidy %>% select(w1, log_freq1)) %>%
  left_join(freq_tidy %>% select(w2, log_freq2)) %>%
  left_join(freq_tidy %>% select(w3, log_freq3))  %>%
  rowwise()%>%
  mutate(mean_word_freq = mean(c(log_freq1, log_freq2, log_freq3), 
                          na.rm = T),
         max_word_freq =  max(c(log_freq1, log_freq2, log_freq3), 
                          na.rm = T),
         log_trigram_freq = log(freq)) %>%
  select(vocab_group, log_trigram_freq, max_word_freq, mean_word_freq)

lm(log_trigram_freq ~ vocab_group + mean_word_freq, d = all_trigrams_with_freq) %>%
  summary()
## 
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + mean_word_freq, 
##     data = all_trigrams_with_freq)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5250 -0.5693 -0.2550  0.3161  7.3738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.941802   0.035122 -112.23  < 2e-16 ***
## vocab_grouplow -0.039697   0.005960   -6.66 2.74e-11 ***
## mean_word_freq  0.866465   0.006744  128.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.853 on 125950 degrees of freedom
##   (40 observations deleted due to missingness)
## Multiple R-squared:  0.116,  Adjusted R-squared:  0.116 
## F-statistic:  8266 on 2 and 125950 DF,  p-value: < 2.2e-16
lm(log_trigram_freq ~ vocab_group + max_word_freq, d = all_trigrams_with_freq %>% filter(is.finite(max_word_freq))) %>%
  summary()
## 
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + max_word_freq, 
##     data = all_trigrams_with_freq %>% filter(is.finite(max_word_freq)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8122 -0.5972 -0.3792  0.3139  7.7071 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -2.043505   0.037827 -54.023  < 2e-16 ***
## vocab_grouplow -0.026020   0.006222  -4.182 2.89e-05 ***
## max_word_freq   0.452620   0.006565  68.949  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8905 on 125950 degrees of freedom
## Multiple R-squared:  0.03654,    Adjusted R-squared:  0.03652 
## F-statistic:  2388 on 2 and 125950 DF,  p-value: < 2.2e-16

Kids in the high group produce high frequency trigrams, contorling for the mean frequency of the words/max frequency of the words in the trigrams.

Individual kids

good_types_t1 <- target_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  filter(gloss %in% t1_words$gloss)

get_trigrams_by_kid <- function(df, all_trigrams, measure, freq){
     current_trigrams <- all_trigrams[w1 %in% df$gloss &
                                 w2 %in% df$gloss & 
                                 w3 %in% df$gloss]  
    
      vocab_freqs <- df %>%
          left_join(freq, by = c("gloss" = "Word")) 
     
     mean_freq <- mean(vocab_freqs$Lg10WF, na.rm = T )
     median_freq <- median(vocab_freqs$Lg10WF, na.rm = T )
     max_freq <- median(vocab_freqs$Lg10WF, na.rm = T )

     if (measure == "num"){
      list(log_trigram_num = log(nrow(current_trigrams)), 
           mean_word_freq = mean_freq, 
           median_word_freq = median_freq,
           max_word_freq = max_freq)

     } else if (measure == "freq"){
      list(mean_trigram_freq = mean(log(current_trigrams$freq)), 
           mean_word_freq = mean_freq, 
           median_word_freq = median_freq,
           max_word_freq = max_freq)
     }
}

number of trigrams

trigram_num_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(kid_measures = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "num", freq),
                     kid_measure_names = list(c("log_num_trigrams",  # this is bad
                                                "mean_word_freq",
                                                "median_word_freq",
                                                "max_word_freq"))) %>%
            select(-data) %>%
            unnest() %>%
            mutate(kid_measures = unlist(kid_measures)) %>%
            left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
            spread(kid_measure_names, kid_measures)



ggplot(trigram_num_by_kid, 
       aes(x = log_num_trigrams, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4) +
  theme_classic()

ggplot(trigram_num_by_kid, 
       aes(x = log_num_trigrams, 
                         fill = delta_resid_group)) +
       geom_histogram() +
  theme_classic()

trigram_num_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "log_num_trigrams", na.rm = T) %>%
    ggplot(aes( x = delta_resid_group, y = mean, 
                group = delta_resid_group, fill = delta_resid_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

lm(log_num_trigrams ~ delta_resid_group + mean_word_freq, 
   d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
  summary()
## 
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + mean_word_freq, 
##     data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5340 -1.7205  0.6778  2.1102  5.7103 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           16.1982     2.6749   6.056 5.34e-08 ***
## delta_resid_grouplow  -2.6669     0.6923  -3.852 0.000247 ***
## mean_word_freq        -2.0165     0.5970  -3.378 0.001168 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.979 on 74 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.3053, Adjusted R-squared:  0.2865 
## F-statistic: 16.26 on 2 and 74 DF,  p-value: 1.403e-06
lm(log_num_trigrams ~ delta_resid_group + median_word_freq, 
   d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
  summary()
## 
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + median_word_freq, 
##     data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5658 -2.0368  0.4806  2.0022  5.5137 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           14.3034     2.2911   6.243 2.44e-08 ***
## delta_resid_grouplow  -2.6411     0.7028  -3.758 0.000339 ***
## median_word_freq      -1.5834     0.5070  -3.123 0.002552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.009 on 74 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.2916, Adjusted R-squared:  0.2724 
## F-statistic: 15.23 on 2 and 74 DF,  p-value: 2.892e-06

frequency of trigrams

trigram_freqs_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(kid_measures = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "freq", freq),
                     kid_measure_names = list(c("log_mean_trigram_frequency",  # this is bad
                                                "mean_word_freq",
                                                "median_word_freq",
                                                "max_word_freq"))) %>%
            select(-data) %>%
            unnest() %>%
            mutate(kid_measures = unlist(kid_measures)) %>%
            left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
            spread(kid_measure_names, kid_measures)

ggplot(trigram_freqs_by_kid, 
       aes(x = log_mean_trigram_frequency, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4) +
  theme_classic()

trigram_freqs_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "log_mean_trigram_frequency", na.rm = T) %>%
    ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

lm(log_mean_trigram_frequency ~ delta_resid_group + mean_word_freq, d = trigram_freqs_by_kid) %>%
  summary()
## 
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group + 
##     mean_word_freq, data = trigram_freqs_by_kid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3618 -0.4136 -0.1613  0.1153  3.1254 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.4181     0.7778  -3.109  0.00267 ** 
## delta_resid_grouplow   0.5240     0.2013   2.603  0.01116 *  
## mean_word_freq         0.7703     0.1736   4.437 3.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8664 on 74 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.3029, Adjusted R-squared:  0.284 
## F-statistic: 16.08 on 2 and 74 DF,  p-value: 1.593e-06
lm(log_mean_trigram_frequency ~ delta_resid_group + median_word_freq, d = trigram_freqs_by_kid) %>%
  summary()
## 
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group + 
##     median_word_freq, data = trigram_freqs_by_kid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3375 -0.4262 -0.1738  0.1008  3.1990 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.4487     0.6840  -2.118  0.03752 *  
## delta_resid_grouplow   0.5309     0.2098   2.531  0.01351 *  
## median_word_freq       0.5493     0.1513   3.629  0.00052 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8982 on 74 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.2508, Adjusted R-squared:  0.2305 
## F-statistic: 12.38 on 2 and 74 DF,  p-value: 2.296e-05