library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Params

MINCOUNT <- 1

Merge data together

Vocab data

groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900.csv")
target_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
  group_by(target_child_id, tbin, gloss) %>%
  summarize(count = sum(count)) %>%
  ungroup() %>%
  filter(count >= MINCOUNT) %>%
  mutate(gloss = tolower(gloss))

Trigram freq data

childes_trigrams <- read_csv("data/trigrams/adult_childes_trigrams_turns.csv") %>%
  data.table() 

all_types <- unique(target_types$gloss)

all_trigrams <- childes_trigrams[w1 %in% all_types &
                                 w2 %in% all_types & 
                                 w3 %in% all_types] 

Word frequency ata

freq <- read_tsv("../1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
  mutate(word = tolower(Word))  %>%
    select(word, Lg10WF) 

Get trigrams at t1 and outcome variables

# By-kid vocabulary  functions
get_trigrams_by_kid <- function(df, all_trigrams, measure){
     current_trigrams <- all_trigrams[w1 %in% df$gloss &
                                 w2 %in% df$gloss & 
                                 w3 %in% df$gloss]  

     if (measure == "num"){
        log(nrow(current_trigrams))
     } else if (measure == "freq"){
        mean(log(current_trigrams$freq))
     }
}

get_word_freq_by_kid <- function(df, freq){

  vocab_with_freqs <- left_join(df, freq, by = c("gloss" = "word")) 
  mean(vocab_with_freqs$Lg10WF)

}

trigram_num_by_kid_t1 <- target_types %>%
              filter(tbin == "t1")  %>%
              group_by(target_child_id) %>%
              nest(-target_child_id) %>%
              mutate(log_num_trigrams_t1 = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "num")) %>%
              select(-data) %>%
              unnest() %>%
              mutate(log_num_trigrams_t1 = ifelse(!is.finite(log_num_trigrams_t1),
                                               0, log_num_trigrams_t1)) 

trigram_freq_by_kid_t1 <- target_types %>%
              filter(tbin == "t1")  %>%
              group_by(target_child_id) %>%
              nest(-target_child_id) %>%
              mutate(mean_log_freq_trigrams_t1 = 
                       map(data, get_trigrams_by_kid,
                           childes_trigrams, "freq")) %>%
              select(-data) %>%
              unnest() %>%
              mutate(mean_log_freq_trigrams_t1 = ifelse(!is.finite(mean_log_freq_trigrams_t1),
                                               0, mean_log_freq_trigrams_t1))


word_freq_by_kid_t1 <- target_types %>%
              filter(tbin == "t1")  %>%
              group_by(target_child_id) %>%
              nest(-target_child_id) %>%
              mutate(mean_log_word_freq_t1 = 
                       map(data, get_word_freq_by_kid, freq)) %>%
              select(-data) %>%
              unnest()  %>%
              mutate(mean_log_word_freq_t1 = ifelse(!is.finite(mean_log_word_freq_t1),
                                               0, mean_log_word_freq_t1))
vocab_delta <- target_types %>%
  group_by(target_child_id, tbin) %>%
  summarize(vocab_size = n()) %>%
  spread("tbin", "vocab_size") %>%
  mutate(t1 = ifelse(is.na(t1), 0, t1),
               t2 = ifelse(is.na(t2), 0, t2)) %>%
  mutate(vocab_delta  = t2 - t1) %>%
  rename(vocab_t1 = t1,
         vocab_t2 = t2)

MTLD_delta <- groups_info %>%
  select(target_child_id, delta_resid, age_diff, mtld_t1, mtld_t2, slope) %>%
  mutate(mtld_delta = mtld_t2 - mtld_t1) %>%
  rename(mtld_delta_resid = delta_resid)

Merge everything together.

full_df <- list(trigram_freq_by_kid_t1, 
     trigram_num_by_kid_t1,
     MTLD_delta,
     vocab_delta, 
     word_freq_by_kid_t1) %>%
  accumulate(full_join) %>%
  last() %>%
  select(target_child_id, vocab_delta, vocab_t1, vocab_t2, mtld_delta, mtld_t1,
         mtld_t2, age_diff, mean_log_word_freq_t1, log_num_trigrams_t1, mean_log_freq_trigrams_t1) 

Here’s what the dataframe looks like:

full_df %>%
  head() %>%
  kable()
target_child_id vocab_delta vocab_t1 vocab_t2 mtld_delta mtld_t1 mtld_t2 age_diff mean_log_word_freq_t1 log_num_trigrams_t1 mean_log_freq_trigrams_t1
2521 -7 72 65 4.8014951 7.180139 11.981635 272.7498 0.000000 7.368970 1.1354135
2567 181 39 220 7.9381308 5.625000 13.563131 260.9373 0.000000 6.343880 0.7074147
2576 39 8 47 4.8604651 3.800000 8.660465 238.4998 3.705312 2.197225 1.4973647
2585 57 29 86 12.4605342 5.868413 18.328947 254.9373 0.000000 2.197225 2.0054280
2588 122 44 166 19.6550160 6.315790 25.970805 265.4998 0.000000 6.519147 0.9654086
2591 45 9 54 -0.1880825 8.704663 8.516581 197.0623 4.580367 4.595120 1.0504714

Regressions

controlling for vocab at t1 and age diff

lm(mtld_delta ~ log_num_trigrams_t1 +  mtld_t1 + age_diff, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff, 
##     data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8742 -3.4935 -0.9669  2.1395 20.9388 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.02194    3.87558   0.264  0.79258    
## log_num_trigrams_t1  0.15176    0.24810   0.612  0.54218    
## mtld_t1             -0.43434    0.05886  -7.379 5.46e-11 ***
## age_diff             0.04271    0.01286   3.322  0.00126 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.229 on 97 degrees of freedom
## Multiple R-squared:  0.4999, Adjusted R-squared:  0.4845 
## F-statistic: 32.32 on 3 and 97 DF,  p-value: 1.432e-14
lm(mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + age_diff, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + 
##     age_diff, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7058 -3.6371 -0.8528  2.4979 20.4168 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.88159    1.85233   2.096   0.0387 *  
## mean_log_freq_trigrams_t1 -2.17167    1.19478  -1.818   0.0722 .  
## mtld_t1                   -0.44145    0.05769  -7.652 1.46e-11 ***
## age_diff                   0.04535    0.01001   4.532 1.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.152 on 97 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.4995 
## F-statistic: 34.27 on 3 and 97 DF,  p-value: 3.448e-15
lm(vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff, 
##     data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1038.83  -159.52    -1.74   104.31  1823.92 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         562.5847   241.8399   2.326  0.02209 * 
## log_num_trigrams_t1 -21.5974    18.5549  -1.164  0.24729   
## vocab_t1              0.3457     0.1110   3.115  0.00242 **
## age_diff             -1.8438     0.7669  -2.404  0.01811 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 310.3 on 97 degrees of freedom
## Multiple R-squared:  0.2169, Adjusted R-squared:  0.1927 
## F-statistic: 8.957 on 3 and 97 DF,  p-value: 2.698e-05
lm(vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + age_diff, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + 
##     age_diff, data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1014.29  -172.12    -5.08    97.81  1830.42 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               299.1291   119.2540   2.508  0.01379 * 
## mean_log_freq_trigrams_t1  27.7715    74.7313   0.372  0.71099   
## vocab_t1                    0.2747     0.0910   3.019  0.00324 **
## age_diff                   -1.3435     0.6257  -2.147  0.03428 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 312.3 on 97 degrees of freedom
## Multiple R-squared:  0.2071, Adjusted R-squared:  0.1826 
## F-statistic: 8.447 on 3 and 97 DF,  p-value: 4.831e-05

controlling for vocab at t1 and age diff and mean_log_word_freq_t1

lm(mtld_delta ~ log_num_trigrams_t1 +  mtld_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff + 
##     mean_log_word_freq_t1, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8705 -3.4760 -0.9571  2.1221 20.9767 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.92764    4.03275   0.230  0.81856    
## log_num_trigrams_t1    0.16018    0.26620   0.602  0.54878    
## mtld_t1               -0.43421    0.05918  -7.338 6.97e-11 ***
## age_diff               0.04278    0.01294   3.305  0.00134 ** 
## mean_log_word_freq_t1  0.05355    0.59230   0.090  0.92815    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.256 on 96 degrees of freedom
## Multiple R-squared:    0.5,  Adjusted R-squared:  0.4791 
## F-statistic:    24 on 4 and 96 DF,  p-value: 8.914e-14
lm(mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + 
##     age_diff + mean_log_word_freq_t1, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3902 -3.5134 -0.8168  2.4588 20.5847 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                4.13594    1.90421   2.172   0.0323 *  
## mean_log_freq_trigrams_t1 -2.47681    1.29815  -1.908   0.0594 .  
## mtld_t1                   -0.44029    0.05791  -7.603 1.95e-11 ***
## age_diff                   0.04488    0.01007   4.456 2.26e-05 ***
## mean_log_word_freq_t1      0.36178    0.59096   0.612   0.5419    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.169 on 96 degrees of freedom
## Multiple R-squared:  0.5164, Adjusted R-squared:  0.4963 
## F-statistic: 25.63 on 4 and 96 DF,  p-value: 1.846e-14
lm(vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff + 
##     mean_log_word_freq_t1, data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1039.11  -160.09    -2.16   102.88  1824.08 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           569.6033   257.0914   2.216  0.02908 * 
## log_num_trigrams_t1   -22.3279    20.5844  -1.085  0.28077   
## vocab_t1                0.3481     0.1151   3.023  0.00321 **
## age_diff               -1.8497     0.7741  -2.389  0.01883 * 
## mean_log_word_freq_t1  -3.0410    36.2620  -0.084  0.93334   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 311.9 on 96 degrees of freedom
## Multiple R-squared:  0.217,  Adjusted R-squared:  0.1844 
## F-statistic: 6.651 on 4 and 96 DF,  p-value: 9.083e-05
lm(vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
  summary()
## 
## Call:
## lm(formula = vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + 
##     age_diff + mean_log_word_freq_t1, data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1015.31  -170.34   -11.96   101.15  1829.38 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               308.70609  124.30668   2.483  0.01475 * 
## mean_log_freq_trigrams_t1  18.46735   81.67945   0.226  0.82161   
## vocab_t1                    0.27274    0.09168   2.975  0.00371 **
## age_diff                   -1.36264    0.63217  -2.155  0.03362 * 
## mean_log_word_freq_t1      10.40412   35.94895   0.289  0.77289   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 313.7 on 96 degrees of freedom
## Multiple R-squared:  0.2078, Adjusted R-squared:  0.1748 
## F-statistic: 6.296 on 4 and 96 DF,  p-value: 0.0001528