library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)
theme_set(theme_classic(base_size = 10))MINCOUNT <- 1Vocab data
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900.csv")
target_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
group_by(target_child_id, tbin, gloss) %>%
summarize(count = sum(count)) %>%
ungroup() %>%
filter(count >= MINCOUNT) %>%
mutate(gloss = tolower(gloss))Trigram freq data
childes_trigrams <- read_csv("data/trigrams/adult_childes_trigrams_turns.csv") %>%
data.table()
all_types <- unique(target_types$gloss)
all_trigrams <- childes_trigrams[w1 %in% all_types &
w2 %in% all_types &
w3 %in% all_types] Word frequency ata
freq <- read_tsv("../1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
mutate(word = tolower(Word)) %>%
select(word, Lg10WF) Get trigrams at t1 and outcome variables
# By-kid vocabulary functions
get_trigrams_by_kid <- function(df, all_trigrams, measure){
current_trigrams <- all_trigrams[w1 %in% df$gloss &
w2 %in% df$gloss &
w3 %in% df$gloss]
if (measure == "num"){
log(nrow(current_trigrams))
} else if (measure == "freq"){
mean(log(current_trigrams$freq))
}
}
get_word_freq_by_kid <- function(df, freq){
vocab_with_freqs <- left_join(df, freq, by = c("gloss" = "word"))
mean(vocab_with_freqs$Lg10WF)
}
trigram_num_by_kid_t1 <- target_types %>%
filter(tbin == "t1") %>%
group_by(target_child_id) %>%
nest(-target_child_id) %>%
mutate(log_num_trigrams_t1 =
map(data, get_trigrams_by_kid,
childes_trigrams, "num")) %>%
select(-data) %>%
unnest() %>%
mutate(log_num_trigrams_t1 = ifelse(!is.finite(log_num_trigrams_t1),
0, log_num_trigrams_t1))
trigram_freq_by_kid_t1 <- target_types %>%
filter(tbin == "t1") %>%
group_by(target_child_id) %>%
nest(-target_child_id) %>%
mutate(mean_log_freq_trigrams_t1 =
map(data, get_trigrams_by_kid,
childes_trigrams, "freq")) %>%
select(-data) %>%
unnest() %>%
mutate(mean_log_freq_trigrams_t1 = ifelse(!is.finite(mean_log_freq_trigrams_t1),
0, mean_log_freq_trigrams_t1))
word_freq_by_kid_t1 <- target_types %>%
filter(tbin == "t1") %>%
group_by(target_child_id) %>%
nest(-target_child_id) %>%
mutate(mean_log_word_freq_t1 =
map(data, get_word_freq_by_kid, freq)) %>%
select(-data) %>%
unnest() %>%
mutate(mean_log_word_freq_t1 = ifelse(!is.finite(mean_log_word_freq_t1),
0, mean_log_word_freq_t1))
vocab_delta <- target_types %>%
group_by(target_child_id, tbin) %>%
summarize(vocab_size = n()) %>%
spread("tbin", "vocab_size") %>%
mutate(t1 = ifelse(is.na(t1), 0, t1),
t2 = ifelse(is.na(t2), 0, t2)) %>%
mutate(vocab_delta = t2 - t1) %>%
rename(vocab_t1 = t1,
vocab_t2 = t2)
MTLD_delta <- groups_info %>%
select(target_child_id, delta_resid, age_diff, mtld_t1, mtld_t2, slope) %>%
mutate(mtld_delta = mtld_t2 - mtld_t1) %>%
rename(mtld_delta_resid = delta_resid)Merge everything together.
full_df <- list(trigram_freq_by_kid_t1,
trigram_num_by_kid_t1,
MTLD_delta,
vocab_delta,
word_freq_by_kid_t1) %>%
accumulate(full_join) %>%
last() %>%
select(target_child_id, vocab_delta, vocab_t1, vocab_t2, mtld_delta, mtld_t1,
mtld_t2, age_diff, mean_log_word_freq_t1, log_num_trigrams_t1, mean_log_freq_trigrams_t1) Here’s what the dataframe looks like:
full_df %>%
head() %>%
kable()| target_child_id | vocab_delta | vocab_t1 | vocab_t2 | mtld_delta | mtld_t1 | mtld_t2 | age_diff | mean_log_word_freq_t1 | log_num_trigrams_t1 | mean_log_freq_trigrams_t1 |
|---|---|---|---|---|---|---|---|---|---|---|
| 2521 | -7 | 72 | 65 | 4.8014951 | 7.180139 | 11.981635 | 272.7498 | 0.000000 | 7.368970 | 1.1354135 |
| 2567 | 181 | 39 | 220 | 7.9381308 | 5.625000 | 13.563131 | 260.9373 | 0.000000 | 6.343880 | 0.7074147 |
| 2576 | 39 | 8 | 47 | 4.8604651 | 3.800000 | 8.660465 | 238.4998 | 3.705312 | 2.197225 | 1.4973647 |
| 2585 | 57 | 29 | 86 | 12.4605342 | 5.868413 | 18.328947 | 254.9373 | 0.000000 | 2.197225 | 2.0054280 |
| 2588 | 122 | 44 | 166 | 19.6550160 | 6.315790 | 25.970805 | 265.4998 | 0.000000 | 6.519147 | 0.9654086 |
| 2591 | 45 | 9 | 54 | -0.1880825 | 8.704663 | 8.516581 | 197.0623 | 4.580367 | 4.595120 | 1.0504714 |
lm(mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff, d = full_df) %>%
summary()##
## Call:
## lm(formula = mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff,
## data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8742 -3.4935 -0.9669 2.1395 20.9388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.02194 3.87558 0.264 0.79258
## log_num_trigrams_t1 0.15176 0.24810 0.612 0.54218
## mtld_t1 -0.43434 0.05886 -7.379 5.46e-11 ***
## age_diff 0.04271 0.01286 3.322 0.00126 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.229 on 97 degrees of freedom
## Multiple R-squared: 0.4999, Adjusted R-squared: 0.4845
## F-statistic: 32.32 on 3 and 97 DF, p-value: 1.432e-14
lm(mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + age_diff, d = full_df) %>%
summary()##
## Call:
## lm(formula = mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 +
## age_diff, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7058 -3.6371 -0.8528 2.4979 20.4168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88159 1.85233 2.096 0.0387 *
## mean_log_freq_trigrams_t1 -2.17167 1.19478 -1.818 0.0722 .
## mtld_t1 -0.44145 0.05769 -7.652 1.46e-11 ***
## age_diff 0.04535 0.01001 4.532 1.67e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.152 on 97 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.4995
## F-statistic: 34.27 on 3 and 97 DF, p-value: 3.448e-15
lm(vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff, d = full_df) %>%
summary()##
## Call:
## lm(formula = vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff,
## data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1038.83 -159.52 -1.74 104.31 1823.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 562.5847 241.8399 2.326 0.02209 *
## log_num_trigrams_t1 -21.5974 18.5549 -1.164 0.24729
## vocab_t1 0.3457 0.1110 3.115 0.00242 **
## age_diff -1.8438 0.7669 -2.404 0.01811 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 310.3 on 97 degrees of freedom
## Multiple R-squared: 0.2169, Adjusted R-squared: 0.1927
## F-statistic: 8.957 on 3 and 97 DF, p-value: 2.698e-05
lm(vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + age_diff, d = full_df) %>%
summary()##
## Call:
## lm(formula = vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 +
## age_diff, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1014.29 -172.12 -5.08 97.81 1830.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 299.1291 119.2540 2.508 0.01379 *
## mean_log_freq_trigrams_t1 27.7715 74.7313 0.372 0.71099
## vocab_t1 0.2747 0.0910 3.019 0.00324 **
## age_diff -1.3435 0.6257 -2.147 0.03428 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 312.3 on 97 degrees of freedom
## Multiple R-squared: 0.2071, Adjusted R-squared: 0.1826
## F-statistic: 8.447 on 3 and 97 DF, p-value: 4.831e-05
lm(mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
summary()##
## Call:
## lm(formula = mtld_delta ~ log_num_trigrams_t1 + mtld_t1 + age_diff +
## mean_log_word_freq_t1, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8705 -3.4760 -0.9571 2.1221 20.9767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.92764 4.03275 0.230 0.81856
## log_num_trigrams_t1 0.16018 0.26620 0.602 0.54878
## mtld_t1 -0.43421 0.05918 -7.338 6.97e-11 ***
## age_diff 0.04278 0.01294 3.305 0.00134 **
## mean_log_word_freq_t1 0.05355 0.59230 0.090 0.92815
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.256 on 96 degrees of freedom
## Multiple R-squared: 0.5, Adjusted R-squared: 0.4791
## F-statistic: 24 on 4 and 96 DF, p-value: 8.914e-14
lm(mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
summary()##
## Call:
## lm(formula = mtld_delta ~ mean_log_freq_trigrams_t1 + mtld_t1 +
## age_diff + mean_log_word_freq_t1, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3902 -3.5134 -0.8168 2.4588 20.5847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.13594 1.90421 2.172 0.0323 *
## mean_log_freq_trigrams_t1 -2.47681 1.29815 -1.908 0.0594 .
## mtld_t1 -0.44029 0.05791 -7.603 1.95e-11 ***
## age_diff 0.04488 0.01007 4.456 2.26e-05 ***
## mean_log_word_freq_t1 0.36178 0.59096 0.612 0.5419
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.169 on 96 degrees of freedom
## Multiple R-squared: 0.5164, Adjusted R-squared: 0.4963
## F-statistic: 25.63 on 4 and 96 DF, p-value: 1.846e-14
lm(vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
summary()##
## Call:
## lm(formula = vocab_delta ~ log_num_trigrams_t1 + vocab_t1 + age_diff +
## mean_log_word_freq_t1, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1039.11 -160.09 -2.16 102.88 1824.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 569.6033 257.0914 2.216 0.02908 *
## log_num_trigrams_t1 -22.3279 20.5844 -1.085 0.28077
## vocab_t1 0.3481 0.1151 3.023 0.00321 **
## age_diff -1.8497 0.7741 -2.389 0.01883 *
## mean_log_word_freq_t1 -3.0410 36.2620 -0.084 0.93334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 311.9 on 96 degrees of freedom
## Multiple R-squared: 0.217, Adjusted R-squared: 0.1844
## F-statistic: 6.651 on 4 and 96 DF, p-value: 9.083e-05
lm(vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 + age_diff + mean_log_word_freq_t1, d = full_df) %>%
summary()##
## Call:
## lm(formula = vocab_delta ~ mean_log_freq_trigrams_t1 + vocab_t1 +
## age_diff + mean_log_word_freq_t1, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1015.31 -170.34 -11.96 101.15 1829.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 308.70609 124.30668 2.483 0.01475 *
## mean_log_freq_trigrams_t1 18.46735 81.67945 0.226 0.82161
## vocab_t1 0.27274 0.09168 2.975 0.00371 **
## age_diff -1.36264 0.63217 -2.155 0.03362 *
## mean_log_word_freq_t1 10.40412 35.94895 0.289 0.77289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 313.7 on 96 degrees of freedom
## Multiple R-squared: 0.2078, Adjusted R-squared: 0.1748
## F-statistic: 6.296 on 4 and 96 DF, p-value: 0.0001528