library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))MINWORDSFORVOCAB <- 1The min words for vocab here is 1.
Read in data
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")
trigrams <- read_csv("../2_trigrams/mtld_continuous_trigram_by_kid_MIN1.csv")
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/ref_complex/Papers/RC_old/analysis/data/corpus/SUBTLEXus_corpus.txt") %>%
rename(word = Word,
log_freq = Lg10WF)
density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iZ4ROE3EvwU8sZ5PVztiNF7PyLaRks5bBF6awA%3D%3D")) %>%
rename(semantic_density = `semantic-density`,
neighb_count = `neighbour-count`,
neighb_conc = `neighbour-concentration`,
centrality = `global-centrality`) %>%
select(word:semantic_density) Get filtered version of types for each kid
types_clean <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)Get mean density at t1
get_density_by_kid <- function(id, data, density_norms, freq_norms){
total_words_t1 <- nrow(data)
this_kids_freq <- data %>%
left_join(freq_norms, by = c("gloss_clean" = "word")) %>%
summarize(mean_log_freq = mean(log_freq, na.rm = T))
this_kids_model <- density_norms %>%
filter(word %in% data$gloss_clean) %>%
select(-word)
this_kids_model %>%
summarize_all(mean) %>%
mutate(target_child_id = id,
words_in_norms_t1 = nrow(this_kids_model),
total_words_t1 = total_words_t1,
mean_log_word_freq_t1 = this_kids_freq$mean_log_freq) %>%
select(target_child_id, everything())
}
nested_data_by_kid <- nest(types_clean, -target_child_id)
vocab_measures <- map2_df(nested_data_by_kid$target_child_id,
nested_data_by_kid$data,
get_density_by_kid,
density_norms,
freq) Merge in other variables
vocab_df <- vocab_measures %>%
left_join(groups_info %>% select(delta_resid_group, target_child_id, mtld_t1,
mtld_t2, age_t1, age_t2, mtld_diff, age_diff)) %>%
mutate(log_mtld_t2 = log(mtld_t2 + 1),
log_mtld_t1 = log(mtld_t1 + 1),
log_total_words_t1 = log(total_words_t1),
log_word_in_norms_t1 = log(words_in_norms_t1),
log_semantic_density = log(semantic_density),
log_centrality = log(centrality)) %>%
left_join(trigrams %>% select(target_child_id, log_num_trigrams_t1, log_num_trigrams_t2,
mean_log_freq_trigrams_t1, mean_log_freq_trigrams_t2)) %>%
mutate_if(is.numeric, scale) # scale everything for regressions lm(log_mtld_t2 ~ log_semantic_density + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_semantic_density + age_t1 + age_t2 +
## log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42334 -0.47777 0.00001 0.37107 2.43015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.539e-16 7.220e-02 0.000 1.000000
## log_semantic_density -1.396e-01 7.430e-02 -1.878 0.063415 .
## age_t1 -4.388e-01 1.157e-01 -3.794 0.000261 ***
## age_t2 1.048e-01 9.664e-02 1.085 0.280876
## log_mtld_t1 7.674e-01 8.729e-02 8.792 6.3e-14 ***
## log_word_in_norms_t1 8.785e-02 1.144e-01 0.768 0.444627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7256 on 95 degrees of freedom
## Multiple R-squared: 0.4998, Adjusted R-squared: 0.4735
## F-statistic: 18.98 on 5 and 95 DF, p-value: 4.783e-13
lm(log_mtld_t2 ~ log_centrality + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_centrality + age_t1 + age_t2 +
## log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.50395 -0.49163 -0.00277 0.40475 2.43579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.489e-16 7.238e-02 0.000 1.000000
## log_centrality -1.295e-01 7.423e-02 -1.745 0.084286 .
## age_t1 -4.531e-01 1.155e-01 -3.923 0.000165 ***
## age_t2 1.075e-01 9.694e-02 1.109 0.270233
## log_mtld_t1 7.771e-01 8.845e-02 8.786 6.5e-14 ***
## log_word_in_norms_t1 7.000e-02 1.153e-01 0.607 0.545052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7274 on 95 degrees of freedom
## Multiple R-squared: 0.4973, Adjusted R-squared: 0.4709
## F-statistic: 18.8 on 5 and 95 DF, p-value: 6.001e-13
lm(mtld_diff ~ log_semantic_density + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1 ,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ log_semantic_density + age_t1 + age_t2 +
## log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6087 -0.4461 -0.1579 0.3458 2.9015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.056e-16 7.742e-02 0.000 1.00000
## log_semantic_density -1.072e-01 7.967e-02 -1.346 0.18151
## age_t1 -3.851e-01 1.240e-01 -3.105 0.00251 **
## age_t2 8.649e-03 1.036e-01 0.083 0.93366
## log_mtld_t1 -4.845e-01 9.359e-02 -5.177 1.26e-06 ***
## log_word_in_norms_t1 2.016e-01 1.227e-01 1.643 0.10375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7781 on 95 degrees of freedom
## Multiple R-squared: 0.4249, Adjusted R-squared: 0.3946
## F-statistic: 14.04 on 5 and 95 DF, p-value: 2.862e-10
lm(mtld_diff ~ log_centrality + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1 ,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ log_centrality + age_t1 + age_t2 + log_mtld_t1 +
## log_word_in_norms_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5846 -0.4209 -0.1351 0.3698 2.9262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.891e-16 7.794e-02 0.000 1.00000
## log_centrality -5.778e-02 7.993e-02 -0.723 0.47151
## age_t1 -3.980e-01 1.244e-01 -3.200 0.00187 **
## age_t2 8.091e-03 1.044e-01 0.078 0.93839
## log_mtld_t1 -4.863e-01 9.524e-02 -5.106 1.69e-06 ***
## log_word_in_norms_t1 1.940e-01 1.241e-01 1.564 0.12126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7833 on 95 degrees of freedom
## Multiple R-squared: 0.4171, Adjusted R-squared: 0.3865
## F-statistic: 13.6 on 5 and 95 DF, p-value: 5.269e-10
lm(log_num_trigrams_t2 ~ log_semantic_density + age_t1 + age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_semantic_density + age_t1 +
## age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
## data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08120 -0.24573 -0.03159 0.25992 1.17499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.403e-16 4.540e-02 0.000 1.00000
## log_semantic_density 1.211e-02 4.642e-02 0.261 0.79482
## age_t1 -2.124e-01 7.265e-02 -2.923 0.00434 **
## age_t2 1.232e-01 6.073e-02 2.028 0.04537 *
## log_word_in_norms_t1 -3.436e-01 3.305e-01 -1.040 0.30110
## log_num_trigrams_t1 1.129e+00 2.842e-01 3.971 0.00014 ***
## mean_log_word_freq_t1 -4.898e-01 9.428e-02 -5.195 1.18e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4563 on 94 degrees of freedom
## Multiple R-squared: 0.8043, Adjusted R-squared: 0.7918
## F-statistic: 64.38 on 6 and 94 DF, p-value: < 2.2e-16
lm(log_num_trigrams_t2 ~ log_centrality + age_t1 + age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality + age_t1 +
## age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
## data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00954 -0.24090 0.00498 0.24206 1.17050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.609e-16 4.387e-02 0.000 1.000000
## log_centrality 1.217e-01 4.675e-02 2.604 0.010704 *
## age_t1 -2.134e-01 6.979e-02 -3.058 0.002900 **
## age_t2 1.143e-01 5.875e-02 1.946 0.054601 .
## log_word_in_norms_t1 -2.667e-01 3.206e-01 -0.832 0.407483
## log_num_trigrams_t1 1.035e+00 2.770e-01 3.738 0.000319 ***
## mean_log_word_freq_t1 -5.133e-01 9.154e-02 -5.608 2.05e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4408 on 94 degrees of freedom
## Multiple R-squared: 0.8173, Adjusted R-squared: 0.8057
## F-statistic: 70.09 on 6 and 94 DF, p-value: < 2.2e-16
lm(mean_log_freq_trigrams_t2 ~ log_semantic_density + age_t1 + age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + mean_log_word_freq_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_semantic_density +
## age_t1 + age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 +
## mean_log_word_freq_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93735 -0.27290 -0.03837 0.28297 1.10832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.445e-16 4.310e-02 0.000 1.00000
## log_semantic_density 3.695e-02 4.410e-02 0.838 0.40417
## age_t1 9.694e-02 6.894e-02 1.406 0.16296
## age_t2 -4.996e-02 5.767e-02 -0.866 0.38860
## log_word_in_norms_t1 -5.996e-01 1.055e-01 -5.681 1.49e-07 ***
## mean_log_freq_trigrams_t1 1.911e-01 6.882e-02 2.776 0.00663 **
## mean_log_word_freq_t1 3.878e-01 6.931e-02 5.596 2.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4332 on 94 degrees of freedom
## Multiple R-squared: 0.8236, Adjusted R-squared: 0.8124
## F-statistic: 73.16 on 6 and 94 DF, p-value: < 2.2e-16
lm(mean_log_freq_trigrams_t2 ~ log_centrality + age_t1 + age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + mean_log_word_freq_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_centrality + age_t1 +
## age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 +
## mean_log_word_freq_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97242 -0.27562 -0.05022 0.23494 1.06416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.385e-16 4.307e-02 0.000 1.00000
## log_centrality -4.152e-02 4.560e-02 -0.911 0.36483
## age_t1 1.044e-01 6.851e-02 1.524 0.13093
## age_t2 -4.470e-02 5.772e-02 -0.774 0.44066
## log_word_in_norms_t1 -5.809e-01 1.064e-01 -5.458 3.90e-07 ***
## mean_log_freq_trigrams_t1 1.971e-01 6.885e-02 2.864 0.00517 **
## mean_log_word_freq_t1 4.084e-01 7.214e-02 5.660 1.63e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4329 on 94 degrees of freedom
## Multiple R-squared: 0.8239, Adjusted R-squared: 0.8126
## F-statistic: 73.28 on 6 and 94 DF, p-value: < 2.2e-16