library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))MINWORDSFORVOCAB <- 5The min words for vocab here is 5.
Read in data
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")
trigrams <- read_csv("../2_trigrams/mtld_continuous_trigram_by_kid_MIN1.csv")
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/ref_complex/Papers/RC_old/analysis/data/corpus/SUBTLEXus_corpus.txt") %>%
rename(word = Word,
log_freq = Lg10WF)
density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iZ4ROE3EvwU8sZ5PVztiNF7PyLaRks5bBF6awA%3D%3D")) %>%
rename(semantic_density = `semantic-density`,
neighb_count = `neighbour-count`,
neighb_conc = `neighbour-concentration`,
centrality = `global-centrality`) %>%
select(word:semantic_density) Get filtered version of types for each kid
nested_data_by_kid_t1 <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB) %>%
nest(-target_child_id)
nested_data_by_kid_t2 <- all_types %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB) %>%
nest(-target_child_id)Get mean density at t1
get_density_by_kid <- function(id, data, density_norms, freq_norms){
total_words_t1 <- nrow(data)
this_kids_freq <- data %>%
left_join(freq_norms, by = c("gloss_clean" = "word")) %>%
summarize(mean_log_freq = mean(log_freq, na.rm = T))
this_kids_model <- density_norms %>%
filter(word %in% data$gloss_clean) %>%
select(-word)
this_kids_model %>%
summarize_all(mean) %>%
mutate(target_child_id = id,
words_in_norms = nrow(this_kids_model),
total_words = total_words_t1,
mean_log_word_freq = this_kids_freq$mean_log_freq) %>%
select(target_child_id, everything())
}
# t1 vocab measures
vocab_measures_t1 <- map2_df(nested_data_by_kid_t1$target_child_id,
nested_data_by_kid_t1$data,
get_density_by_kid,
density_norms,
freq) %>%
rename(words_in_norms_t1 = words_in_norms,
total_words_t1 = total_words,
mean_log_word_freq_t1 = mean_log_word_freq) %>%
mutate(log_density_t1 = log(semantic_density),
log_centrality_t1 = log(centrality),
log_total_words_t1 = log(total_words_t1),
log_word_in_norms_t1 = log(words_in_norms_t1)) %>%
select(-centrality, -neighb_count, -neighb_conc, -semantic_density)
# t2 vocab measures
vocab_measures_t2 <- map2_df(nested_data_by_kid_t2$target_child_id,
nested_data_by_kid_t2$data,
get_density_by_kid,
density_norms,
freq) %>%
rename(words_in_norms_t2 = words_in_norms,
total_words_t2 = total_words,
mean_log_word_freq_t2 = mean_log_word_freq) %>%
mutate(log_density_t2 = log(semantic_density),
log_centrality_t2 = log(centrality),
log_total_words_t2 = log(total_words_t2),
log_word_in_norms_t2 = log(words_in_norms_t2)) %>%
select(-centrality, -neighb_count, -neighb_conc, -semantic_density)
vocab_measures <- full_join(vocab_measures_t1, vocab_measures_t2)Merge in other variables
vocab_df <- vocab_measures %>%
left_join(groups_info %>% select(delta_resid_group, target_child_id, mtld_t1,
mtld_t2, age_t1, age_t2, mtld_diff, age_diff)) %>%
mutate(log_mtld_t2 = log(mtld_t2 + 1),
log_mtld_t1 = log(mtld_t1 + 1)) %>%
left_join(trigrams %>% select(target_child_id, log_num_trigrams_t1, log_num_trigrams_t2,
mean_log_freq_trigrams_t1, mean_log_freq_trigrams_t2)) %>%
select(-mtld_t1, -mtld_t2)
#mutate_if(is.numeric, scale) # scale everything for regressions
write_csv(vocab_df, "semantic_density_df.csv")lm(log_mtld_t2 ~ log_density_t1 + log_mtld_t1 + age_diff + age_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_density_t1 + log_mtld_t1 + age_diff +
## age_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50028 -0.18077 -0.01099 0.18045 0.86402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.328138 1.867447 1.247 0.216
## log_density_t1 0.077231 0.076130 1.014 0.313
## log_mtld_t1 0.616120 0.064274 9.586 3.52e-15 ***
## age_diff 0.001303 0.001714 0.760 0.449
## age_t1 -0.002648 0.002371 -1.117 0.267
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2763 on 85 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.5407, Adjusted R-squared: 0.5191
## F-statistic: 25.02 on 4 and 85 DF, p-value: 1.042e-13
lm(log_mtld_t2 ~ log_centrality_t1 + log_mtld_t1 + age_diff + age_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_centrality_t1 + log_mtld_t1 +
## age_diff + age_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49851 -0.17139 -0.01161 0.16973 0.88446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.554766 2.167594 1.640 0.105
## log_centrality_t1 0.373229 0.490407 0.761 0.449
## log_mtld_t1 0.612818 0.066079 9.274 1.5e-14 ***
## age_diff 0.001207 0.001728 0.698 0.487
## age_t1 -0.002637 0.002383 -1.107 0.272
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.277 on 85 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.5383, Adjusted R-squared: 0.5166
## F-statistic: 24.78 on 4 and 85 DF, p-value: 1.297e-13
lm(log_mtld_t2 ~ log_density_t1 + log_centrality_t1 + log_mtld_t1 + age_diff + age_t1,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_density_t1 + log_centrality_t1 +
## log_mtld_t1 + age_diff + age_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50194 -0.18118 -0.01173 0.18022 0.86345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.947093 3.210738 0.606 0.546
## log_density_t1 0.094071 0.138228 0.681 0.498
## log_centrality_t1 -0.129955 0.888090 -0.146 0.884
## log_mtld_t1 0.618599 0.066831 9.256 1.81e-14 ***
## age_diff 0.001342 0.001745 0.769 0.444
## age_t1 -0.002625 0.002390 -1.098 0.275
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2779 on 84 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5135
## F-statistic: 19.79 on 5 and 84 DF, p-value: 5.562e-13
lm(mtld_diff ~ log_density_t1 + age_diff + age_t1 + log_mtld_t1 ,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ log_density_t1 + age_diff + age_t1 +
## log_mtld_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9888 -2.9122 -0.8178 2.8028 17.0447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.58482 36.67253 1.325 0.189
## log_density_t1 2.13300 1.49502 1.427 0.157
## age_diff -0.01362 0.03366 -0.405 0.687
## age_t1 -0.06136 0.04657 -1.318 0.191
## log_mtld_t1 -6.73635 1.26220 -5.337 7.73e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.425 on 85 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.4186, Adjusted R-squared: 0.3912
## F-statistic: 15.3 on 4 and 85 DF, p-value: 1.842e-09
lm(mtld_diff ~ log_centrality_t1 + age_diff + age_t1 +
log_mtld_t1 ,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ log_centrality_t1 + age_diff + age_t1 +
## log_mtld_t1, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.4459 -3.1278 -0.8113 2.6113 17.9299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.10696 42.85411 1.706 0.0917 .
## log_centrality_t1 6.30725 9.69552 0.651 0.5171
## age_diff -0.01472 0.03417 -0.431 0.6678
## age_t1 -0.05891 0.04711 -1.251 0.2145
## log_mtld_t1 -6.68263 1.30640 -5.115 1.91e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.476 on 85 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.4076, Adjusted R-squared: 0.3797
## F-statistic: 14.62 on 4 and 85 DF, p-value: 3.976e-09
lm(mtld_diff ~ log_density_t1 + log_centrality_t1 + age_diff +
age_t1 + log_mtld_t1 + log_density_t2 + log_centrality_t2 + log_total_words_t1 + log_total_words_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ log_density_t1 + log_centrality_t1 +
## age_diff + age_t1 + log_mtld_t1 + log_density_t2 + log_centrality_t2 +
## log_total_words_t1 + log_total_words_t2, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.4474 -3.1576 -0.7623 1.8200 20.8240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 114.535358 93.363938 1.227 0.2235
## log_density_t1 5.243340 2.752231 1.905 0.0604 .
## log_centrality_t1 -34.898206 18.706609 -1.866 0.0658 .
## age_diff -0.008057 0.034329 -0.235 0.8150
## age_t1 -0.052893 0.045525 -1.162 0.2487
## log_mtld_t1 -7.875184 1.365537 -5.767 1.46e-07 ***
## log_density_t2 -4.557817 5.365463 -0.849 0.3982
## log_centrality_t2 69.836508 32.616111 2.141 0.0353 *
## log_total_words_t1 0.462883 1.036437 0.447 0.6564
## log_total_words_t2 0.518431 1.039351 0.499 0.6193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.256 on 80 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.4863, Adjusted R-squared: 0.4286
## F-statistic: 8.416 on 9 and 80 DF, p-value: 9.898e-09
lm(log_num_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 +
## age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 +
## mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
## data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06441 -0.27453 -0.02732 0.19873 1.59324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.154147 3.917191 0.550 0.583965
## log_density_t1 0.281318 0.152799 1.841 0.069460 .
## log_num_trigrams_t1 0.246264 0.061293 4.018 0.000136 ***
## age_diff 0.004315 0.003216 1.342 0.183616
## age_t1 -0.002379 0.004289 -0.555 0.580778
## log_word_in_norms_t1 -0.361960 0.116247 -3.114 0.002594 **
## log_word_in_norms_t2 1.136285 0.106095 10.710 < 2e-16 ***
## mean_log_word_freq_t1 0.052644 0.142748 0.369 0.713294
## mean_log_word_freq_t2 0.170032 0.217109 0.783 0.435934
## log_density_t2 0.132887 0.346978 0.383 0.702786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4625 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.9233, Adjusted R-squared: 0.9143
## F-statistic: 103 on 9 and 77 DF, p-value: < 2.2e-16
lm(log_num_trigrams_t2 ~ log_centrality_t1 + log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality_t1 + log_num_trigrams_t1 +
## age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 +
## mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
## data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85743 -0.27263 0.00408 0.17887 1.79782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.329689 5.670422 4.114 9.65e-05 ***
## log_centrality_t1 2.129950 1.050609 2.027 0.04609 *
## log_num_trigrams_t1 0.245509 0.056075 4.378 3.72e-05 ***
## age_diff 0.003976 0.002971 1.338 0.18480
## age_t1 -0.001419 0.003938 -0.360 0.71960
## log_word_in_norms_t1 -0.241495 0.110509 -2.185 0.03191 *
## log_word_in_norms_t2 0.893284 0.113694 7.857 1.92e-11 ***
## mean_log_word_freq_t1 0.009115 0.150576 0.061 0.95189
## mean_log_word_freq_t2 -0.250680 0.239485 -1.047 0.29849
## log_centrality_t2 6.751832 2.409092 2.803 0.00641 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4227 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.936, Adjusted R-squared: 0.9285
## F-statistic: 125 on 9 and 77 DF, p-value: < 2.2e-16
lm(log_num_trigrams_t2 ~ log_centrality_t1 + log_centrality_t2 + log_density_t1 + log_density_t2 + log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality_t1 + log_centrality_t2 +
## log_density_t1 + log_density_t2 + log_num_trigrams_t1 + age_diff +
## age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 +
## mean_log_word_freq_t2 + log_centrality_t2, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91379 -0.19204 0.00719 0.18141 1.41606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.185327 9.243930 4.564 1.93e-05 ***
## log_centrality_t1 0.963013 1.980311 0.486 0.628177
## log_centrality_t2 13.541510 3.746847 3.614 0.000543 ***
## log_density_t1 0.097392 0.259600 0.375 0.708598
## log_density_t2 -1.192939 0.484249 -2.463 0.016052 *
## log_num_trigrams_t1 0.210312 0.056193 3.743 0.000354 ***
## age_diff 0.003224 0.002903 1.110 0.270345
## age_t1 -0.001968 0.003859 -0.510 0.611593
## log_word_in_norms_t1 -0.155477 0.112751 -1.379 0.172014
## log_word_in_norms_t2 0.849467 0.111794 7.598 6.91e-11 ***
## mean_log_word_freq_t1 0.134994 0.173001 0.780 0.437666
## mean_log_word_freq_t2 -0.447029 0.261319 -1.711 0.091277 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4106 on 75 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.9411, Adjusted R-squared: 0.9325
## F-statistic: 109 on 11 and 75 DF, p-value: < 2.2e-16
lm(mean_log_freq_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 +
## age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 +
## mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
## data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.179659 -0.032371 -0.002094 0.035780 0.150014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4536048 0.4956081 0.915 0.3629
## log_density_t1 -0.0038107 0.0193324 -0.197 0.8443
## log_num_trigrams_t1 -0.0133411 0.0077549 -1.720 0.0894 .
## age_diff -0.0001089 0.0004069 -0.268 0.7896
## age_t1 0.0002166 0.0005426 0.399 0.6909
## log_word_in_norms_t1 0.0230250 0.0147078 1.565 0.1216
## log_word_in_norms_t2 -0.1103722 0.0134232 -8.222 3.79e-12 ***
## mean_log_word_freq_t1 0.0010097 0.0180606 0.056 0.9556
## mean_log_word_freq_t2 0.0098051 0.0274689 0.357 0.7221
## log_density_t2 0.0842354 0.0439001 1.919 0.0587 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05852 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.8948, Adjusted R-squared: 0.8825
## F-statistic: 72.74 on 9 and 77 DF, p-value: < 2.2e-16
lm(mean_log_freq_trigrams_t2 ~ log_centrality_t1 + log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
data = vocab_df) %>%
summary()##
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_centrality_t1 +
## log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 +
## log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 +
## log_centrality_t2, data = vocab_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.236366 -0.028185 -0.001167 0.027665 0.139214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3655768 0.7971563 0.459 0.6478
## log_centrality_t1 -0.1132254 0.1476962 -0.767 0.4457
## log_num_trigrams_t1 -0.0164908 0.0078831 -2.092 0.0397 *
## age_diff -0.0001359 0.0004177 -0.325 0.7458
## age_t1 0.0001285 0.0005536 0.232 0.8171
## log_word_in_norms_t1 0.0207918 0.0155355 1.338 0.1847
## log_word_in_norms_t2 -0.0915469 0.0159832 -5.728 1.88e-07 ***
## mean_log_word_freq_t1 0.0076215 0.0211682 0.360 0.7198
## mean_log_word_freq_t2 0.0428827 0.0336671 1.274 0.2066
## log_centrality_t2 -0.1532772 0.3386737 -0.453 0.6521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05943 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.8915, Adjusted R-squared: 0.8788
## F-statistic: 70.29 on 9 and 77 DF, p-value: < 2.2e-16
types_clean <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
word_by_decile <- types_clean %>%
ungroup()%>%
count(gloss_clean) %>%
filter(n >= 5) %>%
left_join(density_norms, by = c("gloss_clean" = "word")) %>%
filter(!is.na(centrality)) %>%
mutate(decile_centrality = ntile(centrality, 10),
decile_density = ntile(semantic_density, 10))
word_by_decile_diff = word_by_decile %>%
#filter(decile_centrality != decile_density) %>%
mutate(decile_diff = decile_centrality - decile_density,
abs_decile_diff = abs(decile_diff)) %>%
#filter(abs_decile_diff > 4) %>%
select(-2:-6) %>%
arrange(-abs_decile_diff)
DT::datatable(word_by_decile_diff)ggplot(vocab_measures, aes(x = log_centrality_t1, y = log_centrality_t2)) +
geom_point() +
geom_smooth(method = "lm")ggplot(vocab_measures, aes(x = log_density_t1, y = log_density_t2)) +
geom_point() +
geom_smooth(method = "lm")