library(knitr)
library(tidyverse)
library(boot)
library(broom)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
theme_set(theme_classic(base_size = 10))Regression models suggest that vocabulary density at t1 is negatively associated with number of trigrams at time 2: Kids who know words that are less dense in semantic space at t1 are able to produce more trigrams at t2. In this analysis, I do a repeated cross-validation procedure by sampling 80% of the kids and use their vocabulary at t1 to predict the outcomes of the other 20% kids at t2. To do this, I took all the words that kids knew at t1 and scaled the centrality and density values such that the mean was zero; values above zero suggest that the words are relatively central/dense, and words below zero indicate that the word is relatively less central/dense. For the weighted measure, I multiplied the scaled centrality/density value by the log of the number of kids at t1 that said that word. Then for these testing kids, I subsetted their t1 vocabulary to only the words that kids in the training group knew at t1. For each kid in the test group, I then took their mean vocabulary centrality/density t1 and their number of trigrams at t2. Then I estimated the correlation between the mean centrality/density at t1 and trigrams at time 2 across kids for kids in the testing set. I repeated this NSAMPLES times and then bootstrapped means and CIs of this estimate.
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
trigram_outcomes <- read_csv("semantic_density_df.csv")
density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iXP7uN49YWb0EglCMjVLCP56VLAfks5bGBfqwA%3D%3D")) %>%
rename(semantic_density = `semantic-density`,
centrality = `global-centrality`) %>%
select(word:semantic_density) nested_data_by_kid_t1 <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= params$MINWORDSFORVOCAB) %>%
nest(-target_child_id)
nested_data_by_kid_t2 <- all_types %>%
filter(tbin == "t2") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= params$MINWORDSFORVOCAB) %>%
nest(-target_child_id)# function for cross-validation
get_corr_for_sample <- function(i,
kid_t1_data,
outcome_data,
norms,
fraction_sampled,
this_measure){
# training kids
training_kids_words <- kid_t1_data %>%
sample_frac(fraction_sampled) %>%
unnest() %>%
select(-count) %>%
mutate(this_measure = this_measure)
training_kids_words_with_measure <- training_kids_words %>%
count(gloss_clean) %>%
left_join(norms, by = c("gloss_clean" = "word")) %>%
filter(!is.na(semantic_density)) %>%
mutate(norm_measure = case_when(this_measure == "centrality" ~ centrality,
this_measure == "semantic_density" ~ semantic_density),
weighted_measure = norm_measure*log(n)) %>%
select(gloss_clean, norm_measure, weighted_measure)
# testing kids
testing_kids_with_measure <- kid_t1_data %>%
filter(!(target_child_id %in% training_kids_words$target_child_id)) %>%
unnest() %>%
left_join(training_kids_words_with_measure) %>% # merge in density of words from kid 2
group_by(target_child_id) %>%
summarize(norm_measure_mean = mean(norm_measure, na.rm = T),
weighted_measure_mean = mean(weighted_measure, na.rm = T))
testing_kids_with_measure_mtld <- testing_kids_with_measure %>%
left_join(outcome_data %>% select(target_child_id, log_num_trigrams_t2))
data.frame(sample = i,
norm_measure_mean_corr = cor(testing_kids_with_measure_mtld$log_num_trigrams_t2,
testing_kids_with_measure_mtld$norm_measure_mean)[[1]],
weighted_measure_mean_corr = cor(testing_kids_with_measure_mtld$log_num_trigrams_t2,
testing_kids_with_measure_mtld$weighted_measure_mean)[[1]])
}all_t1_words <- nested_data_by_kid_t1 %>%
unnest() %>%
select(gloss_clean) %>%
unlist(use.names = F)
density_norms_scaled <- density_norms %>%
filter(word %in% all_t1_words) %>%
mutate(centrality = scale(centrality),
semantic_density = scale(semantic_density))
sampled_corrs <- map_df(c(1:params$NSAMPLES),
get_corr_for_sample,
nested_data_by_kid_t1,
trigram_outcomes,
density_norms,
params$TRAIN_FRACTION,
"semantic_density")Distribution of correlations between mtld_t1
sampled_corrs %>%
gather(corr_type, value, -1) %>%
ggplot(aes(x = value, fill = corr_type)) +
geom_density(alpha = .8) +
geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(~corr_type) +
theme(legend.position = "none")Bootsrap correlation coefficient across samples
boot_unweighted_corr <- boot(sampled_corrs$norm_measure_mean_corr,
function(d, i) {mean(d[i])}, R = 1000)
boot_weighted_corr <- boot(sampled_corrs$weighted_measure_mean_corr,
function(d, i) {mean(d[i])}, R = 1000)
estimate_df <- data.frame(corr_type = c("unweighted", "weighted"),
corr = c(boot_unweighted_corr$t0, boot_weighted_corr$t0),
ci_low = c(boot.ci(boot_unweighted_corr)$normal[2], boot.ci(boot_weighted_corr)$normal[2]),
ci_high = c(boot.ci(boot_unweighted_corr)$normal[3], boot.ci(boot_weighted_corr)$normal[3]))
kable(estimate_df)| corr_type | corr | ci_low | ci_high |
|---|---|---|---|
| unweighted | 0.0687711 | 0.0150879 | 0.1217618 |
| weighted | -0.2094690 | -0.2528513 | -0.1653177 |
ggplot(estimate_df, aes(x = corr_type, y = corr, color = corr_type)) +
geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
ylab("estimated correlation coefficient of\nvocabulary density and mtld at time 2") +
geom_hline(aes(yintercept = 0), linetype = 2)+
theme(legend.position = "none")sampled_corrs <- map_df(c(1:params$NSAMPLES),
get_corr_for_sample,
nested_data_by_kid_t1,
trigram_outcomes,
density_norms,
params$TRAIN_FRACTION,
"centrality")Distribution of correlations between mtld_t1
sampled_corrs %>%
gather(corr_type, value, -1) %>%
ggplot(aes(x = value, fill = corr_type)) +
geom_density(alpha = .8) +
geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(~corr_type) +
theme(legend.position = "none")Bootsrap correlation coefficient across samples
boot_unweighted_corr <- boot(sampled_corrs$norm_measure_mean_corr,
function(d, i) {mean(d[i])}, R = 1000)
boot_weighted_corr <- boot(sampled_corrs$weighted_measure_mean_corr,
function(d, i) {mean(d[i])}, R = 1000)
estimate_df <- data.frame(corr_type = c("unweighted", "weighted"),
corr = c(boot_unweighted_corr$t0, boot_weighted_corr$t0),
ci_low = c(boot.ci(boot_unweighted_corr)$normal[2], boot.ci(boot_weighted_corr)$normal[2]),
ci_high = c(boot.ci(boot_unweighted_corr)$normal[3], boot.ci(boot_weighted_corr)$normal[3]))
kable(estimate_df)| corr_type | corr | ci_low | ci_high |
|---|---|---|---|
| unweighted | 0.2300603 | 0.1770034 | 0.2836633 |
| weighted | -0.5880123 | -0.6163258 | -0.5600064 |
ggplot(estimate_df, aes(x = corr_type, y = corr, color = corr_type)) +
geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
ylab("estimated correlation coefficient of \nvocabulary centrality and mtld at time 2") +
geom_hline(aes(yintercept = 0), linetype = 2)+
theme(legend.position = "none")Number of words at t1 and t2, age diff, age t1, and log_num_trigrams_t1
# function for cross-validation
get_corr_for_sample2 <- function(i,
kid_t1_data,
outcome_data,
norms,
fraction_sampled,
this_measure){
# training kids
training_kids_words <- kid_t1_data %>%
sample_frac(fraction_sampled) %>%
unnest() %>%
select(-count) %>%
mutate(this_measure = this_measure)
training_kids_words_with_measure <- training_kids_words %>%
count(gloss_clean) %>%
left_join(norms, by = c("gloss_clean" = "word")) %>%
filter(!is.na(semantic_density)) %>%
mutate(norm_measure = case_when(this_measure == "centrality" ~ centrality,
this_measure == "semantic_density" ~ semantic_density),
weighted_measure = norm_measure*log(n)) %>%
select(gloss_clean, norm_measure, weighted_measure)
# testing kids
testing_kids_with_measure <- kid_t1_data %>%
filter(!(target_child_id %in% training_kids_words$target_child_id)) %>%
unnest() %>%
left_join(training_kids_words_with_measure) %>% # merge in density of words from kid 2
group_by(target_child_id) %>%
summarize(norm_measure_mean = mean(norm_measure, na.rm = T),
weighted_measure_mean = mean(weighted_measure, na.rm = T))
testing_kids_with_measure_mtld <- testing_kids_with_measure %>%
left_join(outcome_data)
mean_stat <- lm(log_num_trigrams_t2~ log_total_words_t1 + log_total_words_t2 +
norm_measure_mean + age_diff + age_t1, testing_kids_with_measure_mtld) %>%
summary() %>%
tidy() %>%
filter(term == "norm_measure_mean") %>%
pull(statistic)
weighted_mean_stat <- lm(log_num_trigrams_t2~ log_total_words_t1 + log_total_words_t2 +
weighted_measure_mean + age_diff + age_t1 + log_num_trigrams_t1 , testing_kids_with_measure_mtld) %>%
summary() %>%
tidy() %>%
filter(term == "weighted_measure_mean") %>%
pull(statistic)
data.frame(sample = i,
norm_measure_mean_stat = mean_stat,
weighted_measure_mean_stat = weighted_mean_stat)
}sampled_stats <- map_df(c(1:params$NSAMPLES),
get_corr_for_sample2,
nested_data_by_kid_t1,
trigram_outcomes,
density_norms,
params$TRAIN_FRACTION,
"semantic_density")Distribution of test statistcs of density as a predictor of number of trigrams at t2
sampled_stats %>%
gather(corr_type, value, -1) %>%
ggplot(aes(x = value, fill = corr_type)) +
geom_density(alpha = .8) +
geom_vline(aes(xintercept = 2), linetype = 2) +
facet_wrap(~corr_type) +
theme(legend.position = "none")Bootsrap correlation statistic across samples
boot_unweighted_stat <- boot(sampled_stats$norm_measure_mean_stat,
function(d, i) {mean(d[i])}, R = 1000)
boot_weighted_stat <- boot(sampled_stats$weighted_measure_mean_stat,
function(d, i) {mean(d[i])}, R = 1000)
estimate_df <- data.frame(stat_type = c("unweighted", "weighted"),
stat = c(boot_unweighted_stat$t0, boot_weighted_stat$t0),
ci_low = c(boot.ci(boot_unweighted_stat)$normal[2], boot.ci(boot_weighted_stat)$normal[2]),
ci_high = c(boot.ci(boot_unweighted_stat)$normal[3], boot.ci(boot_weighted_stat)$normal[3]))
kable(estimate_df)| stat_type | stat | ci_low | ci_high |
|---|---|---|---|
| unweighted | 0.7947348 | 0.5744865 | 1.016373 |
| weighted | 0.9656777 | 0.7258297 | 1.197559 |
ggplot(estimate_df, aes(x = stat_type, y = stat, color = stat_type)) +
geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
ylab("estimated statistic of density coefficient \npredicting num trigrams at t2") +
geom_hline(aes(yintercept = 2), linetype = 2)+
theme(legend.position = "none")sampled_stats <- map_df(c(1:params$NSAMPLES),
get_corr_for_sample2,
nested_data_by_kid_t1,
trigram_outcomes,
density_norms,
params$TRAIN_FRACTION,
"centrality")Distribution of test statistcs of centrality as a predictor of number of trigrams at t2
sampled_stats %>%
gather(corr_type, value, -1) %>%
ggplot(aes(x = value, fill = corr_type)) +
geom_density(alpha = .8) +
geom_vline(aes(xintercept = 2), linetype = 2) +
facet_wrap(~corr_type) +
theme(legend.position = "none")Bootsrap correlation statistic across samples
boot_unweighted_stat <- boot(sampled_stats$norm_measure_mean_stat,
function(d, i) {mean(d[i])}, R = 1000)
boot_weighted_stat <- boot(sampled_stats$weighted_measure_mean_stat,
function(d, i) {mean(d[i])}, R = 1000)
estimate_df <- data.frame(stat_type = c("unweighted", "weighted"),
stat = c(boot_unweighted_stat$t0, boot_weighted_stat$t0),
ci_low = c(boot.ci(boot_unweighted_stat)$normal[2], boot.ci(boot_weighted_stat)$normal[2]),
ci_high = c(boot.ci(boot_unweighted_stat)$normal[3], boot.ci(boot_weighted_stat)$normal[3]))
kable(estimate_df)| stat_type | stat | ci_low | ci_high |
|---|---|---|---|
| unweighted | 1.9208173 | 1.6230378 | 2.2178735 |
| weighted | 0.4980724 | 0.2448795 | 0.7544975 |
ggplot(estimate_df, aes(x = stat_type, y = stat, color = stat_type)) +
geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
ylab("estimated statistic of centrality coefficient \npredicting num trigrams at t2") +
geom_hline(aes(yintercept = 2), linetype = 2) +
theme(legend.position = "none")