Semantic density analyses - cross validation, predicting trigrams at t2

Correlation
- Semantic Density
- Centrality
Controlling for stuff
- Semantic Density
- Centrality

library(knitr)
library(tidyverse)
library(boot)
library(broom)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

theme_set(theme_classic(base_size = 10))

Regression models suggest that vocabulary density at t1 is negatively associated with number of trigrams at time 2: Kids who know words that are less dense in semantic space at t1 are able to produce more trigrams at t2. In this analysis, I do a repeated cross-validation procedure by sampling 80% of the kids and use their vocabulary at t1 to predict the outcomes of the other 20% kids at t2. To do this, I took all the words that kids knew at t1 and scaled the centrality and density values such that the mean was zero; values above zero suggest that the words are relatively central/dense, and words below zero indicate that the word is relatively less central/dense. For the weighted measure, I multiplied the scaled centrality/density value by the log of the number of kids at t1 that said that word. Then for these testing kids, I subsetted their t1 vocabulary to only the words that kids in the training group knew at t1. For each kid in the test group, I then took their mean vocabulary centrality/density t1 and their number of trigrams at t2. Then I estimated the correlation between the mean centrality/density at t1 and trigrams at time 2 across kids for kids in the testing set. I repeated this NSAMPLES times and then bootstrapped means and CIs of this estimate.

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
trigram_outcomes <- read_csv("semantic_density_df.csv")
density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iXP7uN49YWb0EglCMjVLCP56VLAfks5bGBfqwA%3D%3D")) %>%
  rename(semantic_density = `semantic-density`, 
         centrality = `global-centrality`) %>%
  select(word:semantic_density)

nested_data_by_kid_t1 <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= params$MINWORDSFORVOCAB)  %>%
  nest(-target_child_id)

nested_data_by_kid_t2 <- all_types %>%
  filter(tbin == "t2") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= params$MINWORDSFORVOCAB) %>%
  nest(-target_child_id)

# function for cross-validation
get_corr_for_sample <- function(i, 
                                kid_t1_data,
                                outcome_data,
                                norms,
                                fraction_sampled,
                                this_measure){
  
   # training kids
   training_kids_words <- kid_t1_data  %>%
      sample_frac(fraction_sampled) %>%
      unnest() %>%
      select(-count) %>%
      mutate(this_measure = this_measure)
   
   training_kids_words_with_measure <- training_kids_words %>%
    count(gloss_clean) %>%
    left_join(norms, by = c("gloss_clean" = "word")) %>%
    filter(!is.na(semantic_density)) %>%
    mutate(norm_measure = case_when(this_measure == "centrality" ~ centrality,
                                    this_measure == "semantic_density" ~ semantic_density),
           weighted_measure = norm_measure*log(n)) %>%
    select(gloss_clean, norm_measure, weighted_measure)
  
     # testing kids
    testing_kids_with_measure <- kid_t1_data %>%
      filter(!(target_child_id %in% training_kids_words$target_child_id)) %>%
      unnest() %>%
      left_join(training_kids_words_with_measure) %>% # merge in density of words from kid 2
      group_by(target_child_id) %>%
      summarize(norm_measure_mean = mean(norm_measure, na.rm = T),
                weighted_measure_mean = mean(weighted_measure, na.rm = T))
  
    testing_kids_with_measure_mtld <- testing_kids_with_measure %>%
      left_join(outcome_data %>% select(target_child_id, log_num_trigrams_t2)) 
    
    data.frame(sample = i,
               norm_measure_mean_corr = cor(testing_kids_with_measure_mtld$log_num_trigrams_t2,
                                                    testing_kids_with_measure_mtld$norm_measure_mean)[[1]],
               weighted_measure_mean_corr = cor(testing_kids_with_measure_mtld$log_num_trigrams_t2,
                                                        testing_kids_with_measure_mtld$weighted_measure_mean)[[1]])
}

Correlation

Semantic Density

all_t1_words <- nested_data_by_kid_t1  %>%
              unnest() %>% 
              select(gloss_clean) %>%
              unlist(use.names = F)

density_norms_scaled <- density_norms %>%
  filter(word %in% all_t1_words) %>%
  mutate(centrality = scale(centrality),
         semantic_density = scale(semantic_density))

sampled_corrs <- map_df(c(1:params$NSAMPLES), 
                        get_corr_for_sample, 
                        nested_data_by_kid_t1, 
                        trigram_outcomes, 
                        density_norms,
                        params$TRAIN_FRACTION,
                        "semantic_density")

Distribution of correlations between mtld_t1

sampled_corrs %>%
  gather(corr_type, value, -1) %>%
  ggplot(aes(x = value, fill = corr_type)) +
  geom_density(alpha = .8) +
  geom_vline(aes(xintercept = 0), linetype = 2) +
  facet_wrap(~corr_type) +
  theme(legend.position = "none")

Bootsrap correlation coefficient across samples

boot_unweighted_corr  <- boot(sampled_corrs$norm_measure_mean_corr, 
          function(d, i) {mean(d[i])}, R = 1000)  

boot_weighted_corr  <- boot(sampled_corrs$weighted_measure_mean_corr, 
          function(d, i) {mean(d[i])}, R = 1000) 

estimate_df <- data.frame(corr_type = c("unweighted", "weighted"),
                          corr =  c(boot_unweighted_corr$t0, boot_weighted_corr$t0), 
                          ci_low = c(boot.ci(boot_unweighted_corr)$normal[2], boot.ci(boot_weighted_corr)$normal[2]), 
                          ci_high = c(boot.ci(boot_unweighted_corr)$normal[3], boot.ci(boot_weighted_corr)$normal[3]))

kable(estimate_df)

corr_type	corr	ci_low	ci_high
unweighted	0.0687711	0.0150879	0.1217618
weighted	-0.2094690	-0.2528513	-0.1653177

ggplot(estimate_df, aes(x = corr_type, y = corr, color = corr_type)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
  ylab("estimated correlation coefficient of\nvocabulary density and mtld at time 2") +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  theme(legend.position = "none")

Centrality

sampled_corrs <- map_df(c(1:params$NSAMPLES), 
                        get_corr_for_sample, 
                        nested_data_by_kid_t1, 
                        trigram_outcomes, 
                        density_norms,
                        params$TRAIN_FRACTION,
                        "centrality")

Distribution of correlations between mtld_t1

sampled_corrs %>%
  gather(corr_type, value, -1) %>%
  ggplot(aes(x = value, fill = corr_type)) +
  geom_density(alpha = .8) +
  geom_vline(aes(xintercept = 0), linetype = 2) +
  facet_wrap(~corr_type) +
  theme(legend.position = "none")

Bootsrap correlation coefficient across samples

boot_unweighted_corr  <- boot(sampled_corrs$norm_measure_mean_corr, 
          function(d, i) {mean(d[i])}, R = 1000)  

boot_weighted_corr  <- boot(sampled_corrs$weighted_measure_mean_corr, 
          function(d, i) {mean(d[i])}, R = 1000) 

estimate_df <- data.frame(corr_type = c("unweighted", "weighted"),
                          corr =  c(boot_unweighted_corr$t0, boot_weighted_corr$t0), 
                          ci_low = c(boot.ci(boot_unweighted_corr)$normal[2], boot.ci(boot_weighted_corr)$normal[2]), 
                          ci_high = c(boot.ci(boot_unweighted_corr)$normal[3], boot.ci(boot_weighted_corr)$normal[3]))

kable(estimate_df)

corr_type	corr	ci_low	ci_high
unweighted	0.2300603	0.1770034	0.2836633
weighted	-0.5880123	-0.6163258	-0.5600064

ggplot(estimate_df, aes(x = corr_type, y = corr, color = corr_type)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
  ylab("estimated correlation coefficient of \nvocabulary centrality and mtld at time 2") +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  theme(legend.position = "none")

Controlling for stuff

Number of words at t1 and t2, age diff, age t1, and log_num_trigrams_t1

# function for cross-validation
get_corr_for_sample2 <- function(i, 
                                kid_t1_data,
                                outcome_data,
                                norms,
                                fraction_sampled,
                                this_measure){
  
   # training kids
   training_kids_words <- kid_t1_data  %>%
      sample_frac(fraction_sampled) %>%
      unnest() %>%
      select(-count) %>%
      mutate(this_measure = this_measure)
   
   training_kids_words_with_measure <- training_kids_words %>%
    count(gloss_clean) %>%
    left_join(norms, by = c("gloss_clean" = "word")) %>%
    filter(!is.na(semantic_density)) %>%
    mutate(norm_measure = case_when(this_measure == "centrality" ~ centrality,
                                    this_measure == "semantic_density" ~ semantic_density),
           weighted_measure = norm_measure*log(n)) %>%
    select(gloss_clean, norm_measure, weighted_measure)
  
     # testing kids
    testing_kids_with_measure <- kid_t1_data %>%
      filter(!(target_child_id %in% training_kids_words$target_child_id)) %>%
      unnest() %>%
      left_join(training_kids_words_with_measure) %>% # merge in density of words from kid 2
      group_by(target_child_id) %>%
      summarize(norm_measure_mean = mean(norm_measure, na.rm = T),
                weighted_measure_mean = mean(weighted_measure, na.rm = T))
  
    testing_kids_with_measure_mtld <- testing_kids_with_measure %>%
      left_join(outcome_data) 
    
    mean_stat <- lm(log_num_trigrams_t2~ log_total_words_t1 + log_total_words_t2 + 
                      norm_measure_mean + age_diff + age_t1, testing_kids_with_measure_mtld) %>%
      summary() %>%
      tidy() %>%
      filter(term == "norm_measure_mean") %>%
      pull(statistic)
    
      weighted_mean_stat <- lm(log_num_trigrams_t2~ log_total_words_t1 + log_total_words_t2 + 
                                 weighted_measure_mean + age_diff + age_t1 + log_num_trigrams_t1 , testing_kids_with_measure_mtld) %>%
      summary() %>%
      tidy() %>%
      filter(term == "weighted_measure_mean") %>%
      pull(statistic)
    
    data.frame(sample = i,
               norm_measure_mean_stat = mean_stat,
               weighted_measure_mean_stat = weighted_mean_stat)
}

Semantic Density

sampled_stats <- map_df(c(1:params$NSAMPLES), 
                        get_corr_for_sample2, 
                        nested_data_by_kid_t1, 
                        trigram_outcomes, 
                        density_norms,
                        params$TRAIN_FRACTION,
                        "semantic_density")

Distribution of test statistcs of density as a predictor of number of trigrams at t2

sampled_stats %>%
  gather(corr_type, value, -1) %>%
  ggplot(aes(x = value, fill = corr_type)) +
  geom_density(alpha = .8) +
  geom_vline(aes(xintercept = 2), linetype = 2) +
  facet_wrap(~corr_type) +
  theme(legend.position = "none")

Bootsrap correlation statistic across samples

boot_unweighted_stat  <- boot(sampled_stats$norm_measure_mean_stat, 
          function(d, i) {mean(d[i])}, R = 1000)  

boot_weighted_stat  <- boot(sampled_stats$weighted_measure_mean_stat, 
          function(d, i) {mean(d[i])}, R = 1000) 

estimate_df <- data.frame(stat_type = c("unweighted", "weighted"),
                          stat =  c(boot_unweighted_stat$t0, boot_weighted_stat$t0), 
                          ci_low = c(boot.ci(boot_unweighted_stat)$normal[2], boot.ci(boot_weighted_stat)$normal[2]), 
                          ci_high = c(boot.ci(boot_unweighted_stat)$normal[3], boot.ci(boot_weighted_stat)$normal[3]))

kable(estimate_df)

stat_type	stat	ci_low	ci_high
unweighted	0.7947348	0.5744865	1.016373
weighted	0.9656777	0.7258297	1.197559

ggplot(estimate_df, aes(x = stat_type, y = stat, color = stat_type)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
  ylab("estimated statistic of density coefficient \npredicting num trigrams at t2") +
  geom_hline(aes(yintercept = 2), linetype = 2)+
  theme(legend.position = "none")

Centrality

sampled_stats <- map_df(c(1:params$NSAMPLES), 
                        get_corr_for_sample2, 
                        nested_data_by_kid_t1, 
                        trigram_outcomes, 
                        density_norms,
                        params$TRAIN_FRACTION,
                        "centrality")

Distribution of test statistcs of centrality as a predictor of number of trigrams at t2

sampled_stats %>%
  gather(corr_type, value, -1) %>%
  ggplot(aes(x = value, fill = corr_type)) +
  geom_density(alpha = .8) +
  geom_vline(aes(xintercept = 2), linetype = 2) +
  facet_wrap(~corr_type) +
  theme(legend.position = "none")

Bootsrap correlation statistic across samples

boot_unweighted_stat  <- boot(sampled_stats$norm_measure_mean_stat, 
          function(d, i) {mean(d[i])}, R = 1000)  

boot_weighted_stat  <- boot(sampled_stats$weighted_measure_mean_stat, 
          function(d, i) {mean(d[i])}, R = 1000) 

estimate_df <- data.frame(stat_type = c("unweighted", "weighted"),
                          stat =  c(boot_unweighted_stat$t0, boot_weighted_stat$t0), 
                          ci_low = c(boot.ci(boot_unweighted_stat)$normal[2], boot.ci(boot_weighted_stat)$normal[2]), 
                          ci_high = c(boot.ci(boot_unweighted_stat)$normal[3], boot.ci(boot_weighted_stat)$normal[3]))

kable(estimate_df)

stat_type	stat	ci_low	ci_high
unweighted	1.9208173	1.6230378	2.2178735
weighted	0.4980724	0.2448795	0.7544975

ggplot(estimate_df, aes(x = stat_type, y = stat, color = stat_type)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high), size = .3) +
  ylab("estimated statistic of centrality coefficient \npredicting num trigrams at t2") +
  geom_hline(aes(yintercept = 2), linetype = 2) +
  theme(legend.position = "none")

Semantic density analyses - cross validation, predicting trigrams at t2

min_vocab = 5

2018-05-30

Correlation

Semantic Density

Centrality

Controlling for stuff

Semantic Density

Centrality