Semantic density analyses - cross validation

Correlation
- MTLD

library(knitr)
library(tidyverse)
library(boot)
library(broom)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = T, tidy = F)

theme_set(theme_classic(base_size = 10))

#params = list(MINWORDSFORVOCAB = 1, NSAMPLES = 5, TRAIN_FRACTION = .8)

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
kid_data <- read_csv("semantic_density_df.csv")

nested_data_by_kid_t1 <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= params$MINWORDSFORVOCAB)  %>%
  nest(-target_child_id)

nested_data_by_kid_t2 <- all_types %>%
  filter(tbin == "t2") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= params$MINWORDSFORVOCAB) %>%
  nest(-target_child_id)

# gets beta for single word
get_beta <- function(target_word, df){
  current_df <- df %>%
    mutate(target_word_present = unlist(map(data,function(x){target_word %in% x$gloss_clean})))
  
    lm(log_mtld_t2 ~ target_word_present + age_t1 + age_diff + log_mtld_t1, data = current_df) %>%
      tidy() %>%
      filter(term == "target_word_presentTRUE") %>%
      select(-term) %>%
      mutate(outcome = "log_mtld_t2",
             gloss_clean = target_word)
}

# for all words get beta
get_by_word_measure <- function(df){
  targ_words <- df %>%
    unnest() %>%
    count(gloss_clean) %>%
    filter(n > 5)
  
  word_beta <- map_df(targ_words$gloss_clean, get_beta, df) 

}

# wrapper function for cross-validation
get_corr_for_sample <- function(i, 
                                kid_t1_data,
                                kid_meta_data,
                                fraction_sampled){
    print(i)
  
   # training kids
   training_kids <- kid_t1_data  %>%
      sample_frac(fraction_sampled) %>%
      left_join(kid_meta_data)
   
   by_words_measure <- get_by_word_measure(training_kids) %>%
     as_data_frame %>%
     mutate(scaled_betas = scale(estimate) %>% as.vector) # scale betas

     # testing kids
    testing_kids_with_measure <- kid_t1_data %>%
      filter(!(target_child_id %in% training_kids$target_child_id)) %>%
      unnest() %>%
      left_join(by_words_measure) %>% # merge in density of words from kid 2
      group_by(target_child_id) %>%
      summarize(mean_beta = mean(scaled_betas, na.rm = T))
  
    testing_kids_with_measure_mtld <- testing_kids_with_measure %>%
      left_join(kid_meta_data %>% select(target_child_id, log_mtld_t2)) 
    
    list(sample = i, 
        measure_mean_corr = cor(testing_kids_with_measure_mtld$log_mtld_t2,
                                       testing_kids_with_measure_mtld$mean_beta)[[1]],
          beta_df = by_words_measure)
}

Correlation

MTLD

kid_data_tidy <- kid_data %>%
  select(target_child_id, log_total_words_t1, log_total_words_t2, 
         age_t1, age_t2, log_mtld_t1, log_mtld_t2, age_diff, 
         log_num_trigrams_t1, log_num_trigrams_t2)

sampled_corrs <- map(c(1:params$NSAMPLES), 
                        get_corr_for_sample, 
                        nested_data_by_kid_t1, 
                        kid_data_tidy, 
                        params$TRAIN_FRACTION)

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100

m = map(sampled_corrs, ~ .x$beta_df) %>%
  bind_rows() %>%
  group_by(gloss_clean) %>%
  summarize(scaled_betas = mean(scaled_betas)) %>%
  arrange(-scaled_betas)

corrs2 <- map(sampled_corrs, ~ .x$measure_mean_corr) %>%
  unlist()

ggplot(data.frame(corrs = corrs2), aes(x = corrs2)) +
  geom_density() +
  geom_vline(aes(xintercept = 0), linetype = 2) +
  theme(legend.position = "none")

Bootsrap correlation coefficient across samples

boot_corr  <- boot(corrs2, 
          function(d, i) {mean(d[i])}, R = 1000) 

estimate_df <- data.frame(
                          corr =   boot_corr$t0, 
                          ci_low = boot.ci(boot_corr)$normal[2],
                          ci_high = boot.ci(boot_corr)$normal[3])

kable(estimate_df)

corr	ci_low	ci_high
0.3096289	0.2726005	0.3461848

ggplot(estimate_df, aes(x = 1, y = corr)) +
  geom_pointrange(aes(ymin = ci_low, ymax = ci_high),size = .8) +
  ylab("estimated correlation coefficient of\nvocabulary density and mtld at time 2") +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  theme(legend.position = "none")

Semantic density analyses - cross validation

2018-06-06

Correlation

MTLD