library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))
MINWORDSFORVOCAB <- 5

The min words for vocab here is 5.

Get df with all measures

Read in data

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
word2vec_model <- read_csv("3_train_childes_model/childes_kid_adult_w2v.txt") %>%
  rename(target_word = word)
mlu_info <- read_csv("mlu_by_kid.csv")
trans_info <- read_csv("t1_transitional_probs_in_vocab_missing0.csv")
mcrae_info <- read_csv("mcrae_vocab_by_kid_t1.txt")
pos_info <- read_csv("prop_pos_by_kid_t1.csv")
kid_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/4_semantic_density/semantic_density_df.csv") %>%
  left_join(mlu_info) %>%
  left_join(trans_info) %>%
  left_join(mcrae_info)%>%
  left_join(pos_info) %>%
  mutate(corpus_id = as.factor(corpus_id))

freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
  rename(word = Word,
         log_freq = Lg10WF) %>%
  select(word, log_freq)

Get filtered version of types for each kid

types_clean <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB) %>%
  mutate(log_count = log(count)) %>%
  select(-count) %>%
  left_join(freq, by= c("gloss_clean" = "word")) %>%
  mutate(log_count_w1 = log_count,
         log_count_w2 = log_count,
         log_freq_w1 = log_freq, 
         log_freq_w2 = log_freq) %>%
  select(-log_count, -log_freq)

Get vocab measures by kids

get_vocab_measure_by_kid3 <- function(id, data, model){
  this_kids_model <- model %>%
    filter(target_word %in% data$gloss_clean)
  
  words_in_model <- data %>%
    filter(gloss_clean %in% this_kids_model$target_word)

   # get pairwise distances
   word_word_dists <- coop::cosine(t(this_kids_model[,-1])) 
   
     data.frame(target_child_id = id,
                mean_dist_t1 = mean(word_word_dists),
                median_dist_t1 = median(word_word_dists),
                var_dist_t1 =  ifelse(mean(var(word_word_dists)) == 0, NA,
                                       mean(var(word_word_dists))),
                n_t1 = nrow(word_word_dists),
                median_freq_t1 =  median(words_in_model$log_freq_w1, na.rm = T)) 
    
}

nested_data_by_kid <- nest(types_clean, -target_child_id)

vocab_measures <- map2_df(nested_data_by_kid$target_child_id, 
                          nested_data_by_kid$data, 
                          get_vocab_measure_by_kid3, 
                          word2vec_model)

Merge in other variables

vocab_df <- vocab_measures %>%
    filter(n_t1 > 2) %>%
    mutate_at(vars(mean_dist_t1, median_dist_t1, n_t1), log) %>%
    left_join(kid_info %>% select(target_child_id, 
                                  log_mtld_t1, 
                                  log_mtld_t2, 
                                  age_t1, age_diff , 
                                  log_transcript_length_t1, log_transcript_length_t2,  
                                  log_num_trigrams_t1, log_num_trigrams_t2,
                                  mlu_m_t1, mlu_m_t2, 
                                  target_child_sex, 
                                  mean_trans_prob_t1 , 
                                  prop_na_trans_t1,
                                  Num_Corred_Pairs_No_Tax, 
                                  Density_No_Tax,
                                  prop_noun_t1, 
                                  prop_verb_t1,
                                  corpus_id, collection_name))  %>%
  mutate(Density_No_Tax = ifelse(is.na(Density_No_Tax), 0, Density_No_Tax),
         Num_Corred_Pairs_No_Tax = ifelse(is.na(Num_Corred_Pairs_No_Tax), 0,
                                          Num_Corred_Pairs_No_Tax))
df_no_corrs <- vocab_df %>%
  select(-target_child_id, -target_child_sex, -corpus_id, -collection_name) 
  #filter_all(all_vars(!is.na(.)))

corr_mat <- cor(df_no_corrs, 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(df_no_corrs, 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "hclust", number.cex = .7,
         addCoef.col = "black", insig = "blank",
         p.mat = p.mat, sig.level = .05, 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

Regressions

Predicting MTLD at t2

lme4::lmer(scale(log_mtld_t2) ~ 
                  scale(median_dist_t1)  + 
                  #scale(var_dist_t1) +
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(age_diff) + 
                  scale(log_mtld_t1) +
                  scale(prop_noun_t1) +
                  #target_child_sex +                  
                  scale(mlu_m_t1)+ 
                  scale(median_freq_t1) +
                  scale(log_transcript_length_t1) + 
                scale(log_transcript_length_t2) + (1|corpus_id),
   data = vocab_df) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(log_mtld_t2) ~ scale(median_dist_t1) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(age_diff) + scale(log_mtld_t1) + scale(prop_noun_t1) +  
##     scale(mlu_m_t1) + scale(median_freq_t1) + scale(log_transcript_length_t1) +  
##     scale(log_transcript_length_t2) + (1 | corpus_id)
##    Data: vocab_df
## 
## REML criterion at convergence: 124.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.23905 -0.65873  0.02381  0.52434  2.78393 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.03748  0.1936  
##  Residual              0.22049  0.4696  
## Number of obs: 72, groups:  corpus_id, 20
## 
## Fixed effects:
##                                  Estimate Std. Error t value
## (Intercept)                     -0.005949   0.093468  -0.064
## scale(median_dist_t1)            0.181153   0.070643   2.564
## scale(Num_Corred_Pairs_No_Tax)   0.178496   0.073247   2.437
## scale(age_t1)                   -0.576090   0.207228  -2.780
## scale(age_diff)                  0.037736   0.206346   0.183
## scale(log_mtld_t1)               0.921051   0.087638  10.510
## scale(prop_noun_t1)              0.033994   0.083522   0.407
## scale(mlu_m_t1)                 -0.148997   0.101315  -1.471
## scale(median_freq_t1)           -0.070956   0.104437  -0.679
## scale(log_transcript_length_t1)  0.021193   0.196265   0.108
## scale(log_transcript_length_t2)  0.262450   0.197004   1.332
## 
## Correlation of Fixed Effects:
##              (Intr) scl(mdn_d_1) s(N_C_ sc(_1) scl(_) scl(l__1) scl(p__1)
## scl(mdn_d_1) -0.046                                                      
## s(N_C_P_N_T  -0.009  0.188                                               
## scale(g_t1)   0.205 -0.093       -0.016                                  
## scal(g_dff)   0.180 -0.055        0.010  0.876                           
## scl(lg_m_1)  -0.027  0.087       -0.080 -0.065  0.101                    
## scl(prp__1)   0.091 -0.067       -0.363  0.163  0.149  0.109             
## scl(ml_m_1)  -0.065 -0.204       -0.038  0.059 -0.112 -0.529    -0.136   
## scl(mdn_f_1)  0.047 -0.347       -0.073 -0.110 -0.122 -0.192     0.464   
## scl(lg___1)   0.173  0.189       -0.027  0.229  0.348  0.122     0.187   
## scl(lg___2)  -0.328 -0.174       -0.082 -0.416 -0.355 -0.015    -0.158   
##              scl(ml__1) scl(mdn_f_1) s(___1
## scl(mdn_d_1)                               
## s(N_C_P_N_T                                
## scale(g_t1)                                
## scal(g_dff)                                
## scl(lg_m_1)                                
## scl(prp__1)                                
## scl(ml_m_1)                                
## scl(mdn_f_1) -0.140                        
## scl(lg___1)  -0.533      0.109             
## scl(lg___2)   0.159      0.160       -0.750

Predicting number of trigrams at t2

lme4::lmer(scale(log_num_trigrams_t2) ~    scale(median_dist_t1)  + 
                # scale(var_dist_t1)+
                  scale(Num_Corred_Pairs_No_Tax) +
                  #scale(Density_No_Tax) +
                  #scale(mean_trans_prob_t1) +
                  #scale(-prop_na_trans_t1) +
                  scale(age_t1) +
                  scale(age_diff) + 
                  scale(log_num_trigrams_t1) +
                  scale(prop_noun_t1) +
                  #target_child_sex +                  
                  scale(mlu_m_t1)+ 
                  scale(median_freq_t1) +
                  scale(log_transcript_length_t1) + 
     scale(log_transcript_length_t2)+ (1|corpus_id),
   data = vocab_df) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(log_num_trigrams_t2) ~ scale(median_dist_t1) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(age_diff) + scale(log_num_trigrams_t1) +  
##     scale(prop_noun_t1) + scale(mlu_m_t1) + scale(median_freq_t1) +  
##     scale(log_transcript_length_t1) + scale(log_transcript_length_t2) +  
##     (1 | corpus_id)
##    Data: vocab_df
## 
## REML criterion at convergence: 31.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.97344 -0.66564 -0.02972  0.54788  1.78412 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.04133  0.2033  
##  Residual              0.04098  0.2024  
## Number of obs: 72, groups:  corpus_id, 20
## 
## Fixed effects:
##                                 Estimate Std. Error t value
## (Intercept)                      0.06333    0.06913   0.916
## scale(median_dist_t1)            0.03943    0.03178   1.241
## scale(Num_Corred_Pairs_No_Tax)   0.07135    0.03306   2.158
## scale(age_t1)                    0.02426    0.10931   0.222
## scale(age_diff)                  0.23735    0.10486   2.264
## scale(log_num_trigrams_t1)       0.93501    0.09099  10.276
## scale(prop_noun_t1)              0.16975    0.03713   4.572
## scale(mlu_m_t1)                  0.01527    0.04164   0.367
## scale(median_freq_t1)            0.07087    0.04717   1.503
## scale(log_transcript_length_t1) -0.59953    0.13118  -4.570
## scale(log_transcript_length_t2)  0.76921    0.11156   6.895
## 
## Correlation of Fixed Effects:
##              (Intr) scl(mdn_d_1) s(N_C_ sc(_1) scl(_) scl(lg_n__1)
## scl(mdn_d_1) -0.099                                               
## s(N_C_P_N_T  -0.011  0.213                                        
## scale(g_t1)   0.220 -0.100       -0.015                           
## scal(g_dff)   0.242 -0.097       -0.017  0.863                    
## scl(lg_n__1)  0.092 -0.114       -0.169  0.072  0.136             
## scl(prp__1)   0.135 -0.107       -0.381  0.187  0.169  0.105      
## scl(ml_m_1)  -0.085 -0.149       -0.016 -0.018 -0.083 -0.320      
## scl(mdn_f_1)  0.062 -0.324       -0.058 -0.062 -0.088 -0.205      
## scl(lg_t__1)  0.064  0.138        0.087  0.179  0.146 -0.636      
## scl(lg___2)  -0.378 -0.040       -0.092 -0.432 -0.321  0.055      
##              scl(p__1) scl(ml__1) scl(mdn_f_1) scl(lg_t__1)
## scl(mdn_d_1)                                               
## s(N_C_P_N_T                                                
## scale(g_t1)                                                
## scal(g_dff)                                                
## scl(lg_n__1)                                               
## scl(prp__1)                                                
## scl(ml_m_1)  -0.143                                        
## scl(mdn_f_1)  0.467    -0.193                              
## scl(lg_t__1)  0.093    -0.185      0.272                   
## scl(lg___2)  -0.192     0.145      0.004       -0.615
get_vocab_measure_by_kid2 <- function(id, data, model){
  this_kids_model <- model %>%
    filter(target_word %in% data$gloss_clean)
  
  words_in_model <- data %>%
    filter(gloss_clean %in% this_kids_model$target_word)

   # get pairwise distances
   word_word_dists <- coop::cosine(t(this_kids_model[,-1])) 
   
   wide_word_word_dists <- word_word_dists %>%
        as.data.frame()  %>%
        mutate(word1 =  words_in_model$gloss_clean) %>%
        select(word1, everything())

    names(wide_word_word_dists)  = c("word1", words_in_model$gloss_clean)

    long_word_word_dists_wiki <- wide_word_word_dists %>%
      gather("word2", "cos_dist_wiki", -word1) %>%
      select(word1, word2, everything())  
    
    # merge in word counts and frequency infor for each word
    word_dists_with_meta <- long_word_word_dists_wiki %>%
      left_join(data %>% select(-log_count_w2, -log_freq_w2), 
                by = c("word1" = "gloss_clean" )) %>%
      left_join(data %>% select(-log_count_w1, -log_freq_w1), 
                by = c("word2" = "gloss_clean")) %>%
      rowwise() %>%
      mutate(mean_log_count = mean(c(log_count_w1, log_count_w2), na.rm = T),
             mean_log_freq = mean(c(log_freq_w1, log_freq_w2), na.rm = T)) %>%
      select(word1, word2, cos_dist_wiki, mean_log_count, mean_log_freq)
    
    try({
      model <- lm(cos_dist_wiki~ mean_log_count + mean_log_freq,
                  data = word_dists_with_meta)
      
      dists_with_resids <- word_dists_with_meta %>%
        add_residuals(model, var = "resid_cos_dist_wiki") %>%
        select(-mean_log_count, -mean_log_freq) %>%
        select(-word2) %>%
        group_by(word1) %>%
        summarize_all(mean, na.rm = T)
      
        df_return <- data.frame(target_child_id = id,
                mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
                median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
                var_dist_t1 =  ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
                                       mean(var(dists_with_resids$cos_dist_wiki))),
                mean_r_dist_t1 = mean(dists_with_resids$resid_cos_dist_wiki),
                median_r_dist_t1 = median(dists_with_resids$resid_cos_dist_wiki),
                var_r_dist_t1 =  ifelse(mean(var(dists_with_resids$resid_cos_dist_wiki)) == 0, NA,
                                       mean(var(dists_with_resids$resid_cos_dist_wiki))),
                n_t1 = nrow(dists_with_resids)) 
     
    }, {

      dists_with_resids <- word_dists_with_meta %>%
        select(-mean_log_count, -mean_log_freq) %>%
        select(-word2) %>%
        group_by(word1) %>%
        summarize_all(mean, na.rm = T)
    
         df_return <- data.frame(target_child_id = id,
              mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
              median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
              var_dist_t1 =  ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
                                    mean(var(dists_with_resids$cos_dist_wiki))),
              mean_r_dist_t1 = NA,
              median_r_dist_t1 = NA,
              var_r_dist_t1 =  NA,
              n_t1 = nrow(dists_with_resids)) 
    })
    df_return
}

get_vocab_measure_by_kid1 <- function(id, data, model){
  this_kids_model <- model %>%
    filter(target_word %in% data$gloss_clean)
  
  words_in_model <- data %>%
    filter(gloss_clean %in% this_kids_model$target_word)

   # get pairwise distances
   word_word_dists <- coop::cosine(t(this_kids_model[,-1]))  %>%
     rowMeans()
   
     data.frame(target_child_id = id,
                mean_dist_t1 = mean(word_word_dists),
                median_dist_t1 = median(word_word_dists),
                var_dist_t1 =  ifelse(mean(var(word_word_dists)) == 0, NA,
                                       mean(var(word_word_dists))),
                n_t1 = length(word_word_dists),
                median_freq =  median(words_in_model$log_freq_w1, na.rm = T)) 
    
}


vocab_df <- vocab_measures %>%
    mutate_at(vars(mean_dist_t1, median_dist_t1, var_dist_t1,
                   mean_r_dist_t1, median_r_dist_t1, var_r_dist_t1, n_t1),
              function(x) {log(x + 10)}) %>%
    left_join(kid_info %>% select(target_child_id, log_mtld_t1, 
                                          log_mtld_t2, age_t1, age_t2, mtld_diff, age_diff,
                                  mean_log_word_freq_t1, log_transcript_length_t1, log_transcript_length_t2, log_num_trigrams_t2,
                                  log_num_trigrams_t1)) %>%
  mutate_if(is.numeric, scale) # scale everything for regressions 

vocab_df %>%
  select(-target_child_id) %>%
  gather("var", "value") %>%
  ggplot(aes( x= value)) +
  geom_histogram(scales = "free") +
  facet_wrap(~var)