library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))
MINWORDSFORVOCAB <- 5

The min words for vocab here is 5.

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
mlu_info <- read_csv("mlu_by_kid.csv")
trans_info <- read_csv("t1_transitional_probs_in_vocab_missing0.csv")
mcrae_info <- read_csv("mcrae_vocab_by_kid_t1.txt") %>%
  rename(Per_Corred_Pairs_No_Tax = `%_Corred_Pairs_No_Tax`)
pos_info <- read_csv("prop_pos_by_kid_t1.csv")
conc_info <- read_csv("conc_by_kid_t1.csv")

kid_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/4_semantic_density/semantic_density_df.csv") %>%
  left_join(mlu_info) %>%
  left_join(trans_info) %>%
  left_join(mcrae_info)%>%
  left_join(pos_info) %>%
  left_join(conc_info) %>%
  mutate(corpus_id = as.factor(corpus_id))

#freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
 # rename(word = Word,
 #        log_freq = Lg10WF) %>%
 # select(word, log_freq)

freq <- read_csv("childes_adult_word_freq.csv")

Embedding models

word2vec_model_childes <- read_csv("3_train_childes_model/childes_adult_w2v.txt") %>%
  rename(target_word = word)
word2vec_model_wiki <- read_feather("fast_text_childes_words_600_900.feather") 

Get filtered version of types for each kid

types_clean <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB) %>%
  mutate(log_count = log(count)) %>%
  select(-count) %>%
  left_join(freq, by= c("gloss_clean" = "word")) %>%
  mutate(log_count_w1 = log_count,
         log_count_w2 = log_count,
         log_freq_w1 = log_freq, 
         log_freq_w2 = log_freq) %>%
  select(-log_count, -log_freq)
conc <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  select(Word, Conc.M)

distinct_types <- types_clean %>%
  left_join(conc, by =c("gloss_clean" = "Word")) %>%
  ungroup() %>%
  distinct(gloss_clean, .keep_all = T)

median_conc <- median(distinct_types$Conc.M, na.rm = T)

types_clean <- types_clean %>%
    left_join(conc, by =c("gloss_clean" = "Word")) %>%
    filter(Conc.M<= median_conc)

Get distance measure by kid at t1 in both wikipedia model and adult childes model

get_vocab_measure_by_kid3 <- function(id, data, model){
  this_kids_model <- model %>%
    filter(target_word %in% data$gloss_clean)
  
  words_in_model <- data %>%
    filter(gloss_clean %in% this_kids_model$target_word)

   # get pairwise distances
   word_word_dists <- coop::cosine(t(this_kids_model[,-1])) 
   
     data.frame(target_child_id = id,
                mean_dist_t1 = mean(word_word_dists),
                median_dist_t1 = median(word_word_dists),
                var_dist_t1 =  ifelse(mean(var(word_word_dists)) == 0, NA,
                                       mean(var(word_word_dists))),
                n_t1 = nrow(word_word_dists),
                median_freq_t1 =  mean(words_in_model$log_freq_w1, na.rm = T)) 
    
}

nested_data_by_kid <- nest(types_clean, -target_child_id)

vocab_measures_wiki <- map2_df(nested_data_by_kid$target_child_id, 
                          nested_data_by_kid$data, 
                          get_vocab_measure_by_kid3, 
                          word2vec_model_wiki) %>%
  rename(mean_dist_t1_wiki = mean_dist_t1, 
         median_dist_t1_wiki = median_dist_t1, 
         var_dist_t1_wiki = var_dist_t1,
         n_t1_wiki = n_t1,
         median_freq_t1_wiki = median_freq_t1)

vocab_measures_childes <- map2_df(nested_data_by_kid$target_child_id, 
                          nested_data_by_kid$data, 
                          get_vocab_measure_by_kid3, 
                          word2vec_model_childes) %>%
  rename(mean_dist_t1_childes = mean_dist_t1, 
         median_dist_t1_childes  = median_dist_t1, 
         var_dist_t1_childes  = var_dist_t1,
         n_t1_childes = n_t1,
         median_freq_t1_childes  = median_freq_t1)

vocab_measures <- left_join(vocab_measures_wiki, vocab_measures_childes)  %>%
  mutate(mean_dist_t1_wiki_log = log(mean_dist_t1_wiki),
         mean_dist_t1_childes_log = log(mean_dist_t1_childes)) %>%
  select(-mean_dist_t1_wiki, -mean_dist_t1_childes)

Mean distance in childes vs. mean distance in wiki model at t1

ggplot(vocab_measures, aes(x = mean_dist_t1_childes_log, y = mean_dist_t1_wiki_log)) +
  geom_point() +
  geom_smooth(method = "lm")

embedding_corr  <- cor.test(vocab_measures$mean_dist_t1_childes_log, vocab_measures$mean_dist_t1_wiki_log)
embedding_corr
## 
##  Pearson's product-moment correlation
## 
## data:  vocab_measures$mean_dist_t1_childes_log and vocab_measures$mean_dist_t1_wiki_log
## t = 5.0439, df = 83, p-value = 2.641e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3024613 0.6322227
## sample estimates:
##       cor 
## 0.4843625

They’re highly correlated r = 0.4843625.

Other kid-level variables at t1

Get merged dataframe

vocab_df <- vocab_measures %>%
    filter(n_t1_wiki > 2) %>% # can't look at median distance when there's only two words
    left_join(kid_info %>% select(target_child_id, 
                                  log_mtld_t1, 
                                  log_mtld_t2, 
                                  age_t1, age_diff , 
                                  log_transcript_length_t1, log_transcript_length_t2,  log_num_trigrams_t1,
                                  log_num_trigrams_t2,
                                  log_num_trigrams_t1, log_num_trigrams_t2,
                                  mlu_m_t1, mlu_m_t2, 
                                  target_child_sex,
                                  conc_t1,
                                #  n_clusters, 
                                 # mean_trans_prob_t1 , 
                                 # prop_na_trans_t1,
                                  Per_Corred_Pairs_No_Tax,
                                  Num_Corred_Pairs_No_Tax, 
                                  prop_noun_t1, 
                                  prop_verb_t1,
                                  corpus_id, collection_name,
                                age_t2)) 

descriptive stats

Correlation between variables

df_no_corrs <- vocab_df %>%
  select(-target_child_id, -target_child_sex, -corpus_id, -collection_name) 
  #filter_all(all_vars(!is.na(.)))

corr_mat <- cor(df_no_corrs, 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(df_no_corrs, 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "original", number.cex = .7, #order = "hclust"
         addCoef.col = "black", insig = "blank",
         p.mat = p.mat, sig.level = .05, 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

#age_t1 
#log_transcript_length_t1
#mlu_m_t1*
#mean_trans_prob_t1
#prop_noun_t1
#median_freq_t1_wiki

Predicting t2 measures

Wikipedia

MTLD

lme4::lmer(scale(log_mtld_t2) ~ 
                  scale(mean_dist_t1_wiki_log)  + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_wiki) +
                  scale(log_mtld_t1) +
                  scale(mlu_m_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(log_mtld_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +  
##     scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +  
##     (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 85.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.03026 -0.56609  0.03706  0.43285  2.65390 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.05095  0.2257  
##  Residual              0.17907  0.4232  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                  Estimate Std. Error t value
## (Intercept)                      0.085367   0.091519   0.933
## scale(mean_dist_t1_wiki_log)    -0.078080   0.158004  -0.494
## scale(Num_Corred_Pairs_No_Tax)   0.209218   0.074875   2.794
## scale(age_t1)                   -0.253571   0.083444  -3.039
## scale(prop_noun_t1)              0.140604   0.089169   1.577
## scale(median_freq_t1_wiki)       0.105907   0.107449   0.986
## scale(log_mtld_t1)               0.862979   0.091664   9.415
## scale(mlu_m_t1)                 -0.009338   0.104150  -0.090
## scale(log_transcript_length_t1)  0.080539   0.169678   0.475
## 
## Correlation of Fixed Effects:
##             (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__)  0.035                                                  
## s(N_C_P_N_T  0.070  0.488                                           
## scale(g_t1)  0.038  0.143   0.004                                   
## scl(prp__1) -0.046 -0.266  -0.417  0.026                            
## scl(md__1_) -0.038 -0.363  -0.318 -0.079  0.470                     
## scl(lg_m_1)  0.025 -0.230  -0.173 -0.163  0.161    -0.013           
## scl(ml_m_1) -0.023  0.127   0.050  0.319  0.095    -0.124  -0.566   
## scl(lg___1) -0.049  0.624   0.279 -0.171 -0.219     0.149  -0.058   
##             scl(m__1)
## scl(m__1__)          
## s(N_C_P_N_T          
## scale(g_t1)          
## scl(prp__1)          
## scl(md__1_)          
## scl(lg_m_1)          
## scl(ml_m_1)          
## scl(lg___1) -0.368

MLU

 lme4::lmer(scale(mlu_m_t2) ~ 
                  scale(mean_dist_t1_wiki_log)   + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_wiki) +
                  scale(log_mtld_t1) +
                  scale(mlu_m_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(mlu_m_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +  
##     scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +  
##     (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 111.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.29033 -0.46106  0.02629  0.33578  2.62574 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.0000   0.0000  
##  Residual              0.3753   0.6126  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                  Estimate Std. Error t value
## (Intercept)                      0.004453   0.085519   0.052
## scale(mean_dist_t1_wiki_log)    -0.124233   0.211848  -0.586
## scale(Num_Corred_Pairs_No_Tax)   0.036079   0.099970   0.361
## scale(age_t1)                   -0.302221   0.101093  -2.990
## scale(prop_noun_t1)              0.248935   0.117273   2.123
## scale(median_freq_t1_wiki)       0.190358   0.137726   1.382
## scale(log_mtld_t1)               0.150910   0.120814   1.249
## scale(mlu_m_t1)                  0.629587   0.139059   4.527
## scale(log_transcript_length_t1)  0.049437   0.223102   0.222
## 
## Correlation of Fixed Effects:
##             (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) -0.015                                                  
## s(N_C_P_N_T -0.018  0.441                                           
## scale(g_t1) -0.048  0.142  -0.017                                   
## scl(prp__1) -0.040 -0.168  -0.335  0.015                            
## scl(md__1_) -0.017 -0.300  -0.190 -0.104  0.388                     
## scl(lg_m_1)  0.096 -0.195  -0.140 -0.267  0.101    -0.092           
## scl(ml_m_1) -0.082  0.072  -0.049  0.429  0.233     0.006  -0.553   
## scl(lg___1)  0.021  0.720   0.389 -0.200 -0.280     0.110  -0.085   
##             scl(m__1)
## scl(m__1__)          
## s(N_C_P_N_T          
## scale(g_t1)          
## scl(prp__1)          
## scl(md__1_)          
## scl(lg_m_1)          
## scl(ml_m_1)          
## scl(lg___1) -0.327

Trigrams

  lme4::lmer(scale(log_num_trigrams_t2) ~ 
                  scale(mean_dist_t1_wiki_log)   + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_wiki) +
                  scale(log_num_trigrams_t1) +
                  scale(mlu_m_t1) +
                  scale(log_mtld_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(log_num_trigrams_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +  
##     scale(log_num_trigrams_t1) + scale(mlu_m_t1) + scale(log_mtld_t1) +  
##     scale(log_transcript_length_t1) + (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 73.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.05287 -0.47097 -0.04895  0.40862  2.48630 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.1075   0.3278  
##  Residual              0.1124   0.3352  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                 Estimate Std. Error t value
## (Intercept)                      0.14733    0.10175   1.448
## scale(mean_dist_t1_wiki_log)    -0.29307    0.15524  -1.888
## scale(Num_Corred_Pairs_No_Tax)   0.04356    0.06599   0.660
## scale(age_t1)                    0.11322    0.08008   1.414
## scale(prop_noun_t1)              0.11816    0.07772   1.520
## scale(median_freq_t1_wiki)       0.27125    0.10982   2.470
## scale(log_num_trigrams_t1)       0.43371    0.20208   2.146
## scale(mlu_m_t1)                  0.08648    0.08766   0.987
## scale(log_mtld_t1)               0.01803    0.09730   0.185
## scale(log_transcript_length_t1)  0.11683    0.21123   0.553
## 
## Correlation of Fixed Effects:
##              (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(lg_n__1)
## scl(m__1__)   0.130                                                     
## s(N_C_P_N_T   0.134  0.575                                              
## scale(g_t1)   0.013  0.094  -0.008                                      
## scl(prp__1)  -0.092 -0.392  -0.498  0.030                               
## scl(md__1_)  -0.127 -0.557  -0.478 -0.039  0.528                        
## scl(lg_n__1)  0.177  0.510   0.313 -0.059 -0.224    -0.554              
## scl(ml_m_1)  -0.021  0.061   0.045  0.249  0.059    -0.071  -0.116      
## scl(lg_m_1)  -0.100 -0.459  -0.302 -0.039  0.266     0.299  -0.560      
## scl(lg_t__1) -0.169  0.002  -0.065 -0.038  0.014     0.467  -0.706      
##              scl(m__1) scl(l__1)
## scl(m__1__)                     
## s(N_C_P_N_T                     
## scale(g_t1)                     
## scl(prp__1)                     
## scl(md__1_)                     
## scl(lg_n__1)                    
## scl(ml_m_1)                     
## scl(lg_m_1)  -0.389             
## scl(lg_t__1) -0.187     0.352

Childes

MTLD

lme4::lmer(scale(log_mtld_t2) ~ 
                  scale(mean_dist_t1_childes_log) + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_childes) +
                  scale(log_mtld_t1) +
                  scale(mlu_m_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(log_mtld_t2) ~ scale(mean_dist_t1_childes_log) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_childes) +  
##     scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +  
##     (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 84.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.03398 -0.50750  0.07519  0.42842  2.40186 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.03747  0.1936  
##  Residual              0.18095  0.4254  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                 Estimate Std. Error t value
## (Intercept)                      0.07326    0.08591   0.853
## scale(mean_dist_t1_childes_log)  0.13751    0.12334   1.115
## scale(Num_Corred_Pairs_No_Tax)   0.23724    0.06692   3.545
## scale(age_t1)                   -0.24769    0.08019  -3.089
## scale(prop_noun_t1)              0.10953    0.08924   1.227
## scale(median_freq_t1_childes)   -0.05672    0.17218  -0.329
## scale(log_mtld_t1)               0.85186    0.08830   9.648
## scale(mlu_m_t1)                 -0.04476    0.10632  -0.421
## scale(log_transcript_length_t1)  0.08222    0.14332   0.574
## 
## Correlation of Fixed Effects:
##             (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) -0.005                                                  
## s(N_C_P_N_T  0.052  0.235                                           
## scale(g_t1)  0.031 -0.035  -0.085                                   
## scl(prp__1) -0.035 -0.300  -0.379  0.073                            
## scl(md__1_) -0.013 -0.822  -0.278  0.012  0.466                     
## scl(lg_m_1)  0.042 -0.083  -0.092 -0.151  0.127     0.010           
## scl(ml_m_1) -0.030 -0.262  -0.084  0.322  0.216     0.175  -0.515   
## scl(lg___1) -0.070 -0.438  -0.117 -0.303  0.050     0.622   0.141   
##             scl(m__1)
## scl(m__1__)          
## s(N_C_P_N_T          
## scale(g_t1)          
## scl(prp__1)          
## scl(md__1_)          
## scl(lg_m_1)          
## scl(ml_m_1)          
## scl(lg___1) -0.385

MLU

 lme4::lmer(scale(mlu_m_t2) ~ 
                  scale(mean_dist_t1_childes_log)   + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_childes) +
                  scale(log_mtld_t1) +
                  scale(mlu_m_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(mlu_m_t2) ~ scale(mean_dist_t1_childes_log) + scale(Num_Corred_Pairs_No_Tax) +  
##     scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_childes) +  
##     scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +  
##     (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 111.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.21791 -0.38342  0.00357  0.38874  2.67770 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.002809 0.0530  
##  Residual              0.374400 0.6119  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                 Estimate Std. Error t value
## (Intercept)                      0.00868    0.08776   0.099
## scale(mean_dist_t1_childes_log)  0.04576    0.16432   0.278
## scale(Num_Corred_Pairs_No_Tax)   0.07069    0.09294   0.761
## scale(age_t1)                   -0.29495    0.10107  -2.918
## scale(prop_noun_t1)              0.22428    0.12418   1.806
## scale(median_freq_t1_childes)    0.11488    0.23470   0.489
## scale(log_mtld_t1)               0.13141    0.11942   1.100
## scale(mlu_m_t1)                  0.62898    0.14266   4.409
## scale(log_transcript_length_t1)  0.11494    0.17958   0.640
## 
## Correlation of Fixed Effects:
##             (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__)  0.009                                                  
## s(N_C_P_N_T -0.004  0.257                                           
## scale(g_t1) -0.037 -0.084  -0.108                                   
## scl(prp__1) -0.042 -0.363  -0.361  0.071                            
## scl(md__1_) -0.021 -0.830  -0.251  0.038  0.488                     
## scl(lg_m_1)  0.089 -0.117  -0.091 -0.230  0.114     0.015           
## scl(ml_m_1) -0.076 -0.222  -0.137  0.424  0.300     0.194  -0.508   
## scl(lg___1)  0.024 -0.491  -0.038 -0.334 -0.005     0.646   0.135   
##             scl(m__1)
## scl(m__1__)          
## s(N_C_P_N_T          
## scale(g_t1)          
## scl(prp__1)          
## scl(md__1_)          
## scl(lg_m_1)          
## scl(ml_m_1)          
## scl(lg___1) -0.360

Trigrams

  lme4::lmer(scale(log_num_trigrams_t2) ~ 
                  scale(mean_dist_t1_childes_log)   + 
                  scale(Num_Corred_Pairs_No_Tax) +
                  scale(age_t1) + 
                  scale(prop_noun_t1) +
                  scale(median_freq_t1_childes) +
                  scale(log_num_trigrams_t1) +
                  scale(mlu_m_t1) +
                  scale(log_mtld_t1) +
                  scale(log_transcript_length_t1) + 
             (1|corpus_id),
   data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: scale(log_num_trigrams_t2) ~ scale(mean_dist_t1_childes_log) +  
##     scale(Num_Corred_Pairs_No_Tax) + scale(age_t1) + scale(prop_noun_t1) +  
##     scale(median_freq_t1_childes) + scale(log_num_trigrams_t1) +  
##     scale(mlu_m_t1) + scale(log_mtld_t1) + scale(log_transcript_length_t1) +  
##     (1 | corpus_id)
##    Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
## 
## REML criterion at convergence: 76.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.68203 -0.37379  0.01322  0.30205  2.55065 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  corpus_id (Intercept) 0.1048   0.3238  
##  Residual              0.1194   0.3455  
## Number of obs: 52, groups:  corpus_id, 19
## 
## Fixed effects:
##                                  Estimate Std. Error t value
## (Intercept)                      0.158287   0.101586   1.558
## scale(mean_dist_t1_childes_log)  0.154551   0.116916   1.322
## scale(Num_Corred_Pairs_No_Tax)   0.131709   0.056935   2.313
## scale(age_t1)                    0.132754   0.081013   1.639
## scale(prop_noun_t1)              0.038993   0.075285   0.518
## scale(median_freq_t1_childes)   -0.002295   0.152196  -0.015
## scale(log_num_trigrams_t1)       0.546293   0.186268   2.933
## scale(mlu_m_t1)                  0.062748   0.093429   0.672
## scale(log_mtld_t1)              -0.053639   0.088453  -0.606
## scale(log_transcript_length_t1)  0.124281   0.215672   0.576
## 
## Correlation of Fixed Effects:
##              (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(lg_n__1)
## scl(m__1__)  -0.074                                                     
## s(N_C_P_N_T   0.055  0.228                                              
## scale(g_t1)  -0.002  0.062  -0.060                                      
## scl(prp__1)  -0.027 -0.233  -0.393  0.057                               
## scl(md__1_)   0.016 -0.794  -0.313 -0.038  0.420                        
## scl(lg_n__1)  0.147 -0.287  -0.041 -0.138  0.041     0.007              
## scl(ml_m_1)  -0.010 -0.272  -0.052  0.224  0.151     0.191  -0.083      
## scl(lg_m_1)  -0.051  0.078  -0.036  0.004  0.088    -0.022  -0.428      
## scl(lg_t__1) -0.171 -0.019  -0.076 -0.043  0.012     0.358  -0.783      
##              scl(m__1) scl(l__1)
## scl(m__1__)                     
## s(N_C_P_N_T                     
## scale(g_t1)                     
## scl(prp__1)                     
## scl(md__1_)                     
## scl(lg_n__1)                    
## scl(ml_m_1)                     
## scl(lg_m_1)  -0.414             
## scl(lg_t__1) -0.171     0.396

Spaghetti plot

plot_df <- vocab_df %>%
  filter(!is.na(Num_Corred_Pairs_No_Tax)) %>%
  filter(!is.na(mlu_m_t1))

OUTFILE <- "spaghetti_plot.pdf"
#pdf(OUTFILE)
plot_df %>%
  rename(`Time 1` = "log_mtld_t1",
         `Time 2` = "log_mtld_t2") %>%
  select(`Time 1`, `Time 2`, target_child_id) %>%
  gather("timepoint", "value", -target_child_id) %>%
  ggplot(aes(x = timepoint, y = value, group = target_child_id)) +
  ylab("Vocabulary size estimate (log MTLD)") +
  xlab("") +
  theme(axis.line.x = element_blank(),
        axis.line.y = element_line(size = 1),
        axis.ticks = element_blank(),
        axis.title.y = element_text(size = 14),
        axis.text.x = element_text(size = 14, color = "black"),
        axis.text.y = element_text(size = 12)) + 
  geom_line() +
  geom_point(color = "red") 

#dev.off()