Semantic space by-kid analyses

Get df with all measures
Regressions

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))

QUESTION: Do the distance properties of a kids vocab at t1 predict vocabulary outcomes at t2?

For each kids vocab, getting coordinates of each word in semantic space using fasttext english wikipedia model. The predictors are the mean, median, var distance between words for each kid.

Vocabulary outcome measures: mtld at t2, mtld change, number of trigrams at t2 and frequency of trigrams at t1

In previous versions of this analysis I transformed the 300-D wikipedia model into tsne coordinates for the words in all kids’ vocabs. This analysis suggested an effect whereby vocabs with larger mean distance at t1 predicted mtld at t2. However, the tsne coordinates were probabilistic (changed with each run of the model), and thus these results were not consistent.

So, here I’m doing distance calculations using cosine on the full 300-D space. The effects depend on the cuttoff for counting a kid as knowing a word, with larger effects for bigger cuttoffs.

Broadly, the pattern of findings is that vocabularies with high distance and low variability have high mtld at t2. This is also true for number of trigrams. For trigram frequency, the opposite is true: low distance high variability predicts more frequent trigrams at t2.

MINWORDSFORVOCAB <- 5

The min words for vocab here is 5.

Get df with all measures

Read in data

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
fasttext_model <- read_feather("fast_text_childes_words_600_900.feather")
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")
trigrams <- read_csv("../2_trigrams/mtld_continuous_trigram_by_kid_MIN1.csv")

Get filtered version of types for each kid

types_clean <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

Get vocab measures by kids

get_vocab_measure_by_kid <- function(id, data, model){
  this_kids_model <- model %>%
    filter(target_word %in% data$gloss_clean)
 
   # get pairwise distances
   all_dists = coop::cosine(t(this_kids_model[,-1]))
  
   data.frame(target_child_id = id,
              mean_dist_t1 = mean(all_dists),
              median_dist_t1 = median(all_dists),
              var_dist_t1 =  ifelse(mean(var(all_dists)) == 0, NA, mean(var(all_dists))),
              n_t1 = nrow(all_dists)) 
}

nested_data_by_kid <- nest(types_clean, -target_child_id)

vocab_measures <- map2_df(nested_data_by_kid$target_child_id, 
                          nested_data_by_kid$data, 
                          get_vocab_measure_by_kid, 
                          fasttext_model)

Merge in other variables

vocab_df <- vocab_measures %>%
  left_join(groups_info %>% select(delta_resid_group, target_child_id, mtld_t1, 
                                          mtld_t2, age_t1, age_t2, mtld_diff, age_diff)) %>%
  mutate(log_mtld_t2 = log(mtld_t2 + 1),
         log_mtld_t1 = log(mtld_t1 + 1),
         log_median_dist_t1 = log(median_dist_t1),
         log_mean_dist_t1 = log(mean_dist_t1),
         log_var_dist_t1 = log(var_dist_t1),
         log_n_t1 = log(n_t1)) %>%
  left_join(trigrams %>% select(target_child_id, log_num_trigrams_t1, log_num_trigrams_t2,
                                mean_log_freq_trigrams_t1, mean_log_freq_trigrams_t2)) %>%
  mutate_if(is.numeric, scale)

Regressions

Predicting MTLD at t2

lm(log_mtld_t2 ~ log_median_dist_t1 + age_t1 + age_t2 + log_mtld_t1 + log_n_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_median_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28147 -0.40165  0.02427  0.37627  1.92425 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.002e-16  6.921e-02   0.000  1.00000    
## log_median_dist_t1  3.581e-01  1.053e-01   3.402  0.00103 ** 
## age_t1             -3.464e-01  1.083e-01  -3.200  0.00194 ** 
## age_t2              9.943e-02  9.035e-02   1.101  0.27425    
## log_mtld_t1         8.025e-01  7.894e-02  10.166  2.7e-16 ***
## log_n_t1            2.738e-01  1.137e-01   2.408  0.01825 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6565 on 84 degrees of freedom
## Multiple R-squared:  0.5932, Adjusted R-squared:  0.569 
## F-statistic:  24.5 on 5 and 84 DF,  p-value: 3.941e-15

lm(log_mtld_t2 ~ log_mean_dist_t1 + age_t1 + age_t2 + log_mtld_t1 + log_n_t1,
     data = vocab_df) %>%
    summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_mean_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31538 -0.37129  0.01006  0.35365  1.87462 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.389e-16  6.868e-02   0.000 1.000000    
## log_mean_dist_t1  6.022e-01  1.668e-01   3.611 0.000517 ***
## age_t1           -2.996e-01  1.109e-01  -2.702 0.008333 ** 
## age_t2            1.030e-01  8.970e-02   1.148 0.254211    
## log_mtld_t1       7.983e-01  7.839e-02  10.184  2.5e-16 ***
## log_n_t1          5.251e-01  1.594e-01   3.294 0.001448 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6515 on 84 degrees of freedom
## Multiple R-squared:  0.5993, Adjusted R-squared:  0.5755 
## F-statistic: 25.13 on 5 and 84 DF,  p-value: 2.107e-15

lm(log_mtld_t2 ~ log_var_dist_t1 + age_t1 + age_t2 + log_mtld_t1 + log_n_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_var_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1134 -0.3250  0.1191  0.3211  1.6121 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.09753    0.06419  -1.519   0.1330    
## log_var_dist_t1 -0.13047    0.06354  -2.053   0.0437 *  
## age_t1          -0.52486    0.09713  -5.404 8.04e-07 ***
## age_t2           0.04589    0.07721   0.594   0.5541    
## log_mtld_t1      0.86319    0.06696  12.891  < 2e-16 ***
## log_n_t1         0.22562    0.08748   2.579   0.0119 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5435 on 72 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7132, Adjusted R-squared:  0.6933 
## F-statistic: 35.81 on 5 and 72 DF,  p-value: < 2.2e-16

lm(log_mtld_t2 ~ log_median_dist_t1+ log_var_dist_t1 + age_t1 + age_t2 + log_mtld_t1 + log_n_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_median_dist_t1 + log_var_dist_t1 + 
##     age_t1 + age_t2 + log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11689 -0.35389  0.03871  0.34048  0.98794 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.02571    0.07157   0.359 0.720453    
## log_median_dist_t1  0.49711    0.15487   3.210 0.001995 ** 
## log_var_dist_t1    -0.23322    0.06782  -3.439 0.000982 ***
## age_t1             -0.43551    0.09554  -4.558  2.1e-05 ***
## age_t2              0.04743    0.07266   0.653 0.516002    
## log_mtld_t1         0.81387    0.06486  12.548  < 2e-16 ***
## log_n_t1            0.36088    0.09248   3.902 0.000214 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5114 on 71 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7496, Adjusted R-squared:  0.7284 
## F-statistic: 35.42 on 6 and 71 DF,  p-value: < 2.2e-16

Predicting change in MTLD

lm(mtld_diff ~ log_median_dist_t1  +  age_t1 + age_t2  + log_mtld_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_median_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.64460 -0.32687 -0.07873  0.30634  2.54348 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.519e-17  7.910e-02   0.000  1.00000    
## log_median_dist_t1  3.362e-01  1.203e-01   2.794  0.00645 ** 
## age_t1             -2.601e-01  1.237e-01  -2.102  0.03857 *  
## age_t2              1.295e-02  1.033e-01   0.125  0.90052    
## log_mtld_t1        -5.064e-01  9.022e-02  -5.613  2.5e-07 ***
## log_n_t1            3.647e-01  1.300e-01   2.806  0.00623 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7504 on 84 degrees of freedom
## Multiple R-squared:  0.4686, Adjusted R-squared:  0.437 
## F-statistic: 14.81 on 5 and 84 DF,  p-value: 2.098e-10

lm(mtld_diff ~ log_mean_dist_t1  +  age_t1 + age_t2  + log_mtld_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_mean_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6249 -0.3321 -0.1006  0.3529  2.5163 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       8.518e-18  7.899e-02   0.000  1.00000    
## log_mean_dist_t1  5.443e-01  1.918e-01   2.838  0.00569 ** 
## age_t1           -2.209e-01  1.275e-01  -1.732  0.08686 .  
## age_t2            1.573e-02  1.032e-01   0.152  0.87919    
## log_mtld_t1      -5.098e-01  9.016e-02  -5.654  2.1e-07 ***
## log_n_t1          5.844e-01  1.834e-01   3.187  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7494 on 84 degrees of freedom
## Multiple R-squared:   0.47,  Adjusted R-squared:  0.4385 
## F-statistic:  14.9 on 5 and 84 DF,  p-value: 1.882e-10

lm(mtld_diff ~ log_var_dist_t1  + age_t1 + age_t2  + log_mtld_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_var_dist_t1 + age_t1 + age_t2 + 
##     log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37187 -0.32006 -0.04821  0.29294  2.43359 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.08138    0.07431  -1.095  0.27712    
## log_var_dist_t1 -0.10328    0.07356  -1.404  0.16464    
## age_t1          -0.48542    0.11245  -4.317 4.97e-05 ***
## age_t2          -0.04858    0.08939  -0.543  0.58848    
## log_mtld_t1     -0.47154    0.07753  -6.082 5.16e-08 ***
## log_n_t1         0.33751    0.10128   3.332  0.00136 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6292 on 72 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.5475, Adjusted R-squared:  0.516 
## F-statistic: 17.42 on 5 and 72 DF,  p-value: 2.876e-11

lm(mtld_diff ~ log_median_dist_t1  +  log_var_dist_t1  + age_t1 + age_t2  + log_mtld_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_median_dist_t1 + log_var_dist_t1 + 
##     age_t1 + age_t2 + log_mtld_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.31959 -0.27768 -0.04133  0.28865  1.78013 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.04765    0.08395   0.568 0.572084    
## log_median_dist_t1  0.52048    0.18166   2.865 0.005479 ** 
## log_var_dist_t1    -0.21086    0.07955  -2.651 0.009902 ** 
## age_t1             -0.39187    0.11207  -3.497 0.000817 ***
## age_t2             -0.04697    0.08522  -0.551 0.583264    
## log_mtld_t1        -0.52319    0.07608  -6.877 1.97e-09 ***
## log_n_t1            0.47914    0.10848   4.417 3.51e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5999 on 71 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.5944, Adjusted R-squared:  0.5601 
## F-statistic: 17.34 on 6 and 71 DF,  p-value: 3.081e-12

Predicting number of trigrams at t2

lm(log_num_trigrams_t2 ~ log_median_dist_t1 + age_t1 + age_t2  + log_n_t1 + log_num_trigrams_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_median_dist_t1 + age_t1 + 
##     age_t2 + log_n_t1 + log_num_trigrams_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2068 -0.2661 -0.0222  0.2195  1.4840 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.905e-16  4.963e-02   0.000 1.000000    
## log_median_dist_t1   2.065e-01  7.549e-02   2.736 0.007594 ** 
## age_t1              -1.560e-01  7.589e-02  -2.055 0.042939 *  
## age_t2               1.569e-01  6.476e-02   2.423 0.017543 *  
## log_n_t1             5.262e-01  1.354e-01   3.886 0.000202 ***
## log_num_trigrams_t1  6.891e-01  1.332e-01   5.174 1.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4708 on 84 degrees of freedom
## Multiple R-squared:  0.7908, Adjusted R-squared:  0.7783 
## F-statistic:  63.5 on 5 and 84 DF,  p-value: < 2.2e-16

lm(log_num_trigrams_t2 ~ log_mean_dist_t1 + age_t1 + age_t2  + log_n_t1 + log_num_trigrams_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_mean_dist_t1 + age_t1 + 
##     age_t2 + log_n_t1 + log_num_trigrams_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27610 -0.25718 -0.01588  0.23621  1.51041 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.146e-16  4.942e-02   0.000  1.00000    
## log_mean_dist_t1     3.441e-01  1.198e-01   2.872  0.00516 ** 
## age_t1              -1.292e-01  7.797e-02  -1.656  0.10136    
## age_t2               1.588e-01  6.452e-02   2.462  0.01587 *  
## log_n_t1             6.773e-01  1.575e-01   4.301 4.57e-05 ***
## log_num_trigrams_t1  6.784e-01  1.325e-01   5.119 1.92e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4689 on 84 degrees of freedom
## Multiple R-squared:  0.7925, Adjusted R-squared:  0.7802 
## F-statistic: 64.17 on 5 and 84 DF,  p-value: < 2.2e-16

lm(log_num_trigrams_t2 ~ log_var_dist_t1  + age_t1 + age_t2  + log_num_trigrams_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_var_dist_t1 + age_t1 + 
##     age_t2 + log_num_trigrams_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10949 -0.23278 -0.01548  0.18671  1.30409 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.06777    0.05436  -1.247   0.2166    
## log_var_dist_t1     -0.10714    0.05309  -2.018   0.0473 *  
## age_t1              -0.10729    0.07982  -1.344   0.1831    
## age_t2               0.14153    0.06484   2.183   0.0323 *  
## log_num_trigrams_t1  0.78474    0.15680   5.005 3.83e-06 ***
## log_n_t1             0.35367    0.14289   2.475   0.0157 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4559 on 72 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7766, Adjusted R-squared:  0.7611 
## F-statistic: 50.06 on 5 and 72 DF,  p-value: < 2.2e-16

lm(log_num_trigrams_t2 ~ log_mean_dist_t1  +  log_var_dist_t1  + age_t1 + age_t2  + log_num_trigrams_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_mean_dist_t1 + log_var_dist_t1 + 
##     age_t1 + age_t2 + log_num_trigrams_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14654 -0.26724 -0.03388  0.15642  1.44963 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.006587   0.059983  -0.110  0.91286    
## log_mean_dist_t1     0.404762   0.185794   2.179  0.03269 *  
## log_var_dist_t1     -0.125612   0.052449  -2.395  0.01926 *  
## age_t1              -0.039452   0.083824  -0.471  0.63933    
## age_t2               0.142021   0.063214   2.247  0.02777 *  
## log_num_trigrams_t1  0.711600   0.156517   4.546 2.19e-05 ***
## log_n_t1             0.644929   0.193087   3.340  0.00134 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4445 on 71 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.7906, Adjusted R-squared:  0.7729 
## F-statistic: 44.68 on 6 and 71 DF,  p-value: < 2.2e-16

Predicting frequency of trigrams at t2

lm(mean_log_freq_trigrams_t2 ~ log_median_dist_t1 + age_t1 + age_t2  + log_n_t1 + mean_log_freq_trigrams_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_median_dist_t1 + 
##     age_t1 + age_t2 + log_n_t1 + mean_log_freq_trigrams_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04813 -0.24661 -0.02453  0.23372  1.13363 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.831e-16  4.512e-02   0.000  1.00000    
## log_median_dist_t1        -2.276e-01  6.857e-02  -3.318  0.00134 ** 
## age_t1                     5.656e-02  6.835e-02   0.828  0.41029    
## age_t2                    -7.765e-02  5.895e-02  -1.317  0.19132    
## log_n_t1                  -8.079e-01  9.160e-02  -8.821 1.36e-13 ***
## mean_log_freq_trigrams_t1  3.773e-01  7.743e-02   4.873 5.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4281 on 84 degrees of freedom
## Multiple R-squared:  0.8271, Adjusted R-squared:  0.8168 
## F-statistic: 80.34 on 5 and 84 DF,  p-value: < 2.2e-16

lm(mean_log_freq_trigrams_t2 ~ log_mean_dist_t1 + age_t1 + age_t2  + log_n_t1 + mean_log_freq_trigrams_t1,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_mean_dist_t1 + age_t1 + 
##     age_t2 + log_n_t1 + mean_log_freq_trigrams_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07987 -0.24162  0.00621  0.25563  1.04165 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.558e-16  4.500e-02   0.000  1.00000    
## log_mean_dist_t1          -3.715e-01  1.093e-01  -3.398  0.00104 ** 
## age_t1                     2.683e-02  7.063e-02   0.380  0.70501    
## age_t2                    -8.024e-02  5.881e-02  -1.364  0.17613    
## log_n_t1                  -9.727e-01  1.202e-01  -8.094 3.92e-12 ***
## mean_log_freq_trigrams_t1  3.576e-01  7.740e-02   4.621 1.37e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4269 on 84 degrees of freedom
## Multiple R-squared:  0.828,  Adjusted R-squared:  0.8178 
## F-statistic: 80.89 on 5 and 84 DF,  p-value: < 2.2e-16

lm(mean_log_freq_trigrams_t2 ~ log_var_dist_t1  + age_t1 + age_t2  + mean_log_freq_trigrams_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_var_dist_t1 + age_t1 + 
##     age_t2 + mean_log_freq_trigrams_t1 + log_n_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9163 -0.2312 -0.0329  0.2621  0.8651 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.082883   0.046007   1.802   0.0758 .  
## log_var_dist_t1            0.076645   0.045694   1.677   0.0978 .  
## age_t1                    -0.003519   0.067853  -0.052   0.9588    
## age_t2                    -0.075266   0.055355  -1.360   0.1782    
## mean_log_freq_trigrams_t1  0.618268   0.111355   5.552 4.45e-07 ***
## log_n_t1                  -0.534587   0.102774  -5.202 1.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3894 on 72 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.8459, Adjusted R-squared:  0.8352 
## F-statistic: 79.02 on 5 and 72 DF,  p-value: < 2.2e-16

lm(mean_log_freq_trigrams_t2 ~ log_median_dist_t1  +  log_var_dist_t1  + age_t1 + age_t2  + mean_log_freq_trigrams_t1 + log_n_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_median_dist_t1 + 
##     log_var_dist_t1 + age_t1 + age_t2 + mean_log_freq_trigrams_t1 + 
##     log_n_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95111 -0.15412  0.01941  0.25468  0.85431 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.0008849  0.0506540  -0.017  0.98611    
## log_median_dist_t1        -0.3444604  0.1079156  -3.192  0.00211 ** 
## log_var_dist_t1            0.1512095  0.0489626   3.088  0.00287 ** 
## age_t1                    -0.0535003  0.0657891  -0.813  0.41882    
## age_t2                    -0.0755962  0.0521291  -1.450  0.15141    
## mean_log_freq_trigrams_t1  0.6097192  0.1048987   5.812 1.61e-07 ***
## log_n_t1                  -0.6332007  0.1015949  -6.233 2.90e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3667 on 71 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.8652, Adjusted R-squared:  0.8538 
## F-statistic: 75.95 on 6 and 71 DF,  p-value: < 2.2e-16