library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))
MINWORDSFORVOCAB <- 1

The min words for vocab here is 1.

Read in data

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")
trigrams <- read_csv("../2_trigrams/mtld_continuous_trigram_by_kid_MIN1.csv")
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/ref_complex/Papers/RC_old/analysis/data/corpus/SUBTLEXus_corpus.txt") %>%
  rename(word = Word,
         log_freq = Lg10WF)

density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iZ4ROE3EvwU8sZ5PVztiNF7PyLaRks5bBF6awA%3D%3D")) %>%
  rename(semantic_density = `semantic-density`, 
         neighb_count = `neighbour-count`,
         neighb_conc = `neighbour-concentration`,
         centrality = `global-centrality`) %>%
  select(word:semantic_density) 

Get filtered version of types for each kid

types_clean <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

Get mean density at t1

get_density_by_kid <- function(id, data, density_norms, freq_norms){
  total_words_t1 <- nrow(data)
  
  this_kids_freq <- data %>% 
    left_join(freq_norms, by = c("gloss_clean" = "word")) %>%
    summarize(mean_log_freq = mean(log_freq, na.rm  = T))
    
  this_kids_model <- density_norms %>%
    filter(word %in% data$gloss_clean) %>%
    select(-word)
 
  this_kids_model %>%
      summarize_all(mean) %>%
      mutate(target_child_id = id, 
             words_in_norms_t1 = nrow(this_kids_model),
             total_words_t1 = total_words_t1,
             mean_log_word_freq_t1 = this_kids_freq$mean_log_freq) %>%
      select(target_child_id, everything())

}

nested_data_by_kid <- nest(types_clean, -target_child_id)

vocab_measures <- map2_df(nested_data_by_kid$target_child_id, 
                          nested_data_by_kid$data, 
                          get_density_by_kid, 
                          density_norms, 
                          freq) 

Merge in other variables

vocab_df <- vocab_measures %>%
  left_join(groups_info %>% select(delta_resid_group, target_child_id, mtld_t1, 
                                          mtld_t2, age_t1, age_t2, mtld_diff, age_diff)) %>%
  mutate(log_mtld_t2 = log(mtld_t2 + 1),
         log_mtld_t1 = log(mtld_t1 + 1),
         log_total_words_t1 = log(total_words_t1),
         log_word_in_norms_t1 = log(words_in_norms_t1),
         log_semantic_density = log(semantic_density),
         log_centrality = log(centrality)) %>%
  left_join(trigrams %>% select(target_child_id, log_num_trigrams_t1, log_num_trigrams_t2,
                                mean_log_freq_trigrams_t1, mean_log_freq_trigrams_t2)) %>%
  mutate_if(is.numeric, scale) # scale everything for regressions 

Regressions

Predicting log_mtld_t2

lm(log_mtld_t2 ~ log_semantic_density + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = log_mtld_t2 ~ log_semantic_density + age_t1 + age_t2 + 
##     log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.42334 -0.47777  0.00001  0.37107  2.43015 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.539e-16  7.220e-02   0.000 1.000000    
## log_semantic_density -1.396e-01  7.430e-02  -1.878 0.063415 .  
## age_t1               -4.388e-01  1.157e-01  -3.794 0.000261 ***
## age_t2                1.048e-01  9.664e-02   1.085 0.280876    
## log_mtld_t1           7.674e-01  8.729e-02   8.792  6.3e-14 ***
## log_word_in_norms_t1  8.785e-02  1.144e-01   0.768 0.444627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7256 on 95 degrees of freedom
## Multiple R-squared:  0.4998, Adjusted R-squared:  0.4735 
## F-statistic: 18.98 on 5 and 95 DF,  p-value: 4.783e-13
lm(log_mtld_t2 ~ log_centrality + age_t1 + age_t2 + log_mtld_t1 + log_word_in_norms_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = log_mtld_t2 ~ log_centrality + age_t1 + age_t2 + 
##     log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50395 -0.49163 -0.00277  0.40475  2.43579 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.489e-16  7.238e-02   0.000 1.000000    
## log_centrality       -1.295e-01  7.423e-02  -1.745 0.084286 .  
## age_t1               -4.531e-01  1.155e-01  -3.923 0.000165 ***
## age_t2                1.075e-01  9.694e-02   1.109 0.270233    
## log_mtld_t1           7.771e-01  8.845e-02   8.786  6.5e-14 ***
## log_word_in_norms_t1  7.000e-02  1.153e-01   0.607 0.545052    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7274 on 95 degrees of freedom
## Multiple R-squared:  0.4973, Adjusted R-squared:  0.4709 
## F-statistic:  18.8 on 5 and 95 DF,  p-value: 6.001e-13

Predicting mtld_diff

lm(mtld_diff ~ log_semantic_density  +  age_t1 + age_t2  + log_mtld_t1 + log_word_in_norms_t1 ,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_diff ~ log_semantic_density + age_t1 + age_t2 + 
##     log_mtld_t1 + log_word_in_norms_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6087 -0.4461 -0.1579  0.3458  2.9015 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.056e-16  7.742e-02   0.000  1.00000    
## log_semantic_density -1.072e-01  7.967e-02  -1.346  0.18151    
## age_t1               -3.851e-01  1.240e-01  -3.105  0.00251 ** 
## age_t2                8.649e-03  1.036e-01   0.083  0.93366    
## log_mtld_t1          -4.845e-01  9.359e-02  -5.177 1.26e-06 ***
## log_word_in_norms_t1  2.016e-01  1.227e-01   1.643  0.10375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7781 on 95 degrees of freedom
## Multiple R-squared:  0.4249, Adjusted R-squared:  0.3946 
## F-statistic: 14.04 on 5 and 95 DF,  p-value: 2.862e-10
lm(mtld_diff ~ log_centrality  + age_t1 + age_t2  + log_mtld_t1 + log_word_in_norms_t1 ,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = mtld_diff ~ log_centrality + age_t1 + age_t2 + log_mtld_t1 + 
##     log_word_in_norms_t1, data = vocab_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5846 -0.4209 -0.1351  0.3698  2.9262 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.891e-16  7.794e-02   0.000  1.00000    
## log_centrality       -5.778e-02  7.993e-02  -0.723  0.47151    
## age_t1               -3.980e-01  1.244e-01  -3.200  0.00187 ** 
## age_t2                8.091e-03  1.044e-01   0.078  0.93839    
## log_mtld_t1          -4.863e-01  9.524e-02  -5.106 1.69e-06 ***
## log_word_in_norms_t1  1.940e-01  1.241e-01   1.564  0.12126    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7833 on 95 degrees of freedom
## Multiple R-squared:  0.4171, Adjusted R-squared:  0.3865 
## F-statistic:  13.6 on 5 and 95 DF,  p-value: 5.269e-10

Predicting num trigrams

lm(log_num_trigrams_t2 ~ log_semantic_density + age_t1 + age_t2  + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_semantic_density + age_t1 + 
##     age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1, 
##     data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08120 -0.24573 -0.03159  0.25992  1.17499 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.403e-16  4.540e-02   0.000  1.00000    
## log_semantic_density   1.211e-02  4.642e-02   0.261  0.79482    
## age_t1                -2.124e-01  7.265e-02  -2.923  0.00434 ** 
## age_t2                 1.232e-01  6.073e-02   2.028  0.04537 *  
## log_word_in_norms_t1  -3.436e-01  3.305e-01  -1.040  0.30110    
## log_num_trigrams_t1    1.129e+00  2.842e-01   3.971  0.00014 ***
## mean_log_word_freq_t1 -4.898e-01  9.428e-02  -5.195 1.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4563 on 94 degrees of freedom
## Multiple R-squared:  0.8043, Adjusted R-squared:  0.7918 
## F-statistic: 64.38 on 6 and 94 DF,  p-value: < 2.2e-16
lm(log_num_trigrams_t2 ~ log_centrality + age_t1 + age_t2  + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality + age_t1 + 
##     age_t2 + log_word_in_norms_t1 + log_num_trigrams_t1 + mean_log_word_freq_t1, 
##     data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00954 -0.24090  0.00498  0.24206  1.17050 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -7.609e-16  4.387e-02   0.000 1.000000    
## log_centrality         1.217e-01  4.675e-02   2.604 0.010704 *  
## age_t1                -2.134e-01  6.979e-02  -3.058 0.002900 ** 
## age_t2                 1.143e-01  5.875e-02   1.946 0.054601 .  
## log_word_in_norms_t1  -2.667e-01  3.206e-01  -0.832 0.407483    
## log_num_trigrams_t1    1.035e+00  2.770e-01   3.738 0.000319 ***
## mean_log_word_freq_t1 -5.133e-01  9.154e-02  -5.608 2.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4408 on 94 degrees of freedom
## Multiple R-squared:  0.8173, Adjusted R-squared:  0.8057 
## F-statistic: 70.09 on 6 and 94 DF,  p-value: < 2.2e-16

Predicting freq trigrams

lm(mean_log_freq_trigrams_t2 ~ log_semantic_density + age_t1 + age_t2  + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + mean_log_word_freq_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_semantic_density + 
##     age_t1 + age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + 
##     mean_log_word_freq_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93735 -0.27290 -0.03837  0.28297  1.10832 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.445e-16  4.310e-02   0.000  1.00000    
## log_semantic_density       3.695e-02  4.410e-02   0.838  0.40417    
## age_t1                     9.694e-02  6.894e-02   1.406  0.16296    
## age_t2                    -4.996e-02  5.767e-02  -0.866  0.38860    
## log_word_in_norms_t1      -5.996e-01  1.055e-01  -5.681 1.49e-07 ***
## mean_log_freq_trigrams_t1  1.911e-01  6.882e-02   2.776  0.00663 ** 
## mean_log_word_freq_t1      3.878e-01  6.931e-02   5.596 2.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4332 on 94 degrees of freedom
## Multiple R-squared:  0.8236, Adjusted R-squared:  0.8124 
## F-statistic: 73.16 on 6 and 94 DF,  p-value: < 2.2e-16
lm(mean_log_freq_trigrams_t2 ~ log_centrality + age_t1 + age_t2  + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + mean_log_word_freq_t1,
   data = vocab_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_centrality + age_t1 + 
##     age_t2 + log_word_in_norms_t1 + mean_log_freq_trigrams_t1 + 
##     mean_log_word_freq_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97242 -0.27562 -0.05022  0.23494  1.06416 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.385e-16  4.307e-02   0.000  1.00000    
## log_centrality            -4.152e-02  4.560e-02  -0.911  0.36483    
## age_t1                     1.044e-01  6.851e-02   1.524  0.13093    
## age_t2                    -4.470e-02  5.772e-02  -0.774  0.44066    
## log_word_in_norms_t1      -5.809e-01  1.064e-01  -5.458 3.90e-07 ***
## mean_log_freq_trigrams_t1  1.971e-01  6.885e-02   2.864  0.00517 ** 
## mean_log_word_freq_t1      4.084e-01  7.214e-02   5.660 1.63e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4329 on 94 degrees of freedom
## Multiple R-squared:  0.8239, Adjusted R-squared:  0.8126 
## F-statistic: 73.28 on 6 and 94 DF,  p-value: < 2.2e-16