Semantic density analyses

Regressions
Exploring individual words
Centrality at t1 vs. t2

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))

MINWORDSFORVOCAB <- 5

The min words for vocab here is 5.

Read in data

all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")
trigrams <- read_csv("../2_trigrams/mtld_continuous_trigram_by_kid_MIN1.csv")
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/ref_complex/Papers/RC_old/analysis/data/corpus/SUBTLEXus_corpus.txt") %>%
  rename(word = Word,
         log_freq = Lg10WF)

density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iZ4ROE3EvwU8sZ5PVztiNF7PyLaRks5bBF6awA%3D%3D")) %>%
  rename(semantic_density = `semantic-density`, 
         neighb_count = `neighbour-count`,
         neighb_conc = `neighbour-concentration`,
         centrality = `global-centrality`) %>%
  select(word:semantic_density)

Get filtered version of types for each kid

nested_data_by_kid_t1 <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)  %>%
  nest(-target_child_id)

nested_data_by_kid_t2 <- all_types %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB) %>%
  nest(-target_child_id)

Get mean density at t1

get_density_by_kid <- function(id, data, density_norms, freq_norms){
  total_words_t1 <- nrow(data)
  
  this_kids_freq <- data %>% 
    left_join(freq_norms, by = c("gloss_clean" = "word")) %>%
    summarize(mean_log_freq = mean(log_freq, na.rm  = T))
    
  this_kids_model <- density_norms %>%
    filter(word %in% data$gloss_clean) %>%
    select(-word)
 
  this_kids_model %>%
      summarize_all(mean) %>%
      mutate(target_child_id = id, 
             words_in_norms = nrow(this_kids_model),
             total_words = total_words_t1,
             mean_log_word_freq = this_kids_freq$mean_log_freq) %>%
      select(target_child_id, everything())

}

# t1 vocab measures
vocab_measures_t1 <- map2_df(nested_data_by_kid_t1$target_child_id, 
                          nested_data_by_kid_t1$data, 
                          get_density_by_kid, 
                          density_norms, 
                          freq)  %>%
         rename(words_in_norms_t1 = words_in_norms,
             total_words_t1 = total_words, 
             mean_log_word_freq_t1 = mean_log_word_freq) %>%
        mutate(log_density_t1 = log(semantic_density),
               log_centrality_t1 = log(centrality),
               log_total_words_t1 = log(total_words_t1),
               log_word_in_norms_t1 = log(words_in_norms_t1)) %>%
      select(-centrality, -neighb_count, -neighb_conc, -semantic_density)

# t2 vocab measures
vocab_measures_t2 <- map2_df(nested_data_by_kid_t2$target_child_id, 
                          nested_data_by_kid_t2$data, 
                          get_density_by_kid, 
                          density_norms, 
                          freq)  %>%
        rename(words_in_norms_t2 = words_in_norms,
               total_words_t2 = total_words, 
               mean_log_word_freq_t2 = mean_log_word_freq) %>%
        mutate(log_density_t2 = log(semantic_density),
               log_centrality_t2 = log(centrality),
              log_total_words_t2 = log(total_words_t2),
              log_word_in_norms_t2 = log(words_in_norms_t2))  %>%
         select(-centrality, -neighb_count, -neighb_conc, -semantic_density)

vocab_measures <- full_join(vocab_measures_t1, vocab_measures_t2)

Merge in other variables

vocab_df <- vocab_measures %>%
  left_join(groups_info %>% select(delta_resid_group, target_child_id, mtld_t1, 
                                          mtld_t2, age_t1, age_t2, mtld_diff, age_diff)) %>%
  mutate(log_mtld_t2 = log(mtld_t2 + 1),
         log_mtld_t1 = log(mtld_t1 + 1)) %>%
  left_join(trigrams %>% select(target_child_id, log_num_trigrams_t1, log_num_trigrams_t2,
                                mean_log_freq_trigrams_t1, mean_log_freq_trigrams_t2)) %>%
  select(-mtld_t1, -mtld_t2)
  #mutate_if(is.numeric, scale) # scale everything for regressions 

write_csv(vocab_df, "semantic_density_df.csv")

Regressions

Predicting log_mtld_t2

lm(log_mtld_t2 ~  log_density_t1 + log_mtld_t1 + age_diff + age_t1, 
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_density_t1 + log_mtld_t1 + age_diff + 
##     age_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50028 -0.18077 -0.01099  0.18045  0.86402 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.328138   1.867447   1.247    0.216    
## log_density_t1  0.077231   0.076130   1.014    0.313    
## log_mtld_t1     0.616120   0.064274   9.586 3.52e-15 ***
## age_diff        0.001303   0.001714   0.760    0.449    
## age_t1         -0.002648   0.002371  -1.117    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2763 on 85 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.5407, Adjusted R-squared:  0.5191 
## F-statistic: 25.02 on 4 and 85 DF,  p-value: 1.042e-13

lm(log_mtld_t2 ~  log_centrality_t1 + log_mtld_t1 + age_diff + age_t1, 
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_centrality_t1 + log_mtld_t1 + 
##     age_diff + age_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49851 -0.17139 -0.01161  0.16973  0.88446 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.554766   2.167594   1.640    0.105    
## log_centrality_t1  0.373229   0.490407   0.761    0.449    
## log_mtld_t1        0.612818   0.066079   9.274  1.5e-14 ***
## age_diff           0.001207   0.001728   0.698    0.487    
## age_t1            -0.002637   0.002383  -1.107    0.272    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.277 on 85 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.5383, Adjusted R-squared:  0.5166 
## F-statistic: 24.78 on 4 and 85 DF,  p-value: 1.297e-13

lm(log_mtld_t2 ~  log_density_t1 + log_centrality_t1 + log_mtld_t1 + age_diff + age_t1, 
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_density_t1 + log_centrality_t1 + 
##     log_mtld_t1 + age_diff + age_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50194 -0.18118 -0.01173  0.18022  0.86345 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.947093   3.210738   0.606    0.546    
## log_density_t1     0.094071   0.138228   0.681    0.498    
## log_centrality_t1 -0.129955   0.888090  -0.146    0.884    
## log_mtld_t1        0.618599   0.066831   9.256 1.81e-14 ***
## age_diff           0.001342   0.001745   0.769    0.444    
## age_t1            -0.002625   0.002390  -1.098    0.275    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2779 on 84 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.5408, Adjusted R-squared:  0.5135 
## F-statistic: 19.79 on 5 and 84 DF,  p-value: 5.562e-13

Predicting mtld_diff

lm(mtld_diff ~ log_density_t1  +  age_diff + age_t1 + log_mtld_t1 ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_density_t1 + age_diff + age_t1 + 
##     log_mtld_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.9888  -2.9122  -0.8178   2.8028  17.0447 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    48.58482   36.67253   1.325    0.189    
## log_density_t1  2.13300    1.49502   1.427    0.157    
## age_diff       -0.01362    0.03366  -0.405    0.687    
## age_t1         -0.06136    0.04657  -1.318    0.191    
## log_mtld_t1    -6.73635    1.26220  -5.337 7.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.425 on 85 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.4186, Adjusted R-squared:  0.3912 
## F-statistic:  15.3 on 4 and 85 DF,  p-value: 1.842e-09

lm(mtld_diff ~ log_centrality_t1  +  age_diff + age_t1 + 
     log_mtld_t1  ,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_centrality_t1 + age_diff + age_t1 + 
##     log_mtld_t1, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4459  -3.1278  -0.8113   2.6113  17.9299 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       73.10696   42.85411   1.706   0.0917 .  
## log_centrality_t1  6.30725    9.69552   0.651   0.5171    
## age_diff          -0.01472    0.03417  -0.431   0.6678    
## age_t1            -0.05891    0.04711  -1.251   0.2145    
## log_mtld_t1       -6.68263    1.30640  -5.115 1.91e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.476 on 85 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.4076, Adjusted R-squared:  0.3797 
## F-statistic: 14.62 on 4 and 85 DF,  p-value: 3.976e-09

lm(mtld_diff ~ log_density_t1  +  log_centrality_t1 + age_diff + 
     age_t1 + log_mtld_t1 + log_density_t2 + log_centrality_t2 + log_total_words_t1 + log_total_words_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ log_density_t1 + log_centrality_t1 + 
##     age_diff + age_t1 + log_mtld_t1 + log_density_t2 + log_centrality_t2 + 
##     log_total_words_t1 + log_total_words_t2, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.4474  -3.1576  -0.7623   1.8200  20.8240 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        114.535358  93.363938   1.227   0.2235    
## log_density_t1       5.243340   2.752231   1.905   0.0604 .  
## log_centrality_t1  -34.898206  18.706609  -1.866   0.0658 .  
## age_diff            -0.008057   0.034329  -0.235   0.8150    
## age_t1              -0.052893   0.045525  -1.162   0.2487    
## log_mtld_t1         -7.875184   1.365537  -5.767 1.46e-07 ***
## log_density_t2      -4.557817   5.365463  -0.849   0.3982    
## log_centrality_t2   69.836508  32.616111   2.141   0.0353 *  
## log_total_words_t1   0.462883   1.036437   0.447   0.6564    
## log_total_words_t2   0.518431   1.039351   0.499   0.6193    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.256 on 80 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.4863, Adjusted R-squared:  0.4286 
## F-statistic: 8.416 on 9 and 80 DF,  p-value: 9.898e-09

Predicting num trigrams

lm(log_num_trigrams_t2 ~ log_density_t1 +  log_num_trigrams_t1 + age_diff + age_t1 +  log_word_in_norms_t1 + log_word_in_norms_t2 +  mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 + 
##     age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + 
##     mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2, 
##     data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06441 -0.27453 -0.02732  0.19873  1.59324 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            2.154147   3.917191   0.550 0.583965    
## log_density_t1         0.281318   0.152799   1.841 0.069460 .  
## log_num_trigrams_t1    0.246264   0.061293   4.018 0.000136 ***
## age_diff               0.004315   0.003216   1.342 0.183616    
## age_t1                -0.002379   0.004289  -0.555 0.580778    
## log_word_in_norms_t1  -0.361960   0.116247  -3.114 0.002594 ** 
## log_word_in_norms_t2   1.136285   0.106095  10.710  < 2e-16 ***
## mean_log_word_freq_t1  0.052644   0.142748   0.369 0.713294    
## mean_log_word_freq_t2  0.170032   0.217109   0.783 0.435934    
## log_density_t2         0.132887   0.346978   0.383 0.702786    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4625 on 77 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.9233, Adjusted R-squared:  0.9143 
## F-statistic:   103 on 9 and 77 DF,  p-value: < 2.2e-16

lm(log_num_trigrams_t2 ~ log_centrality_t1 +  log_num_trigrams_t1 + age_diff + age_t1 +  log_word_in_norms_t1 + log_word_in_norms_t2 +  mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality_t1 + log_num_trigrams_t1 + 
##     age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + 
##     mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2, 
##     data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85743 -0.27263  0.00408  0.17887  1.79782 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           23.329689   5.670422   4.114 9.65e-05 ***
## log_centrality_t1      2.129950   1.050609   2.027  0.04609 *  
## log_num_trigrams_t1    0.245509   0.056075   4.378 3.72e-05 ***
## age_diff               0.003976   0.002971   1.338  0.18480    
## age_t1                -0.001419   0.003938  -0.360  0.71960    
## log_word_in_norms_t1  -0.241495   0.110509  -2.185  0.03191 *  
## log_word_in_norms_t2   0.893284   0.113694   7.857 1.92e-11 ***
## mean_log_word_freq_t1  0.009115   0.150576   0.061  0.95189    
## mean_log_word_freq_t2 -0.250680   0.239485  -1.047  0.29849    
## log_centrality_t2      6.751832   2.409092   2.803  0.00641 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4227 on 77 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.936,  Adjusted R-squared:  0.9285 
## F-statistic:   125 on 9 and 77 DF,  p-value: < 2.2e-16

lm(log_num_trigrams_t2 ~ log_centrality_t1 + log_centrality_t2 + log_density_t1 + log_density_t2 + log_num_trigrams_t1 + age_diff + age_t1 +  log_word_in_norms_t1 + log_word_in_norms_t2 +  mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = log_num_trigrams_t2 ~ log_centrality_t1 + log_centrality_t2 + 
##     log_density_t1 + log_density_t2 + log_num_trigrams_t1 + age_diff + 
##     age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + mean_log_word_freq_t1 + 
##     mean_log_word_freq_t2 + log_centrality_t2, data = vocab_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91379 -0.19204  0.00719  0.18141  1.41606 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           42.185327   9.243930   4.564 1.93e-05 ***
## log_centrality_t1      0.963013   1.980311   0.486 0.628177    
## log_centrality_t2     13.541510   3.746847   3.614 0.000543 ***
## log_density_t1         0.097392   0.259600   0.375 0.708598    
## log_density_t2        -1.192939   0.484249  -2.463 0.016052 *  
## log_num_trigrams_t1    0.210312   0.056193   3.743 0.000354 ***
## age_diff               0.003224   0.002903   1.110 0.270345    
## age_t1                -0.001968   0.003859  -0.510 0.611593    
## log_word_in_norms_t1  -0.155477   0.112751  -1.379 0.172014    
## log_word_in_norms_t2   0.849467   0.111794   7.598 6.91e-11 ***
## mean_log_word_freq_t1  0.134994   0.173001   0.780 0.437666    
## mean_log_word_freq_t2 -0.447029   0.261319  -1.711 0.091277 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4106 on 75 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.9411, Adjusted R-squared:  0.9325 
## F-statistic:   109 on 11 and 75 DF,  p-value: < 2.2e-16

Predicting freq trigrams

lm(mean_log_freq_trigrams_t2 ~ log_density_t1 +  log_num_trigrams_t1 + age_diff + age_t1 +  log_word_in_norms_t1 + log_word_in_norms_t2 +  mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_density_t1 + log_num_trigrams_t1 + 
##     age_diff + age_t1 + log_word_in_norms_t1 + log_word_in_norms_t2 + 
##     mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_density_t2, 
##     data = vocab_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.179659 -0.032371 -0.002094  0.035780  0.150014 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.4536048  0.4956081   0.915   0.3629    
## log_density_t1        -0.0038107  0.0193324  -0.197   0.8443    
## log_num_trigrams_t1   -0.0133411  0.0077549  -1.720   0.0894 .  
## age_diff              -0.0001089  0.0004069  -0.268   0.7896    
## age_t1                 0.0002166  0.0005426   0.399   0.6909    
## log_word_in_norms_t1   0.0230250  0.0147078   1.565   0.1216    
## log_word_in_norms_t2  -0.1103722  0.0134232  -8.222 3.79e-12 ***
## mean_log_word_freq_t1  0.0010097  0.0180606   0.056   0.9556    
## mean_log_word_freq_t2  0.0098051  0.0274689   0.357   0.7221    
## log_density_t2         0.0842354  0.0439001   1.919   0.0587 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05852 on 77 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.8948, Adjusted R-squared:  0.8825 
## F-statistic: 72.74 on 9 and 77 DF,  p-value: < 2.2e-16

lm(mean_log_freq_trigrams_t2 ~ log_centrality_t1 +  log_num_trigrams_t1 + age_diff + age_t1 +  log_word_in_norms_t1 + log_word_in_norms_t2 +  mean_log_word_freq_t1 + mean_log_word_freq_t2 + log_centrality_t2,
   data = vocab_df) %>%
  summary()

## 
## Call:
## lm(formula = mean_log_freq_trigrams_t2 ~ log_centrality_t1 + 
##     log_num_trigrams_t1 + age_diff + age_t1 + log_word_in_norms_t1 + 
##     log_word_in_norms_t2 + mean_log_word_freq_t1 + mean_log_word_freq_t2 + 
##     log_centrality_t2, data = vocab_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.236366 -0.028185 -0.001167  0.027665  0.139214 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.3655768  0.7971563   0.459   0.6478    
## log_centrality_t1     -0.1132254  0.1476962  -0.767   0.4457    
## log_num_trigrams_t1   -0.0164908  0.0078831  -2.092   0.0397 *  
## age_diff              -0.0001359  0.0004177  -0.325   0.7458    
## age_t1                 0.0001285  0.0005536   0.232   0.8171    
## log_word_in_norms_t1   0.0207918  0.0155355   1.338   0.1847    
## log_word_in_norms_t2  -0.0915469  0.0159832  -5.728 1.88e-07 ***
## mean_log_word_freq_t1  0.0076215  0.0211682   0.360   0.7198    
## mean_log_word_freq_t2  0.0428827  0.0336671   1.274   0.2066    
## log_centrality_t2     -0.1532772  0.3386737  -0.453   0.6521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05943 on 77 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.8915, Adjusted R-squared:  0.8788 
## F-statistic: 70.29 on 9 and 77 DF,  p-value: < 2.2e-16

Exploring individual words

types_clean <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)
 
word_by_decile <- types_clean %>%
  ungroup()%>%
  count(gloss_clean) %>%
  filter(n >= 5) %>%
  left_join(density_norms, by = c("gloss_clean" = "word")) %>%
  filter(!is.na(centrality)) %>%
  mutate(decile_centrality = ntile(centrality, 10),
         decile_density = ntile(semantic_density, 10)) 

word_by_decile_diff = word_by_decile  %>% 
  #filter(decile_centrality != decile_density) %>%
  mutate(decile_diff = decile_centrality - decile_density,
         abs_decile_diff = abs(decile_diff)) %>%
  #filter(abs_decile_diff > 4) %>%
  select(-2:-6) %>%
  arrange(-abs_decile_diff)

DT::datatable(word_by_decile_diff)

Centrality at t1 vs. t2

ggplot(vocab_measures, aes(x = log_centrality_t1, y = log_centrality_t2)) +
  geom_point() +
  geom_smooth(method = "lm")

ggplot(vocab_measures, aes(x = log_density_t1, y = log_density_t2)) +
  geom_point() +
  geom_smooth(method = "lm")

Semantic density analyses

2018-05-16

Regressions

Predicting log_mtld_t2

Predicting mtld_diff

Predicting num trigrams

Predicting freq trigrams

Exploring individual words

Centrality at t1 vs. t2