Regression analysis for each word

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld at t2

MODEL: lm(log_mtld_t2 ~ know_word_at_t1 + log_mtld_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_t2.csv") %>%
  mutate(word = tolower(word))

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

Do kids who have a high mean t have high mtld at t2? Yes.

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
         log_mtld_t2 = log(mtld_t2),
         age_diff = age_t2 - age_t1) %>%
    select(target_child_id, log_mtld_t1, log_mtld_t2, 
           age_t1, age_t2, age_diff, corpus_name)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = log_mtld_t2)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
  mutate(log_transcript_length_t1 = log(transcript_length_t1),
         log_transcript_length_t2 = log(transcript_length_t2)) %>%
  select(-transcript_length_t1, transcript_length_t2)

freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(transcript_length) %>%
  left_join(freq_info)

lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + age_diff + age_t1  , t1_word_counts_with_ts_mtld  ) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + 
##     log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + 
##     age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52342 -0.09859 -0.00054  0.08263  0.82875 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.427e+00  1.381e+00   3.206  0.00195 ** 
## log_mtld_t1               5.490e-01  4.701e-02  11.678  < 2e-16 ***
## mean_t                    3.087e-01  5.644e-02   5.470 5.13e-07 ***
## log_transcript_length_t1 -9.636e-02  2.870e-02  -3.357  0.00121 ** 
## log_transcript_length_t2  1.124e-01  2.319e-02   4.848 6.13e-06 ***
## mean_freq_t1             -1.587e-02  2.874e-02  -0.552  0.58230    
## mean_freq_t2              8.300e-02  3.984e-02   2.084  0.04044 *  
## age_diff                 -7.947e-05  1.347e-03  -0.059  0.95309    
## age_t1                   -5.850e-03  1.755e-03  -3.332  0.00131 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1974 on 79 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.8025, Adjusted R-squared:  0.7825 
## F-statistic: 40.13 on 8 and 79 DF,  p-value: < 2.2e-16

lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
     log_transcript_length_t2 + mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + 
##     log_transcript_length_t2 + mean_freq_t1 + age_diff + age_t1, 
##     data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5082 -0.1446 -0.0081  0.1051  0.9293 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.7411776  1.6413724   2.889 0.004949 ** 
## log_mtld_t1               0.5597619  0.0555810  10.071 5.40e-16 ***
## mean_t                    0.3696902  0.0630847   5.860 9.29e-08 ***
## log_transcript_length_t1 -0.1318427  0.0338742  -3.892 0.000201 ***
## log_transcript_length_t2  0.1058127  0.0278141   3.804 0.000273 ***
## mean_freq_t1             -0.0344203  0.0342822  -1.004 0.318319    
## age_diff                  0.0005582  0.0016055   0.348 0.728951    
## age_t1                   -0.0046047  0.0021082  -2.184 0.031803 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.24 on 82 degrees of freedom
## Multiple R-squared:  0.7108, Adjusted R-squared:  0.6861 
## F-statistic:  28.8 on 7 and 82 DF,  p-value: < 2.2e-16

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27076 -0.42523  0.04564  0.46364  1.93046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.117495   0.149479  -0.786    0.432    
## centrality   0.673556   0.977164   0.689    0.491    
## log_freq     0.061839   0.007709   8.022 1.89e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6183 on 1746 degrees of freedom
##   (468 observations deleted due to missingness)
## Multiple R-squared:  0.03934,    Adjusted R-squared:  0.03824 
## F-statistic: 35.75 on 2 and 1746 DF,  p-value: 6.065e-16

lm(t ~ density  + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2701 -0.4243  0.0505  0.4580  1.9063 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.843e-02  5.458e-02  -0.338    0.736    
## density     -2.347e-06  2.050e-05  -0.114    0.909    
## log_freq     6.313e-02  7.492e-03   8.427   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6184 on 1746 degrees of freedom
##   (468 observations deleted due to missingness)
## Multiple R-squared:  0.03909,    Adjusted R-squared:  0.03799 
## F-statistic: 35.51 on 2 and 1746 DF,  p-value: 7.642e-16

lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.33427 -0.43370  0.07415  0.44955  1.91892 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.103186   0.054171   1.905  0.05696 .  
## mean_dist   0.608182   0.099960   6.084 1.43e-09 ***
## log_freq    0.029573   0.009193   3.217  0.00132 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6173 on 1807 degrees of freedom
##   (407 observations deleted due to missingness)
## Multiple R-squared:  0.05997,    Adjusted R-squared:  0.05893 
## F-statistic: 57.64 on 2 and 1807 DF,  p-value: < 2.2e-16

lm(scale(t) ~ scale(mean_dist)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7817 -0.6848  0.1468  0.7016  2.9206 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.06764    0.03001  -2.253 0.024349 *  
## scale(mean_dist)                  0.12739    0.03286   3.877 0.000110 ***
## scale(log_freq)                   0.10828    0.03292   3.289 0.001026 ** 
## scale(mean_dist):scale(log_freq)  0.11552    0.03307   3.493 0.000489 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.987 on 1806 degrees of freedom
##   (407 observations deleted due to missingness)
## Multiple R-squared:  0.06628,    Adjusted R-squared:  0.06473 
## F-statistic: 42.73 on 3 and 1806 DF,  p-value: < 2.2e-16

mean_dist x freq intraction:

word_coeffs_min5_t2_with_vars %>%
  mutate(freq_bin = ntile(log_freq, 2),
         freq_bin = as.factor(freq_bin)) %>%
  filter(!is.na(freq_bin)) %>%
  #filter(mean_dist > .3) %>%
  ggplot(aes(x = mean_dist, y= t, group = freq_bin, color = freq_bin)) +
  geom_point() +
  geom_smooth(method = "lm")

word_coeffs_min5_t2_with_vars %>%
  filter(mean_dist < .3) %>%
  filter(n_know > 4) %>%
  data.frame() %>%
  pull(word)

##   [1] "toys"       "uh"         "i"          "cake"       "cookie"    
##   [6] "mommy"      "um"         "mommy"      "purple"     "huh"       
##  [11] "doggie"     "hold"       "blanket"    "hi"         "all_gone"  
##  [16] "boom"       "broke"      "clock"      "cream"      "doll"      
##  [21] "dolly"      "draw"       "eh"         "eye"        "fall"      
##  [26] "fix"        "flower"     "girl"       "hammer"     "happy"     
##  [31] "horsie"     "ice+cream"  "jump"       "mama"       "moo"       
##  [36] "penny"      "pocket"     "puzzle"     "swimming"   "tower"     
##  [41] "turtle"     "wake"       "block"      "cow"        "daddy"     
##  [46] "pants"      "wanna"      "bunny"      "bye"        "cry"       
##  [51] "daddy"      "daddy's"    "ha"         "ma"         "rabbit"    
##  [56] "tiger"      "woof"       "light"      "pooh"       "bath"      
##  [61] "boots"      "close"      "da"         "egg"        "monkey"    
##  [66] "row"        "uhoh"       "whee"       "dada"       "dinosaur"  
##  [71] "quack"      "star"       "wee"        "bike"       "circle"    
##  [76] "la"         "snake"      "triangle"   "bricks"     "microphone"
##  [81] "ow"         "spider"     "paint"      "seesaw"     "kitty"     
##  [86] "yummy"      "tail"       "fit"        "fly"        "frog"      
##  [91] "leg"        "penguin"    "tractor"    "panda"      "sheep"     
##  [96] "dada"       "birdie"     "mum"        "mummy"      "mummy's"   
## [101] "mummie"     "mummie"     "er"         "neenaw"

word_coeffs_min5_t2_with_vars %>%
  filter(mean_dist < .3) %>%
  filter(n_know > 4) %>%
  data.frame() %>%
  pull(word)

##   [1] "toys"       "uh"         "i"          "cake"       "cookie"    
##   [6] "mommy"      "um"         "mommy"      "purple"     "huh"       
##  [11] "doggie"     "hold"       "blanket"    "hi"         "all_gone"  
##  [16] "boom"       "broke"      "clock"      "cream"      "doll"      
##  [21] "dolly"      "draw"       "eh"         "eye"        "fall"      
##  [26] "fix"        "flower"     "girl"       "hammer"     "happy"     
##  [31] "horsie"     "ice+cream"  "jump"       "mama"       "moo"       
##  [36] "penny"      "pocket"     "puzzle"     "swimming"   "tower"     
##  [41] "turtle"     "wake"       "block"      "cow"        "daddy"     
##  [46] "pants"      "wanna"      "bunny"      "bye"        "cry"       
##  [51] "daddy"      "daddy's"    "ha"         "ma"         "rabbit"    
##  [56] "tiger"      "woof"       "light"      "pooh"       "bath"      
##  [61] "boots"      "close"      "da"         "egg"        "monkey"    
##  [66] "row"        "uhoh"       "whee"       "dada"       "dinosaur"  
##  [71] "quack"      "star"       "wee"        "bike"       "circle"    
##  [76] "la"         "snake"      "triangle"   "bricks"     "microphone"
##  [81] "ow"         "spider"     "paint"      "seesaw"     "kitty"     
##  [86] "yummy"      "tail"       "fit"        "fly"        "frog"      
##  [91] "leg"        "penguin"    "tractor"    "panda"      "sheep"     
##  [96] "dada"       "birdie"     "mum"        "mummy"      "mummy's"   
## [101] "mummie"     "mummie"     "er"         "neenaw"

Adding in frequency:

lm(t ~ Conc.M +  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.29963 -0.42188  0.04347  0.44444  1.85038 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.60635    0.11726  -5.171 2.68e-07 ***
## Conc.M       0.07078    0.01746   4.054 5.33e-05 ***
## log_freq     0.11107    0.01002  11.084  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6041 on 1331 degrees of freedom
##   (883 observations deleted due to missingness)
## Multiple R-squared:  0.0848, Adjusted R-squared:  0.08342 
## F-statistic: 61.66 on 2 and 1331 DF,  p-value: < 2.2e-16

lm(t ~ mean_dist +  Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40051 -0.41944  0.05586  0.42202  1.86395 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51575    0.12211  -4.224 2.57e-05 ***
## mean_dist    0.61505    0.11123   5.530 3.86e-08 ***
## Conc.M       0.08230    0.01752   4.697 2.91e-06 ***
## log_freq     0.07633    0.01236   6.175 8.82e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5965 on 1319 degrees of freedom
##   (894 observations deleted due to missingness)
## Multiple R-squared:  0.1064, Adjusted R-squared:  0.1043 
## F-statistic: 52.33 on 3 and 1319 DF,  p-value: < 2.2e-16

lm(t ~ Conc.M  + mean_dist *  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39693 -0.40173  0.06233  0.41745  1.80106 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.41342    0.13083  -3.160  0.00161 ** 
## Conc.M              0.09192    0.01805   5.091 4.07e-07 ***
## mean_dist          -0.33660    0.45460  -0.740  0.45917    
## log_freq            0.05207    0.01669   3.119  0.00185 ** 
## mean_dist:log_freq  0.13685    0.06339   2.159  0.03104 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5956 on 1318 degrees of freedom
##   (894 observations deleted due to missingness)
## Multiple R-squared:  0.1095, Adjusted R-squared:  0.1068 
## F-statistic: 40.52 on 4 and 1318 DF,  p-value: < 2.2e-16

Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.