Regression analysis for each word

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld diff

MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_600_900.csv") %>%
  mutate(word = tolower(word)) %>%
  filter(n_know > 3)

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

Do kids who have a high mean t have high mtld diff at? Yes.

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
          log_mtld_t2 = log(mtld_t2),
          age_diff = age_t2 - age_t1,
          mtld_diff =  log_mtld_t2 - log_mtld_t1) %>%
    select(target_child_id, mtld_diff, 
           age_t1, age_t2, age_diff, corpus_name)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = mtld_diff)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
  mutate(log_transcript_length_t1 = log(transcript_length_t1),
         log_transcript_length_t2 = log(transcript_length_t2)) %>%
  select(-transcript_length_t1, transcript_length_t2)

freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(transcript_length) %>%
  left_join(freq_info)


lm(mtld_diff ~   mean_t + log_transcript_length_t1 +
     mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 + 
##     mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75136 -0.13993  0.01353  0.15027  0.76755 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.503187   1.882216   1.330    0.187    
## mean_t                    0.640568   0.091862   6.973 6.57e-10 ***
## log_transcript_length_t1 -0.024128   0.020037  -1.204    0.232    
## mean_freq_t1             -0.005324   0.040999  -0.130    0.897    
## age_diff                  0.002071   0.001827   1.134    0.260    
## age_t1                   -0.003750   0.002403  -1.561    0.122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2812 on 84 degrees of freedom
## Multiple R-squared:  0.6548, Adjusted R-squared:  0.6343 
## F-statistic: 31.87 on 5 and 84 DF,  p-value: < 2.2e-16

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(embedding_dist_wiki)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54563 -0.44258 -0.00751  0.48376  1.74976 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2397     0.3050   7.344 9.69e-13 ***
## centrality   -6.9504     2.2071  -3.149  0.00175 ** 
## log_freq     -0.0582     0.0253  -2.300  0.02188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6868 on 453 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.06239,    Adjusted R-squared:  0.05825 
## F-statistic: 15.07 on 2 and 453 DF,  p-value: 4.601e-07

lm(t ~ density  + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54810 -0.44759  0.00077  0.50423  1.79157 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.476e+00  1.881e-01   7.847  3.1e-14 ***
## density     -9.545e-05  4.854e-05  -1.966  0.04985 *  
## log_freq    -8.698e-02  2.275e-02  -3.824  0.00015 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6913 on 453 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.04997,    Adjusted R-squared:  0.04578 
## F-statistic: 11.91 on 2 and 453 DF,  p-value: 9.057e-06

lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39917 -0.43970 -0.00758  0.49611  1.70313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.73742    0.18541   9.371  < 2e-16 ***
## mean_dist    0.97602    0.20361   4.794 2.20e-06 ***
## log_freq    -0.15994    0.02386  -6.702 5.87e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6847 on 473 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.09298,    Adjusted R-squared:  0.08914 
## F-statistic: 24.24 on 2 and 473 DF,  p-value: 9.475e-11

lm(scale(t) ~ scale(mean_dist)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3284 -0.5935  0.0191  0.6864  2.3896 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.04529    0.05114   0.886  0.37633    
## scale(mean_dist)                  0.19255    0.05723   3.364  0.00083 ***
## scale(log_freq)                  -0.31306    0.05124  -6.109 2.09e-09 ***
## scale(mean_dist):scale(log_freq) -0.09115    0.05698  -1.600  0.11037    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9551 on 472 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.09787,    Adjusted R-squared:  0.09213 
## F-statistic: 17.07 on 3 and 472 DF,  p-value: 1.537e-10

mean_dist_wiki x freq intraction:

word_coeffs_min5_t2_with_vars %>%
  mutate(freq_bin = ntile(log_freq, 2),
         freq_bin = as.factor(freq_bin)) %>%
  filter(!is.na(freq_bin)) %>%
  #filter(mean_dist > .3) %>%
  ggplot(aes(x = mean_dist, y = t, 
             group = freq_bin, color = freq_bin)) +
 # geom_point() +
  geom_smooth(method = "lm")

Adding in frequency:

lm(t ~ Conc.M +  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49239 -0.44428  0.01359  0.46540  1.64502 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.41745    0.36765   1.135 0.256844    
## Conc.M       0.13644    0.03877   3.519 0.000482 ***
## log_freq    -0.03191    0.02994  -1.066 0.287263    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6846 on 410 degrees of freedom
##   (68 observations deleted due to missingness)
## Multiple R-squared:  0.07065,    Adjusted R-squared:  0.06612 
## F-statistic: 15.59 on 2 and 410 DF,  p-value: 2.995e-07

lm(t ~ mean_dist +  Conc.M + log_freq+centrality+density, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq + centrality + 
##     density, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49301 -0.43717  0.01895  0.44462  1.52685 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.195e+00  6.260e-01   3.506 0.000506 ***
## mean_dist    9.781e-01  2.213e-01   4.421 1.26e-05 ***
## Conc.M       1.033e-01  3.878e-02   2.664 0.008032 ** 
## log_freq    -4.745e-02  3.400e-02  -1.396 0.163573    
## centrality  -1.214e+01  4.109e+00  -2.954 0.003324 ** 
## density      1.439e-04  8.601e-05   1.673 0.094997 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6633 on 407 degrees of freedom
##   (68 observations deleted due to missingness)
## Multiple R-squared:  0.1338, Adjusted R-squared:  0.1232 
## F-statistic: 12.58 on 5 and 407 DF,  p-value: 2.274e-11

lm(t ~ Conc.M  + mean_dist *  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.45635 -0.39665  0.00323  0.45785  1.57175 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -0.15389    0.56920  -0.270  0.78701   
## Conc.M              0.12283    0.03829   3.208  0.00144 **
## mean_dist           3.70784    1.53816   2.411  0.01637 * 
## log_freq            0.02669    0.07096   0.376  0.70701   
## mean_dist:log_freq -0.36399    0.20226  -1.800  0.07266 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6683 on 408 degrees of freedom
##   (68 observations deleted due to missingness)
## Multiple R-squared:  0.1186, Adjusted R-squared:   0.11 
## F-statistic: 13.73 on 4 and 408 DF,  p-value: 1.64e-10

Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.

Regression analysis for each word - mtld- diff

Predicting mtld diff

Sanity Check

Predicting t-value of a word with other measures