Regression analysis for each word

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld diff

MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff.csv") %>%
  mutate(word = tolower(word))

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

Do kids who have a high mean t have high mtld diff at? Yes.

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
          log_mtld_t2 = log(mtld_t2),
          age_diff = age_t2 - age_t1,
          mtld_diff =  log_mtld_t2 - log_mtld_t1) %>%
    select(target_child_id, mtld_diff, 
           age_t1, age_t2, age_diff, corpus_name)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = mtld_diff)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
  mutate(log_transcript_length_t1 = log(transcript_length_t1),
         log_transcript_length_t2 = log(transcript_length_t2)) %>%
  select(-transcript_length_t1, transcript_length_t2)

freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(transcript_length) %>%
  left_join(freq_info)


lm(mtld_diff ~   mean_t + log_transcript_length_t1 +
     mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 + 
##     mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67042 -0.11145  0.01607  0.13055  0.70517 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.262868   1.728390   1.309    0.194    
## mean_t                    0.630668   0.073593   8.570 4.36e-13 ***
## log_transcript_length_t1 -0.016340   0.018246  -0.896    0.373    
## mean_freq_t1             -0.020235   0.036793  -0.550    0.584    
## age_diff                  0.002421   0.001676   1.444    0.152    
## age_t1                   -0.003248   0.002208  -1.471    0.145    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2581 on 84 degrees of freedom
## Multiple R-squared:  0.7092, Adjusted R-squared:  0.6919 
## F-statistic: 40.98 on 5 and 84 DF,  p-value: < 2.2e-16

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(embedding_dist_wiki)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46325 -0.49197 -0.00968  0.41383  1.96802 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.711764   0.165616   4.298 1.82e-05 ***
## centrality  -3.806389   1.082654  -3.516  0.00045 ***
## log_freq     0.038240   0.008541   4.477 8.06e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.685 on 1746 degrees of freedom
##   (468 observations deleted due to missingness)
## Multiple R-squared:  0.0149, Adjusted R-squared:  0.01377 
## F-statistic:  13.2 on 2 and 1746 DF,  p-value: 2.033e-06

lm(t ~ density  + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35097 -0.48872 -0.01159  0.42221  1.99847 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.247e-01  6.054e-02   3.711 0.000213 ***
## density     -6.371e-05  2.274e-05  -2.801 0.005145 ** 
## log_freq     3.175e-02  8.310e-03   3.821 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6859 on 1746 degrees of freedom
##   (468 observations deleted due to missingness)
## Multiple R-squared:  0.01237,    Adjusted R-squared:  0.01123 
## F-statistic: 10.93 on 2 and 1746 DF,  p-value: 1.917e-05

lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32109 -0.50519 -0.00879  0.41700  1.95176 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.337719   0.060511   5.581 2.75e-08 ***
## mean_dist    0.596690   0.111659   5.344 1.03e-07 ***
## log_freq    -0.008804   0.010269  -0.857    0.391    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6896 on 1807 degrees of freedom
##   (407 observations deleted due to missingness)
## Multiple R-squared:  0.02066,    Adjusted R-squared:  0.01958 
## F-statistic: 19.06 on 2 and 1807 DF,  p-value: 6.414e-09

lm(scale(t) ~ scale(mean_dist)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3289 -0.7120 -0.0151  0.6120  2.8415 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.05273    0.03030   1.740   0.0819 .  
## scale(mean_dist)                  0.18765    0.03317   5.657 1.79e-08 ***
## scale(log_freq)                  -0.02968    0.03324  -0.893   0.3721    
## scale(mean_dist):scale(log_freq) -0.06538    0.03338  -1.958   0.0503 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9963 on 1806 degrees of freedom
##   (407 observations deleted due to missingness)
## Multiple R-squared:  0.02274,    Adjusted R-squared:  0.02111 
## F-statistic: 14.01 on 3 and 1806 DF,  p-value: 4.996e-09

mean_dist_wiki x freq intraction:

word_coeffs_min5_t2_with_vars %>%
  mutate(freq_bin = ntile(log_freq, 2),
         freq_bin = as.factor(freq_bin)) %>%
  filter(!is.na(freq_bin)) %>%
  #filter(mean_dist > .3) %>%
  ggplot(aes(x = mean_dist, y = t, 
             group = freq_bin, color = freq_bin)) +
 # geom_point() +
  geom_smooth(method = "lm")

Adding in frequency:

lm(t ~ Conc.M +  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.29559 -0.47488  0.00457  0.42154  1.80524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.82518    0.12641  -6.528 9.46e-11 ***
## Conc.M       0.16031    0.01882   8.517  < 2e-16 ***
## log_freq     0.08531    0.01080   7.897 5.92e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6513 on 1331 degrees of freedom
##   (883 observations deleted due to missingness)
## Multiple R-squared:  0.06665,    Adjusted R-squared:  0.06525 
## F-statistic: 47.52 on 2 and 1331 DF,  p-value: < 2.2e-16

lm(t ~ mean_dist +  Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2724 -0.4600 -0.0003  0.4233  1.8071 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.76295    0.13141  -5.806 8.01e-09 ***
## mean_dist    0.66310    0.11969   5.540 3.65e-08 ***
## Conc.M       0.17530    0.01885   9.298  < 2e-16 ***
## log_freq     0.05123    0.01330   3.850 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6419 on 1319 degrees of freedom
##   (894 observations deleted due to missingness)
## Multiple R-squared:  0.09323,    Adjusted R-squared:  0.09117 
## F-statistic:  45.2 on 3 and 1319 DF,  p-value: < 2.2e-16

lm(t ~ Conc.M  + mean_dist *  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.24721 -0.46230 -0.00811  0.41635  1.74155 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.86333    0.14083  -6.130 1.16e-09 ***
## Conc.M              0.16587    0.01943   8.535  < 2e-16 ***
## mean_dist           1.59661    0.48936   3.263  0.00113 ** 
## log_freq            0.07503    0.01797   4.175 3.18e-05 ***
## mean_dist:log_freq -0.13424    0.06824  -1.967  0.04936 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6412 on 1318 degrees of freedom
##   (894 observations deleted due to missingness)
## Multiple R-squared:  0.09588,    Adjusted R-squared:  0.09314 
## F-statistic: 34.94 on 4 and 1318 DF,  p-value: < 2.2e-16

Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.

Regression analysis for each word - mtld- diff

Predicting mtld diff

Sanity Check

Predicting t-value of a word with other measures