Regression analysis for each word

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld diff

MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_900_1200.csv") %>%
  mutate(word = tolower(word))

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

Do kids who have a high mean t have high mtld diff at? Yes.

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_900_1200.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_900_1200.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
          log_mtld_t2 = log(mtld_t2),
          age_diff = age_t2 - age_t1,
          mtld_diff =  log_mtld_t2 - log_mtld_t1) %>%
    select(target_child_id, mtld_diff, 
           age_t1, age_t2, age_diff, corpus_name)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = mtld_diff)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

transcript_length <- read_csv("data/transcript_lengths_900_1200.csv")  %>%
  mutate(log_transcript_length_t1 = log(transcript_length_t1),
         log_transcript_length_t2 = log(transcript_length_t2)) %>%
  select(-transcript_length_t1, -transcript_length_t2)

freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(transcript_length) %>%
  left_join(freq_info)


lm(mtld_diff ~   mean_t + log_transcript_length_t1 +
     mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 + 
##     mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36888 -0.09112  0.01384  0.12711  0.37634 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.7867705  1.8470995  -0.426  0.67197    
## mean_t                    0.5878772  0.0816525   7.200 2.92e-09 ***
## log_transcript_length_t1 -0.0612665  0.0187258  -3.272  0.00194 ** 
## mean_freq_t1              0.0204980  0.0358907   0.571  0.57047    
## age_diff                  0.0031656  0.0010069   3.144  0.00280 ** 
## age_t1                    0.0008346  0.0016958   0.492  0.62475    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1811 on 50 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.5211, Adjusted R-squared:  0.4732 
## F-statistic: 10.88 on 5 and 50 DF,  p-value: 4.06e-07

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(embedding_dist_wiki)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3332 -0.4722  0.0530  0.5172  2.7504 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.121865   0.117518   1.037    0.300    
## centrality  -0.481560   0.752292  -0.640    0.522    
## log_freq     0.035115   0.006074   5.781  8.2e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6235 on 2941 degrees of freedom
##   (874 observations deleted due to missingness)
## Multiple R-squared:  0.01129,    Adjusted R-squared:  0.01061 
## F-statistic: 16.79 on 2 and 2941 DF,  p-value: 5.636e-08

lm(t ~ density  + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3233 -0.4730  0.0486  0.5162  2.7456 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.424e-02  4.130e-02   1.071    0.284    
## density     5.836e-06  1.546e-05   0.378    0.706    
## log_freq    3.445e-02  5.979e-03   5.763 9.15e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6235 on 2941 degrees of freedom
##   (874 observations deleted due to missingness)
## Multiple R-squared:  0.0112, Adjusted R-squared:  0.01052 
## F-statistic: 16.65 on 2 and 2941 DF,  p-value: 6.441e-08

lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2431 -0.4692  0.0187  0.4493  2.9133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.50299    0.08664   5.805 7.86e-09 ***
## mean_dist    0.94611    0.11679   8.101 1.14e-15 ***
## log_freq    -0.04901    0.01332  -3.679 0.000243 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6857 on 1466 degrees of freedom
##   (2349 observations deleted due to missingness)
## Multiple R-squared:  0.0439, Adjusted R-squared:  0.04259 
## F-statistic: 33.65 on 2 and 1466 DF,  p-value: 5.122e-15

lm(scale(t) ~ scale(mean_dist)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8457 -0.7070 -0.0037  0.7027  4.4561 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.35259    0.04432   7.955 3.55e-15 ***
## scale(mean_dist)                  0.52026    0.04589  11.338  < 2e-16 ***
## scale(log_freq)                  -0.09576    0.04528  -2.115   0.0346 *  
## scale(mean_dist):scale(log_freq) -0.34773    0.04471  -7.777 1.39e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.079 on 1465 degrees of freedom
##   (2349 observations deleted due to missingness)
## Multiple R-squared:  0.08181,    Adjusted R-squared:  0.07992 
## F-statistic: 43.51 on 3 and 1465 DF,  p-value: < 2.2e-16

mean_dist_wiki x freq intraction:

word_coeffs_min5_t2_with_vars %>%
  mutate(freq_bin = ntile(log_freq, 2),
         freq_bin = as.factor(freq_bin)) %>%
  filter(!is.na(freq_bin)) %>%
  #filter(mean_dist > .3) %>%
  ggplot(aes(x = mean_dist, y = t, 
             group = freq_bin, color = freq_bin)) +
 # geom_point() +
  geom_smooth(method = "lm")

Adding in frequency:

lm(t ~ Conc.M +  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2363 -0.4773  0.0329  0.4822  2.6643 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.23234    0.08558  -2.715  0.00668 ** 
## Conc.M       0.07127    0.01403   5.079 4.12e-07 ***
## log_freq     0.03835    0.00781   4.910 9.77e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6346 on 2162 degrees of freedom
##   (1653 observations deleted due to missingness)
## Multiple R-squared:  0.01717,    Adjusted R-squared:  0.01626 
## F-statistic: 18.89 on 2 and 2162 DF,  p-value: 7.368e-09

lm(t ~ mean_dist +  Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.13123 -0.47106  0.00945  0.42901  2.82603 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.04772    0.17179   0.278   0.7813    
## mean_dist    1.04724    0.13208   7.929 5.17e-15 ***
## Conc.M       0.10196    0.02197   4.641 3.85e-06 ***
## log_freq    -0.04085    0.01714  -2.384   0.0173 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.686 on 1163 degrees of freedom
##   (2651 observations deleted due to missingness)
## Multiple R-squared:  0.06452,    Adjusted R-squared:  0.0621 
## F-statistic: 26.74 on 3 and 1163 DF,  p-value: < 2.2e-16

lm(t ~ Conc.M  + mean_dist *  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.96065 -0.45094 -0.02017  0.42259  2.68916 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.72874    0.20271  -3.595 0.000338 ***
## Conc.M              0.07597    0.02187   3.474 0.000532 ***
## mean_dist           5.19652    0.61625   8.433  < 2e-16 ***
## log_freq            0.09920    0.02638   3.760 0.000178 ***
## mean_dist:log_freq -0.59622    0.08657  -6.887 9.31e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6727 on 1162 degrees of freedom
##   (2651 observations deleted due to missingness)
## Multiple R-squared:  0.1012, Adjusted R-squared:  0.09811 
## F-statistic: 32.71 on 4 and 1162 DF,  p-value: < 2.2e-16

Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.

Regression analysis for each word - mtld- diff (900)

Predicting mtld diff

Sanity Check

Predicting t-value of a word with other measures