Regression analysis for each word (900

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld at t2

MODEL: lm(log_mtld_t2 ~ know_word_at_t1 + log_mtld_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_t2_900_1200.csv") %>%
  mutate(word = tolower(word))

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

Do kids who have a high mean t have high mtld at t2? Yes.

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_900_1200.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_900_1200.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
          log_mtld_t2 = log(mtld_t2),
          age_diff = age_t2 - age_t1) %>%
    select(target_child_id, log_mtld_t1, log_mtld_t2, 
           age_t1, age_t2, age_diff, corpus_name)

lm(I(log_mtld_t2-log_mtld_t1) ~ log_mtld_t1, d = mtld_age) %>%
  summary()

## 
## Call:
## lm(formula = I(log_mtld_t2 - log_mtld_t1) ~ log_mtld_t1, data = mtld_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57373 -0.16164 -0.01849  0.11300  0.97127 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.32370    0.21936   6.034 5.83e-08 ***
## log_mtld_t1 -0.39670    0.07634  -5.196 1.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2786 on 74 degrees of freedom
## Multiple R-squared:  0.2673, Adjusted R-squared:  0.2574 
## F-statistic:    27 on 1 and 74 DF,  p-value: 1.735e-06

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = log_mtld_t2)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

transcript_length <- read_csv("data/transcript_lengths_900_1200.csv") %>%
  mutate(log_transcript_length_t1 = log(transcript_length_t1),
         log_transcript_length_t2 = log(transcript_length_t2)) %>%
     filter(!is.na(transcript_length_t1) ) %>%
  select(-transcript_length_t1, -transcript_length_t2) 

freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(transcript_length) %>%
  left_join(freq_info) 
 

lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + age_diff + age_t1  , t1_word_counts_with_ts_mtld  ) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + 
##     log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + 
##     age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45415 -0.05452 -0.00522  0.05512  0.27382 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.5835420  1.8494874   2.478   0.0169 *  
## log_mtld_t1               0.7126842  0.0488464  14.590  < 2e-16 ***
## mean_t                    0.8691092  0.0773202  11.240 8.68e-15 ***
## log_transcript_length_t1 -0.0858475  0.0371749  -2.309   0.0255 *  
## log_transcript_length_t2  0.0650993  0.0343015   1.898   0.0640 .  
## mean_freq_t1             -0.0081557  0.0288969  -0.282   0.7790    
## mean_freq_t2              0.0540937  0.0343828   1.573   0.1225    
## age_diff                  0.0004607  0.0011180   0.412   0.6822    
## age_t1                   -0.0040387  0.0017191  -2.349   0.0232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1325 on 46 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.8977 
## F-statistic: 60.23 on 8 and 46 DF,  p-value: < 2.2e-16

lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
     #log_transcript_length_t2 + 
     mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + 
##     mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37755 -0.05138 -0.00632  0.05580  0.35392 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.1978162  1.4566440   1.509  0.13777    
## log_mtld_t1               0.7252569  0.0501639  14.458  < 2e-16 ***
## mean_t                    0.8351910  0.0774632  10.782 1.56e-14 ***
## log_transcript_length_t1 -0.0187220  0.0125815  -1.488  0.14314    
## mean_freq_t1              0.0116275  0.0276513   0.421  0.67596    
## age_diff                  0.0021180  0.0007697   2.752  0.00829 ** 
## age_t1                   -0.0015466  0.0013211  -1.171  0.24738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1377 on 49 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:    0.9,  Adjusted R-squared:  0.8878 
## F-statistic: 73.52 on 6 and 49 DF,  p-value: < 2.2e-16

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(embedding_dist_wiki)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.74015 -0.41475 -0.00537  0.47439  3.15955 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.334350   0.111138  -3.008  0.00265 **
## centrality   1.778131   0.711449   2.499  0.01250 * 
## log_freq     0.017601   0.005744   3.064  0.00220 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5896 on 2941 degrees of freedom
##   (874 observations deleted due to missingness)
## Multiple R-squared:  0.006401,   Adjusted R-squared:  0.005725 
## F-statistic: 9.473 on 2 and 2941 DF,  p-value: 7.925e-05

lm(t ~ density  + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.76203 -0.41553 -0.00503  0.47510  3.15874 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.101e-01  3.905e-02  -2.819 0.004854 ** 
## density      3.556e-05  1.462e-05   2.433 0.015037 *  
## log_freq     2.031e-02  5.654e-03   3.591 0.000334 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5897 on 2941 degrees of freedom
##   (874 observations deleted due to missingness)
## Multiple R-squared:  0.006291,   Adjusted R-squared:  0.005615 
## F-statistic: 9.309 on 2 and 2941 DF,  p-value: 9.331e-05

lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7544 -0.4427 -0.0094  0.4266  3.3028 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.07691    0.08274   0.930    0.353    
## mean_dist    0.63495    0.11153   5.693 1.51e-08 ***
## log_freq    -0.01770    0.01272  -1.391    0.164    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6549 on 1466 degrees of freedom
##   (2349 observations deleted due to missingness)
## Multiple R-squared:  0.02533,    Adjusted R-squared:  0.024 
## F-statistic: 19.05 on 2 and 1466 DF,  p-value: 6.823e-09

lm(scale(t) ~ scale(mean_dist)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5469 -0.7553 -0.0291  0.7156  5.5098 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.08401    0.04533   1.854   0.0640 .  
## scale(mean_dist)                  0.26603    0.04692   5.669 1.73e-08 ***
## scale(log_freq)                  -0.04383    0.04630  -0.947   0.3439    
## scale(mean_dist):scale(log_freq) -0.09512    0.04573  -2.080   0.0377 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.103 on 1465 degrees of freedom
##   (2349 observations deleted due to missingness)
## Multiple R-squared:  0.0282, Adjusted R-squared:  0.02621 
## F-statistic: 14.17 on 3 and 1465 DF,  p-value: 4.181e-09

lm(t ~ mean_dist_wiki + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist_wiki + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7089 -0.4466 -0.0233  0.4599  3.1649 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    -0.24671    0.12675  -1.946   0.0518 .
## mean_dist_wiki  0.97687    0.58143   1.680   0.0931 .
## log_freq        0.01529    0.01037   1.474   0.1408  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6538 on 1502 degrees of freedom
##   (2313 observations deleted due to missingness)
## Multiple R-squared:  0.003992,   Adjusted R-squared:  0.002665 
## F-statistic:  3.01 on 2 and 1502 DF,  p-value: 0.0496

lm(scale(t) ~ scale(mean_dist_wiki)  * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = scale(t) ~ scale(mean_dist_wiki) * scale(log_freq), 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5705 -0.7533 -0.0398  0.7756  5.3371 
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                           -0.036964   0.037605  -0.983
## scale(mean_dist_wiki)                  0.049271   0.036922   1.334
## scale(log_freq)                        0.054686   0.037213   1.470
## scale(mean_dist_wiki):scale(log_freq) -0.001219   0.035613  -0.034
##                                       Pr(>|t|)
## (Intercept)                              0.326
## scale(mean_dist_wiki)                    0.182
## scale(log_freq)                          0.142
## scale(mean_dist_wiki):scale(log_freq)    0.973
## 
## Residual standard error: 1.103 on 1501 degrees of freedom
##   (2313 observations deleted due to missingness)
## Multiple R-squared:  0.003992,   Adjusted R-squared:  0.002002 
## F-statistic: 2.006 on 3 and 1501 DF,  p-value: 0.1113

For wikipedia distances, freq-dist interaction holds, no main effect of dist.

mean_dist_wiki x freq intraction:

word_coeffs_min5_t2_with_vars %>%
  mutate(freq_bin = ntile(log_freq, 2),
         freq_bin = as.factor(freq_bin)) %>%
  filter(!is.na(freq_bin)) %>%
  #filter(mean_dist > .3) %>%
  ggplot(aes(x = mean_dist_wiki, y = t, 
             group = freq_bin, color = freq_bin)) +
 # geom_point() +
  geom_smooth(method = "lm")

Adding in frequency:

lm(t ~ Conc.M +  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7614 -0.4271 -0.0090  0.4775  3.1622 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.068885   0.081003  -0.850   0.3952  
## Conc.M       0.004330   0.013281   0.326   0.7444  
## log_freq     0.018153   0.007392   2.456   0.0141 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6006 on 2162 degrees of freedom
##   (1653 observations deleted due to missingness)
## Multiple R-squared:  0.002892,   Adjusted R-squared:  0.001969 
## F-statistic: 3.135 on 2 and 2162 DF,  p-value: 0.04369

lm(t ~ mean_dist +  Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7129 -0.4435 -0.0153  0.4149  3.2831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07517    0.16393  -0.459    0.647    
## mean_dist    0.67447    0.12604   5.351 1.05e-07 ***
## Conc.M       0.03251    0.02096   1.551    0.121    
## log_freq    -0.01526    0.01635  -0.933    0.351    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6546 on 1163 degrees of freedom
##   (2651 observations deleted due to missingness)
## Multiple R-squared:  0.02787,    Adjusted R-squared:  0.02536 
## F-statistic: 11.11 on 3 and 1163 DF,  p-value: 3.402e-07

lm(t ~ Conc.M  + mean_dist *  log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6540 -0.4373 -0.0215  0.4182  3.2358 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.34329    0.19683  -1.744 0.081399 .  
## Conc.M              0.02354    0.02124   1.108 0.267930    
## mean_dist           2.10729    0.59837   3.522 0.000445 ***
## log_freq            0.03310    0.02561   1.292 0.196490    
## mean_dist:log_freq -0.20589    0.08406  -2.449 0.014462 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6532 on 1162 degrees of freedom
##   (2651 observations deleted due to missingness)
## Multiple R-squared:  0.03286,    Adjusted R-squared:  0.02953 
## F-statistic: 9.871 on 4 and 1162 DF,  p-value: 7.44e-08

Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.

Regression analysis for each word (900 - 1200)

Predicting mtld at t2

Sanity Check

Predicting t-value of a word with other measures