Regression analysis for each word

Predicting mtld diff

Sanity Check

Predicting t-value of a word with other measures

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld diff

MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_24_30.csv") %>%
  mutate(word = tolower(word)) %>%
  filter(n_know > 2)

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Show entries

Search:

	word	Estimate	SE	t	n_know
1	mummie	0.383032912983459	0.119040405636	3.21767143632467	18
2	oh	0.347123331014364	0.139173957165264	2.49416872297572	44
3	dear	0.376140632788464	0.160630211837299	2.34165558574656	13
4	fall	0.332443078459379	0.161590314944604	2.05732056759309	11
5	uh	0.233263834699509	0.123531444271937	1.8882952115901	21
6	fish	0.320020133482659	0.185065206742266	1.7292290599407	11
7	balloon	0.364146672766893	0.221247295034181	1.64588079013863	6
8	teddy	0.277073377948581	0.170576145234168	1.62433837139544	11
9	monkey	0.29210010898967	0.18097344046775	1.61404959885107	13
10	gone	0.198259351444858	0.125852283246179	1.57533376694521	20

Showing 1 to 10 of 591 entries

Previous1 2 3 4 5…60Next

Sanity Check

Do kids who have a high mean t have high mtld diff at? Yes.

all_types <- read_csv("data/types_by_kid_24_30.csv",
                        col_names = c("collection_name", "corpus_id", "target_child_name",
                                      "target_child_id", "tbin", "gloss" ,"count"))

MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t = sum(t, na.rm = T),
            mean_t = mean(t, na.rm = T)) 

mtld_age <- read_csv("data/mtld_by_kid_24_30.csv") %>%
    mutate(log_mtld_t1 = log(mtld_t1),
          log_mtld_t2 = log(mtld_t2),
          mtld_diff =  log_mtld_t2 - log_mtld_t1) %>%
    select(target_child_id, mtld_diff, 
           age_t1, age_t2, age_diff, corpus_name)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  select(-target_child_id)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  mean_t , y = mtld_diff)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Does it hold controlling for stuff? Yes.

n_transcripts <- read_csv("data/n_transcript_by_kid_24_30.csv") 

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld_age) %>%
  left_join(n_transcripts) 

lm(mtld_diff ~   mean_t + n_transcripts_t1 +
    # mean_freq_t1 +
     age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
  summary()

## 
## Call:
## lm(formula = mtld_diff ~ mean_t + n_transcripts_t1 + age_diff + 
##     age_t1, data = t1_word_counts_with_ts_mtld)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05399 -0.17631  0.02414  0.22274  0.65504 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5.551437   5.591685  -0.993    0.325    
## mean_t            0.558648   0.076519   7.301 8.37e-10 ***
## n_transcripts_t1 -0.003508   0.003968  -0.884    0.380    
## age_diff          0.004247   0.004965   0.855    0.396    
## age_t1            0.007040   0.006783   1.038    0.304    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3514 on 59 degrees of freedom
## Multiple R-squared:  0.4938, Adjusted R-squared:  0.4594 
## F-statistic: 14.39 on 4 and 59 DF,  p-value: 2.956e-08

Predicting t-value of a word with other measures

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
  select(-n)

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

aoa_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/next_kids/stimuli_selection/AoA_ratings_Kuperman_et_al_BRM.csv") %>%
  select(Word, Rating.Mean) %>%
  rename(word = Word,
         adult_aoa_estimate = Rating.Mean)

embedding_dist <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/wiki_embedding_dist_by_word.csv") %>%
  rename(mean_dist = mean_dist_wiki)

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

pos <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt") %>%
  select(Word, Dom_PoS_SUBTLEX) %>%
  rename(word = Word,
         pos = Dom_PoS_SUBTLEX) %>%
  mutate(pos = ifelse(pos == "Verb", "v", "o"))

glasgow <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/data/study1a/raw/GlasgowNorms.csv") %>%
  select(word, contains("_M")) %>%
  select(-AOA_M, -CNC_M)

ar_va <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/Ratings_Warriner_et_al.csv") %>%
  select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum) %>%
  rename(word = Word)

complexity <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/MRC_database/complexity_norms.csv") %>%
  select(word, complexity)

word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(embedding_dist) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(aoa_norms) %>%
  left_join(pos) %>%
  left_join(ar_va) %>%
  left_join(glasgow) %>%
  left_join(complexity) %>%
  mutate(word_length = nchar(word)) %>%
  filter(n_know >= 5)

df_corrs <- word_coeffs_min5_t2_with_vars %>%
  select(-word, -Estimate, -SE, -pos) 
  #filter_all(all_vars(!is.na(.)))

corr_mat <- cor(df_corrs, 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(df_corrs, 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "hclust", number.cex = .7,
         addCoef.col = "black", insig = "blank",
         p.mat = p.mat, sig.level = .05, 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

## USE THIS MODEL
lm(t ~ mean_dist +  Conc.M + centrality +pos  +word_length + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ mean_dist + Conc.M + centrality + pos + word_length + 
##     log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53985 -0.46376 -0.05939  0.42980  2.74676 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.22082    0.58094   5.544 6.48e-08 ***
## mean_dist     4.40994    2.22275   1.984   0.0482 *  
## Conc.M        0.01265    0.04542   0.278   0.7809    
## centrality  -17.31602    4.03617  -4.290 2.41e-05 ***
## posv          0.12541    0.09216   1.361   0.1746    
## word_length   0.06226    0.03318   1.877   0.0616 .  
## log_freq     -0.17942    0.04113  -4.362 1.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6641 on 300 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.3448, Adjusted R-squared:  0.3317 
## F-statistic: 26.32 on 6 and 300 DF,  p-value: < 2.2e-16

Regression analysis for each word - mtld- diff

Predicting mtld diff

Sanity Check

Predicting t-value of a word with other measures