library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))MODEL: lm(log_mtld_t2 ~ know_word_at_t1 + log_mtld_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_t2.csv") %>%
mutate(word = tolower(word))
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic() word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()Do kids who have a high mean t have high mtld at t2? Yes.
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
age_diff = age_t2 - age_t1) %>%
select(target_child_id, log_mtld_t1, log_mtld_t2,
age_t1, age_t2, age_diff, corpus_name)
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = log_mtld_t2)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()Does it hold controlling for stuff? Yes.
transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
mutate(log_transcript_length_t1 = log(transcript_length_t1),
log_transcript_length_t2 = log(transcript_length_t2)) %>%
select(-transcript_length_t1, transcript_length_t2)
freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(transcript_length) %>%
left_join(freq_info)
lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + age_diff + age_t1 , t1_word_counts_with_ts_mtld ) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
## log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 +
## age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52342 -0.09859 -0.00054 0.08263 0.82875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.427e+00 1.381e+00 3.206 0.00195 **
## log_mtld_t1 5.490e-01 4.701e-02 11.678 < 2e-16 ***
## mean_t 3.087e-01 5.644e-02 5.470 5.13e-07 ***
## log_transcript_length_t1 -9.636e-02 2.870e-02 -3.357 0.00121 **
## log_transcript_length_t2 1.124e-01 2.319e-02 4.848 6.13e-06 ***
## mean_freq_t1 -1.587e-02 2.874e-02 -0.552 0.58230
## mean_freq_t2 8.300e-02 3.984e-02 2.084 0.04044 *
## age_diff -7.947e-05 1.347e-03 -0.059 0.95309
## age_t1 -5.850e-03 1.755e-03 -3.332 0.00131 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1974 on 79 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8025, Adjusted R-squared: 0.7825
## F-statistic: 40.13 on 8 and 79 DF, p-value: < 2.2e-16
lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
log_transcript_length_t2 + mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
## log_transcript_length_t2 + mean_freq_t1 + age_diff + age_t1,
## data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5082 -0.1446 -0.0081 0.1051 0.9293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7411776 1.6413724 2.889 0.004949 **
## log_mtld_t1 0.5597619 0.0555810 10.071 5.40e-16 ***
## mean_t 0.3696902 0.0630847 5.860 9.29e-08 ***
## log_transcript_length_t1 -0.1318427 0.0338742 -3.892 0.000201 ***
## log_transcript_length_t2 0.1058127 0.0278141 3.804 0.000273 ***
## mean_freq_t1 -0.0344203 0.0342822 -1.004 0.318319
## age_diff 0.0005582 0.0016055 0.348 0.728951
## age_t1 -0.0046047 0.0021082 -2.184 0.031803 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.24 on 82 degrees of freedom
## Multiple R-squared: 0.7108, Adjusted R-squared: 0.6861
## F-statistic: 28.8 on 7 and 82 DF, p-value: < 2.2e-16
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness)
lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27076 -0.42523 0.04564 0.46364 1.93046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.117495 0.149479 -0.786 0.432
## centrality 0.673556 0.977164 0.689 0.491
## log_freq 0.061839 0.007709 8.022 1.89e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6183 on 1746 degrees of freedom
## (468 observations deleted due to missingness)
## Multiple R-squared: 0.03934, Adjusted R-squared: 0.03824
## F-statistic: 35.75 on 2 and 1746 DF, p-value: 6.065e-16
lm(t ~ density + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2701 -0.4243 0.0505 0.4580 1.9063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.843e-02 5.458e-02 -0.338 0.736
## density -2.347e-06 2.050e-05 -0.114 0.909
## log_freq 6.313e-02 7.492e-03 8.427 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6184 on 1746 degrees of freedom
## (468 observations deleted due to missingness)
## Multiple R-squared: 0.03909, Adjusted R-squared: 0.03799
## F-statistic: 35.51 on 2 and 1746 DF, p-value: 7.642e-16
lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.33427 -0.43370 0.07415 0.44955 1.91892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.103186 0.054171 1.905 0.05696 .
## mean_dist 0.608182 0.099960 6.084 1.43e-09 ***
## log_freq 0.029573 0.009193 3.217 0.00132 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6173 on 1807 degrees of freedom
## (407 observations deleted due to missingness)
## Multiple R-squared: 0.05997, Adjusted R-squared: 0.05893
## F-statistic: 57.64 on 2 and 1807 DF, p-value: < 2.2e-16
lm(scale(t) ~ scale(mean_dist) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7817 -0.6848 0.1468 0.7016 2.9206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.06764 0.03001 -2.253 0.024349 *
## scale(mean_dist) 0.12739 0.03286 3.877 0.000110 ***
## scale(log_freq) 0.10828 0.03292 3.289 0.001026 **
## scale(mean_dist):scale(log_freq) 0.11552 0.03307 3.493 0.000489 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.987 on 1806 degrees of freedom
## (407 observations deleted due to missingness)
## Multiple R-squared: 0.06628, Adjusted R-squared: 0.06473
## F-statistic: 42.73 on 3 and 1806 DF, p-value: < 2.2e-16
mean_dist x freq intraction:
word_coeffs_min5_t2_with_vars %>%
mutate(freq_bin = ntile(log_freq, 2),
freq_bin = as.factor(freq_bin)) %>%
filter(!is.na(freq_bin)) %>%
#filter(mean_dist > .3) %>%
ggplot(aes(x = mean_dist, y= t, group = freq_bin, color = freq_bin)) +
geom_point() +
geom_smooth(method = "lm") word_coeffs_min5_t2_with_vars %>%
filter(mean_dist < .3) %>%
filter(n_know > 4) %>%
data.frame() %>%
pull(word)## [1] "toys" "uh" "i" "cake" "cookie"
## [6] "mommy" "um" "mommy" "purple" "huh"
## [11] "doggie" "hold" "blanket" "hi" "all_gone"
## [16] "boom" "broke" "clock" "cream" "doll"
## [21] "dolly" "draw" "eh" "eye" "fall"
## [26] "fix" "flower" "girl" "hammer" "happy"
## [31] "horsie" "ice+cream" "jump" "mama" "moo"
## [36] "penny" "pocket" "puzzle" "swimming" "tower"
## [41] "turtle" "wake" "block" "cow" "daddy"
## [46] "pants" "wanna" "bunny" "bye" "cry"
## [51] "daddy" "daddy's" "ha" "ma" "rabbit"
## [56] "tiger" "woof" "light" "pooh" "bath"
## [61] "boots" "close" "da" "egg" "monkey"
## [66] "row" "uhoh" "whee" "dada" "dinosaur"
## [71] "quack" "star" "wee" "bike" "circle"
## [76] "la" "snake" "triangle" "bricks" "microphone"
## [81] "ow" "spider" "paint" "seesaw" "kitty"
## [86] "yummy" "tail" "fit" "fly" "frog"
## [91] "leg" "penguin" "tractor" "panda" "sheep"
## [96] "dada" "birdie" "mum" "mummy" "mummy's"
## [101] "mummie" "mummie" "er" "neenaw"
word_coeffs_min5_t2_with_vars %>%
filter(mean_dist < .3) %>%
filter(n_know > 4) %>%
data.frame() %>%
pull(word)## [1] "toys" "uh" "i" "cake" "cookie"
## [6] "mommy" "um" "mommy" "purple" "huh"
## [11] "doggie" "hold" "blanket" "hi" "all_gone"
## [16] "boom" "broke" "clock" "cream" "doll"
## [21] "dolly" "draw" "eh" "eye" "fall"
## [26] "fix" "flower" "girl" "hammer" "happy"
## [31] "horsie" "ice+cream" "jump" "mama" "moo"
## [36] "penny" "pocket" "puzzle" "swimming" "tower"
## [41] "turtle" "wake" "block" "cow" "daddy"
## [46] "pants" "wanna" "bunny" "bye" "cry"
## [51] "daddy" "daddy's" "ha" "ma" "rabbit"
## [56] "tiger" "woof" "light" "pooh" "bath"
## [61] "boots" "close" "da" "egg" "monkey"
## [66] "row" "uhoh" "whee" "dada" "dinosaur"
## [71] "quack" "star" "wee" "bike" "circle"
## [76] "la" "snake" "triangle" "bricks" "microphone"
## [81] "ow" "spider" "paint" "seesaw" "kitty"
## [86] "yummy" "tail" "fit" "fly" "frog"
## [91] "leg" "penguin" "tractor" "panda" "sheep"
## [96] "dada" "birdie" "mum" "mummy" "mummy's"
## [101] "mummie" "mummie" "er" "neenaw"
Adding in frequency:
lm(t ~ Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29963 -0.42188 0.04347 0.44444 1.85038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.60635 0.11726 -5.171 2.68e-07 ***
## Conc.M 0.07078 0.01746 4.054 5.33e-05 ***
## log_freq 0.11107 0.01002 11.084 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6041 on 1331 degrees of freedom
## (883 observations deleted due to missingness)
## Multiple R-squared: 0.0848, Adjusted R-squared: 0.08342
## F-statistic: 61.66 on 2 and 1331 DF, p-value: < 2.2e-16
lm(t ~ mean_dist + Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40051 -0.41944 0.05586 0.42202 1.86395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51575 0.12211 -4.224 2.57e-05 ***
## mean_dist 0.61505 0.11123 5.530 3.86e-08 ***
## Conc.M 0.08230 0.01752 4.697 2.91e-06 ***
## log_freq 0.07633 0.01236 6.175 8.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5965 on 1319 degrees of freedom
## (894 observations deleted due to missingness)
## Multiple R-squared: 0.1064, Adjusted R-squared: 0.1043
## F-statistic: 52.33 on 3 and 1319 DF, p-value: < 2.2e-16
lm(t ~ Conc.M + mean_dist * log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39693 -0.40173 0.06233 0.41745 1.80106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.41342 0.13083 -3.160 0.00161 **
## Conc.M 0.09192 0.01805 5.091 4.07e-07 ***
## mean_dist -0.33660 0.45460 -0.740 0.45917
## log_freq 0.05207 0.01669 3.119 0.00185 **
## mean_dist:log_freq 0.13685 0.06339 2.159 0.03104 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5956 on 1318 degrees of freedom
## (894 observations deleted due to missingness)
## Multiple R-squared: 0.1095, Adjusted R-squared: 0.1068
## F-statistic: 40.52 on 4 and 1318 DF, p-value: < 2.2e-16
Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.