library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))MODEL: lm(log_mtld_t2 ~ know_word_at_t1 + log_mtld_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_t2_900_1200.csv") %>%
mutate(word = tolower(word))
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic() word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()Do kids who have a high mean t have high mtld at t2? Yes.
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_900_1200.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_900_1200.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
age_diff = age_t2 - age_t1) %>%
select(target_child_id, log_mtld_t1, log_mtld_t2,
age_t1, age_t2, age_diff, corpus_name)
lm(I(log_mtld_t2-log_mtld_t1) ~ log_mtld_t1, d = mtld_age) %>%
summary()##
## Call:
## lm(formula = I(log_mtld_t2 - log_mtld_t1) ~ log_mtld_t1, data = mtld_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57373 -0.16164 -0.01849 0.11300 0.97127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.32370 0.21936 6.034 5.83e-08 ***
## log_mtld_t1 -0.39670 0.07634 -5.196 1.73e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2786 on 74 degrees of freedom
## Multiple R-squared: 0.2673, Adjusted R-squared: 0.2574
## F-statistic: 27 on 1 and 74 DF, p-value: 1.735e-06
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = log_mtld_t2)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()Does it hold controlling for stuff? Yes.
transcript_length <- read_csv("data/transcript_lengths_900_1200.csv") %>%
mutate(log_transcript_length_t1 = log(transcript_length_t1),
log_transcript_length_t2 = log(transcript_length_t2)) %>%
filter(!is.na(transcript_length_t1) ) %>%
select(-transcript_length_t1, -transcript_length_t2)
freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(transcript_length) %>%
left_join(freq_info)
lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 + log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 + age_diff + age_t1 , t1_word_counts_with_ts_mtld ) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
## log_transcript_length_t2 + mean_freq_t1 + mean_freq_t2 +
## age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45415 -0.05452 -0.00522 0.05512 0.27382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5835420 1.8494874 2.478 0.0169 *
## log_mtld_t1 0.7126842 0.0488464 14.590 < 2e-16 ***
## mean_t 0.8691092 0.0773202 11.240 8.68e-15 ***
## log_transcript_length_t1 -0.0858475 0.0371749 -2.309 0.0255 *
## log_transcript_length_t2 0.0650993 0.0343015 1.898 0.0640 .
## mean_freq_t1 -0.0081557 0.0288969 -0.282 0.7790
## mean_freq_t2 0.0540937 0.0343828 1.573 0.1225
## age_diff 0.0004607 0.0011180 0.412 0.6822
## age_t1 -0.0040387 0.0017191 -2.349 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1325 on 46 degrees of freedom
## (20 observations deleted due to missingness)
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.8977
## F-statistic: 60.23 on 8 and 46 DF, p-value: < 2.2e-16
lm(log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
#log_transcript_length_t2 +
mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ log_mtld_t1 + mean_t + log_transcript_length_t1 +
## mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37755 -0.05138 -0.00632 0.05580 0.35392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1978162 1.4566440 1.509 0.13777
## log_mtld_t1 0.7252569 0.0501639 14.458 < 2e-16 ***
## mean_t 0.8351910 0.0774632 10.782 1.56e-14 ***
## log_transcript_length_t1 -0.0187220 0.0125815 -1.488 0.14314
## mean_freq_t1 0.0116275 0.0276513 0.421 0.67596
## age_diff 0.0021180 0.0007697 2.752 0.00829 **
## age_t1 -0.0015466 0.0013211 -1.171 0.24738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1377 on 49 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 0.9, Adjusted R-squared: 0.8878
## F-statistic: 73.52 on 6 and 49 DF, p-value: < 2.2e-16
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(embedding_dist_wiki)
lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.74015 -0.41475 -0.00537 0.47439 3.15955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.334350 0.111138 -3.008 0.00265 **
## centrality 1.778131 0.711449 2.499 0.01250 *
## log_freq 0.017601 0.005744 3.064 0.00220 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5896 on 2941 degrees of freedom
## (874 observations deleted due to missingness)
## Multiple R-squared: 0.006401, Adjusted R-squared: 0.005725
## F-statistic: 9.473 on 2 and 2941 DF, p-value: 7.925e-05
lm(t ~ density + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.76203 -0.41553 -0.00503 0.47510 3.15874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.101e-01 3.905e-02 -2.819 0.004854 **
## density 3.556e-05 1.462e-05 2.433 0.015037 *
## log_freq 2.031e-02 5.654e-03 3.591 0.000334 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5897 on 2941 degrees of freedom
## (874 observations deleted due to missingness)
## Multiple R-squared: 0.006291, Adjusted R-squared: 0.005615
## F-statistic: 9.309 on 2 and 2941 DF, p-value: 9.331e-05
lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7544 -0.4427 -0.0094 0.4266 3.3028
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07691 0.08274 0.930 0.353
## mean_dist 0.63495 0.11153 5.693 1.51e-08 ***
## log_freq -0.01770 0.01272 -1.391 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6549 on 1466 degrees of freedom
## (2349 observations deleted due to missingness)
## Multiple R-squared: 0.02533, Adjusted R-squared: 0.024
## F-statistic: 19.05 on 2 and 1466 DF, p-value: 6.823e-09
lm(scale(t) ~ scale(mean_dist) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5469 -0.7553 -0.0291 0.7156 5.5098
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08401 0.04533 1.854 0.0640 .
## scale(mean_dist) 0.26603 0.04692 5.669 1.73e-08 ***
## scale(log_freq) -0.04383 0.04630 -0.947 0.3439
## scale(mean_dist):scale(log_freq) -0.09512 0.04573 -2.080 0.0377 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.103 on 1465 degrees of freedom
## (2349 observations deleted due to missingness)
## Multiple R-squared: 0.0282, Adjusted R-squared: 0.02621
## F-statistic: 14.17 on 3 and 1465 DF, p-value: 4.181e-09
lm(t ~ mean_dist_wiki + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist_wiki + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7089 -0.4466 -0.0233 0.4599 3.1649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.24671 0.12675 -1.946 0.0518 .
## mean_dist_wiki 0.97687 0.58143 1.680 0.0931 .
## log_freq 0.01529 0.01037 1.474 0.1408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6538 on 1502 degrees of freedom
## (2313 observations deleted due to missingness)
## Multiple R-squared: 0.003992, Adjusted R-squared: 0.002665
## F-statistic: 3.01 on 2 and 1502 DF, p-value: 0.0496
lm(scale(t) ~ scale(mean_dist_wiki) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist_wiki) * scale(log_freq),
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5705 -0.7533 -0.0398 0.7756 5.3371
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -0.036964 0.037605 -0.983
## scale(mean_dist_wiki) 0.049271 0.036922 1.334
## scale(log_freq) 0.054686 0.037213 1.470
## scale(mean_dist_wiki):scale(log_freq) -0.001219 0.035613 -0.034
## Pr(>|t|)
## (Intercept) 0.326
## scale(mean_dist_wiki) 0.182
## scale(log_freq) 0.142
## scale(mean_dist_wiki):scale(log_freq) 0.973
##
## Residual standard error: 1.103 on 1501 degrees of freedom
## (2313 observations deleted due to missingness)
## Multiple R-squared: 0.003992, Adjusted R-squared: 0.002002
## F-statistic: 2.006 on 3 and 1501 DF, p-value: 0.1113
For wikipedia distances, freq-dist interaction holds, no main effect of dist.
mean_dist_wiki x freq intraction:
word_coeffs_min5_t2_with_vars %>%
mutate(freq_bin = ntile(log_freq, 2),
freq_bin = as.factor(freq_bin)) %>%
filter(!is.na(freq_bin)) %>%
#filter(mean_dist > .3) %>%
ggplot(aes(x = mean_dist_wiki, y = t,
group = freq_bin, color = freq_bin)) +
# geom_point() +
geom_smooth(method = "lm") Adding in frequency:
lm(t ~ Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7614 -0.4271 -0.0090 0.4775 3.1622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.068885 0.081003 -0.850 0.3952
## Conc.M 0.004330 0.013281 0.326 0.7444
## log_freq 0.018153 0.007392 2.456 0.0141 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6006 on 2162 degrees of freedom
## (1653 observations deleted due to missingness)
## Multiple R-squared: 0.002892, Adjusted R-squared: 0.001969
## F-statistic: 3.135 on 2 and 2162 DF, p-value: 0.04369
lm(t ~ mean_dist + Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7129 -0.4435 -0.0153 0.4149 3.2831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07517 0.16393 -0.459 0.647
## mean_dist 0.67447 0.12604 5.351 1.05e-07 ***
## Conc.M 0.03251 0.02096 1.551 0.121
## log_freq -0.01526 0.01635 -0.933 0.351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6546 on 1163 degrees of freedom
## (2651 observations deleted due to missingness)
## Multiple R-squared: 0.02787, Adjusted R-squared: 0.02536
## F-statistic: 11.11 on 3 and 1163 DF, p-value: 3.402e-07
lm(t ~ Conc.M + mean_dist * log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6540 -0.4373 -0.0215 0.4182 3.2358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.34329 0.19683 -1.744 0.081399 .
## Conc.M 0.02354 0.02124 1.108 0.267930
## mean_dist 2.10729 0.59837 3.522 0.000445 ***
## log_freq 0.03310 0.02561 1.292 0.196490
## mean_dist:log_freq -0.20589 0.08406 -2.449 0.014462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6532 on 1162 degrees of freedom
## (2651 observations deleted due to missingness)
## Multiple R-squared: 0.03286, Adjusted R-squared: 0.02953
## F-statistic: 9.871 on 4 and 1162 DF, p-value: 7.44e-08
Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.