library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_600_900.csv") %>%
mutate(word = tolower(word)) %>%
filter(n_know > 3)
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic() word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()Do kids who have a high mean t have high mtld diff at? Yes.
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
age_diff = age_t2 - age_t1,
mtld_diff = log_mtld_t2 - log_mtld_t1) %>%
select(target_child_id, mtld_diff,
age_t1, age_t2, age_diff, corpus_name)
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = mtld_diff)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()Does it hold controlling for stuff? Yes.
transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
mutate(log_transcript_length_t1 = log(transcript_length_t1),
log_transcript_length_t2 = log(transcript_length_t2)) %>%
select(-transcript_length_t1, transcript_length_t2)
freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(transcript_length) %>%
left_join(freq_info)
lm(mtld_diff ~ mean_t + log_transcript_length_t1 +
mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 +
## mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75136 -0.13993 0.01353 0.15027 0.76755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.503187 1.882216 1.330 0.187
## mean_t 0.640568 0.091862 6.973 6.57e-10 ***
## log_transcript_length_t1 -0.024128 0.020037 -1.204 0.232
## mean_freq_t1 -0.005324 0.040999 -0.130 0.897
## age_diff 0.002071 0.001827 1.134 0.260
## age_t1 -0.003750 0.002403 -1.561 0.122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2812 on 84 degrees of freedom
## Multiple R-squared: 0.6548, Adjusted R-squared: 0.6343
## F-statistic: 31.87 on 5 and 84 DF, p-value: < 2.2e-16
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(embedding_dist_wiki)
lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.54563 -0.44258 -0.00751 0.48376 1.74976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2397 0.3050 7.344 9.69e-13 ***
## centrality -6.9504 2.2071 -3.149 0.00175 **
## log_freq -0.0582 0.0253 -2.300 0.02188 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6868 on 453 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.06239, Adjusted R-squared: 0.05825
## F-statistic: 15.07 on 2 and 453 DF, p-value: 4.601e-07
lm(t ~ density + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.54810 -0.44759 0.00077 0.50423 1.79157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.476e+00 1.881e-01 7.847 3.1e-14 ***
## density -9.545e-05 4.854e-05 -1.966 0.04985 *
## log_freq -8.698e-02 2.275e-02 -3.824 0.00015 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6913 on 453 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.04997, Adjusted R-squared: 0.04578
## F-statistic: 11.91 on 2 and 453 DF, p-value: 9.057e-06
lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39917 -0.43970 -0.00758 0.49611 1.70313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.73742 0.18541 9.371 < 2e-16 ***
## mean_dist 0.97602 0.20361 4.794 2.20e-06 ***
## log_freq -0.15994 0.02386 -6.702 5.87e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6847 on 473 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.09298, Adjusted R-squared: 0.08914
## F-statistic: 24.24 on 2 and 473 DF, p-value: 9.475e-11
lm(scale(t) ~ scale(mean_dist) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3284 -0.5935 0.0191 0.6864 2.3896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04529 0.05114 0.886 0.37633
## scale(mean_dist) 0.19255 0.05723 3.364 0.00083 ***
## scale(log_freq) -0.31306 0.05124 -6.109 2.09e-09 ***
## scale(mean_dist):scale(log_freq) -0.09115 0.05698 -1.600 0.11037
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9551 on 472 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.09787, Adjusted R-squared: 0.09213
## F-statistic: 17.07 on 3 and 472 DF, p-value: 1.537e-10
mean_dist_wiki x freq intraction:
word_coeffs_min5_t2_with_vars %>%
mutate(freq_bin = ntile(log_freq, 2),
freq_bin = as.factor(freq_bin)) %>%
filter(!is.na(freq_bin)) %>%
#filter(mean_dist > .3) %>%
ggplot(aes(x = mean_dist, y = t,
group = freq_bin, color = freq_bin)) +
# geom_point() +
geom_smooth(method = "lm") Adding in frequency:
lm(t ~ Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49239 -0.44428 0.01359 0.46540 1.64502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.41745 0.36765 1.135 0.256844
## Conc.M 0.13644 0.03877 3.519 0.000482 ***
## log_freq -0.03191 0.02994 -1.066 0.287263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6846 on 410 degrees of freedom
## (68 observations deleted due to missingness)
## Multiple R-squared: 0.07065, Adjusted R-squared: 0.06612
## F-statistic: 15.59 on 2 and 410 DF, p-value: 2.995e-07
lm(t ~ mean_dist + Conc.M + log_freq+centrality+density, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq + centrality +
## density, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49301 -0.43717 0.01895 0.44462 1.52685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.195e+00 6.260e-01 3.506 0.000506 ***
## mean_dist 9.781e-01 2.213e-01 4.421 1.26e-05 ***
## Conc.M 1.033e-01 3.878e-02 2.664 0.008032 **
## log_freq -4.745e-02 3.400e-02 -1.396 0.163573
## centrality -1.214e+01 4.109e+00 -2.954 0.003324 **
## density 1.439e-04 8.601e-05 1.673 0.094997 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6633 on 407 degrees of freedom
## (68 observations deleted due to missingness)
## Multiple R-squared: 0.1338, Adjusted R-squared: 0.1232
## F-statistic: 12.58 on 5 and 407 DF, p-value: 2.274e-11
lm(t ~ Conc.M + mean_dist * log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.45635 -0.39665 0.00323 0.45785 1.57175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15389 0.56920 -0.270 0.78701
## Conc.M 0.12283 0.03829 3.208 0.00144 **
## mean_dist 3.70784 1.53816 2.411 0.01637 *
## log_freq 0.02669 0.07096 0.376 0.70701
## mean_dist:log_freq -0.36399 0.20226 -1.800 0.07266 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6683 on 408 degrees of freedom
## (68 observations deleted due to missingness)
## Multiple R-squared: 0.1186, Adjusted R-squared: 0.11
## F-statistic: 13.73 on 4 and 408 DF, p-value: 1.64e-10
Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.