library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff.csv") %>%
mutate(word = tolower(word))
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic() word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()Do kids who have a high mean t have high mtld diff at? Yes.
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_600_900_corrected.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
age_diff = age_t2 - age_t1,
mtld_diff = log_mtld_t2 - log_mtld_t1) %>%
select(target_child_id, mtld_diff,
age_t1, age_t2, age_diff, corpus_name)
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = mtld_diff)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()Does it hold controlling for stuff? Yes.
transcript_length <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/transcript_length_by_kid.csv") %>%
mutate(log_transcript_length_t1 = log(transcript_length_t1),
log_transcript_length_t2 = log(transcript_length_t2)) %>%
select(-transcript_length_t1, transcript_length_t2)
freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(transcript_length) %>%
left_join(freq_info)
lm(mtld_diff ~ mean_t + log_transcript_length_t1 +
mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 +
## mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67042 -0.11145 0.01607 0.13055 0.70517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.262868 1.728390 1.309 0.194
## mean_t 0.630668 0.073593 8.570 4.36e-13 ***
## log_transcript_length_t1 -0.016340 0.018246 -0.896 0.373
## mean_freq_t1 -0.020235 0.036793 -0.550 0.584
## age_diff 0.002421 0.001676 1.444 0.152
## age_t1 -0.003248 0.002208 -1.471 0.145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2581 on 84 degrees of freedom
## Multiple R-squared: 0.7092, Adjusted R-squared: 0.6919
## F-statistic: 40.98 on 5 and 84 DF, p-value: < 2.2e-16
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(embedding_dist_wiki)
lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46325 -0.49197 -0.00968 0.41383 1.96802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.711764 0.165616 4.298 1.82e-05 ***
## centrality -3.806389 1.082654 -3.516 0.00045 ***
## log_freq 0.038240 0.008541 4.477 8.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.685 on 1746 degrees of freedom
## (468 observations deleted due to missingness)
## Multiple R-squared: 0.0149, Adjusted R-squared: 0.01377
## F-statistic: 13.2 on 2 and 1746 DF, p-value: 2.033e-06
lm(t ~ density + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35097 -0.48872 -0.01159 0.42221 1.99847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.247e-01 6.054e-02 3.711 0.000213 ***
## density -6.371e-05 2.274e-05 -2.801 0.005145 **
## log_freq 3.175e-02 8.310e-03 3.821 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6859 on 1746 degrees of freedom
## (468 observations deleted due to missingness)
## Multiple R-squared: 0.01237, Adjusted R-squared: 0.01123
## F-statistic: 10.93 on 2 and 1746 DF, p-value: 1.917e-05
lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.32109 -0.50519 -0.00879 0.41700 1.95176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.337719 0.060511 5.581 2.75e-08 ***
## mean_dist 0.596690 0.111659 5.344 1.03e-07 ***
## log_freq -0.008804 0.010269 -0.857 0.391
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6896 on 1807 degrees of freedom
## (407 observations deleted due to missingness)
## Multiple R-squared: 0.02066, Adjusted R-squared: 0.01958
## F-statistic: 19.06 on 2 and 1807 DF, p-value: 6.414e-09
lm(scale(t) ~ scale(mean_dist) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3289 -0.7120 -0.0151 0.6120 2.8415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.05273 0.03030 1.740 0.0819 .
## scale(mean_dist) 0.18765 0.03317 5.657 1.79e-08 ***
## scale(log_freq) -0.02968 0.03324 -0.893 0.3721
## scale(mean_dist):scale(log_freq) -0.06538 0.03338 -1.958 0.0503 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9963 on 1806 degrees of freedom
## (407 observations deleted due to missingness)
## Multiple R-squared: 0.02274, Adjusted R-squared: 0.02111
## F-statistic: 14.01 on 3 and 1806 DF, p-value: 4.996e-09
mean_dist_wiki x freq intraction:
word_coeffs_min5_t2_with_vars %>%
mutate(freq_bin = ntile(log_freq, 2),
freq_bin = as.factor(freq_bin)) %>%
filter(!is.na(freq_bin)) %>%
#filter(mean_dist > .3) %>%
ggplot(aes(x = mean_dist, y = t,
group = freq_bin, color = freq_bin)) +
# geom_point() +
geom_smooth(method = "lm") Adding in frequency:
lm(t ~ Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29559 -0.47488 0.00457 0.42154 1.80524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.82518 0.12641 -6.528 9.46e-11 ***
## Conc.M 0.16031 0.01882 8.517 < 2e-16 ***
## log_freq 0.08531 0.01080 7.897 5.92e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6513 on 1331 degrees of freedom
## (883 observations deleted due to missingness)
## Multiple R-squared: 0.06665, Adjusted R-squared: 0.06525
## F-statistic: 47.52 on 2 and 1331 DF, p-value: < 2.2e-16
lm(t ~ mean_dist + Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2724 -0.4600 -0.0003 0.4233 1.8071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.76295 0.13141 -5.806 8.01e-09 ***
## mean_dist 0.66310 0.11969 5.540 3.65e-08 ***
## Conc.M 0.17530 0.01885 9.298 < 2e-16 ***
## log_freq 0.05123 0.01330 3.850 0.000124 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6419 on 1319 degrees of freedom
## (894 observations deleted due to missingness)
## Multiple R-squared: 0.09323, Adjusted R-squared: 0.09117
## F-statistic: 45.2 on 3 and 1319 DF, p-value: < 2.2e-16
lm(t ~ Conc.M + mean_dist * log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24721 -0.46230 -0.00811 0.41635 1.74155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.86333 0.14083 -6.130 1.16e-09 ***
## Conc.M 0.16587 0.01943 8.535 < 2e-16 ***
## mean_dist 1.59661 0.48936 3.263 0.00113 **
## log_freq 0.07503 0.01797 4.175 3.18e-05 ***
## mean_dist:log_freq -0.13424 0.06824 -1.967 0.04936 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6412 on 1318 degrees of freedom
## (894 observations deleted due to missingness)
## Multiple R-squared: 0.09588, Adjusted R-squared: 0.09314
## F-statistic: 34.94 on 4 and 1318 DF, p-value: < 2.2e-16
Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.