library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_900_1200.csv") %>%
mutate(word = tolower(word))
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic() word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()Do kids who have a high mean t have high mtld diff at? Yes.
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_900_1200.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/groups_info_900_1200.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
age_diff = age_t2 - age_t1,
mtld_diff = log_mtld_t2 - log_mtld_t1) %>%
select(target_child_id, mtld_diff,
age_t1, age_t2, age_diff, corpus_name)
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = mtld_diff)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()Does it hold controlling for stuff? Yes.
transcript_length <- read_csv("data/transcript_lengths_900_1200.csv") %>%
mutate(log_transcript_length_t1 = log(transcript_length_t1),
log_transcript_length_t2 = log(transcript_length_t2)) %>%
select(-transcript_length_t1, -transcript_length_t2)
freq_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(transcript_length) %>%
left_join(freq_info)
lm(mtld_diff ~ mean_t + log_transcript_length_t1 +
mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()##
## Call:
## lm(formula = mtld_diff ~ mean_t + log_transcript_length_t1 +
## mean_freq_t1 + age_diff + age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36888 -0.09112 0.01384 0.12711 0.37634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.7867705 1.8470995 -0.426 0.67197
## mean_t 0.5878772 0.0816525 7.200 2.92e-09 ***
## log_transcript_length_t1 -0.0612665 0.0187258 -3.272 0.00194 **
## mean_freq_t1 0.0204980 0.0358907 0.571 0.57047
## age_diff 0.0031656 0.0010069 3.144 0.00280 **
## age_t1 0.0008346 0.0016958 0.492 0.62475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1811 on 50 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 0.5211, Adjusted R-squared: 0.4732
## F-statistic: 10.88 on 5 and 50 DF, p-value: 4.06e-07
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
embedding_dist <- read_csv("data/childes_embedding_dist_by_word.csv")
embedding_dist_wiki <- read_csv("data/wiki_embedding_dist_by_word.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(embedding_dist_wiki)
lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3332 -0.4722 0.0530 0.5172 2.7504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.121865 0.117518 1.037 0.300
## centrality -0.481560 0.752292 -0.640 0.522
## log_freq 0.035115 0.006074 5.781 8.2e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6235 on 2941 degrees of freedom
## (874 observations deleted due to missingness)
## Multiple R-squared: 0.01129, Adjusted R-squared: 0.01061
## F-statistic: 16.79 on 2 and 2941 DF, p-value: 5.636e-08
lm(t ~ density + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ density + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3233 -0.4730 0.0486 0.5162 2.7456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.424e-02 4.130e-02 1.071 0.284
## density 5.836e-06 1.546e-05 0.378 0.706
## log_freq 3.445e-02 5.979e-03 5.763 9.15e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6235 on 2941 degrees of freedom
## (874 observations deleted due to missingness)
## Multiple R-squared: 0.0112, Adjusted R-squared: 0.01052
## F-statistic: 16.65 on 2 and 2941 DF, p-value: 6.441e-08
lm(t ~ mean_dist + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2431 -0.4692 0.0187 0.4493 2.9133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.50299 0.08664 5.805 7.86e-09 ***
## mean_dist 0.94611 0.11679 8.101 1.14e-15 ***
## log_freq -0.04901 0.01332 -3.679 0.000243 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6857 on 1466 degrees of freedom
## (2349 observations deleted due to missingness)
## Multiple R-squared: 0.0439, Adjusted R-squared: 0.04259
## F-statistic: 33.65 on 2 and 1466 DF, p-value: 5.122e-15
lm(scale(t) ~ scale(mean_dist) * scale(log_freq), word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = scale(t) ~ scale(mean_dist) * scale(log_freq), data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8457 -0.7070 -0.0037 0.7027 4.4561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35259 0.04432 7.955 3.55e-15 ***
## scale(mean_dist) 0.52026 0.04589 11.338 < 2e-16 ***
## scale(log_freq) -0.09576 0.04528 -2.115 0.0346 *
## scale(mean_dist):scale(log_freq) -0.34773 0.04471 -7.777 1.39e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.079 on 1465 degrees of freedom
## (2349 observations deleted due to missingness)
## Multiple R-squared: 0.08181, Adjusted R-squared: 0.07992
## F-statistic: 43.51 on 3 and 1465 DF, p-value: < 2.2e-16
mean_dist_wiki x freq intraction:
word_coeffs_min5_t2_with_vars %>%
mutate(freq_bin = ntile(log_freq, 2),
freq_bin = as.factor(freq_bin)) %>%
filter(!is.na(freq_bin)) %>%
#filter(mean_dist > .3) %>%
ggplot(aes(x = mean_dist, y = t,
group = freq_bin, color = freq_bin)) +
# geom_point() +
geom_smooth(method = "lm") Adding in frequency:
lm(t ~ Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2363 -0.4773 0.0329 0.4822 2.6643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.23234 0.08558 -2.715 0.00668 **
## Conc.M 0.07127 0.01403 5.079 4.12e-07 ***
## log_freq 0.03835 0.00781 4.910 9.77e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6346 on 2162 degrees of freedom
## (1653 observations deleted due to missingness)
## Multiple R-squared: 0.01717, Adjusted R-squared: 0.01626
## F-statistic: 18.89 on 2 and 2162 DF, p-value: 7.368e-09
lm(t ~ mean_dist + Conc.M + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.13123 -0.47106 0.00945 0.42901 2.82603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04772 0.17179 0.278 0.7813
## mean_dist 1.04724 0.13208 7.929 5.17e-15 ***
## Conc.M 0.10196 0.02197 4.641 3.85e-06 ***
## log_freq -0.04085 0.01714 -2.384 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.686 on 1163 degrees of freedom
## (2651 observations deleted due to missingness)
## Multiple R-squared: 0.06452, Adjusted R-squared: 0.0621
## F-statistic: 26.74 on 3 and 1163 DF, p-value: < 2.2e-16
lm(t ~ Conc.M + mean_dist * log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ Conc.M + mean_dist * log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.96065 -0.45094 -0.02017 0.42259 2.68916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.72874 0.20271 -3.595 0.000338 ***
## Conc.M 0.07597 0.02187 3.474 0.000532 ***
## mean_dist 5.19652 0.61625 8.433 < 2e-16 ***
## log_freq 0.09920 0.02638 3.760 0.000178 ***
## mean_dist:log_freq -0.59622 0.08657 -6.887 9.31e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6727 on 1162 degrees of freedom
## (2651 observations deleted due to missingness)
## Multiple R-squared: 0.1012, Adjusted R-squared: 0.09811
## F-statistic: 32.71 on 4 and 1162 DF, p-value: < 2.2e-16
Concreteness and mean distance both predict t score - words that are similiar to other words and highly concrete have large t-scores.