library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)
theme_set(theme_classic(base_size = 10))
MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)
word_coeffs_min5_t2 <- read_csv("data/word_coeffs_log_mtld_diff_24_30.csv") %>%
mutate(word = tolower(word)) %>%
filter(n_know > 2)
ggplot(word_coeffs_min5_t2, aes(t)) +
geom_histogram() +
ggtitle("t-distribution ") +
geom_vline(aes(xintercept = 2), color = "red") +
geom_vline(aes(xintercept = -2), color = "red") +
theme_classic()
word_coeffs_min5_t2 %>%
arrange(-t) %>%
DT::datatable()
Do kids who have a high mean t have high mtld diff at? Yes.
all_types <- read_csv("data/types_by_kid_24_30.csv",
col_names = c("collection_name", "corpus_id", "target_child_name",
"target_child_id", "tbin", "gloss" ,"count"))
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2 %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t = sum(t, na.rm = T),
mean_t = mean(t, na.rm = T))
mtld_age <- read_csv("data/mtld_by_kid_24_30.csv") %>%
mutate(log_mtld_t1 = log(mtld_t1),
log_mtld_t2 = log(mtld_t2),
mtld_diff = log_mtld_t2 - log_mtld_t1) %>%
select(target_child_id, mtld_diff,
age_t1, age_t2, age_diff, corpus_name)
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
select(-target_child_id)
t1_word_counts_with_ts_mtld %>%
ggplot( aes(x = mean_t , y = mtld_diff)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic()
Does it hold controlling for stuff? Yes.
n_transcripts <- read_csv("data/n_transcript_by_kid_24_30.csv")
t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
left_join(mtld_age) %>%
left_join(n_transcripts)
lm(mtld_diff ~ mean_t + n_transcripts_t1 +
# mean_freq_t1 +
age_diff + age_t1, data = t1_word_counts_with_ts_mtld) %>%
summary()
##
## Call:
## lm(formula = mtld_diff ~ mean_t + n_transcripts_t1 + age_diff +
## age_t1, data = t1_word_counts_with_ts_mtld)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05399 -0.17631 0.02414 0.22274 0.65504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.551437 5.591685 -0.993 0.325
## mean_t 0.558648 0.076519 7.301 8.37e-10 ***
## n_transcripts_t1 -0.003508 0.003968 -0.884 0.380
## age_diff 0.004247 0.004965 0.855 0.396
## age_t1 0.007040 0.006783 1.038 0.304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3514 on 59 degrees of freedom
## Multiple R-squared: 0.4938, Adjusted R-squared: 0.4594
## F-statistic: 14.39 on 4 and 59 DF, p-value: 2.956e-08
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
select(-n)
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
aoa_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/next_kids/stimuli_selection/AoA_ratings_Kuperman_et_al_BRM.csv") %>%
select(Word, Rating.Mean) %>%
rename(word = Word,
adult_aoa_estimate = Rating.Mean)
embedding_dist <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/wiki_embedding_dist_by_word.csv") %>%
rename(mean_dist = mean_dist_wiki)
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
pos <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt") %>%
select(Word, Dom_PoS_SUBTLEX) %>%
rename(word = Word,
pos = Dom_PoS_SUBTLEX) %>%
mutate(pos = ifelse(pos == "Verb", "v", "o"))
glasgow <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/data/study1a/raw/GlasgowNorms.csv") %>%
select(word, contains("_M")) %>%
select(-AOA_M, -CNC_M)
ar_va <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/Ratings_Warriner_et_al.csv") %>%
select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum) %>%
rename(word = Word)
complexity <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/MRC_database/complexity_norms.csv") %>%
select(word, complexity)
word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(embedding_dist) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(aoa_norms) %>%
left_join(pos) %>%
left_join(ar_va) %>%
left_join(glasgow) %>%
left_join(complexity) %>%
mutate(word_length = nchar(word)) %>%
filter(n_know >= 5)
df_corrs <- word_coeffs_min5_t2_with_vars %>%
select(-word, -Estimate, -SE, -pos)
#filter_all(all_vars(!is.na(.)))
corr_mat <- cor(df_corrs,
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(df_corrs,
conf.level = .95,
use = "pairwise.complete.obs")$p
cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "full", order = "hclust", number.cex = .7,
addCoef.col = "black", insig = "blank",
p.mat = p.mat, sig.level = .05,
tl.col = "black", tl.srt = 90,
diag = FALSE)
## USE THIS MODEL
lm(t ~ mean_dist + Conc.M + centrality +pos +word_length + log_freq, word_coeffs_min5_t2_with_vars) %>%
summary()
##
## Call:
## lm(formula = t ~ mean_dist + Conc.M + centrality + pos + word_length +
## log_freq, data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53985 -0.46376 -0.05939 0.42980 2.74676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.22082 0.58094 5.544 6.48e-08 ***
## mean_dist 4.40994 2.22275 1.984 0.0482 *
## Conc.M 0.01265 0.04542 0.278 0.7809
## centrality -17.31602 4.03617 -4.290 2.41e-05 ***
## posv 0.12541 0.09216 1.361 0.1746
## word_length 0.06226 0.03318 1.877 0.0616 .
## log_freq -0.17942 0.04113 -4.362 1.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6641 on 300 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.3448, Adjusted R-squared: 0.3317
## F-statistic: 26.32 on 6 and 300 DF, p-value: < 2.2e-16