library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))MINWORDSFORVOCAB <- 5The min words for vocab here is 5.
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
mlu_info <- read_csv("mlu_by_kid.csv")
trans_info <- read_csv("t1_transitional_probs_in_vocab_missing0.csv")
mcrae_info <- read_csv("mcrae_vocab_by_kid_t1.txt") %>%
rename(Per_Corred_Pairs_No_Tax = `%_Corred_Pairs_No_Tax`)
pos_info <- read_csv("prop_pos_by_kid_t1.csv")
conc_info <- read_csv("conc_by_kid_t1.csv")
kid_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/4_semantic_density/semantic_density_df.csv") %>%
left_join(mlu_info) %>%
left_join(trans_info) %>%
left_join(mcrae_info)%>%
left_join(pos_info) %>%
left_join(conc_info) %>%
mutate(corpus_id = as.factor(corpus_id))
#freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
# rename(word = Word,
# log_freq = Lg10WF) %>%
# select(word, log_freq)
freq <- read_csv("childes_adult_word_freq.csv")Embedding models
word2vec_model_childes <- read_csv("3_train_childes_model/childes_adult_w2v.txt") %>%
rename(target_word = word)
word2vec_model_wiki <- read_feather("fast_text_childes_words_600_900.feather") Get filtered version of types for each kid
types_clean <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB) %>%
mutate(log_count = log(count)) %>%
select(-count) %>%
left_join(freq, by= c("gloss_clean" = "word")) %>%
mutate(log_count_w1 = log_count,
log_count_w2 = log_count,
log_freq_w1 = log_freq,
log_freq_w2 = log_freq) %>%
select(-log_count, -log_freq)conc <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
select(Word, Conc.M)
distinct_types <- types_clean %>%
left_join(conc, by =c("gloss_clean" = "Word")) %>%
ungroup() %>%
distinct(gloss_clean, .keep_all = T)
median_conc <- median(distinct_types$Conc.M, na.rm = T)
types_clean <- types_clean %>%
left_join(conc, by =c("gloss_clean" = "Word")) %>%
filter(Conc.M<= median_conc)Get distance measure by kid at t1 in both wikipedia model and adult childes model
get_vocab_measure_by_kid3 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1]))
data.frame(target_child_id = id,
mean_dist_t1 = mean(word_word_dists),
median_dist_t1 = median(word_word_dists),
var_dist_t1 = ifelse(mean(var(word_word_dists)) == 0, NA,
mean(var(word_word_dists))),
n_t1 = nrow(word_word_dists),
median_freq_t1 = mean(words_in_model$log_freq_w1, na.rm = T))
}
nested_data_by_kid <- nest(types_clean, -target_child_id)
vocab_measures_wiki <- map2_df(nested_data_by_kid$target_child_id,
nested_data_by_kid$data,
get_vocab_measure_by_kid3,
word2vec_model_wiki) %>%
rename(mean_dist_t1_wiki = mean_dist_t1,
median_dist_t1_wiki = median_dist_t1,
var_dist_t1_wiki = var_dist_t1,
n_t1_wiki = n_t1,
median_freq_t1_wiki = median_freq_t1)
vocab_measures_childes <- map2_df(nested_data_by_kid$target_child_id,
nested_data_by_kid$data,
get_vocab_measure_by_kid3,
word2vec_model_childes) %>%
rename(mean_dist_t1_childes = mean_dist_t1,
median_dist_t1_childes = median_dist_t1,
var_dist_t1_childes = var_dist_t1,
n_t1_childes = n_t1,
median_freq_t1_childes = median_freq_t1)
vocab_measures <- left_join(vocab_measures_wiki, vocab_measures_childes) %>%
mutate(mean_dist_t1_wiki_log = log(mean_dist_t1_wiki),
mean_dist_t1_childes_log = log(mean_dist_t1_childes)) %>%
select(-mean_dist_t1_wiki, -mean_dist_t1_childes)ggplot(vocab_measures, aes(x = mean_dist_t1_childes_log, y = mean_dist_t1_wiki_log)) +
geom_point() +
geom_smooth(method = "lm")embedding_corr <- cor.test(vocab_measures$mean_dist_t1_childes_log, vocab_measures$mean_dist_t1_wiki_log)
embedding_corr##
## Pearson's product-moment correlation
##
## data: vocab_measures$mean_dist_t1_childes_log and vocab_measures$mean_dist_t1_wiki_log
## t = 5.0439, df = 83, p-value = 2.641e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3024613 0.6322227
## sample estimates:
## cor
## 0.4843625
They’re highly correlated r = 0.4843625.
Get merged dataframe
vocab_df <- vocab_measures %>%
filter(n_t1_wiki > 2) %>% # can't look at median distance when there's only two words
left_join(kid_info %>% select(target_child_id,
log_mtld_t1,
log_mtld_t2,
age_t1, age_diff ,
log_transcript_length_t1, log_transcript_length_t2, log_num_trigrams_t1,
log_num_trigrams_t2,
log_num_trigrams_t1, log_num_trigrams_t2,
mlu_m_t1, mlu_m_t2,
target_child_sex,
conc_t1,
# n_clusters,
# mean_trans_prob_t1 ,
# prop_na_trans_t1,
Per_Corred_Pairs_No_Tax,
Num_Corred_Pairs_No_Tax,
prop_noun_t1,
prop_verb_t1,
corpus_id, collection_name,
age_t2)) descriptive stats
df_no_corrs <- vocab_df %>%
select(-target_child_id, -target_child_sex, -corpus_id, -collection_name)
#filter_all(all_vars(!is.na(.)))
corr_mat <- cor(df_no_corrs,
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(df_no_corrs,
conf.level = .95,
use = "pairwise.complete.obs")$p
cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "full", order = "original", number.cex = .7, #order = "hclust"
addCoef.col = "black", insig = "blank",
p.mat = p.mat, sig.level = .05,
tl.col = "black", tl.srt = 90,
diag = FALSE)#age_t1
#log_transcript_length_t1
#mlu_m_t1*
#mean_trans_prob_t1
#prop_noun_t1
#median_freq_t1_wikilme4::lmer(scale(log_mtld_t2) ~
scale(mean_dist_t1_wiki_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_wiki) +
scale(log_mtld_t1) +
scale(mlu_m_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_mtld_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +
## scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +
## (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 85.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.03026 -0.56609 0.03706 0.43285 2.65390
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.05095 0.2257
## Residual 0.17907 0.4232
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.085367 0.091519 0.933
## scale(mean_dist_t1_wiki_log) -0.078080 0.158004 -0.494
## scale(Num_Corred_Pairs_No_Tax) 0.209218 0.074875 2.794
## scale(age_t1) -0.253571 0.083444 -3.039
## scale(prop_noun_t1) 0.140604 0.089169 1.577
## scale(median_freq_t1_wiki) 0.105907 0.107449 0.986
## scale(log_mtld_t1) 0.862979 0.091664 9.415
## scale(mlu_m_t1) -0.009338 0.104150 -0.090
## scale(log_transcript_length_t1) 0.080539 0.169678 0.475
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) 0.035
## s(N_C_P_N_T 0.070 0.488
## scale(g_t1) 0.038 0.143 0.004
## scl(prp__1) -0.046 -0.266 -0.417 0.026
## scl(md__1_) -0.038 -0.363 -0.318 -0.079 0.470
## scl(lg_m_1) 0.025 -0.230 -0.173 -0.163 0.161 -0.013
## scl(ml_m_1) -0.023 0.127 0.050 0.319 0.095 -0.124 -0.566
## scl(lg___1) -0.049 0.624 0.279 -0.171 -0.219 0.149 -0.058
## scl(m__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_m_1)
## scl(ml_m_1)
## scl(lg___1) -0.368
lme4::lmer(scale(mlu_m_t2) ~
scale(mean_dist_t1_wiki_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_wiki) +
scale(log_mtld_t1) +
scale(mlu_m_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(mlu_m_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +
## scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +
## (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 111.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.29033 -0.46106 0.02629 0.33578 2.62574
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.0000 0.0000
## Residual 0.3753 0.6126
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.004453 0.085519 0.052
## scale(mean_dist_t1_wiki_log) -0.124233 0.211848 -0.586
## scale(Num_Corred_Pairs_No_Tax) 0.036079 0.099970 0.361
## scale(age_t1) -0.302221 0.101093 -2.990
## scale(prop_noun_t1) 0.248935 0.117273 2.123
## scale(median_freq_t1_wiki) 0.190358 0.137726 1.382
## scale(log_mtld_t1) 0.150910 0.120814 1.249
## scale(mlu_m_t1) 0.629587 0.139059 4.527
## scale(log_transcript_length_t1) 0.049437 0.223102 0.222
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) -0.015
## s(N_C_P_N_T -0.018 0.441
## scale(g_t1) -0.048 0.142 -0.017
## scl(prp__1) -0.040 -0.168 -0.335 0.015
## scl(md__1_) -0.017 -0.300 -0.190 -0.104 0.388
## scl(lg_m_1) 0.096 -0.195 -0.140 -0.267 0.101 -0.092
## scl(ml_m_1) -0.082 0.072 -0.049 0.429 0.233 0.006 -0.553
## scl(lg___1) 0.021 0.720 0.389 -0.200 -0.280 0.110 -0.085
## scl(m__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_m_1)
## scl(ml_m_1)
## scl(lg___1) -0.327
lme4::lmer(scale(log_num_trigrams_t2) ~
scale(mean_dist_t1_wiki_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_wiki) +
scale(log_num_trigrams_t1) +
scale(mlu_m_t1) +
scale(log_mtld_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_num_trigrams_t2) ~ scale(mean_dist_t1_wiki_log) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_wiki) +
## scale(log_num_trigrams_t1) + scale(mlu_m_t1) + scale(log_mtld_t1) +
## scale(log_transcript_length_t1) + (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 73.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.05287 -0.47097 -0.04895 0.40862 2.48630
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.1075 0.3278
## Residual 0.1124 0.3352
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.14733 0.10175 1.448
## scale(mean_dist_t1_wiki_log) -0.29307 0.15524 -1.888
## scale(Num_Corred_Pairs_No_Tax) 0.04356 0.06599 0.660
## scale(age_t1) 0.11322 0.08008 1.414
## scale(prop_noun_t1) 0.11816 0.07772 1.520
## scale(median_freq_t1_wiki) 0.27125 0.10982 2.470
## scale(log_num_trigrams_t1) 0.43371 0.20208 2.146
## scale(mlu_m_t1) 0.08648 0.08766 0.987
## scale(log_mtld_t1) 0.01803 0.09730 0.185
## scale(log_transcript_length_t1) 0.11683 0.21123 0.553
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(lg_n__1)
## scl(m__1__) 0.130
## s(N_C_P_N_T 0.134 0.575
## scale(g_t1) 0.013 0.094 -0.008
## scl(prp__1) -0.092 -0.392 -0.498 0.030
## scl(md__1_) -0.127 -0.557 -0.478 -0.039 0.528
## scl(lg_n__1) 0.177 0.510 0.313 -0.059 -0.224 -0.554
## scl(ml_m_1) -0.021 0.061 0.045 0.249 0.059 -0.071 -0.116
## scl(lg_m_1) -0.100 -0.459 -0.302 -0.039 0.266 0.299 -0.560
## scl(lg_t__1) -0.169 0.002 -0.065 -0.038 0.014 0.467 -0.706
## scl(m__1) scl(l__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_n__1)
## scl(ml_m_1)
## scl(lg_m_1) -0.389
## scl(lg_t__1) -0.187 0.352
lme4::lmer(scale(log_mtld_t2) ~
scale(mean_dist_t1_childes_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_childes) +
scale(log_mtld_t1) +
scale(mlu_m_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_mtld_t2) ~ scale(mean_dist_t1_childes_log) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_childes) +
## scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +
## (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 84.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.03398 -0.50750 0.07519 0.42842 2.40186
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.03747 0.1936
## Residual 0.18095 0.4254
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.07326 0.08591 0.853
## scale(mean_dist_t1_childes_log) 0.13751 0.12334 1.115
## scale(Num_Corred_Pairs_No_Tax) 0.23724 0.06692 3.545
## scale(age_t1) -0.24769 0.08019 -3.089
## scale(prop_noun_t1) 0.10953 0.08924 1.227
## scale(median_freq_t1_childes) -0.05672 0.17218 -0.329
## scale(log_mtld_t1) 0.85186 0.08830 9.648
## scale(mlu_m_t1) -0.04476 0.10632 -0.421
## scale(log_transcript_length_t1) 0.08222 0.14332 0.574
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) -0.005
## s(N_C_P_N_T 0.052 0.235
## scale(g_t1) 0.031 -0.035 -0.085
## scl(prp__1) -0.035 -0.300 -0.379 0.073
## scl(md__1_) -0.013 -0.822 -0.278 0.012 0.466
## scl(lg_m_1) 0.042 -0.083 -0.092 -0.151 0.127 0.010
## scl(ml_m_1) -0.030 -0.262 -0.084 0.322 0.216 0.175 -0.515
## scl(lg___1) -0.070 -0.438 -0.117 -0.303 0.050 0.622 0.141
## scl(m__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_m_1)
## scl(ml_m_1)
## scl(lg___1) -0.385
lme4::lmer(scale(mlu_m_t2) ~
scale(mean_dist_t1_childes_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_childes) +
scale(log_mtld_t1) +
scale(mlu_m_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(mlu_m_t2) ~ scale(mean_dist_t1_childes_log) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(prop_noun_t1) + scale(median_freq_t1_childes) +
## scale(log_mtld_t1) + scale(mlu_m_t1) + scale(log_transcript_length_t1) +
## (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 111.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.21791 -0.38342 0.00357 0.38874 2.67770
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.002809 0.0530
## Residual 0.374400 0.6119
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.00868 0.08776 0.099
## scale(mean_dist_t1_childes_log) 0.04576 0.16432 0.278
## scale(Num_Corred_Pairs_No_Tax) 0.07069 0.09294 0.761
## scale(age_t1) -0.29495 0.10107 -2.918
## scale(prop_noun_t1) 0.22428 0.12418 1.806
## scale(median_freq_t1_childes) 0.11488 0.23470 0.489
## scale(log_mtld_t1) 0.13141 0.11942 1.100
## scale(mlu_m_t1) 0.62898 0.14266 4.409
## scale(log_transcript_length_t1) 0.11494 0.17958 0.640
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(l__1)
## scl(m__1__) 0.009
## s(N_C_P_N_T -0.004 0.257
## scale(g_t1) -0.037 -0.084 -0.108
## scl(prp__1) -0.042 -0.363 -0.361 0.071
## scl(md__1_) -0.021 -0.830 -0.251 0.038 0.488
## scl(lg_m_1) 0.089 -0.117 -0.091 -0.230 0.114 0.015
## scl(ml_m_1) -0.076 -0.222 -0.137 0.424 0.300 0.194 -0.508
## scl(lg___1) 0.024 -0.491 -0.038 -0.334 -0.005 0.646 0.135
## scl(m__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_m_1)
## scl(ml_m_1)
## scl(lg___1) -0.360
lme4::lmer(scale(log_num_trigrams_t2) ~
scale(mean_dist_t1_childes_log) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(prop_noun_t1) +
scale(median_freq_t1_childes) +
scale(log_num_trigrams_t1) +
scale(mlu_m_t1) +
scale(log_mtld_t1) +
scale(log_transcript_length_t1) +
(1|corpus_id),
data = filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: scale(log_num_trigrams_t2) ~ scale(mean_dist_t1_childes_log) +
## scale(Num_Corred_Pairs_No_Tax) + scale(age_t1) + scale(prop_noun_t1) +
## scale(median_freq_t1_childes) + scale(log_num_trigrams_t1) +
## scale(mlu_m_t1) + scale(log_mtld_t1) + scale(log_transcript_length_t1) +
## (1 | corpus_id)
## Data: filter(vocab_df, !is.na(Num_Corred_Pairs_No_Tax))
##
## REML criterion at convergence: 76.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.68203 -0.37379 0.01322 0.30205 2.55065
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.1048 0.3238
## Residual 0.1194 0.3455
## Number of obs: 52, groups: corpus_id, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.158287 0.101586 1.558
## scale(mean_dist_t1_childes_log) 0.154551 0.116916 1.322
## scale(Num_Corred_Pairs_No_Tax) 0.131709 0.056935 2.313
## scale(age_t1) 0.132754 0.081013 1.639
## scale(prop_noun_t1) 0.038993 0.075285 0.518
## scale(median_freq_t1_childes) -0.002295 0.152196 -0.015
## scale(log_num_trigrams_t1) 0.546293 0.186268 2.933
## scale(mlu_m_t1) 0.062748 0.093429 0.672
## scale(log_mtld_t1) -0.053639 0.088453 -0.606
## scale(log_transcript_length_t1) 0.124281 0.215672 0.576
##
## Correlation of Fixed Effects:
## (Intr) s(__1__ s(N_C_ sc(_1) scl(p__1) s(__1_) scl(lg_n__1)
## scl(m__1__) -0.074
## s(N_C_P_N_T 0.055 0.228
## scale(g_t1) -0.002 0.062 -0.060
## scl(prp__1) -0.027 -0.233 -0.393 0.057
## scl(md__1_) 0.016 -0.794 -0.313 -0.038 0.420
## scl(lg_n__1) 0.147 -0.287 -0.041 -0.138 0.041 0.007
## scl(ml_m_1) -0.010 -0.272 -0.052 0.224 0.151 0.191 -0.083
## scl(lg_m_1) -0.051 0.078 -0.036 0.004 0.088 -0.022 -0.428
## scl(lg_t__1) -0.171 -0.019 -0.076 -0.043 0.012 0.358 -0.783
## scl(m__1) scl(l__1)
## scl(m__1__)
## s(N_C_P_N_T
## scale(g_t1)
## scl(prp__1)
## scl(md__1_)
## scl(lg_n__1)
## scl(ml_m_1)
## scl(lg_m_1) -0.414
## scl(lg_t__1) -0.171 0.396
plot_df <- vocab_df %>%
filter(!is.na(Num_Corred_Pairs_No_Tax)) %>%
filter(!is.na(mlu_m_t1))
OUTFILE <- "spaghetti_plot.pdf"
#pdf(OUTFILE)
plot_df %>%
rename(`Time 1` = "log_mtld_t1",
`Time 2` = "log_mtld_t2") %>%
select(`Time 1`, `Time 2`, target_child_id) %>%
gather("timepoint", "value", -target_child_id) %>%
ggplot(aes(x = timepoint, y = value, group = target_child_id)) +
ylab("Vocabulary size estimate (log MTLD)") +
xlab("") +
theme(axis.line.x = element_blank(),
axis.line.y = element_line(size = 1),
axis.ticks = element_blank(),
axis.title.y = element_text(size = 14),
axis.text.x = element_text(size = 14, color = "black"),
axis.text.y = element_text(size = 12)) +
geom_line() +
geom_point(color = "red") #dev.off()