library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))MINWORDSFORVOCAB <- 5The min words for vocab here is 5.
Read in data
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
word2vec_model <- read_csv("3_train_childes_model/childes_kid_adult_w2v.txt") %>%
rename(target_word = word)
mlu_info <- read_csv("mlu_by_kid.csv")
trans_info <- read_csv("t1_transitional_probs_in_vocab_missing0.csv")
mcrae_info <- read_csv("mcrae_vocab_by_kid_t1.txt")
pos_info <- read_csv("prop_pos_by_kid_t1.csv")
kid_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/4_semantic_density/semantic_density_df.csv") %>%
left_join(mlu_info) %>%
left_join(trans_info) %>%
left_join(mcrae_info)%>%
left_join(pos_info) %>%
mutate(corpus_id = as.factor(corpus_id))
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
rename(word = Word,
log_freq = Lg10WF) %>%
select(word, log_freq)Get filtered version of types for each kid
types_clean <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB) %>%
mutate(log_count = log(count)) %>%
select(-count) %>%
left_join(freq, by= c("gloss_clean" = "word")) %>%
mutate(log_count_w1 = log_count,
log_count_w2 = log_count,
log_freq_w1 = log_freq,
log_freq_w2 = log_freq) %>%
select(-log_count, -log_freq)Get vocab measures by kids
get_vocab_measure_by_kid3 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1]))
data.frame(target_child_id = id,
mean_dist_t1 = mean(word_word_dists),
median_dist_t1 = median(word_word_dists),
var_dist_t1 = ifelse(mean(var(word_word_dists)) == 0, NA,
mean(var(word_word_dists))),
n_t1 = nrow(word_word_dists),
median_freq_t1 = median(words_in_model$log_freq_w1, na.rm = T))
}
nested_data_by_kid <- nest(types_clean, -target_child_id)
vocab_measures <- map2_df(nested_data_by_kid$target_child_id,
nested_data_by_kid$data,
get_vocab_measure_by_kid3,
word2vec_model)Merge in other variables
vocab_df <- vocab_measures %>%
filter(n_t1 > 2) %>%
mutate_at(vars(mean_dist_t1, median_dist_t1, n_t1), log) %>%
left_join(kid_info %>% select(target_child_id,
log_mtld_t1,
log_mtld_t2,
age_t1, age_diff ,
log_transcript_length_t1, log_transcript_length_t2,
log_num_trigrams_t1, log_num_trigrams_t2,
mlu_m_t1, mlu_m_t2,
target_child_sex,
mean_trans_prob_t1 ,
prop_na_trans_t1,
Num_Corred_Pairs_No_Tax,
Density_No_Tax,
prop_noun_t1,
prop_verb_t1,
corpus_id, collection_name)) %>%
mutate(Density_No_Tax = ifelse(is.na(Density_No_Tax), 0, Density_No_Tax),
Num_Corred_Pairs_No_Tax = ifelse(is.na(Num_Corred_Pairs_No_Tax), 0,
Num_Corred_Pairs_No_Tax))df_no_corrs <- vocab_df %>%
select(-target_child_id, -target_child_sex, -corpus_id, -collection_name)
#filter_all(all_vars(!is.na(.)))
corr_mat <- cor(df_no_corrs,
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(df_no_corrs,
conf.level = .95,
use = "pairwise.complete.obs")$p
cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "full", order = "hclust", number.cex = .7,
addCoef.col = "black", insig = "blank",
p.mat = p.mat, sig.level = .05,
tl.col = "black", tl.srt = 90,
diag = FALSE)lme4::lmer(scale(log_mtld_t2) ~
scale(median_dist_t1) +
#scale(var_dist_t1) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(age_diff) +
scale(log_mtld_t1) +
scale(prop_noun_t1) +
#target_child_sex +
scale(mlu_m_t1)+
scale(median_freq_t1) +
scale(log_transcript_length_t1) +
scale(log_transcript_length_t2) + (1|corpus_id),
data = vocab_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_mtld_t2) ~ scale(median_dist_t1) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(age_diff) + scale(log_mtld_t1) + scale(prop_noun_t1) +
## scale(mlu_m_t1) + scale(median_freq_t1) + scale(log_transcript_length_t1) +
## scale(log_transcript_length_t2) + (1 | corpus_id)
## Data: vocab_df
##
## REML criterion at convergence: 124.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.23905 -0.65873 0.02381 0.52434 2.78393
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.03748 0.1936
## Residual 0.22049 0.4696
## Number of obs: 72, groups: corpus_id, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.005949 0.093468 -0.064
## scale(median_dist_t1) 0.181153 0.070643 2.564
## scale(Num_Corred_Pairs_No_Tax) 0.178496 0.073247 2.437
## scale(age_t1) -0.576090 0.207228 -2.780
## scale(age_diff) 0.037736 0.206346 0.183
## scale(log_mtld_t1) 0.921051 0.087638 10.510
## scale(prop_noun_t1) 0.033994 0.083522 0.407
## scale(mlu_m_t1) -0.148997 0.101315 -1.471
## scale(median_freq_t1) -0.070956 0.104437 -0.679
## scale(log_transcript_length_t1) 0.021193 0.196265 0.108
## scale(log_transcript_length_t2) 0.262450 0.197004 1.332
##
## Correlation of Fixed Effects:
## (Intr) scl(mdn_d_1) s(N_C_ sc(_1) scl(_) scl(l__1) scl(p__1)
## scl(mdn_d_1) -0.046
## s(N_C_P_N_T -0.009 0.188
## scale(g_t1) 0.205 -0.093 -0.016
## scal(g_dff) 0.180 -0.055 0.010 0.876
## scl(lg_m_1) -0.027 0.087 -0.080 -0.065 0.101
## scl(prp__1) 0.091 -0.067 -0.363 0.163 0.149 0.109
## scl(ml_m_1) -0.065 -0.204 -0.038 0.059 -0.112 -0.529 -0.136
## scl(mdn_f_1) 0.047 -0.347 -0.073 -0.110 -0.122 -0.192 0.464
## scl(lg___1) 0.173 0.189 -0.027 0.229 0.348 0.122 0.187
## scl(lg___2) -0.328 -0.174 -0.082 -0.416 -0.355 -0.015 -0.158
## scl(ml__1) scl(mdn_f_1) s(___1
## scl(mdn_d_1)
## s(N_C_P_N_T
## scale(g_t1)
## scal(g_dff)
## scl(lg_m_1)
## scl(prp__1)
## scl(ml_m_1)
## scl(mdn_f_1) -0.140
## scl(lg___1) -0.533 0.109
## scl(lg___2) 0.159 0.160 -0.750
lme4::lmer(scale(log_num_trigrams_t2) ~ scale(median_dist_t1) +
# scale(var_dist_t1)+
scale(Num_Corred_Pairs_No_Tax) +
#scale(Density_No_Tax) +
#scale(mean_trans_prob_t1) +
#scale(-prop_na_trans_t1) +
scale(age_t1) +
scale(age_diff) +
scale(log_num_trigrams_t1) +
scale(prop_noun_t1) +
#target_child_sex +
scale(mlu_m_t1)+
scale(median_freq_t1) +
scale(log_transcript_length_t1) +
scale(log_transcript_length_t2)+ (1|corpus_id),
data = vocab_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_num_trigrams_t2) ~ scale(median_dist_t1) + scale(Num_Corred_Pairs_No_Tax) +
## scale(age_t1) + scale(age_diff) + scale(log_num_trigrams_t1) +
## scale(prop_noun_t1) + scale(mlu_m_t1) + scale(median_freq_t1) +
## scale(log_transcript_length_t1) + scale(log_transcript_length_t2) +
## (1 | corpus_id)
## Data: vocab_df
##
## REML criterion at convergence: 31.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.97344 -0.66564 -0.02972 0.54788 1.78412
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.04133 0.2033
## Residual 0.04098 0.2024
## Number of obs: 72, groups: corpus_id, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.06333 0.06913 0.916
## scale(median_dist_t1) 0.03943 0.03178 1.241
## scale(Num_Corred_Pairs_No_Tax) 0.07135 0.03306 2.158
## scale(age_t1) 0.02426 0.10931 0.222
## scale(age_diff) 0.23735 0.10486 2.264
## scale(log_num_trigrams_t1) 0.93501 0.09099 10.276
## scale(prop_noun_t1) 0.16975 0.03713 4.572
## scale(mlu_m_t1) 0.01527 0.04164 0.367
## scale(median_freq_t1) 0.07087 0.04717 1.503
## scale(log_transcript_length_t1) -0.59953 0.13118 -4.570
## scale(log_transcript_length_t2) 0.76921 0.11156 6.895
##
## Correlation of Fixed Effects:
## (Intr) scl(mdn_d_1) s(N_C_ sc(_1) scl(_) scl(lg_n__1)
## scl(mdn_d_1) -0.099
## s(N_C_P_N_T -0.011 0.213
## scale(g_t1) 0.220 -0.100 -0.015
## scal(g_dff) 0.242 -0.097 -0.017 0.863
## scl(lg_n__1) 0.092 -0.114 -0.169 0.072 0.136
## scl(prp__1) 0.135 -0.107 -0.381 0.187 0.169 0.105
## scl(ml_m_1) -0.085 -0.149 -0.016 -0.018 -0.083 -0.320
## scl(mdn_f_1) 0.062 -0.324 -0.058 -0.062 -0.088 -0.205
## scl(lg_t__1) 0.064 0.138 0.087 0.179 0.146 -0.636
## scl(lg___2) -0.378 -0.040 -0.092 -0.432 -0.321 0.055
## scl(p__1) scl(ml__1) scl(mdn_f_1) scl(lg_t__1)
## scl(mdn_d_1)
## s(N_C_P_N_T
## scale(g_t1)
## scal(g_dff)
## scl(lg_n__1)
## scl(prp__1)
## scl(ml_m_1) -0.143
## scl(mdn_f_1) 0.467 -0.193
## scl(lg_t__1) 0.093 -0.185 0.272
## scl(lg___2) -0.192 0.145 0.004 -0.615
get_vocab_measure_by_kid2 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1]))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = words_in_model$gloss_clean) %>%
select(word1, everything())
names(wide_word_word_dists) = c("word1", words_in_model$gloss_clean)
long_word_word_dists_wiki <- wide_word_word_dists %>%
gather("word2", "cos_dist_wiki", -word1) %>%
select(word1, word2, everything())
# merge in word counts and frequency infor for each word
word_dists_with_meta <- long_word_word_dists_wiki %>%
left_join(data %>% select(-log_count_w2, -log_freq_w2),
by = c("word1" = "gloss_clean" )) %>%
left_join(data %>% select(-log_count_w1, -log_freq_w1),
by = c("word2" = "gloss_clean")) %>%
rowwise() %>%
mutate(mean_log_count = mean(c(log_count_w1, log_count_w2), na.rm = T),
mean_log_freq = mean(c(log_freq_w1, log_freq_w2), na.rm = T)) %>%
select(word1, word2, cos_dist_wiki, mean_log_count, mean_log_freq)
try({
model <- lm(cos_dist_wiki~ mean_log_count + mean_log_freq,
data = word_dists_with_meta)
dists_with_resids <- word_dists_with_meta %>%
add_residuals(model, var = "resid_cos_dist_wiki") %>%
select(-mean_log_count, -mean_log_freq) %>%
select(-word2) %>%
group_by(word1) %>%
summarize_all(mean, na.rm = T)
df_return <- data.frame(target_child_id = id,
mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
var_dist_t1 = ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$cos_dist_wiki))),
mean_r_dist_t1 = mean(dists_with_resids$resid_cos_dist_wiki),
median_r_dist_t1 = median(dists_with_resids$resid_cos_dist_wiki),
var_r_dist_t1 = ifelse(mean(var(dists_with_resids$resid_cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$resid_cos_dist_wiki))),
n_t1 = nrow(dists_with_resids))
}, {
dists_with_resids <- word_dists_with_meta %>%
select(-mean_log_count, -mean_log_freq) %>%
select(-word2) %>%
group_by(word1) %>%
summarize_all(mean, na.rm = T)
df_return <- data.frame(target_child_id = id,
mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
var_dist_t1 = ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$cos_dist_wiki))),
mean_r_dist_t1 = NA,
median_r_dist_t1 = NA,
var_r_dist_t1 = NA,
n_t1 = nrow(dists_with_resids))
})
df_return
}
get_vocab_measure_by_kid1 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1])) %>%
rowMeans()
data.frame(target_child_id = id,
mean_dist_t1 = mean(word_word_dists),
median_dist_t1 = median(word_word_dists),
var_dist_t1 = ifelse(mean(var(word_word_dists)) == 0, NA,
mean(var(word_word_dists))),
n_t1 = length(word_word_dists),
median_freq = median(words_in_model$log_freq_w1, na.rm = T))
}
vocab_df <- vocab_measures %>%
mutate_at(vars(mean_dist_t1, median_dist_t1, var_dist_t1,
mean_r_dist_t1, median_r_dist_t1, var_r_dist_t1, n_t1),
function(x) {log(x + 10)}) %>%
left_join(kid_info %>% select(target_child_id, log_mtld_t1,
log_mtld_t2, age_t1, age_t2, mtld_diff, age_diff,
mean_log_word_freq_t1, log_transcript_length_t1, log_transcript_length_t2, log_num_trigrams_t2,
log_num_trigrams_t1)) %>%
mutate_if(is.numeric, scale) # scale everything for regressions
vocab_df %>%
select(-target_child_id) %>%
gather("var", "value") %>%
ggplot(aes( x= value)) +
geom_histogram(scales = "free") +
facet_wrap(~var)