library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))
MINWORDSFORVOCAB <- 5
The min words for vocab here is 5.
Read in data
all_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
fasttext_model <- read_feather("fast_text_childes_words_600_900.feather")
mlu_info <- read_csv("mlu_by_kid.csv")
trans_info <- read_csv("t1_transitional_probs_in_vocab_missing0.csv")
mcrae_info <- read_csv("mcrae_vocab_by_kid_t1.txt")
pos_info <- read_csv("prop_pos_by_kid_t1.csv")
kid_info <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/4_semantic_density/semantic_density_df.csv") %>%
left_join(mlu_info) %>%
left_join(trans_info) %>%
left_join(mcrae_info)%>%
left_join(pos_info) %>%
mutate(corpus_id = as.factor(corpus_id))
freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
rename(word = Word,
log_freq = Lg10WF) %>%
select(word, log_freq)
Get filtered version of types for each kid
types_clean <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB) %>%
mutate(log_count = log(count)) %>%
select(-count) %>%
left_join(freq, by= c("gloss_clean" = "word")) %>%
mutate(log_count_w1 = log_count,
log_count_w2 = log_count,
log_freq_w1 = log_freq,
log_freq_w2 = log_freq) %>%
select(-log_count, -log_freq)
Get vocab measures by kids
get_vocab_measure_by_kid3 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1]))
data.frame(target_child_id = id,
mean_dist_t1 = mean(word_word_dists),
median_dist_t1 = median(word_word_dists),
var_dist_t1 = ifelse(mean(var(word_word_dists)) == 0, NA,
mean(var(word_word_dists))),
n_t1 = nrow(word_word_dists),
median_freq_t1 = median(words_in_model$log_freq_w1, na.rm = T))
}
nested_data_by_kid <- nest(types_clean, -target_child_id)
vocab_measures <- map2_df(nested_data_by_kid$target_child_id,
nested_data_by_kid$data,
get_vocab_measure_by_kid3,
fasttext_model)
Merge in other variables
vocab_df <- vocab_measures %>%
filter(n_t1 > 2) %>%
mutate_at(vars(mean_dist_t1, median_dist_t1, n_t1), log) %>%
left_join(kid_info %>% select(target_child_id,
log_mtld_t1,
log_mtld_t2,
age_t1, age_diff ,
log_transcript_length_t1, log_transcript_length_t2,
log_num_trigrams_t1, log_num_trigrams_t2,
mlu_m_t1, mlu_m_t2,
target_child_sex,
mean_trans_prob_t1 ,
prop_na_trans_t1,
Num_Corred_Pairs_No_Tax,
Density_No_Tax,
prop_noun_t1,
prop_verb_t1,
corpus_id, collection_name)) %>%
mutate(Density_No_Tax = ifelse(is.na(Density_No_Tax), 0, Density_No_Tax),
Num_Corred_Pairs_No_Tax = ifelse(is.na(Num_Corred_Pairs_No_Tax), 0,
Num_Corred_Pairs_No_Tax))
df_no_corrs <- vocab_df %>%
select(-target_child_id, -target_child_sex, -corpus_id, -collection_name)
#filter_all(all_vars(!is.na(.)))
corr_mat <- cor(df_no_corrs,
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(df_no_corrs,
conf.level = .95,
use = "pairwise.complete.obs")$p
cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "full", order = "hclust", number.cex = .7,
addCoef.col = "black", insig = "blank",
p.mat = p.mat, sig.level = .05,
tl.col = "black", tl.srt = 90,
diag = FALSE)
lme4::lmer(scale(log_mtld_t2) ~
scale(median_dist_t1) +
scale(var_dist_t1) +
scale(Num_Corred_Pairs_No_Tax) +
scale(age_t1) +
scale(age_diff) +
scale(log_mtld_t1) +
scale(prop_noun_t1) +
#target_child_sex +
scale(mlu_m_t1)+
scale(median_freq_t1) +
scale(log_transcript_length_t1) +
scale(log_transcript_length_t2) + (1|corpus_id),
data = vocab_df) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_mtld_t2) ~ scale(median_dist_t1) + scale(var_dist_t1) +
## scale(Num_Corred_Pairs_No_Tax) + scale(age_t1) + scale(age_diff) +
## scale(log_mtld_t1) + scale(prop_noun_t1) + scale(mlu_m_t1) +
## scale(median_freq_t1) + scale(log_transcript_length_t1) +
## scale(log_transcript_length_t2) + (1 | corpus_id)
## Data: vocab_df
##
## REML criterion at convergence: 128.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4753 -0.6087 0.1334 0.5244 1.9933
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.03198 0.1788
## Residual 0.20852 0.4566
## Number of obs: 76, groups: corpus_id, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.02948 0.08600 0.343
## scale(median_dist_t1) 0.20577 0.08048 2.557
## scale(var_dist_t1) -0.23302 0.07277 -3.202
## scale(Num_Corred_Pairs_No_Tax) 0.15455 0.06890 2.243
## scale(age_t1) -0.49872 0.19268 -2.588
## scale(age_diff) -0.02190 0.18046 -0.121
## scale(log_mtld_t1) 0.84304 0.08544 9.867
## scale(prop_noun_t1) -0.02771 0.08186 -0.339
## scale(mlu_m_t1) -0.06818 0.09104 -0.749
## scale(median_freq_t1) 0.09973 0.08994 1.109
## scale(log_transcript_length_t1) -0.05012 0.17626 -0.284
## scale(log_transcript_length_t2) 0.33864 0.18184 1.862
##
## Correlation of Fixed Effects:
## (Intr) scl(mdn_d_1) scl(v__1) s(N_C_ sc(_1) scl(_) scl(l__1)
## scl(mdn_d_1) 0.038
## scl(vr_d_1) -0.086 -0.461
## s(N_C_P_N_T 0.049 0.188 -0.082
## scale(g_t1) 0.162 0.185 -0.094 0.055
## scal(g_dff) 0.093 0.117 -0.043 0.053 0.886
## scl(lg_m_1) -0.060 -0.140 0.095 -0.125 -0.103 0.044
## scl(prp__1) 0.005 -0.073 0.235 -0.346 0.144 0.116 0.164
## scl(ml_m_1) -0.004 -0.131 0.093 -0.007 0.041 -0.062 -0.558
## scl(mdn_f_1) 0.055 -0.119 -0.169 -0.021 -0.148 -0.151 -0.123
## scl(lg___1) 0.125 0.332 -0.219 -0.026 0.299 0.348 0.100
## scl(lg___2) -0.293 -0.235 0.191 -0.101 -0.453 -0.346 0.015
## scl(p__1) scl(ml__1) scl(mdn_f_1) s(___1
## scl(mdn_d_1)
## scl(vr_d_1)
## s(N_C_P_N_T
## scale(g_t1)
## scal(g_dff)
## scl(lg_m_1)
## scl(prp__1)
## scl(ml_m_1) -0.068
## scl(mdn_f_1) 0.344 -0.171
## scl(lg___1) 0.111 -0.442 0.108
## scl(lg___2) -0.120 0.076 0.124 -0.750
lme4::lmer(scale(log_num_trigrams_t2) ~ scale(median_dist_t1) +
scale(var_dist_t1)+
scale(Num_Corred_Pairs_No_Tax) +
#scale(Density_No_Tax) +
#scale(mean_trans_prob_t1) +
#scale(-prop_na_trans_t1) +
scale(age_t1) +
scale(age_diff) +
scale(log_num_trigrams_t1) +
scale(prop_noun_t1) +
#target_child_sex +
scale(mlu_m_t1)+
scale(median_freq_t1) +
scale(log_transcript_length_t1) +
scale(log_transcript_length_t2)+ (1|corpus_id),
data = vocab_df) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(log_num_trigrams_t2) ~ scale(median_dist_t1) + scale(var_dist_t1) +
## scale(Num_Corred_Pairs_No_Tax) + scale(age_t1) + scale(age_diff) +
## scale(log_num_trigrams_t1) + scale(prop_noun_t1) + scale(mlu_m_t1) +
## scale(median_freq_t1) + scale(log_transcript_length_t1) +
## scale(log_transcript_length_t2) + (1 | corpus_id)
## Data: vocab_df
##
## REML criterion at convergence: 41.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7435 -0.5423 -0.1087 0.5222 2.2717
##
## Random effects:
## Groups Name Variance Std.Dev.
## corpus_id (Intercept) 0.02016 0.1420
## Residual 0.05112 0.2261
## Number of obs: 76, groups: corpus_id, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.02983 0.05516 0.541
## scale(median_dist_t1) 0.03452 0.04018 0.859
## scale(var_dist_t1) -0.02244 0.03658 -0.613
## scale(Num_Corred_Pairs_No_Tax) 0.07187 0.03520 2.042
## scale(age_t1) -0.02685 0.10490 -0.256
## scale(age_diff) 0.14723 0.09693 1.519
## scale(log_num_trigrams_t1) 0.88649 0.09888 8.965
## scale(prop_noun_t1) 0.15127 0.04134 3.659
## scale(mlu_m_t1) 0.05532 0.04241 1.305
## scale(median_freq_t1) 0.07460 0.04609 1.618
## scale(log_transcript_length_t1) -0.67432 0.12868 -5.240
## scale(log_transcript_length_t2) 0.84564 0.10369 8.156
##
## Correlation of Fixed Effects:
## (Intr) scl(mdn_d_1) scl(v__1) s(N_C_ sc(_1) scl(_)
## scl(mdn_d_1) 0.045
## scl(vr_d_1) -0.105 -0.459
## s(N_C_P_N_T 0.044 0.153 -0.072
## scale(g_t1) 0.201 0.153 -0.120 0.034
## scal(g_dff) 0.144 0.120 -0.071 0.025 0.884
## scl(lg_n__1) 0.048 -0.021 0.056 -0.228 0.064 0.118
## scl(prp__1) 0.034 -0.041 0.214 -0.371 0.166 0.133
## scl(ml_m_1) -0.060 -0.227 0.147 0.026 -0.066 -0.096
## scl(mdn_f_1) 0.049 -0.128 -0.169 0.022 -0.129 -0.151
## scl(lg_t__1) 0.077 0.252 -0.223 0.148 0.210 0.168
## scl(lg___2) -0.349 -0.200 0.221 -0.126 -0.466 -0.324
## scl(lg_n__1) scl(p__1) scl(ml__1) scl(mdn_f_1) scl(lg_t__1)
## scl(mdn_d_1)
## scl(vr_d_1)
## s(N_C_P_N_T
## scale(g_t1)
## scal(g_dff)
## scl(lg_n__1)
## scl(prp__1) 0.227
## scl(ml_m_1) -0.426 -0.082
## scl(mdn_f_1) -0.234 0.298 -0.154
## scl(lg_t__1) -0.683 -0.087 -0.008 0.257
## scl(lg___2) 0.067 -0.093 0.056 0.051 -0.594
get_vocab_measure_by_kid2 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1]))
wide_word_word_dists <- word_word_dists %>%
as.data.frame() %>%
mutate(word1 = words_in_model$gloss_clean) %>%
select(word1, everything())
names(wide_word_word_dists) = c("word1", words_in_model$gloss_clean)
long_word_word_dists_wiki <- wide_word_word_dists %>%
gather("word2", "cos_dist_wiki", -word1) %>%
select(word1, word2, everything())
# merge in word counts and frequency infor for each word
word_dists_with_meta <- long_word_word_dists_wiki %>%
left_join(data %>% select(-log_count_w2, -log_freq_w2),
by = c("word1" = "gloss_clean" )) %>%
left_join(data %>% select(-log_count_w1, -log_freq_w1),
by = c("word2" = "gloss_clean")) %>%
rowwise() %>%
mutate(mean_log_count = mean(c(log_count_w1, log_count_w2), na.rm = T),
mean_log_freq = mean(c(log_freq_w1, log_freq_w2), na.rm = T)) %>%
select(word1, word2, cos_dist_wiki, mean_log_count, mean_log_freq)
try({
model <- lm(cos_dist_wiki~ mean_log_count + mean_log_freq,
data = word_dists_with_meta)
dists_with_resids <- word_dists_with_meta %>%
add_residuals(model, var = "resid_cos_dist_wiki") %>%
select(-mean_log_count, -mean_log_freq) %>%
select(-word2) %>%
group_by(word1) %>%
summarize_all(mean, na.rm = T)
df_return <- data.frame(target_child_id = id,
mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
var_dist_t1 = ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$cos_dist_wiki))),
mean_r_dist_t1 = mean(dists_with_resids$resid_cos_dist_wiki),
median_r_dist_t1 = median(dists_with_resids$resid_cos_dist_wiki),
var_r_dist_t1 = ifelse(mean(var(dists_with_resids$resid_cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$resid_cos_dist_wiki))),
n_t1 = nrow(dists_with_resids))
}, {
dists_with_resids <- word_dists_with_meta %>%
select(-mean_log_count, -mean_log_freq) %>%
select(-word2) %>%
group_by(word1) %>%
summarize_all(mean, na.rm = T)
df_return <- data.frame(target_child_id = id,
mean_dist_t1 = mean(dists_with_resids$cos_dist_wiki),
median_dist_t1 = median(dists_with_resids$cos_dist_wiki),
var_dist_t1 = ifelse(mean(var(dists_with_resids$cos_dist_wiki)) == 0, NA,
mean(var(dists_with_resids$cos_dist_wiki))),
mean_r_dist_t1 = NA,
median_r_dist_t1 = NA,
var_r_dist_t1 = NA,
n_t1 = nrow(dists_with_resids))
})
df_return
}
get_vocab_measure_by_kid1 <- function(id, data, model){
this_kids_model <- model %>%
filter(target_word %in% data$gloss_clean)
words_in_model <- data %>%
filter(gloss_clean %in% this_kids_model$target_word)
# get pairwise distances
word_word_dists <- coop::cosine(t(this_kids_model[,-1])) %>%
rowMeans()
data.frame(target_child_id = id,
mean_dist_t1 = mean(word_word_dists),
median_dist_t1 = median(word_word_dists),
var_dist_t1 = ifelse(mean(var(word_word_dists)) == 0, NA,
mean(var(word_word_dists))),
n_t1 = length(word_word_dists),
median_freq = median(words_in_model$log_freq_w1, na.rm = T))
}
vocab_df <- vocab_measures %>%
mutate_at(vars(mean_dist_t1, median_dist_t1, var_dist_t1,
mean_r_dist_t1, median_r_dist_t1, var_r_dist_t1, n_t1),
function(x) {log(x + 10)}) %>%
left_join(kid_info %>% select(target_child_id, log_mtld_t1,
log_mtld_t2, age_t1, age_t2, mtld_diff, age_diff,
mean_log_word_freq_t1, log_transcript_length_t1, log_transcript_length_t2, log_num_trigrams_t2,
log_num_trigrams_t1)) %>%
mutate_if(is.numeric, scale) # scale everything for regressions
vocab_df %>%
select(-target_child_id) %>%
gather("var", "value") %>%
ggplot(aes( x= value)) +
geom_histogram(scales = "free") +
facet_wrap(~var)