library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = T, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
library(lme4)
theme_set(theme_classic(base_size = 10))item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
ungroup()Normalize hypernyms by category
word_bank_hyper_norm <- word_bank_hyper %>%
group_by(category) %>%
mutate(hypernyms_scaled_cat = scale(hypernyms)) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
word_bank_hyper_norm %>%
ggplot(aes(x = hypernyms_scaled_cat, fill = category)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ category) +
theme(legend.position = "none")word_bank_hyper_norm %>%
ggplot(aes(x = hypernyms, fill = category)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ category) +
theme(legend.position = "none")Normalize hypernyms by POS
POS <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt"
pos_data <- read_tsv(POS) %>%
select(Word, Dom_PoS_SUBTLEX) %>%
rename(pos_dom = Dom_PoS_SUBTLEX,
word = Word)
word_bank_hyper_norm_pos <- word_bank_hyper %>%
rowwise()%>%
mutate(item_clean = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
left_join(pos_data, by = c("item_clean" = "word")) %>%
mutate(pos_cat = case_when(pos_dom == "Noun"~"n",
pos_dom == "Verb"~"v",
TRUE ~ "o"),
pos_cat = as.factor(pos_cat)) %>%
group_by(pos_cat) %>%
mutate(hypernyms_scaled_pos = scale(hypernyms)) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
word_bank_hyper_norm_pos %>%
ggplot(aes(x = hypernyms_scaled_pos, fill = pos_cat)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ pos_cat) +
theme(legend.position = "none")mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
# get produced words by kid
produced_words <- cdi_data %>%
filter(value > 0) %>%
select(-value) %>%
left_join(item_key %>% select(item, num_item_id), by = "item") %>%
mutate_if(is.character, as.factor) %>%
rowwise() %>%
mutate(item_clean = str_trim(str_split(item, "\\(")[[1]][1])) Get hypernym/frequency by kid
# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
left_join(word_bank_hyper_norm, by = "num_item_id") %>%
left_join(word_bank_hyper_norm_pos %>% select("num_item_id", "hypernyms_scaled_pos"),
by = "num_item_id") %>%
group_by(child_id, session_num) %>%
summarize(mean_hypernyms_cat_scaled = mean(hypernyms_scaled_cat, na.rm = TRUE),
mean_hypernyms_pos_scaled = mean(hypernyms_scaled_pos, na.rm = TRUE),
mean_hypernyms = mean(hypernyms, na.rm = TRUE),
mean_freq = mean(log_freq, na.rm = T))Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id) %>%
mutate(subsequent_age = lead(age), by = "session_num",
subsequent_percentile = lead(percentile), by = "session_num",
subsequent_words_spoken = lead(words_spoken), by = "session_num",
delta_age = subsequent_age - age,
delta_percentile = subsequent_percentile - percentile,
delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
select(-by)
# join together
full_df <- hypernyms_score_by_kid %>%
left_join(demographic_data)lmer(delta_words_spoken ~ mean_hypernyms + delta_age + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + words_spoken +
## (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14641.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.4376 -0.4704 -0.0910 0.3027 8.4565
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 832.72 28.857
## session_num 22.19 4.711 -0.99
## Residual 1710.75 41.361
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 172.622829 18.777273 9.193
## mean_hypernyms -16.322634 1.943406 -8.399
## delta_age 28.573616 1.434408 19.920
## words_spoken -0.065999 0.009487 -6.957
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g
## mn_hyprnyms -0.989
## delta_age -0.106 0.018
## words_spokn -0.788 0.726 0.030
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14565.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.6865 -0.4549 -0.1072 0.3253 8.2787
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 655.13 25.596
## session_num 16.71 4.087 -1.00
## Residual 1659.17 40.733
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 436.144058 35.317034 12.349
## mean_hypernyms -13.549667 1.903433 -7.119
## delta_age 28.335029 1.397449 20.276
## mean_freq -35.904194 4.158070 -8.635
## words_spoken -0.096033 0.009697 -9.903
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq
## mn_hyprnyms -0.343
## delta_age -0.081 0.012
## mean_freq -0.859 -0.180 0.031
## words_spokn -0.684 0.608 0.038 0.355
lmer(delta_words_spoken ~ mean_hypernyms_cat_scaled + delta_age + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms_cat_scaled + delta_age +
## words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14623
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.5325 -0.4715 -0.1077 0.3273 8.2366
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 891.26 29.854
## session_num 24.63 4.963 -1.00
## Residual 1689.16 41.099
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 31.454280 3.194012 9.848
## mean_hypernyms_cat_scaled -65.536412 7.133650 -9.187
## delta_age 28.390982 1.430550 19.846
## words_spoken -0.039919 0.007373 -5.414
##
## Correlation of Fixed Effects:
## (Intr) mn_h__ delt_g
## mn_hyprny__ -0.514
## delta_age -0.528 0.028
## words_spokn -0.768 0.485 0.034
## convergence code: 0
## boundary (singular) fit: see ?isSingular
lmer(delta_words_spoken ~ mean_hypernyms_cat_scaled + delta_age + mean_freq + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms_cat_scaled + delta_age +
## mean_freq + words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14569.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.7032 -0.4409 -0.1124 0.3313 8.0944
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 758.81 27.546
## session_num 20.27 4.502 -1.00
## Residual 1650.42 40.625
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 277.045967 34.984817 7.919
## mean_hypernyms_cat_scaled -47.820527 7.436007 -6.431
## delta_age 28.226577 1.404331 20.100
## mean_freq -30.874200 4.382563 -7.045
## words_spoken -0.066156 0.008054 -8.214
##
## Correlation of Fixed Effects:
## (Intr) mn_h__ delt_g mn_frq
## mn_hyprny__ 0.303
## delta_age -0.072 0.017
## mean_freq -0.996 -0.348 0.025
## words_spokn -0.512 0.249 0.042 0.453
## convergence code: 0
## boundary (singular) fit: see ?isSingular
lmer(delta_words_spoken ~ mean_hypernyms_pos_scaled + delta_age + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms_pos_scaled + delta_age +
## words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14626
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.4953 -0.4853 -0.0978 0.3216 8.2689
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 786.79 28.050
## session_num 21.78 4.667 -1.00
## Residual 1709.44 41.345
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 37.87321 3.62096 10.459
## mean_hypernyms_pos_scaled -60.18991 6.58832 -9.136
## delta_age 28.76300 1.42817 20.140
## words_spoken -0.05081 0.00804 -6.320
##
## Correlation of Fixed Effects:
## (Intr) mn_h__ delt_g
## mn_hyprny__ -0.659
## delta_age -0.456 0.007
## words_spokn -0.811 0.605 0.023
## convergence code: 0
## boundary (singular) fit: see ?isSingular
lmer(delta_words_spoken ~ mean_hypernyms_pos_scaled + delta_age + mean_freq + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms_pos_scaled + delta_age +
## mean_freq + words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 14566.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.6817 -0.4663 -0.1044 0.3179 8.1152
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 661.21 25.71
## session_num 17.64 4.20 -1.00
## Residual 1661.73 40.76
## Number of obs: 1410, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 293.515790 34.060085 8.618
## mean_hypernyms_pos_scaled -45.915045 6.741945 -6.810
## delta_age 28.501014 1.398737 20.376
## mean_freq -32.253621 4.279563 -7.537
## words_spoken -0.076954 0.008499 -9.054
##
## Correlation of Fixed Effects:
## (Intr) mn_h__ delt_g mn_frq
## mn_hyprny__ 0.232
## delta_age -0.080 -0.003
## mean_freq -0.995 -0.299 0.033
## words_spokn -0.468 0.415 0.034 0.393
## convergence code: 0
## boundary (singular) fit: see ?isSingular
t_values <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/9_t_value_interpolation/data/word_coeffs_cdi_24_30.csv") %>%
left_join(item_key, by = c("word" = "item")) %>%
select(num_item_id, t) %>%
mutate(num_item_id = as.factor(num_item_id))
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
left_join(item_key %>% rowwise() %>% mutate(word = str_trim(str_split(uni_lemma, "\\(")[[1]][1]))) %>%
filter(!is.na(num_item_id)) %>%
select(num_item_id, log_freq) %>%
mutate(num_item_id = as.factor(num_item_id))
word_coeffs_min5_t2_with_vars_childes <- word_bank_hyper_norm %>%
left_join(word_bank_hyper_norm_pos %>% select("num_item_id", "hypernyms_scaled_pos"),
by = "num_item_id") %>%
left_join(t_values) %>%
left_join(freq)word_coeffs_min5_t2_with_vars_childes %>%
ggplot(aes(x = hypernyms, y = t)) +
geom_point() +
geom_smooth(method = "lm")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars_childes$t and word_coeffs_min5_t2_with_vars_childes$hypernyms
## t = -1.0453, df = 447, p-value = 0.2965
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1412745 0.0433610
## sample estimates:
## cor
## -0.04937859
word_coeffs_min5_t2_with_vars_childes %>%
ggplot(aes(x = hypernyms_scaled_cat, y = t)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_coeffs_min5_t2_with_vars_childes$t,
word_coeffs_min5_t2_with_vars_childes$hypernyms_scaled_cat)##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars_childes$t and word_coeffs_min5_t2_with_vars_childes$hypernyms_scaled_cat
## t = -0.38411, df = 443, p-value = 0.7011
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11101496 0.07483735
## sample estimates:
## cor
## -0.01824642
word_coeffs_min5_t2_with_vars_childes %>%
ggplot(aes(x = hypernyms_scaled_pos, y = t)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_coeffs_min5_t2_with_vars_childes$t,
word_coeffs_min5_t2_with_vars_childes$hypernyms_scaled_pos)##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars_childes$t and word_coeffs_min5_t2_with_vars_childes$hypernyms_scaled_pos
## t = -2.8264, df = 447, p-value = 0.004918
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22232073 -0.04046012
## sample estimates:
## cor
## -0.1325054
Get t by kid
cdi_coefficients <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/9_t_value_interpolation/data/word_coeffs_cdi_24_30.csv")
by_kid_data <- produced_words %>%
left_join(cdi_coefficients, by = c("item_clean" = "word")) %>%
group_by(child_id, session_num) %>%
summarize(mean_t_t1 = mean(t, na.rm = T),
words_spoken = mean(words_spoken)) %>%
left_join(hypernyms_score_by_kid)
tidy_words <- produced_words %>%
mutate(tbin = ifelse((age >= 23 & age <= 25), "t1",
ifelse((age >= 29 & age <= 31), "t2", "other"))) %>%
filter(tbin != "other")
good_kids <- tidy_words %>%
distinct(tbin, child_id) %>%
count(child_id) %>%
filter(n == 2) %>%
pull(child_id)
word_data <- tidy_words %>%
filter(child_id %in% good_kids) %>%
group_by(child_id, tbin) %>%
summarize(percentile = mean(percentile)) %>%
spread(tbin, percentile) %>%
rename(perc_t1 = t1,
perc_t2 = t2) %>%
mutate(perc_diff = perc_t2 - perc_t1)
age_data <- tidy_words %>%
filter(child_id %in% good_kids) %>%
group_by(child_id, tbin) %>%
summarize(age = mean(age)) %>%
spread(tbin, age) %>%
rename(age_t1 = t1,
age_t2 = t2) %>%
mutate(age_diff = age_t2 - age_t1)
word_data %>%
left_join(age_data) %>%
left_join(by_kid_data) %>%
ungroup() %>%
mutate(child_id = as.factor(child_id)) %>%
lme4::lmer(scale(perc_diff) ~ scale(mean_hypernyms) + scale(mean_t_t1) + scale(age_diff) + scale(perc_t1) + (1 | child_id), data = .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: scale(perc_diff) ~ scale(mean_hypernyms) + scale(mean_t_t1) +
## scale(age_diff) + scale(perc_t1) + (1 | child_id)
## Data: .
##
## REML criterion at convergence: -10378.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.777e-05 -1.013e-05 9.740e-07 8.215e-06 7.713e-05
##
## Random effects:
## Groups Name Variance Std.Dev.
## child_id (Intercept) 8.197e-02 0.2863106
## Residual 1.769e-10 0.0000133
## Number of obs: 599, groups: child_id, 59
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -3.718e-02 3.731e-02 -0.996
## scale(mean_hypernyms) -5.719e-11 7.205e-07 0.000
## scale(mean_t_t1) 1.406e-10 7.342e-07 0.000
## scale(age_diff) 2.800e-02 3.634e-02 0.771
## scale(perc_t1) -3.716e-01 3.724e-02 -9.980
##
## Correlation of Fixed Effects:
## (Intr) scl(m_) s(__1) scl(g_)
## scl(mn_hyp) 0.000
## scl(mn_t_1) 0.000 -0.318
## scal(g_dff) -0.038 0.000 0.000
## scl(prc_t1) 0.025 0.000 0.000 0.014
## convergence code: 3
left_join(word_data, age_data) %>%
left_join(by_kid_data) %>%
lm(scale(perc_diff) ~ scale(mean_hypernyms) + scale(mean_t_t1) + scale(age_diff) + scale(perc_t1), data = .) %>%
summary()##
## Call:
## lm(formula = scale(perc_diff) ~ scale(mean_hypernyms) + scale(mean_t_t1) +
## scale(age_diff) + scale(perc_t1), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.56806 -0.57689 0.03646 0.45645 2.64890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.002566 0.034228 -0.075 0.94027
## scale(mean_hypernyms) -0.126250 0.039102 -3.229 0.00131 **
## scale(mean_t_t1) 0.389288 0.036342 10.712 < 2e-16 ***
## scale(age_diff) 0.037901 0.034272 1.106 0.26923
## scale(perc_t1) -0.374138 0.038931 -9.610 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8377 on 594 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3052, Adjusted R-squared: 0.3005
## F-statistic: 65.22 on 4 and 594 DF, p-value: < 2.2e-16