library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
library(broom)
library(lme4)
theme_set(theme_classic(base_size = 10))item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
ungroup()mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories, -gender) %>%
arrange(child_id, session_num)
# get produced words by kid
produced_words <- read_csv("data/produced_words_tidy.csv") %>%
mutate_if(is.character, as.factor) %>%
mutate(num_item_id = as.factor(num_item_id))
freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
select(word, log_freq)Get age bins. 1 month bins 16 - 32 months.
cdi_data %>%
distinct(child_id, age) %>%
mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
ggplot(aes(x = age_bin)) +
geom_histogram(stat = "count")# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
filter(!is.na(age_bin)) %>%
left_join(word_bank_hyper, by = "num_item_id") %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
group_by(child_id, session_num, age_bin) %>%
summarize(mean_hypernyms = mean(hypernyms, na.rm = TRUE),
mean_freq = mean(log_freq, na.rm = T)) %>%
group_by(child_id, age_bin) %>% # deal with any cases where a kid has two sessions per age bin
summarize(mean_hypernyms = mean(mean_hypernyms, na.rm = TRUE),
mean_freq = mean(mean_freq, na.rm = T)) #Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
select(-session_num) %>%
mutate(age_bin = cut(age, breaks = seq(16, 32, 1))) %>%
filter(!is.na(age_bin)) %>%
group_by(child_id, age_bin) %>%
summarize_all(mean)
lag_session_words_spoken <- demographic_data %>%
mutate(one_subsequent_words_spoken = lead(words_spoken, 1), by = "age_bin",
two_subsequent_words_spoken = lead(words_spoken, 2), by = "age_bin",
three_subsequent_words_spoken = lead(words_spoken, 3), by = "age_bin",
four_subsequent_words_spoken = lead(words_spoken, 4), by = "age_bin",
five_subsequent_words_spoken = lead(words_spoken, 5), by = "age_bin",
one_delta = one_subsequent_words_spoken - words_spoken,
two_delta = two_subsequent_words_spoken - words_spoken,
three_delta = three_subsequent_words_spoken - words_spoken,
four_delta = four_subsequent_words_spoken - words_spoken,
five_delta = five_subsequent_words_spoken - words_spoken) %>%
select(child_id, age_bin, words_spoken, contains("delta")) %>%
gather("lag", "delta_words", -child_id, -age_bin, -words_spoken)
lag_session_age <- demographic_data %>%
mutate(one_subsequent_age = lead(age, 1), by = "age_bin",
two_subsequent_age = lead(age, 2), by = "age_bin",
three_subsequent_age = lead(age, 3), by = "age_bin",
four_subsequent_age = lead(age, 4), by = "age_bin",
five_subsequent_age = lead(age, 5), by = "age_bin",
one_delta = one_subsequent_age - age,
two_delta = two_subsequent_age - age,
three_delta = three_subsequent_age - age,
four_delta = four_subsequent_age - age,
five_delta = five_subsequent_age - age) %>%
select(child_id, age_bin, contains("delta")) %>%
gather("lag", "delta_age", -child_id, -age_bin)
outcome_age_bin_readable <- demographic_data %>%
ungroup()%>%
distinct(age_bin) %>%
rename(outcome_age_bin = age_bin) %>%
mutate(outcome_age_bin_numeric = as.numeric(outcome_age_bin))
# join together
full_df <- hypernyms_score_by_kid %>%
left_join(lag_session_words_spoken) %>%
left_join(lag_session_age) %>%
mutate(lag_num = case_when(lag == "one_delta" ~ 1,
lag == "two_delta" ~ 2,
lag == "three_delta" ~ 3,
lag == "four_delta" ~ 4,
lag == "five_delta" ~ 5),
predictor_age_bin_numeric = as.numeric(age_bin)) %>%
filter(!is.na(delta_words)) %>% # get rid of cases where delta session doesn't exisit
rename(predictor_age_bin = age_bin) %>%
mutate(outcome_age_bin_numeric = predictor_age_bin_numeric + lag_num) %>%
left_join(outcome_age_bin_readable, by = "outcome_age_bin_numeric") %>%
select(child_id, predictor_age_bin, predictor_age_bin_numeric, lag, lag_num, outcome_age_bin, outcome_age_bin_numeric, mean_hypernyms, mean_freq, words_spoken, delta_words, delta_age)The data:
## Observations: 3,410
## Variables: 12
## Groups: child_id [194]
## $ child_id <chr> "4139_LTP102", "4139_LTP102", "4139_LT…
## $ predictor_age_bin <fct> "(16,17]", "(16,17]", "(16,17]", "(16,…
## $ predictor_age_bin_numeric <dbl> 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4,…
## $ lag <chr> "one_delta", "two_delta", "three_delta…
## $ lag_num <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3,…
## $ outcome_age_bin <fct> "(17,18]", "(18,19]", "(19,20]", "(20,…
## $ outcome_age_bin_numeric <dbl> 2, 3, 4, 5, 6, 4, 5, 6, 7, 8, 5, 6, 7,…
## $ mean_hypernyms <dbl> 9.397260, 9.397260, 9.397260, 9.397260…
## $ mean_freq <dbl> 7.814661, 7.814661, 7.814661, 7.814661…
## $ words_spoken <dbl> 94, 94, 94, 94, 94, 153, 153, 153, 153…
## $ delta_words <dbl> 59, 73, 98, 143, 194, 14, 39, 84, 135,…
## $ delta_age <dbl> 1.1176857, 2.2024984, 3.1886917, 4.142…
lm(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric,
data = full_df %>% filter(lag_num == 1)) %>%
summary()##
## Call:
## lm(formula = delta_words ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + predictor_age_bin_numeric, data = full_df %>%
## filter(lag_num == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -310.78 -25.20 -5.98 16.58 399.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 501.58041 42.33756 11.847 < 2e-16 ***
## mean_hypernyms -14.94045 2.45741 -6.080 1.7e-09 ***
## delta_age 41.04791 2.39883 17.112 < 2e-16 ***
## mean_freq -43.68708 5.11556 -8.540 < 2e-16 ***
## words_spoken -0.12135 0.01381 -8.786 < 2e-16 ***
## predictor_age_bin_numeric 0.26137 0.64673 0.404 0.686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.15 on 1022 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3016, Adjusted R-squared: 0.2981
## F-statistic: 88.25 on 5 and 1022 DF, p-value: < 2.2e-16
lm(delta_words ~ mean_hypernyms + lag_num + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric,
data = full_df) %>%
summary()##
## Call:
## lm(formula = delta_words ~ mean_hypernyms + lag_num + delta_age +
## mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340.04 -45.26 -5.56 36.50 465.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 889.60733 37.75619 23.562 < 2e-16 ***
## mean_hypernyms -28.29170 2.20311 -12.842 < 2e-16 ***
## lag_num 8.08063 2.28280 3.540 0.000406 ***
## delta_age 38.62866 1.59895 24.159 < 2e-16 ***
## mean_freq -74.91100 4.52319 -16.562 < 2e-16 ***
## words_spoken -0.30367 0.01342 -22.626 < 2e-16 ***
## predictor_age_bin_numeric 2.16025 0.65342 3.306 0.000956 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.65 on 3397 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.5691, Adjusted R-squared: 0.5684
## F-statistic: 747.8 on 6 and 3397 DF, p-value: < 2.2e-16
With lag interaction term
lm(delta_words ~ mean_hypernyms*lag_num + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric,
data = full_df) %>%
summary()##
## Call:
## lm(formula = delta_words ~ mean_hypernyms * lag_num + delta_age +
## mean_freq + words_spoken + predictor_age_bin_numeric, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -351.44 -45.34 -5.08 36.32 468.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1025.36066 46.66692 21.972 < 2e-16 ***
## mean_hypernyms -42.60266 3.64531 -11.687 < 2e-16 ***
## lag_num -38.58993 9.75858 -3.954 7.83e-05 ***
## delta_age 38.33434 1.59464 24.039 < 2e-16 ***
## mean_freq -76.04209 4.51369 -16.847 < 2e-16 ***
## words_spoken -0.31041 0.01345 -23.086 < 2e-16 ***
## predictor_age_bin_numeric 2.03122 0.65173 3.117 0.00184 **
## mean_hypernyms:lag_num 5.37255 1.09242 4.918 9.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.38 on 3396 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.5722, Adjusted R-squared: 0.5713
## F-statistic: 648.8 on 7 and 3396 DF, p-value: < 2.2e-16
These are from fitting a separate model for each predictor age x outocme age combo.
by_session_lag_hypernym_beta <- full_df %>%
# mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken", "delta_age", "delta_words"), scale) %>%
group_by(predictor_age_bin, outcome_age_bin, lag_num) %>%
nest() %>%
mutate(temp = map(data, ~ tidy(lm(delta_words ~ mean_hypernyms + delta_age +
mean_freq + words_spoken, data = .x))),
num_obs = map_dbl(data, nrow)) %>%
select(-data) %>%
unnest() %>%
drop_na() %>%
filter(term == "mean_hypernyms") %>%
mutate(is_significant = p.value < .05)
ggplot(by_session_lag_hypernym_beta,
aes(x = predictor_age_bin,
y = estimate,
group = outcome_age_bin,
shape = is_significant)) +
geom_hline(aes(yintercept = 0), lty = 2, color = "grey") +
geom_line() +
ylab("estimate of effect of mean hyperyms on delta words") +
geom_point(size = 2) +
facet_wrap(~outcome_age_bin) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "bottom")ggplot(by_session_lag_hypernym_beta,
aes(x = outcome_age_bin,
y = estimate,
group = lag_num,
shape = is_significant)) +
geom_hline(aes(yintercept = 0), lty = 2, color = "grey") +
facet_wrap(~lag_num) +
geom_line() +
ylab("estimate of effect of mean hyperyms on delta words") +
geom_point(aes(color = as.factor(lag_num)), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "bottom")lmer_model <-
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric+ (1|child_id), full_df %>% filter(lag_num == 1))
summary(lmer_model)## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## predictor_age_bin_numeric + (1 | child_id)
## Data: full_df %>% filter(lag_num == 1)
##
## REML criterion at convergence: 10826
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.5876 -0.5270 -0.1242 0.3521 8.4408
##
## Random effects:
## Groups Name Variance Std.Dev.
## child_id (Intercept) 10.22 3.196
## Residual 2213.34 47.046
## Number of obs: 1028, groups: child_id, 194
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 505.2929 42.5572 11.873
## mean_hypernyms -15.0373 2.4698 -6.088
## delta_age 41.0234 2.3988 17.101
## mean_freq -44.0367 5.1527 -8.546
## words_spoken -0.1228 0.0139 -8.836
## predictor_age_bin_numeric 0.3018 0.6505 0.464
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.311
## delta_age -0.078 -0.008
## mean_freq -0.837 -0.251 0.016
## words_spokn -0.590 0.512 -0.013 0.319
## prdctr_g_b_ 0.000 0.043 -0.042 -0.080 -0.503
lmer_model <-
lmer(delta_words ~ mean_hypernyms+lag_num + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric + (1|child_id), full_df)
summary(lmer_model)## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms + lag_num + delta_age + mean_freq +
## words_spoken + predictor_age_bin_numeric + (1 | child_id)
## Data: full_df
##
## REML criterion at convergence: 37927.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4979 -0.5075 0.0018 0.5173 5.1389
##
## Random effects:
## Groups Name Variance Std.Dev.
## child_id (Intercept) 9875 99.38
## Residual 3323 57.65
## Number of obs: 3404, groups: child_id, 194
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 794.12344 44.64368 17.788
## mean_hypernyms -29.29887 2.39281 -12.245
## lag_num 14.00618 2.03003 6.899
## delta_age 34.34593 1.45091 23.672
## mean_freq -65.73870 5.61926 -11.699
## words_spoken -0.80073 0.01761 -45.463
## predictor_age_bin_numeric 26.33288 0.91816 28.680
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s
## mn_hyprnyms -0.194
## lag_num -0.015 0.001
## delta_age -0.013 0.012 -0.918
## mean_freq -0.851 -0.321 -0.012 0.012
## words_spokn -0.128 0.210 0.026 -0.019 0.034
## prdctr_g_b_ -0.261 0.175 0.073 -0.019 0.124 -0.771
with lag interaction term:
lmer_model <-
lmer(delta_words ~ mean_hypernyms*lag_num + delta_age + mean_freq + words_spoken + predictor_age_bin_numeric + (1|child_id), full_df)
summary(lmer_model)## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms * lag_num + delta_age + mean_freq +
## words_spoken + predictor_age_bin_numeric + (1 | child_id)
## Data: full_df
##
## REML criterion at convergence: 37879.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4711 -0.5086 0.0020 0.5125 4.2589
##
## Random effects:
## Groups Name Variance Std.Dev.
## child_id (Intercept) 9677 98.37
## Residual 3281 57.28
## Number of obs: 3404, groups: child_id, 194
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 961.49558 50.65418 18.982
## mean_hypernyms -45.54624 3.36481 -13.536
## lag_num -34.10388 7.33943 -4.647
## delta_age 33.91757 1.44289 23.507
## mean_freq -68.52758 5.59674 -12.244
## words_spoken -0.81646 0.01765 -46.247
## predictor_age_bin_numeric 26.31629 0.91193 28.858
## mean_hypernyms:lag_num 5.56527 0.81634 6.817
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp lag_nm delt_g mn_frq wrds_s prd___
## mn_hyprnyms -0.462
## lag_num -0.468 0.681
## delta_age -0.033 0.040 -0.210
## mean_freq -0.778 -0.175 0.066 0.016
## words_spokn -0.176 0.242 0.136 -0.013 0.043
## prdctr_g_b_ -0.227 0.123 0.019 -0.019 0.124 -0.764
## mn_hyprny:_ 0.483 -0.708 -0.962 -0.044 -0.072 -0.134 0.001