library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
library(broom)
library(lme4)
theme_set(theme_classic(base_size = 10))item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
ungroup()mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
# get produced words by kid
produced_words <- read_csv("data/produced_words_tidy.csv") %>%
mutate_if(is.character, as.factor) %>%
mutate(num_item_id = as.factor(num_item_id))
freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
select(word, log_freq)# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
left_join(word_bank_hyper, by = "num_item_id") %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
group_by(child_id, session_num) %>%
summarize(mean_hypernyms = mean(hypernyms, na.rm = TRUE),
mean_freq = mean(log_freq, na.rm = T))#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id)
lag_session_words_spoken <- demographic_data %>%
mutate(one_subsequent_words_spoken = lead(words_spoken, 1), by = "session_num",
two_subsequent_words_spoken = lead(words_spoken, 2), by = "session_num",
three_subsequent_words_spoken = lead(words_spoken, 3), by = "session_num",
four_subsequent_words_spoken = lead(words_spoken, 4), by = "session_num",
five_subsequent_words_spoken = lead(words_spoken, 5), by = "session_num",
one_delta = one_subsequent_words_spoken - words_spoken,
two_delta = two_subsequent_words_spoken - words_spoken,
three_delta = three_subsequent_words_spoken - words_spoken,
four_delta = four_subsequent_words_spoken - words_spoken,
five_delta = five_subsequent_words_spoken - words_spoken) %>%
select(child_id, session_num, words_spoken, contains("delta")) %>%
gather("lag", "delta_words", -child_id, -session_num, -words_spoken)
lag_session_age <- demographic_data %>%
mutate(one_subsequent_age = lead(age, 1), by = "session_num",
two_subsequent_age = lead(age, 2), by = "session_num",
three_subsequent_age = lead(age, 3), by = "session_num",
four_subsequent_age = lead(age, 4), by = "session_num",
five_subsequent_age = lead(age, 5), by = "session_num",
one_delta = one_subsequent_age - age,
two_delta = two_subsequent_age - age,
three_delta = three_subsequent_age - age,
four_delta = four_subsequent_age - age,
five_delta = five_subsequent_age - age) %>%
select(child_id, session_num, contains("delta")) %>%
gather("lag", "delta_age", -child_id, -session_num)
# join together
full_df <- hypernyms_score_by_kid %>%
left_join(lag_session_words_spoken) %>%
left_join(lag_session_age)The data:
## Observations: 6,990
## Variables: 8
## Groups: child_id [195]
## $ child_id <chr> "4139_LTP102", "4139_LTP102", "4139_LTP102", "413…
## $ session_num <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4…
## $ mean_hypernyms <dbl> 9.397260, 9.397260, 9.397260, 9.397260, 9.397260,…
## $ mean_freq <dbl> 7.814661, 7.814661, 7.814661, 7.814661, 7.814661,…
## $ words_spoken <dbl> 94, 94, 94, 94, 94, 153, 153, 153, 153, 153, 167,…
## $ lag <chr> "one_delta", "two_delta", "three_delta", "four_de…
## $ delta_words <dbl> 59, 73, 98, 143, 194, 14, 39, 84, 135, 194, 25, 7…
## $ delta_age <dbl> 1.1176857, 2.2024984, 3.1886917, 4.1420118, 5.193…
Betas from fixed effect model: lm(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken)
by_session_lag_hypernym_beta <- full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken", "delta_age", "delta_words"), scale) %>%
drop_na() %>%
group_by(session_num, lag) %>%
nest() %>%
mutate(temp = map(data, ~ tidy(lm(delta_words ~ mean_hypernyms + delta_age +
mean_freq + words_spoken, data = .x))),
num_obs = map_dbl(data, nrow)) %>%
select(-data) %>%
unnest() %>%
mutate(lag_num = case_when(lag == "one_delta" ~ 1,
lag == "two_delta" ~ 2,
lag == "three_delta" ~ 3,
lag == "four_delta" ~ 4,
lag == "five_delta" ~ 5))
by_session_lag_hypernym_beta %>%
ggplot(aes(x = session_num, y = estimate,
group = lag_num, color = as.factor(lag_num))) +
facet_grid(~term) +
geom_smooth(method = "lm", alpha = .2)Mean hypernym betas only:
by_session_lag_hypernym_beta %>%
filter(term == "mean_hypernyms") %>%
ggplot(aes(x = session_num, y = estimate,
group = lag_num, color = as.factor(lag_num))) +
geom_point(aes(size = num_obs)) +
facet_wrap(~lag_num) +
geom_smooth(method = "lm", alpha = .2)Mixed-effect model suggests interaction between lag number at mean_hypernyms:
lmer_model <- full_df %>%
mutate(lag_num = case_when(lag == "one_delta" ~ 1,
lag == "two_delta" ~ 2,
lag == "three_delta" ~ 3,
lag == "four_delta" ~ 4,
lag == "five_delta" ~ 5)) %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
mutate(lag = fct_inorder(lag)) %>%
lmer(delta_words ~ mean_hypernyms*lag + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .)
summary(lmer_model)## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms * lag + delta_age + mean_freq +
## words_spoken + session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 5314.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.5654 -0.6463 -0.0068 0.6389 3.8399
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.18703 0.4325
## session_num 0.01267 0.1126 -0.49
## Residual 0.16726 0.4090
## Number of obs: 4217, groups: child_id, 182
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.87957 0.06182 -30.403
## mean_hypernyms -0.16241 0.01744 -9.312
## lagtwo_delta 0.16698 0.02336 7.147
## lagthree_delta 0.33181 0.03451 9.615
## lagfour_delta 0.47324 0.04728 10.009
## lagfive_delta 0.58754 0.06137 9.574
## delta_age 0.53354 0.01943 27.462
## mean_freq -0.04989 0.00933 -5.347
## words_spoken -1.13999 0.02853 -39.959
## session_num 0.34665 0.01388 24.966
## mean_hypernyms:lagtwo_delta 0.07114 0.01908 3.729
## mean_hypernyms:lagthree_delta 0.11356 0.02062 5.506
## mean_hypernyms:lagfour_delta 0.18450 0.02235 8.254
## mean_hypernyms:lagfive_delta 0.24628 0.02484 9.916
Models at each lag level. One:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag == "one_delta") %>%
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 1188.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9438 -0.6039 -0.1276 0.4673 4.4141
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.027621 0.16620
## session_num 0.001837 0.04286 -0.75
## Residual 0.134442 0.36666
## Number of obs: 1187, groups: child_id, 182
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.64127 0.05234 -12.251
## mean_hypernyms -0.04228 0.01955 -2.163
## delta_age 0.61645 0.03338 18.466
## mean_freq -0.06815 0.01336 -5.099
## words_spoken -0.33934 0.03482 -9.747
## session_num 0.08007 0.01032 7.760
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.021
## delta_age 0.355 0.039
## mean_freq -0.045 -0.038 0.040
## words_spokn 0.539 0.437 -0.296 0.115
## session_num -0.753 0.063 0.266 0.077 -0.699
Two:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag == "two_delta") %>%
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 1298.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2540 -0.6182 -0.0482 0.5383 3.9998
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.11976 0.34607
## session_num 0.00482 0.06942 -1.00
## Residual 0.18176 0.42634
## Number of obs: 1005, groups: child_id, 182
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.91734 0.07257 -12.641
## mean_hypernyms -0.06035 0.02539 -2.377
## delta_age 0.75687 0.03155 23.990
## mean_freq -0.08882 0.01737 -5.114
## words_spoken -0.75956 0.04992 -15.214
## session_num 0.14459 0.01342 10.772
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.022
## delta_age -0.172 0.104
## mean_freq -0.041 -0.040 0.042
## words_spokn 0.752 0.413 -0.230 0.129
## session_num -0.960 0.064 0.318 0.058 -0.697
## convergence code: 0
## boundary (singular) fit: see ?isSingular
Three:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag == "three_delta") %>%
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 1086.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0095 -0.5584 -0.0271 0.5818 3.3505
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.258582 0.50851
## session_num 0.009163 0.09572 -1.00
## Residual 0.171194 0.41376
## Number of obs: 823, groups: child_id, 150
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.52507 0.09623 -15.849
## mean_hypernyms -0.13650 0.02815 -4.848
## delta_age 0.72991 0.03477 20.994
## mean_freq -0.11300 0.01975 -5.721
## words_spoken -1.31578 0.05906 -22.278
## session_num 0.23122 0.01626 14.218
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.073
## delta_age -0.253 0.091
## mean_freq -0.037 -0.060 0.065
## words_spokn 0.738 0.375 -0.140 0.139
## session_num -0.971 0.102 0.154 0.054 -0.661
## convergence code: 0
## boundary (singular) fit: see ?isSingular
Four:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag == "four_delta") %>%
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 908.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7890 -0.5336 -0.0265 0.5693 3.6510
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.40675 0.6378
## session_num 0.01512 0.1230 -1.00
## Residual 0.15998 0.4000
## Number of obs: 673, groups: child_id, 144
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.94974 0.12202 -15.979
## mean_hypernyms -0.11254 0.03065 -3.672
## delta_age 0.71695 0.04197 17.083
## mean_freq -0.06116 0.02140 -2.857
## words_spoken -1.60968 0.06816 -23.617
## session_num 0.28842 0.01935 14.908
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.071
## delta_age -0.395 0.014
## mean_freq -0.049 -0.072 0.082
## words_spokn 0.704 0.363 -0.088 0.148
## session_num -0.922 0.118 0.081 0.057 -0.628
## convergence code: 0
## boundary (singular) fit: see ?isSingular
Five:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag == "five_delta") %>%
lmer(delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words ~ mean_hypernyms + delta_age + mean_freq + words_spoken +
## session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 741.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6762 -0.4856 -0.0083 0.5061 3.1931
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.57071 0.7555
## session_num 0.02588 0.1609 -0.97
## Residual 0.13704 0.3702
## Number of obs: 529, groups: child_id, 130
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.82079 0.17480 -10.417
## mean_hypernyms -0.08209 0.03312 -2.479
## delta_age 0.44382 0.05932 7.482
## mean_freq -0.01950 0.02327 -0.838
## words_spoken -1.75628 0.08636 -20.336
## session_num 0.32853 0.02563 12.820
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.092
## delta_age -0.559 0.083
## mean_freq -0.046 -0.047 0.091
## words_spokn 0.648 0.323 -0.005 0.170
## session_num -0.800 0.105 0.018 0.034 -0.638
## convergence code: 0
## Model failed to converge with max|grad| = 0.0505712 (tol = 0.002, component 1)
Excluding lag 3:
full_df %>%
mutate_at(vars("mean_hypernyms", "mean_freq", "words_spoken",
"delta_age", "delta_words"), scale) %>%
filter(lag != "three_delta") %>%
lmer(delta_words ~ mean_hypernyms*lag + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), .) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words ~ mean_hypernyms * lag + delta_age + mean_freq +
## words_spoken + session_num + (session_num | child_id)
## Data: .
##
## REML criterion at convergence: 4529.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3868 -0.6393 -0.0254 0.6445 3.8335
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 0.162925 0.40364
## session_num 0.008279 0.09099 -0.70
## Residual 0.182166 0.42681
## Number of obs: 3394, groups: child_id, 182
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.13075 0.07238 -15.622
## mean_hypernyms 0.10028 0.02370 4.231
## lagfour_delta -0.06910 0.03456 -1.999
## lagone_delta -0.39801 0.06769 -5.880
## lagtwo_delta -0.28002 0.05350 -5.234
## delta_age 0.59213 0.02203 26.883
## mean_freq -0.04509 0.01051 -4.290
## words_spoken -0.97980 0.03085 -31.758
## session_num 0.26522 0.01260 21.050
## mean_hypernyms:lagfour_delta -0.06174 0.02858 -2.160
## mean_hypernyms:lagone_delta -0.25527 0.02597 -9.828
## mean_hypernyms:lagtwo_delta -0.17807 0.02650 -6.719
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp lgfr_d lgn_dl lgtw_d delt_g mn_frq wrds_s
## mn_hyprnyms -0.191
## lagfour_dlt -0.472 0.395
## lagone_delt -0.617 0.208 0.659
## lagtwo_delt -0.606 0.261 0.695 0.941
## delta_age -0.545 0.038 0.449 0.908 0.846
## mean_freq -0.050 -0.078 0.013 0.014 0.013 0.039
## words_spokn 0.596 0.162 -0.038 -0.085 -0.067 -0.047 0.108
## session_num -0.638 0.015 -0.034 -0.068 -0.070 -0.051 0.062 -0.616
## mn_hyprnyms:lgf_ 0.157 -0.676 -0.527 -0.173 -0.217 -0.003 0.005 0.017
## mn_hyprnyms:lgn_ 0.189 -0.717 -0.386 -0.253 -0.283 -0.033 0.068 0.080
## mn_hyprnyms:lgt_ 0.162 -0.711 -0.361 -0.197 -0.289 -0.001 0.048 0.049
## sssn_n mn_hyprnyms:lgf_ mn_hyprnyms:lgn_
## mn_hyprnyms
## lagfour_dlt
## lagone_delt
## lagtwo_delt
## delta_age
## mean_freq
## words_spokn
## session_num
## mn_hyprnyms:lgf_ 0.007
## mn_hyprnyms:lgn_ 0.043 0.637
## mn_hyprnyms:lgt_ 0.033 0.623 0.713