library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(feather)
library(langcog)
library(broom)
library(lme4)
theme_set(theme_classic(base_size = 10))
Load mcdi and t value data
t_values <- read_csv("data/word_coeffs_log_mtld_t2.csv") %>%
select(word, t) %>%
mutate(word = tolower(word))
mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
Get t-score by kid
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
# get produced words by kid
produced_words <- cdi_data %>%
filter(value > 0) %>%
select(-value)
# get mean t score by kid
t_score_by_kid <- produced_words %>%
left_join(t_values, by = c("item"= "word")) %>% # need to tidy this, some words missing!
group_by(child_id, session_num) %>%
summarize(mean_t = mean(t, na.rm = TRUE))
Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id) %>%
mutate(subsequent_age = lead(age), by = "session_num",
subsequent_percentile = lead(percentile), by = "session_num",
subsequent_words_spoken = lead(words_spoken), by = "session_num",
delta_age = subsequent_age - age,
delta_percentile = subsequent_percentile - percentile,
delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
select( -by)
# join together
full_df <- t_score_by_kid %>%
left_join(demographic_data)
Fit mixed effect model predicting change in words at t2 as a function of mean t-score at t1, controlling for change in age. Mean t-score at t1 predicts change in vocabulary size at t2.
lmer(delta_words_spoken ~ mean_t + delta_age + words_spoken + (session_num|child_id), full_df) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words_spoken ~ mean_t + delta_age + words_spoken + (session_num |
## child_id)
## Data: full_df
##
## REML criterion at convergence: 14693.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.4867 -0.4560 -0.1029 0.3097 8.1779
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 1089.68 33.010
## session_num 27.81 5.273 -0.99
## Residual 1716.04 41.425
## Number of obs: 1412, groups: child_id, 224
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -17.180983 6.419417 -2.676
## mean_t 70.271043 11.493447 6.114
## delta_age 28.061913 1.452022 19.326
## words_spoken -0.027867 0.007283 -3.826
##
## Correlation of Fixed Effects:
## (Intr) mean_t delt_g
## mean_t -0.895
## delta_age -0.245 -0.019
## words_spokn 0.055 -0.385 0.032
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling