library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(lme4)
library(broom)
theme_set(theme_classic(base_size = 10))cog6 <- read_csv("data/followUp_long4GL_FU6mo.csv") %>%
rename(child_id = code1)
cog12 <- read_csv("data/followUp_long4GL_FU12mo.csv") %>%
rename(child_id = code2,
seq_learning = "sequence learning")
leiter <- read_tsv("data/LongWave3-LeiterR4GL.csv", skip = 2) %>%
janitor::clean_names() %>%
select(-contains("percent")) %>%
rename(child_id = code)
normative_order <- read_csv("../13_word_order/normative_corrs.csv") %>%
rowwise()%>%
mutate(child_id = unlist(str_split(child_id, "_"))[2])
cog_data <- full_join(cog6, cog12) %>%
left_join(leiter) %>%
mutate_if(is.character, as.factor) %>%
janitor::clean_names() %>%
select(-age2) %>%
mutate(material1 = ifelse(material1 == "Fuss", NA, material1),
material1 = as.numeric(material1),
cdi1 = as.numeric(ifelse(cdi1 == ">99", "99", cdi1))) %>%
select(-contains("2")) item_key <- read_csv("../11_hypernyms/data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("../11_hypernyms/data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
SUBTLEXUS_PATH <- "../11_hypernyms/data/SUBTLEX-NL.cd-above2.with-pos.txt"
subtlexus_norms <- read_tsv(SUBTLEXUS_PATH) %>%
janitor::clean_names() %>%
select(word, lg10cd) %>%
rename(log_subt_cd = lg10cd)
SEMANTIC_DIV_PATH <- "../11_hypernyms/data/semD.txt"
semantic_diversity <- read_tsv(SEMANTIC_DIV_PATH) %>%
janitor::clean_names() %>%
select(word, sem_d)
word_bank_hyper <- read_csv("../11_hypernyms/data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
left_join(subtlexus_norms, by = c("uni_lemma" = "word")) %>%
left_join(semantic_diversity, by = c("uni_lemma" = "word")) %>%
ungroup()mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
#get produced words by kid
# produced_words <- cdi_data %>%
# filter(value > 0) %>%
# select(-value) %>%
# left_join(item_key %>% select(item, num_item_id), by = "item") %>%
# mutate(item_clean = map_chr(item, ~str_trim(str_split(., "\\(")[[1]][1])))
# write_csv(produced_words, "produced_words_cache.csv")
produced_words <- read_csv("produced_words_cache.csv") %>%
mutate_if(is.character, as.factor)freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
select(word, log_freq)
# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
mutate(num_item_id = as.factor(num_item_id))%>%
left_join(word_bank_hyper, by = "num_item_id") %>%
group_by(child_id, session_num) %>%
summarize(
mean_hypernyms = mean(hypernyms, na.rm = T),
mean_freq = mean(log_freq, na.rm = T),
mean_sem_d = mean(sem_d, na.rm = T),
mean_log_subt_cd = mean(log_subt_cd, na.rm = T))```
#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id) %>%
mutate(subsequent_age = lead(age), by = "session_num",
subsequent_percentile = lead(percentile), by = "session_num",
subsequent_words_spoken = lead(words_spoken), by = "session_num",
delta_age = subsequent_age - age,
delta_percentile = subsequent_percentile - percentile,
delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
select(-by)# join together,
full_df <- hypernyms_score_by_kid %>%
left_join(demographic_data) %>%
rowwise() %>%
mutate(child_id = unlist(str_split(child_id, "_"))[2]) %>%
left_join(cog_data) %>%
left_join(normative_order)
child_level_full_df <- full_df %>%
group_by(child_id) %>%
arrange(-session_num) %>%
slice(1) %>%
select(-contains("delta"), -contains("subsequent"))
clean_df <- child_level_full_df[,-1] %>%
select_if(is.numeric) %>%
select(-correlation_with_normative)long_corr <- cor(clean_df,
use = "pairwise.complete.obs") %>%
as.data.frame() %>%
rownames_to_column("v2") %>%
gather("v1", "estimate", -v2)
long_p <- corrplot::cor.mtest(clean_df,
use = "pairwise.complete.obs")$p %>%
as.data.frame(row.names = names(clean_df)) %>%
do(setNames(.,names(clean_df))) %>%
rownames_to_column("v2") %>%
gather("v1", "p", -v2)
corr_df <- full_join(long_corr, long_p) %>%
mutate(estimate_char = case_when(v1 == v2 ~ "",
TRUE ~ as.character(round(estimate,2))),
estimate = case_when(v1 == v2 ~ as.numeric(NA),
TRUE ~ estimate),
estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))
ggplot(corr_df, aes(v1, fct_rev(v2), fill=estimate_color)) +
geom_tile() + #rectangles for each correlation
#add actual correlation value in the rectangle
geom_text(aes(label = estimate_char), size=3) +
scale_fill_gradient2(low ="blue", mid = "white", high = "red",
midpoint = 0, space = "Lab", guide = "colourbar",
name = "Pearson's r") +
ggtitle("Pairwise Correlation Coefficients") +
theme_classic(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks = element_blank(),
legend.position = "none")fv_composite - fundamental visual compositebvr_iq - brief visual and reasoning iqfvr_iq - full visual and reasoning iqvrss_classification - visual and reasoning scaled scores, classificationvrss_classification - visual and reasoning scaled scores, matchingcog_data %>%
group_by(card_sort1) %>%
multi_boot_standard(col = "cdi1", na.rm = T) %>%
ggplot(aes(x = card_sort1, y = mean))+
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper )) +
theme_classic()lm(cdi1 ~ age1 + card_sort1, cog_data %>%
filter(card_sort1 %in% c("Perseverator", "Switcher"))) %>%
summary()##
## Call:
## lm(formula = cdi1 ~ age1 + card_sort1, data = cog_data %>% filter(card_sort1 %in%
## c("Perseverator", "Switcher")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.207 -10.166 -0.879 9.641 70.659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.871 74.682 1.418 0.161
## age1 -2.178 2.073 -1.050 0.297
## card_sort1Switcher 2.084 3.623 0.575 0.567
##
## Residual standard error: 15.37 on 71 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.01953, Adjusted R-squared: -0.008093
## F-statistic: 0.707 on 2 and 71 DF, p-value: 0.4966
### Session level anaylsis
full_df_complete_card_sort <- full_df %>%
filter(card_sort1 %in% c("Perseverator", "Switcher"))
lmer(words_spoken ~ age + card_sort1 + (session_num|child_id),
full_df_complete_card_sort) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: words_spoken ~ age + card_sort1 + (session_num | child_id)
## Data: full_df_complete_card_sort
##
## REML criterion at convergence: 8089.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8347 -0.5443 -0.0360 0.5257 3.3082
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 15876 126.00
## session_num 163 12.77 -0.28
## Residual 3516 59.30
## Number of obs: 701, groups: child_id, 78
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -694.001 35.957 -19.301
## age 42.955 1.539 27.911
## card_sort1Switcher 36.137 28.155 1.283
##
## Correlation of Fixed Effects:
## (Intr) age
## age -0.852
## crd_srt1Swt -0.338 -0.015
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq + words_spoken + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 12532.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.0944 -0.4855 -0.1355 0.3020 9.5272
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 219.515 14.816
## session_num 6.959 2.638 -1.00
## Residual 1948.642 44.143
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 431.93255 38.00127 11.366
## mean_hypernyms -12.20446 2.14382 -5.693
## delta_age 32.33158 1.89772 17.037
## mean_freq -37.17925 4.60125 -8.080
## words_spoken -0.09603 0.01078 -8.910
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq
## mn_hyprnyms -0.299
## delta_age -0.077 0.007
## mean_freq -0.848 -0.246 0.020
## words_spokn -0.682 0.600 -0.022 0.337
Model with ppvt/evt1/card_sort controls:
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq + words_spoken + (session_num|child_id), full_df %>% select(delta_words_spoken, mean_hypernyms, delta_age, mean_freq, words_spoken, session_num, child_id, evt1, ppvt1, card_sort1) %>% drop_na()) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + (session_num | child_id)
## Data:
## full_df %>% select(delta_words_spoken, mean_hypernyms, delta_age,
## mean_freq, words_spoken, session_num, child_id, evt1, ppvt1,
## card_sort1) %>% drop_na()
##
## REML criterion at convergence: 1254.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6136 -0.4293 -0.0288 0.3374 4.9939
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 326.7 18.07
## session_num 167.5 12.94 -1.00
## Residual 2024.3 44.99
## Number of obs: 120, groups: child_id, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 158.70969 116.90438 1.358
## mean_hypernyms -18.55960 6.16866 -3.009
## delta_age 60.73374 4.44608 13.660
## mean_freq -0.23211 16.28301 -0.014
## words_spoken -0.05560 0.05073 -1.096
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq
## mn_hyprnyms 0.031
## delta_age -0.095 0.011
## mean_freq -0.882 -0.495 0.042
## words_spokn -0.525 0.455 -0.060 0.223
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
words_spoken + evt1 + ppvt1 + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + evt1 + ppvt1 + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 1248.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6359 -0.3674 -0.0918 0.3946 5.2510
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 371.4 19.27
## session_num 133.4 11.55 -1.00
## Residual 2037.9 45.14
## Number of obs: 120, groups: child_id, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 123.74759 125.45094 0.986
## mean_hypernyms -14.40879 6.08351 -2.368
## delta_age 60.98202 4.33804 14.057
## mean_freq -8.93135 16.65902 -0.536
## words_spoken -0.07938 0.04929 -1.610
## evt1 -0.01531 0.47007 -0.033
## ppvt1 0.81051 0.42354 1.914
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s evt1
## mn_hyprnyms -0.016
## delta_age -0.123 0.039
## mean_freq -0.844 -0.479 0.042
## words_spokn -0.484 0.403 -0.037 0.276
## evt1 -0.383 0.061 0.089 0.202 0.063
## ppvt1 0.209 0.093 -0.037 -0.282 -0.217 -0.691
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
words_spoken + card_sort1 + (session_num|child_id),
full_df %>% filter(card_sort1 %in% c("Perseverator", "Switcher"))) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + card_sort1 + (session_num | child_id)
## Data: full_df %>% filter(card_sort1 %in% c("Perseverator", "Switcher"))
##
## REML criterion at convergence: 6642.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.1187 -0.4412 -0.1157 0.2502 8.2647
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 465.98 21.587
## session_num 8.36 2.891 -1.00
## Residual 2524.06 50.240
## Number of obs: 622, groups: child_id, 78
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 520.69535 64.06687 8.127
## mean_hypernyms -18.73407 3.51829 -5.325
## delta_age 32.22009 2.70866 11.895
## mean_freq -39.71647 7.67984 -5.172
## words_spoken -0.14297 0.01657 -8.630
## card_sort1Switcher 2.26162 4.34456 0.521
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.319
## delta_age -0.055 0.023
## mean_freq -0.855 -0.211 -0.006
## words_spokn -0.673 0.615 -0.002 0.333
## crd_srt1Swt -0.099 -0.013 -0.036 0.088 -0.038
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
words_spoken + fvr_iq + (session_num|child_id),
data = full_df ) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + fvr_iq + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 1206.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5590 -0.4804 -0.0328 0.4801 4.7561
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 84.5475 9.1950
## session_num 0.6426 0.8016 1.00
## Residual 1510.9627 38.8711
## Number of obs: 120, groups: child_id, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 105.01040 136.08449 0.772
## mean_hypernyms -17.67392 5.28962 -3.341
## delta_age 51.10270 3.34181 15.292
## mean_freq -14.71250 14.12959 -1.041
## words_spoken -0.10722 0.03427 -3.128
## fvr_iq 1.36726 0.47780 2.862
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.037
## delta_age -0.071 0.105
## mean_freq -0.837 -0.348 0.016
## words_spokn -0.319 0.540 0.041 0.209
## fvr_iq -0.631 -0.123 -0.032 0.287 -0.196
Last session predicitng IQ
lm(fvr_iq ~ mean_freq + words_spoken + age1 , data = child_level_full_df ) %>%
summary()##
## Call:
## lm(formula = fvr_iq ~ mean_freq + words_spoken + age1, data = child_level_full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.486 -3.898 -1.574 4.291 19.390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 178.378346 137.296225 1.299 0.2062
## mean_freq -8.725724 6.912827 -1.262 0.2190
## words_spoken 0.024143 0.008284 2.915 0.0076 **
## age1 0.212794 3.023778 0.070 0.9445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.141 on 24 degrees of freedom
## (167 observations deleted due to missingness)
## Multiple R-squared: 0.4366, Adjusted R-squared: 0.3662
## F-statistic: 6.2 on 3 and 24 DF, p-value: 0.002852
lm(fvr_iq ~ mean_freq + mean_hypernyms + age1 , data = child_level_full_df ) %>%
summary()##
## Call:
## lm(formula = fvr_iq ~ mean_freq + mean_hypernyms + age1, data = child_level_full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7670 -5.6233 -0.9193 4.0780 20.8633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 339.626 128.725 2.638 0.0144 *
## mean_freq -15.911 6.589 -2.415 0.0237 *
## mean_hypernyms -4.026 1.968 -2.046 0.0519 .
## age1 -1.545 3.105 -0.498 0.6233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.741 on 24 degrees of freedom
## (167 observations deleted due to missingness)
## Multiple R-squared: 0.3505, Adjusted R-squared: 0.2693
## F-statistic: 4.317 on 3 and 24 DF, p-value: 0.01436
#cor.test(child_level_full_df$mean_freq, child_level_full_df$mean_hypernyms)
#cor.test(child_level_full_df$words_spoken, child_level_full_df$mean_hypernyms)
#cor.test(child_level_full_df$words_spoken, child_level_full_df$mean_hypernyms) n_by_session <- full_df %>%
filter(!is.na(fvr_iq)) %>%
count(session_num)
fvr_cor_by_session <- full_df %>%
group_by(session_num) %>%
filter(!is.na(fvr_iq)) %>%
nest() %>%
filter(session_num %in% 1:9) %>%
mutate(test = map(data, ~ tidy(cor.test(.x$fvr_iq, .x$words_spoken)))) %>%
select(-data) %>%
unnest() %>%
left_join(n_by_session)
ggplot(fvr_cor_by_session , aes(x = session_num, y = estimate, color = n)) +
ylab("fvr_iq ~ words_spoken correleation") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))n_by_session <- full_df %>%
filter(!is.na(fr_composite)) %>%
count(session_num)
fr_cor_by_session <- full_df %>%
group_by(session_num) %>%
filter(!is.na(fr_composite)) %>%
nest() %>%
filter(session_num %in% 1:9) %>%
mutate(test = map(data, ~ tidy(cor.test(.x$fr_composite, .x$words_spoken)))) %>%
select(-data) %>%
unnest() %>%
left_join(n_by_session)
ggplot(fr_cor_by_session , aes(x = session_num, y = estimate, color = n)) +
ylab("fluid reasoning ~ words_spoken correleation") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))n_by_session <- full_df %>%
filter(!is.na(ms_composite)) %>%
count(session_num)
ms_cor_by_session <- full_df %>%
group_by(session_num) %>%
filter(!is.na(ms_composite)) %>%
nest() %>%
filter(session_num %in% 1:9) %>%
mutate(test = map(data, ~ tidy(cor.test(.x$ms_composite, .x$words_spoken)))) %>%
select(-data) %>%
unnest() %>%
left_join(n_by_session)
ggplot(ms_cor_by_session , aes(x = session_num, y = estimate, color = n)) +
ylab("memory screen ~ words_spoken correleation") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))lmer(words_spoken ~ correlation_with_normative + session_num + (session_num|child_id), data = full_df ) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## words_spoken ~ correlation_with_normative + session_num + (session_num |
## child_id)
## Data: full_df
##
## REML criterion at convergence: 7871.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1743 -0.6072 -0.0198 0.5162 4.1825
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 19082.8 138.14
## session_num 144.3 12.01 -0.59
## Residual 3151.7 56.14
## Number of obs: 692, groups: child_id, 62
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -473.557 110.749 -4.276
## correlation_with_normative 777.871 169.328 4.594
## session_num 47.263 1.645 28.729
##
## Correlation of Fixed Effects:
## (Intr) crrl__
## crrltn_wth_ -0.987
## session_num -0.098 -0.001