library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = T, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
library(lme4)
theme_set(theme_classic(base_size = 10))item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
SUBTLEXUS_PATH <- "data/SUBTLEX-NL.cd-above2.with-pos.txt"
subtlexus_norms <- read_tsv(SUBTLEXUS_PATH) %>%
janitor::clean_names() %>%
select(word, lg10cd) %>%
rename(log_subt_cd = lg10cd)
SEMANTIC_DIV_PATH <- "data/semD.txt"
semantic_diversity <- read_tsv(SEMANTIC_DIV_PATH) %>%
janitor::clean_names() %>%
select(word, sem_d)
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
left_join(subtlexus_norms, by = c("uni_lemma" = "word")) %>%
left_join(semantic_diversity, by = c("uni_lemma" = "word")) %>%
ungroup()ggplot(word_bank_hyper, aes(x = sem_d, y = hypernyms)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_bank_hyper$sem_d, word_bank_hyper$hypernyms)##
## Pearson's product-moment correlation
##
## data: word_bank_hyper$sem_d and word_bank_hyper$hypernyms
## t = -10.331, df = 379, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5436068 -0.3864686
## sample estimates:
## cor
## -0.4687378
ggplot(word_bank_hyper, aes(x = log_subt_cd, y = hypernyms)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_bank_hyper$log_subt_cd, word_bank_hyper$hypernyms)##
## Pearson's product-moment correlation
##
## data: word_bank_hyper$log_subt_cd and word_bank_hyper$hypernyms
## t = 1.3131, df = 321, p-value = 0.1901
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0363272 0.1807786
## sample estimates:
## cor
## 0.07309155
ggplot(word_bank_hyper, aes(x = sem_d, y = log_subt_cd)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_bank_hyper$sem_d, word_bank_hyper$log_subt_cd)##
## Pearson's product-moment correlation
##
## data: word_bank_hyper$sem_d and word_bank_hyper$log_subt_cd
## t = 1.6073, df = 329, p-value = 0.109
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01972118 0.19421922
## sample estimates:
## cor
## 0.08826686
mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
clean_items <- cdi_data %>%
distinct(item) %>%
rowwise()%>%
mutate(item_clean = str_trim(str_split(item, "\\(")[[1]][1]))
# get produced words by kid
produced_words <- cdi_data %>%
filter(value > 0) %>%
select(-value) %>%
left_join(item_key %>% select(item, num_item_id), by = "item") %>%
mutate_if(is.character, as.factor) %>%
left_join(clean_items)
#write_csv(produced_words, "data/produced_words_tidy.csv")#produced_words <- read_csv("data/produced_words_tidy.csv") %>%
# mutate_if(is.character, as.factor)
freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
select(word, log_freq)# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
left_join(word_bank_hyper, by = "num_item_id") %>%
group_by(child_id, session_num) %>%
summarize(
mean_hypernyms = mean(hypernyms, na.rm = T),
mean_freq = mean(log_freq, na.rm = T),
mean_sem_d = mean(sem_d, na.rm = T),
mean_log_subt_cd = mean(log_subt_cd, na.rm = T))#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id) %>%
mutate(subsequent_age = lead(age), by = "session_num",
subsequent_percentile = lead(percentile), by = "session_num",
subsequent_words_spoken = lead(words_spoken), by = "session_num",
delta_age = subsequent_age - age,
delta_percentile = subsequent_percentile - percentile,
delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
select(-by)
# join together
full_df <- hypernyms_score_by_kid %>%
left_join(demographic_data) Models
lmer(delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq + words_spoken + session_num + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +
## words_spoken + session_num + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 12529.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1168 -0.4896 -0.1355 0.2990 9.4906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 244.231 15.628
## session_num 6.574 2.564 -1.00
## Residual 1943.152 44.081
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 439.48463 38.29184 11.477
## mean_hypernyms -12.32378 2.15716 -5.713
## delta_age 31.92327 1.91363 16.682
## mean_freq -38.15424 4.65113 -8.203
## words_spoken -0.10633 0.01208 -8.806
## session_num 0.90468 0.61017 1.483
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.298
## delta_age -0.082 0.006
## mean_freq -0.848 -0.246 0.031
## words_spokn -0.625 0.527 0.040 0.336
## session_num 0.040 0.018 -0.129 -0.083 -0.455
lmer(delta_words_spoken ~ mean_sem_d + delta_age + mean_freq + + words_spoken +session_num + (session_num|child_id), full_df ) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words_spoken ~ mean_sem_d + delta_age + mean_freq + +words_spoken +
## session_num + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 12540.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1267 -0.4887 -0.1248 0.3042 9.1894
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 288.437 16.983
## session_num 8.454 2.908 -1.00
## Residual 1959.912 44.271
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 160.29958 66.04919 2.427
## mean_sem_d 127.58856 33.32163 3.829
## delta_age 32.10020 1.92512 16.674
## mean_freq -42.84973 4.55703 -9.403
## words_spoken -0.08531 0.01124 -7.590
## session_num 0.95470 0.62441 1.529
##
## Correlation of Fixed Effects:
## (Intr) mn_sm_ delt_g mn_frq wrds_s
## mean_sem_d -0.830
## delta_age -0.056 0.012
## mean_freq -0.618 0.077 0.034
## words_spokn 0.025 -0.388 0.036 0.492
## session_num 0.018 0.009 -0.131 -0.081 -0.498
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
lmer(delta_words_spoken ~ mean_sem_d + mean_hypernyms + delta_age + mean_freq + words_spoken +session_num + (session_num|child_id), full_df ) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_sem_d + mean_hypernyms + delta_age +
## mean_freq + words_spoken + session_num + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 12518
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1135 -0.4849 -0.1244 0.3041 9.4596
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 225.73 15.024
## session_num 5.95 2.439 -1.00
## Residual 1945.14 44.104
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 334.17038 76.32705 4.378
## mean_sem_d 59.21143 36.77396 1.610
## mean_hypernyms -10.68012 2.39794 -4.454
## delta_age 31.96669 1.91341 16.707
## mean_freq -38.49742 4.64703 -8.284
## words_spoken -0.10935 0.01217 -8.989
## session_num 0.92165 0.60664 1.519
##
## Correlation of Fixed Effects:
## (Intr) mn_sm_ mn_hyp delt_g mn_frq wrds_s
## mean_sem_d -0.866
## mn_hyprnyms -0.515 0.441
## delta_age -0.055 0.016 0.012
## mean_freq -0.392 -0.037 -0.239 0.030
## words_spokn -0.191 -0.137 0.408 0.038 0.337
## session_num 0.008 0.014 0.022 -0.129 -0.084 -0.455
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
lmer(delta_words_spoken ~ mean_log_subt_cd + delta_age + mean_freq + + words_spoken + session_num + (session_num|child_id), full_df) %>%
summary()## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_log_subt_cd + delta_age + mean_freq +
## +words_spoken + session_num + (session_num | child_id)
## Data: full_df
##
## REML criterion at convergence: 12555.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1095 -0.4700 -0.1195 0.3007 9.1973
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 355.25 18.848
## session_num 11.24 3.353 -1.00
## Residual 1964.78 44.326
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 379.25563 39.13249 9.692
## mean_log_subt_cd -15.20996 13.46592 -1.130
## delta_age 32.10693 1.93282 16.611
## mean_freq -42.38422 4.70243 -9.013
## words_spoken -0.06734 0.01046 -6.441
## session_num 0.88173 0.64257 1.372
##
## Correlation of Fixed Effects:
## (Intr) mn_l__ delt_g mn_frq wrds_s
## mn_lg_sbt_c -0.316
## delta_age -0.071 -0.026
## mean_freq -0.847 -0.230 0.038
## words_spokn -0.560 0.026 0.042 0.549
## session_num 0.033 0.026 -0.133 -0.085 -0.523
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling