item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"
cdi_data <- read_csv(mcdi_path) %>%
bind_rows(read_csv(mcdi_path2)) %>%
select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
-num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
arrange(child_id, session_num)
# get produced words by kid
#produced_words <- cdi_data %>%
# filter(value > 0) %>%
# select(-value) %>%
# left_join(item_key %>% select(item, num_item_id), by = "item") %>%
# mutate_if(is.character, as.factor) %>%
# rowwise() %>%
# mutate(item_clean = str_trim(str_split(item, "\\(")[[1]][1]))
# get produced words by kid
produced_words <- read_csv("data/produced_words_tidy.csv") %>%
mutate_if(is.character, as.factor) %>%
mutate(num_item_id = as.factor(num_item_id))
# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
left_join(freqs, by = c("item_clean" = "word")) %>%
left_join(word_bank_hyper, by = "num_item_id") %>%
left_join(human_ratings, by = "num_item_id") %>%
group_by(child_id, session_num) %>%
summarize(mean_hypernyms = mean(hypernyms, na.rm = TRUE),
mean_freq = mean(log_freq, na.rm = T),
mean_human_ratings = mean(mean_rating, na.rm = T))
#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
select(-item, -value) %>%
distinct(child_id, session_num, .keep_all = T) %>%
group_by(child_id) %>%
mutate(subsequent_age = lead(age), by = "session_num",
subsequent_words_spoken = lead(words_spoken), by = "session_num",
delta_age = subsequent_age - age,
delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
select(child_id, session_num, age, subsequent_age, delta_age, words_spoken, subsequent_words_spoken, delta_words_spoken)
# join together
full_df <- hypernyms_score_by_kid %>%
left_join(demographic_data)
summary(full_df)
## child_id session_num mean_hypernyms mean_freq
## Length:1398 Min. : 1.00 Min. : 7.068 Min. :6.945
## Class :character 1st Qu.: 2.00 1st Qu.: 7.764 1st Qu.:7.517
## Mode :character Median : 4.00 Median : 8.252 Median :7.644
## Mean : 5.04 Mean : 8.541 Mean :7.756
## 3rd Qu.: 7.00 3rd Qu.: 9.036 3rd Qu.:7.871
## Max. :12.00 Max. :13.400 Max. :9.660
## NA's :2
## mean_human_ratings age subsequent_age delta_age
## Min. :2.030 Min. :15.38 Min. :16.60 Min. :-5.0625
## 1st Qu.:2.509 1st Qu.:20.05 1st Qu.:20.78 1st Qu.: 0.8876
## Median :2.597 Median :22.12 Median :22.75 Median : 1.0191
## Mean :2.585 Mean :22.60 Mean :23.19 Mean : 1.1138
## 3rd Qu.:2.673 3rd Qu.:24.94 3rd Qu.:25.41 3rd Qu.: 1.2820
## Max. :3.250 Max. :33.07 Max. :33.07 Max. : 7.1663
## NA's :195 NA's :195
## words_spoken subsequent_words_spoken delta_words_spoken
## Min. : 2.0 Min. : 3.0 Min. :-286.00
## 1st Qu.: 83.0 1st Qu.:111.0 1st Qu.: 17.00
## Median :249.0 Median :297.0 Median : 36.00
## Mean :285.0 Mean :312.1 Mean : 48.46
## 3rd Qu.:471.8 3rd Qu.:500.0 3rd Qu.: 66.00
## Max. :680.0 Max. :680.0 Max. : 545.00
## NA's :195 NA's :195
resid_model <- lm(delta_words_spoken ~ delta_age + words_spoken + mean_freq,
data = full_df)
kid_level <- full_df %>%
modelr::add_residuals(resid_model,"delta_words_spoken_resid") %>%
select(child_id, delta_words_spoken_resid, mean_hypernyms) %>%
group_by(child_id) %>%
summarize_all(mean, na.rm = TRUE)
ggplot(kid_level, aes(x = mean_hypernyms,
y = delta_words_spoken_resid)) +
geom_point(alpha = .8) +
geom_smooth(alpha = .2) +
geom_smooth(method = "lm", color = "red", alpha = .2) +
ylab("N new words at t (residualized)") +
xlab("Mean hypernym level of vocabulary at t-1") +
ggtitle("Hypernym level at previous timepoint\npredicts number of new words learned") +
theme_classic(base_size = 14)
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + words_spoken +
## mean_freq + (session_num | child_id)
## Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 12532.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.0944 -0.4855 -0.1355 0.3020 9.5272
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 219.515 14.816
## session_num 6.959 2.638 -1.00
## Residual 1948.642 44.143
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 431.93255 38.00127 11.366
## mean_hypernyms -12.20446 2.14382 -5.693
## delta_age 32.33158 1.89772 17.037
## words_spoken -0.09603 0.01078 -8.910
## mean_freq -37.17925 4.60125 -8.080
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp delt_g wrds_s
## mn_hyprnyms -0.299
## delta_age -0.077 0.007
## words_spokn -0.682 0.600 -0.022
## mean_freq -0.848 -0.246 0.020 0.337
## convergence code: 0
## boundary (singular) fit: see ?isSingular
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms * words_spoken + delta_age +
## mean_freq + (session_num | child_id)
## Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 12466.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.9779 -0.4766 -0.1440 0.3318 9.6623
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 6.895 2.626
## session_num 3.782 1.945 -1.00
## Residual 1835.309 42.841
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 269.94535 40.68661 6.635
## mean_hypernyms -17.47718 2.12860 -8.211
## words_spoken -1.34238 0.13767 -9.751
## delta_age 32.29388 1.83954 17.555
## mean_freq -13.52618 5.20383 -2.599
## mean_hypernyms:words_spoken 0.16612 0.01825 9.101
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp wrds_s delt_g mn_frq
## mn_hyprnyms -0.114
## words_spokn 0.408 0.303
## delta_age -0.070 0.006 -0.005
## mean_freq -0.882 -0.362 -0.490 0.018
## mn_hyprny:_ -0.454 -0.260 -0.997 0.003 0.513
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words_spoken ~ mean_human_ratings + delta_age + words_spoken +
## mean_freq + (session_num | child_id)
## Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 12565.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.0949 -0.4738 -0.1249 0.3069 9.3995
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 299.84 17.316
## session_num 9.72 3.118 -1.00
## Residual 1954.96 44.215
## Number of obs: 1203, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 300.54703 40.26124 7.465
## mean_human_ratings 60.92191 15.62467 3.899
## delta_age 32.20772 1.90643 16.894
## words_spoken -0.09338 0.01251 -7.467
## mean_freq -54.05892 5.27929 -10.240
##
## Correlation of Fixed Effects:
## (Intr) mn_hm_ delt_g wrds_s
## mn_hmn_rtng -0.411
## delta_age -0.062 -0.020
## words_spokn -0.130 -0.711 -0.008
## mean_freq -0.563 -0.519 0.029 0.744
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular
Both:
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## delta_words_spoken ~ mean_hypernyms + mean_human_ratings + delta_age +
## words_spoken + scale(mean_freq) + (session_num | child_id)
## Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 12517.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.0932 -0.4806 -0.1293 0.3077 9.6416
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## child_id (Intercept) 210.059 14.493
## session_num 6.085 2.467 -1.00
## Residual 1938.999 44.034
## Number of obs: 1201, groups: child_id, 195
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -4.25817 53.40857 -0.080
## mean_hypernyms -9.99964 2.27482 -4.396
## mean_human_ratings 52.63135 17.39825 3.025
## delta_age 32.23877 1.89272 17.033
## words_spoken -0.12024 0.01313 -9.160
## scale(mean_freq) -17.66341 2.11649 -8.346
##
## Correlation of Fixed Effects:
## (Intr) mn_hyp mn_hm_ delt_g wrds_s
## mn_hyprnyms -0.668
## mn_hmn_rtng -0.924 0.340
## delta_age -0.030 0.003 -0.012
## words_spokn 0.324 0.263 -0.580 -0.010
## scl(mn_frq) 0.601 -0.389 -0.594 0.024 0.564
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular