The goal of this markdown is to show demographic data analyses and percentile tables for the YNMO short form CDI data.
Read in data.
ynmo1 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 1 Sept 2022 - Final.xlsx",
sheet="Part 1 responses")) |>
mutate(part = 1)
ynmo2 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 2 Jan 2023 - Final .xlsx",
sheet=" Part 2 responses")) |>
mutate(part = 2)
ynmo3 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 3 Jan 2023 -Final.xlsx",
sheet="Part 3 Responses - Ynmo app")) |>
mutate(part = 3)
raw_ynmo <- bind_rows(ynmo1, ynmo2, ynmo3)
Consider ages in the removed data.
qplot(raw_ynmo$age_in_months, fill = as.factor(raw_ynmo$part), binwidth = 6)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
ynmo <- raw_ynmo |> slice(-1)
demo <- ynmo |>
select(subject_ID : disability_or_language_disorders_other_family)
ynmo_wg <- ynmo |>
filter(!(`cat...117` %in% c("yes","no"))) |>
mutate(form = "WG")
ynmo_ws <- ynmo |>
filter(`cat...117` %in% c("yes","no")) |>
mutate(form = "WS")
Overall totals.
bind_rows(ynmo_wg, ynmo_ws) |>
group_by(form) |>
count() |>
knitr::kable()
form | n |
---|---|
WG | 93 |
WS | 320 |
By part and form totals.
bind_rows(ynmo_wg, ynmo_ws) |>
group_by(part, form) |>
count() |>
knitr::kable()
part | form | n |
---|---|---|
1 | WG | 56 |
1 | WS | 150 |
2 | WG | 6 |
2 | WS | 36 |
3 | WG | 31 |
3 | WS | 134 |
There is not enough data on WG to move froward yet.
ynmo_wg_long <- ynmo_wg |>
select(subject_ID, age_in_months, sex, `cat...25`:`also...116`) |>
pivot_longer( `cat...25`:`also...116`, names_to = "item", values_to = "value") |>
mutate(produces = ifelse(value %in% "produces", 1, 0),
understands = ifelse(value %in% c("understands","produces"), 1, 0))
ynmo_wg_ms <- ynmo_wg_long |>
group_by(subject_ID, age_in_months, sex) |>
summarise(understands = sum(understands),
produces = sum(produces))
## `summarise()` has grouped output by 'subject_ID', 'age_in_months'. You can
## override using the `.groups` argument.
92 items on wg, 100 on WS
max_vocab = 92
quantiles = c(10, 25, 50, 75, 90)
ynmo_wg_ms$understands_01 <- .001 + ynmo_wg_ms$understands/(max_vocab+1)
ynmo_wg_ms$produces_01 <- .001 + ynmo_wg_ms$produces/(max_vocab+1)
prod_mod_data <- filter(ynmo_wg_ms, age_in_months >= 8, age_in_months < 24)
ynmo_wg_prod_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = prod_mod_data)
## GAMLSS-RS iteration 1: Global Deviance = -455.4926
## GAMLSS-RS iteration 2: Global Deviance = -463.355
## GAMLSS-RS iteration 3: Global Deviance = -468.6023
## GAMLSS-RS iteration 4: Global Deviance = -472.226
## GAMLSS-RS iteration 5: Global Deviance = -474.528
## GAMLSS-RS iteration 6: Global Deviance = -475.8397
## GAMLSS-RS iteration 7: Global Deviance = -476.5316
## GAMLSS-RS iteration 8: Global Deviance = -476.8731
## GAMLSS-RS iteration 9: Global Deviance = -477.0334
## GAMLSS-RS iteration 10: Global Deviance = -477.106
centiles <- centiles.pred(ynmo_wg_prod_mod, cent = quantiles,
xname = "age_in_months", xvalues = 8:18) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
mutate(predicted = (predicted * (max_vocab+1)) - .001)
## Warning in predict.gamlss(obj, what = "mu", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
## new prediction
ggplot(prod_mod_data, aes(x = age_in_months, y = produces)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) +
xlim(8,18)
## Warning: Removed 1 rows containing missing values (`geom_point()`).
comp_mod_data <- filter(ynmo_wg_ms, age_in_months >= 8, age_in_months < 24)
ynmo_wg_comp_mod <- gamlss(understands_01 ~ pbm(age_in_months, lambda = 10000),
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = comp_mod_data)
## GAMLSS-RS iteration 1: Global Deviance = -140.4853
## GAMLSS-RS iteration 2: Global Deviance = -140.4908
centiles <- centiles.pred(ynmo_wg_comp_mod, cent = quantiles,
xname = "age_in_months", xvalues = 8:18) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
mutate(predicted = (predicted * (max_vocab+1)) - .001)
## new prediction
## new prediction
ggplot(ynmo_wg_ms, aes(x = age_in_months, y = understands)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) + xlim(8,18)
## Warning: Removed 5 rows containing missing values (`geom_point()`).
ynmo_ws_long <- ynmo_ws |>
select(subject_ID, age_in_months, sex, `cat...117`:`if.2`, combine_words) |>
pivot_longer( `cat...117`:`if.2`, names_to = "item", values_to = "value") |>
mutate(produces = ifelse(value %in% "yes", 1, 0))
ynmo_ws_ms <- ynmo_ws_long |>
group_by(subject_ID, age_in_months, sex) |>
summarise(produces = sum(produces), combine_words = combine_words[1])
## `summarise()` has grouped output by 'subject_ID', 'age_in_months'. You can
## override using the `.groups` argument.
ggplot(filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36),
aes(x = age_in_months, y = produces)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Consider sex effects.
ggplot(filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36),
aes(x = age_in_months, y = produces, col = sex)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Now with percentiles.
max_vocab = 100
quantiles = c(10, 25, 50, 75, 90)
ynmo_ws_ms$produces_01 <- .001 + ynmo_ws_ms$produces/(max_vocab+1)
ws_mod_data <- filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36)
ynmo_ws_prod_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = ws_mod_data)
## GAMLSS-RS iteration 1: Global Deviance = -164.0857
## GAMLSS-RS iteration 2: Global Deviance = -167.8864
## GAMLSS-RS iteration 3: Global Deviance = -167.87
centiles <- centiles.pred(ynmo_ws_prod_mod, cent = quantiles,
xname = "age_in_months", xvalues = 16:36) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
mutate(predicted = (predicted * (max_vocab+1)) - .001)
## new prediction
## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
ggplot(ynmo_ws_ms, aes(x = age_in_months, y = produces)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) +
xlim(16,36) +
ylab("Total productive vocabulary") +
xlab("Age (months)") +
scale_color_discrete(name = "Percentile") +
ggthemes::theme_few()
## Warning: Removed 25 rows containing missing values (`geom_point()`).
ggsave(here("graphs","percentiles.png"), width = 6, height = 4)
## Warning: Removed 26 rows containing missing values (`geom_point()`).
ynmo_ws_prod_sex_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000) * sex,
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = ws_mod_data)
## GAMLSS-RS iteration 1: Global Deviance = -174.8319
## GAMLSS-RS iteration 2: Global Deviance = -179.108
## GAMLSS-RS iteration 3: Global Deviance = -179.1156
pred_data <- expand_grid(age_in_months = 16:36,
sex = c("male","female"))
pred_data$pred <- predict(ynmo_ws_prod_sex_mod,
type = "response",
newdata = pred_data)
## new prediction
pred_data <- pred_data |>
mutate(vocab_pred = (pred * (max_vocab+1)) - .001,
sex = ifelse(sex == "female","Female","Male"))
ggplot(ynmo_ws_ms |>
mutate(sex = ifelse(sex == "female","Female","Male")),
aes(x = age_in_months, y = produces, col = sex)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_line(data = pred_data, aes(x = age_in_months,
y = vocab_pred, col = sex)) +
xlim(16,36) +
ylab("Total productive vocabulary") +
xlab("Age (months)") +
ggthemes::scale_color_solarized(name = "Sex") +
ggthemes::theme_few()
## Warning: Removed 24 rows containing missing values (`geom_point()`).
ggsave(here("graphs","sex.png"), width = 6, height = 4)
## Warning: Removed 24 rows containing missing values (`geom_point()`).
combines_data <- ws_mod_data |>
mutate(`not yet` = ifelse(combine_words == "not yet", 1, 0),
sometimes = ifelse(combine_words == "sometimes", 1, 0),
often = ifelse(combine_words == "often", 1, 0)) |>
pivot_longer(`not yet`:often, names_to = "combines_words",
values_to = "value") |>
mutate(combines_words = factor(combines_words, levels = c("not yet","sometimes","often")))
ggplot(combines_data,
aes(x = age_in_months, y = value, col = combines_words)) +
geom_smooth(method = "glm", formula = y ~ x, method.args = list(family = "binomial")) +
ylab("Proportion of children") +
xlab("Age (months)") +
ggthemes::scale_color_solarized(name = "Combines words") +
ggthemes::theme_few()
ggsave(here("graphs","combines.png"), width = 6, height = 4)
ynmo_ws_prod_combine_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000) + combine_words,
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = ws_mod_data)
## GAMLSS-RS iteration 1: Global Deviance = -358.0044
## GAMLSS-RS iteration 2: Global Deviance = -362.5437
## GAMLSS-RS iteration 3: Global Deviance = -362.9447
## GAMLSS-RS iteration 4: Global Deviance = -362.9871
pred_data <- expand_grid(age_in_months = 16:36,
combine_words = c("not yet","sometimes","often"))
pred_data$pred <- predict(ynmo_ws_prod_combine_mod,
type = "response",
newdata = pred_data)
## new prediction
pred_data <- pred_data |>
mutate(vocab_pred = (pred * (max_vocab+1)) - .001) |>
mutate(combine_words = factor(combine_words,
levels = c("not yet","sometimes","often")))
ggplot(ynmo_ws_ms |>
mutate(combine_words = factor(combine_words,
levels = c("not yet",
"sometimes","often"))),
aes(x = age_in_months, y = produces,
col = combine_words)) +
geom_jitter(width = .2, height = 0, alpha = .2) +
geom_line(data = pred_data, aes(x = age_in_months,
y = vocab_pred, col = combine_words)) +
xlim(16,36) +
ylab("Total productive vocabulary") +
xlab("Age (months)") +
ggthemes::scale_color_solarized(name = "Combines words") +
ggthemes::theme_few()
## Warning: Removed 25 rows containing missing values (`geom_point()`).
ggsave(here("graphs","combine_vocab.png"), width = 6, height = 4)
## Warning: Removed 26 rows containing missing values (`geom_point()`).
summary(ynmo_ws_prod_combine_mod)
## ******************************************************************
## Family: c("BE", "Beta")
##
## Call: gamlss(formula = produces_01 ~ pbm(age_in_months, lambda = 10000) +
## combine_words, sigma.formula = ~pbm(age_in_months,
## lambda = 10000), family = BE, data = ws_mod_data,
## control = gamlss.control(c.crit = 0.1))
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: logit
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.6822 0.1126 -14.943 < 2e-16 ***
## combine_wordsoften 2.5078 0.1478 16.969 < 2e-16 ***
## combine_wordssometimes 1.2445 0.1491 8.344 2.73e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: logit
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15130 0.05639 -2.683 0.00771 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas:
## i) Std. Error for smoothers are for the linear effect only.
## ii) Std. Error for the linear terms maybe are not accurate.
## ------------------------------------------------------------------
## No. of observations in the fit: 304
## Degrees of Freedom for the fit: 6.896795
## Residual Deg. of Freedom: 297.1032
## at cycle: 4
##
## Global Deviance: -362.9871
## AIC: -349.1935
## SBC: -323.5579
## ******************************************************************
Compute MADM.
ynmo_ws_ms |>
mutate(age_binned = floor(age_in_months / 2)*2) |>
group_by(age_binned) |>
summarise(madm = mean(abs(produces - median(produces))) / median(produces)) |>
ggplot(aes(x = age_binned, y = madm)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
Output percentile tables
all_quants = seq(5,95,5)
cents <- centiles.pred(ynmo_ws_prod_mod, cent = all_quants,
xname = "age_in_months", xvalues = 16:36) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "produces") |>
mutate(produces = round((produces * (max_vocab+1)) - .001),
produces = ifelse(produces > 100, 100, produces)) |>
pivot_wider(names_from = "percentile", values_from = "produces")
## new prediction
## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
write_csv(cents, "wg_centiles_2023-05-04.csv")
## Registered S3 method overwritten by 'bit':
## method from
## print.ri gamlss
Output percentile tables
ws_mod_data_m <- filter(ws_mod_data, sex == "male")
ynmo_ws_prod_mod_m <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = ws_mod_data_m)
## GAMLSS-RS iteration 1: Global Deviance = -105.0276
## GAMLSS-RS iteration 2: Global Deviance = -106.1758
## GAMLSS-RS iteration 3: Global Deviance = -106.2372
ws_mod_data_f <- filter(ws_mod_data, sex == "female")
ynmo_ws_prod_mod_f <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
sigma.formula = ~ pbm(age_in_months, lambda = 10000),
family = BE,
control = gamlss.control(c.crit = .1),
data = ws_mod_data_f)
## GAMLSS-RS iteration 1: Global Deviance = -74.8683
## GAMLSS-RS iteration 2: Global Deviance = -76.7965
## GAMLSS-RS iteration 3: Global Deviance = -76.8003
cents_m <- centiles.pred(ynmo_ws_prod_mod_m, cent = all_quants,
xname = "age_in_months", xvalues = 16:36) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "produces") |>
mutate(produces = round((produces * (max_vocab+1)) - .001),
produces = ifelse(produces > 100, 100, produces)) |>
pivot_wider(names_from = "percentile", values_from = "produces")
## Warning in predict.gamlss(obj, what = "mu", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
cents_f <- centiles.pred(ynmo_ws_prod_mod_f, cent = all_quants,
xname = "age_in_months", xvalues = 16:36) |>
rename(age_in_months = x) |>
pivot_longer(-age_in_months, names_to = "percentile", values_to = "produces") |>
mutate(produces = round((produces * (max_vocab+1)) - .001),
produces = ifelse(produces > 100, 100, produces)) |>
pivot_wider(names_from = "percentile", values_from = "produces")
## new prediction
## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy between the original and the re-fit
## used to achieve 'safe' predictions
##
## new prediction
write_csv(cents_m, "wg_centiles_male_2023-05-04.csv")
write_csv(cents_f, "wg_centiles_female_2023-05-04.csv")