The goal of this markdown is to show demographic data analyses and percentile tables for the YNMO short form CDI data.

Preliminaries

Read in data.

ynmo1 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 1 Sept 2022 - Final.xlsx", 
                                       sheet="Part 1 responses")) |>
  mutate(part = 1)
ynmo2 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 2 Jan 2023 - Final .xlsx", 
                                       sheet=" Part 2 responses")) |>
  mutate(part = 2)
ynmo3 <- suppressMessages(read_xlsx(path="raw_data/YNMO/Ynmo Norms Part 3 Jan 2023 -Final.xlsx", 
                                       sheet="Part 3 Responses - Ynmo app")) |>
  mutate(part = 3)

raw_ynmo <- bind_rows(ynmo1, ynmo2, ynmo3)

Consider ages in the removed data.

qplot(raw_ynmo$age_in_months, fill = as.factor(raw_ynmo$part), binwidth = 6)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

ynmo <- raw_ynmo |> slice(-1) 

demo <- ynmo |>
  select(subject_ID : disability_or_language_disorders_other_family)

ynmo_wg <- ynmo |>
  filter(!(`cat...117` %in% c("yes","no"))) |>
  mutate(form = "WG")

ynmo_ws <- ynmo |>
  filter(`cat...117` %in% c("yes","no")) |>
  mutate(form = "WS")

Overall totals.

bind_rows(ynmo_wg, ynmo_ws) |>
  group_by(form) |>
  count() |>
  knitr::kable()

form	n
WG	93
WS	320

By part and form totals.

bind_rows(ynmo_wg, ynmo_ws) |>
  group_by(part, form) |>
  count() |>
  knitr::kable()

part	form	n
1	WG	56
1	WS	150
2	WG	6
2	WS	36
3	WG	31
3	WS	134

Words and Gestures

There is not enough data on WG to move froward yet.

ynmo_wg_long <- ynmo_wg |>
  select(subject_ID, age_in_months, sex, `cat...25`:`also...116`) |>
  pivot_longer( `cat...25`:`also...116`, names_to = "item", values_to = "value") |>
  mutate(produces = ifelse(value %in% "produces", 1, 0), 
         understands = ifelse(value %in% c("understands","produces"), 1, 0))


ynmo_wg_ms <- ynmo_wg_long |>
  group_by(subject_ID, age_in_months, sex) |>
  summarise(understands = sum(understands), 
            produces = sum(produces))

## `summarise()` has grouped output by 'subject_ID', 'age_in_months'. You can
## override using the `.groups` argument.

Production

92 items on wg, 100 on WS

max_vocab = 92
quantiles = c(10, 25, 50, 75, 90)

ynmo_wg_ms$understands_01 <- .001 + ynmo_wg_ms$understands/(max_vocab+1) 
ynmo_wg_ms$produces_01 <- .001 + ynmo_wg_ms$produces/(max_vocab+1) 

prod_mod_data <- filter(ynmo_wg_ms, age_in_months >= 8, age_in_months < 24)

ynmo_wg_prod_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
              sigma.formula = ~ pbm(age_in_months, lambda = 10000),
              family = BE, 
              control = gamlss.control(c.crit = .1),
              data = prod_mod_data)

## GAMLSS-RS iteration 1: Global Deviance = -455.4926 
## GAMLSS-RS iteration 2: Global Deviance = -463.355 
## GAMLSS-RS iteration 3: Global Deviance = -468.6023 
## GAMLSS-RS iteration 4: Global Deviance = -472.226 
## GAMLSS-RS iteration 5: Global Deviance = -474.528 
## GAMLSS-RS iteration 6: Global Deviance = -475.8397 
## GAMLSS-RS iteration 7: Global Deviance = -476.5316 
## GAMLSS-RS iteration 8: Global Deviance = -476.8731 
## GAMLSS-RS iteration 9: Global Deviance = -477.0334 
## GAMLSS-RS iteration 10: Global Deviance = -477.106

centiles <- centiles.pred(ynmo_wg_prod_mod, cent = quantiles,
              xname = "age_in_months", xvalues = 8:18) |>
  rename(age_in_months = x) |>
  pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
  mutate(predicted = (predicted * (max_vocab+1)) - .001)

## Warning in predict.gamlss(obj, what = "mu", newdata = newx, type = "response", : There is a discrepancy  between the original and the re-fit 
##  used to achieve 'safe' predictions 
##

## new prediction 
## new prediction

ggplot(prod_mod_data, aes(x = age_in_months, y = produces)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) +
  xlim(8,18)

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Comprehension

comp_mod_data <- filter(ynmo_wg_ms, age_in_months >= 8, age_in_months < 24)

ynmo_wg_comp_mod <- gamlss(understands_01 ~ pbm(age_in_months, lambda = 10000),
              sigma.formula = ~ pbm(age_in_months, lambda = 10000),
              family = BE, 
              control = gamlss.control(c.crit = .1),
              data = comp_mod_data)

## GAMLSS-RS iteration 1: Global Deviance = -140.4853 
## GAMLSS-RS iteration 2: Global Deviance = -140.4908

centiles <- centiles.pred(ynmo_wg_comp_mod, cent = quantiles,
              xname = "age_in_months", xvalues = 8:18) |>
  rename(age_in_months = x) |>
  pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
  mutate(predicted = (predicted * (max_vocab+1)) - .001)

## new prediction 
## new prediction

ggplot(ynmo_wg_ms, aes(x = age_in_months, y = understands)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) + xlim(8,18)

## Warning: Removed 5 rows containing missing values (`geom_point()`).

Words and Sentences

ynmo_ws_long <- ynmo_ws |>
  select(subject_ID, age_in_months, sex, `cat...117`:`if.2`, combine_words) |>
  pivot_longer( `cat...117`:`if.2`, names_to = "item", values_to = "value") |>
  mutate(produces = ifelse(value %in% "yes", 1, 0))


ynmo_ws_ms <- ynmo_ws_long |>
  group_by(subject_ID, age_in_months, sex) |>
  summarise(produces = sum(produces), combine_words = combine_words[1])

## `summarise()` has grouped output by 'subject_ID', 'age_in_months'. You can
## override using the `.groups` argument.

ggplot(filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36), 
       aes(x = age_in_months, y = produces)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Consider sex effects.

ggplot(filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36), 
       aes(x = age_in_months, y = produces, col = sex)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Now with percentiles.

max_vocab = 100
quantiles = c(10, 25, 50, 75, 90)

ynmo_ws_ms$produces_01 <- .001 + ynmo_ws_ms$produces/(max_vocab+1) 


ws_mod_data <- filter(ynmo_ws_ms, age_in_months >= 16, age_in_months <= 36)

ynmo_ws_prod_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000),
              sigma.formula = ~ pbm(age_in_months, lambda = 10000),
              family = BE, 
              control = gamlss.control(c.crit = .1),
              data = ws_mod_data)

## GAMLSS-RS iteration 1: Global Deviance = -164.0857 
## GAMLSS-RS iteration 2: Global Deviance = -167.8864 
## GAMLSS-RS iteration 3: Global Deviance = -167.87

centiles <- centiles.pred(ynmo_ws_prod_mod, cent = quantiles,
              xname = "age_in_months", xvalues = 16:36) |>
  rename(age_in_months = x) |>
  pivot_longer(-age_in_months, names_to = "percentile", values_to = "predicted") |>
  mutate(predicted = (predicted * (max_vocab+1)) - .001)

## new prediction

## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy  between the original and the re-fit 
##  used to achieve 'safe' predictions 
##

## new prediction

ggplot(ynmo_ws_ms, aes(x = age_in_months, y = produces)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_line(data = centiles, aes(x = age_in_months, y = predicted, col = percentile)) + 
  xlim(16,36) +
  ylab("Total productive vocabulary") + 
  xlab("Age (months)") +
  scale_color_discrete(name = "Percentile") + 
  ggthemes::theme_few()

## Warning: Removed 25 rows containing missing values (`geom_point()`).

ggsave(here("graphs","percentiles.png"), width = 6, height = 4)

## Warning: Removed 26 rows containing missing values (`geom_point()`).

ynmo_ws_prod_sex_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000) * sex,
              sigma.formula = ~ pbm(age_in_months, lambda = 10000),
              family = BE, 
              control = gamlss.control(c.crit = .1),
              data = ws_mod_data)

## GAMLSS-RS iteration 1: Global Deviance = -174.8319 
## GAMLSS-RS iteration 2: Global Deviance = -179.108 
## GAMLSS-RS iteration 3: Global Deviance = -179.1156

pred_data <- expand_grid(age_in_months = 16:36, 
                         sex = c("male","female"))
pred_data$pred <- predict(ynmo_ws_prod_sex_mod, 
                          type = "response",
                          newdata = pred_data)

## new prediction

pred_data <- pred_data |>
  mutate(vocab_pred = (pred * (max_vocab+1)) - .001, 
         sex = ifelse(sex == "female","Female","Male")) 


ggplot(ynmo_ws_ms |>
         mutate(sex = ifelse(sex == "female","Female","Male")),
       aes(x = age_in_months, y = produces, col = sex)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_line(data = pred_data, aes(x = age_in_months, 
                                  y = vocab_pred, col = sex)) + 
  xlim(16,36) +
  ylab("Total productive vocabulary") + 
  xlab("Age (months)") +
  ggthemes::scale_color_solarized(name = "Sex") + 
  ggthemes::theme_few()

## Warning: Removed 24 rows containing missing values (`geom_point()`).

ggsave(here("graphs","sex.png"), width = 6, height = 4)

## Warning: Removed 24 rows containing missing values (`geom_point()`).

Combining words

combines_data <- ws_mod_data |>
  mutate(`not yet` = ifelse(combine_words == "not yet", 1, 0),
         sometimes = ifelse(combine_words == "sometimes", 1, 0), 
         often = ifelse(combine_words == "often", 1, 0)) |>
  pivot_longer(`not yet`:often, names_to = "combines_words", 
               values_to = "value") |>
  mutate(combines_words = factor(combines_words, levels = c("not yet","sometimes","often")))

ggplot(combines_data, 
       aes(x = age_in_months, y = value, col = combines_words)) + 
  geom_smooth(method = "glm", formula = y ~ x, method.args = list(family = "binomial")) + 
  ylab("Proportion of children") + 
  xlab("Age (months)") +
  ggthemes::scale_color_solarized(name = "Combines words") + 
  ggthemes::theme_few()

ggsave(here("graphs","combines.png"), width = 6, height = 4)

ynmo_ws_prod_combine_mod <- gamlss(produces_01 ~ pbm(age_in_months, lambda = 10000) + combine_words,
              sigma.formula = ~ pbm(age_in_months, lambda = 10000),
              family = BE, 
              control = gamlss.control(c.crit = .1),
              data = ws_mod_data)

## GAMLSS-RS iteration 1: Global Deviance = -358.0044 
## GAMLSS-RS iteration 2: Global Deviance = -362.5437 
## GAMLSS-RS iteration 3: Global Deviance = -362.9447 
## GAMLSS-RS iteration 4: Global Deviance = -362.9871

pred_data <- expand_grid(age_in_months = 16:36, 
                         combine_words = c("not yet","sometimes","often"))
pred_data$pred <- predict(ynmo_ws_prod_combine_mod, 
                          type = "response",
                          newdata = pred_data)

## new prediction

pred_data <- pred_data |>
  mutate(vocab_pred = (pred * (max_vocab+1)) - .001) |>
  mutate(combine_words = factor(combine_words, 
  levels = c("not yet","sometimes","often")))


ggplot(ynmo_ws_ms |>
  mutate(combine_words = factor(combine_words, 
                                levels = c("not yet",
                                           "sometimes","often"))), 
  aes(x = age_in_months, y = produces, 
                       col = combine_words)) + 
  geom_jitter(width = .2, height = 0, alpha = .2) + 
  geom_line(data = pred_data, aes(x = age_in_months, 
                                  y = vocab_pred, col = combine_words)) + 
  xlim(16,36) +
  ylab("Total productive vocabulary") + 
  xlab("Age (months)") +
  ggthemes::scale_color_solarized(name = "Combines words") + 
  ggthemes::theme_few()

## Warning: Removed 25 rows containing missing values (`geom_point()`).

ggsave(here("graphs","combine_vocab.png"), width = 6, height = 4)

## Warning: Removed 26 rows containing missing values (`geom_point()`).

summary(ynmo_ws_prod_combine_mod)

## ******************************************************************
## Family:  c("BE", "Beta") 
## 
## Call:  gamlss(formula = produces_01 ~ pbm(age_in_months, lambda = 10000) +  
##     combine_words, sigma.formula = ~pbm(age_in_months,  
##     lambda = 10000), family = BE, data = ws_mod_data,  
##     control = gamlss.control(c.crit = 0.1)) 
## 
## Fitting method: RS() 
## 
## ------------------------------------------------------------------
## Mu link function:  logit
## Mu Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.6822     0.1126 -14.943  < 2e-16 ***
## combine_wordsoften       2.5078     0.1478  16.969  < 2e-16 ***
## combine_wordssometimes   1.2445     0.1491   8.344 2.73e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ------------------------------------------------------------------
## Sigma link function:  logit
## Sigma Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.15130    0.05639  -2.683  0.00771 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ------------------------------------------------------------------
## NOTE: Additive smoothing terms exist in the formulas: 
##  i) Std. Error for smoothers are for the linear effect only. 
## ii) Std. Error for the linear terms maybe are not accurate. 
## ------------------------------------------------------------------
## No. of observations in the fit:  304 
## Degrees of Freedom for the fit:  6.896795
##       Residual Deg. of Freedom:  297.1032 
##                       at cycle:  4 
##  
## Global Deviance:     -362.9871 
##             AIC:     -349.1935 
##             SBC:     -323.5579 
## ******************************************************************

other analysis

Compute MADM.

ynmo_ws_ms |>
  mutate(age_binned = floor(age_in_months / 2)*2) |>
  group_by(age_binned) |>
  summarise(madm = mean(abs(produces - median(produces))) / median(produces)) |>
  ggplot(aes(x = age_binned, y = madm)) + 
  geom_point() + 
  geom_smooth(method = "lm")

## `geom_smooth()` using formula = 'y ~ x'

Output percentile tables

all_quants = seq(5,95,5)

cents <- centiles.pred(ynmo_ws_prod_mod, cent = all_quants,
              xname = "age_in_months", xvalues = 16:36) |>
  rename(age_in_months = x) |>
  pivot_longer(-age_in_months, names_to = "percentile", values_to = "produces") |>
  mutate(produces = round((produces * (max_vocab+1)) - .001), 
         produces = ifelse(produces > 100, 100, produces))  |>
  pivot_wider(names_from = "percentile", values_from = "produces")

## new prediction

## Warning in predict.gamlss(obj, what = "sigma", newdata = newx, type = "response", : There is a discrepancy  between the original and the re-fit 
##  used to achieve 'safe' predictions 
##

## new prediction

write_csv(cents, "wg_centiles_2023-05-04.csv")

## Registered S3 method overwritten by 'bit':
##   method   from  
##   print.ri gamlss