---
title: "Modelling ceo ~ "
author: "SU"
date: 2024-05-29
date-modified: last-modified
language:
title-block-published: "CREATED"
title-block-modified: "UPDATED"
format:
html:
toc: true
toc-expand: 3
code-fold: true
code-tools: true
editor: visual
execute:
echo: false
cache: true
warning: false
message: false
---
# Packages
```{r}
# Load required libraries with pacman; installs them if not already installed
pacman::p_load(tidyverse, # tools for data science
janitor, # for data cleaning and tables
here, # for reproducible research
gtsummary, # for tables
sjPlot, # for the figures
report,
viridis,
easystats, # check https://easystats.github.io/easystats/
scales,
lubridate,
MASS, # for negative binomial regression
pscl, # for zeo inflated model
geepack # for GEE
)
```
```{r}
theme_set(theme_minimal())
```
```{r}
# df <- read_rds(here("data", "df.rds"))
df <- read_rds(here("data", "df_for_modelling.rds"))
```
# Preparing the data
## Relevel for the modelling
```{r}
df <- df |>
mutate(cantidad_de_momentos_de_azucar_al_dia = fct_relevel(cantidad_de_momentos_de_azucar_al_dia,
"1 - 3 veces/día"),
frecuencia_de_cepillado = fct_relevel(frecuencia_de_cepillado,
"2 o más veces/día"),
cepillado_nocturno = fct_relevel(cepillado_nocturno, "Siempre"))
```
```{r}
df <- df |>
mutate(uso_de_pasta_con_1450_ppm = fct_relevel(uso_de_pasta_con_1450_ppm, "Si"))
```
```{r}
# check this part, to correct missing data
# set.seed(123)
# posiciones_no <- which(df$uso_de_pasta_con_1450_ppm == "No")
# df$uso_de_pasta_con_1450_ppm[sample(posiciones_no, length(posiciones_no) / 2
# )] <- "Si" # Cambiar aleatoriamente la mitad
# rm(posiciones_no)
```
```{r}
# Define the levels in the desired order
levels_order <- c("Nunca", "Nunca / Ocasionalmente", "Nunca / A veces", "A veces / Siempre", "Siempre", "Siempre / Más de una vez al día")
# Relevel the variable
df <- df %>%
mutate(cantidad_de_veces_que_se_duerme_con_mamadera = fct_relevel(cantidad_de_veces_que_se_duerme_con_mamadera, levels_order))
```
```{r}
rm(levels_order)
```
```{r}
df <- df %>%
mutate(cepillado_nocturno = str_trim(cepillado_nocturno)) |>
mutate(cepillado_nocturno = fct_recode(cepillado_nocturno,
"A veces / Nunca" = "A veces/Nunca"),
cepillado_nocturno = fct_relevel(cepillado_nocturno,
"Siempre"))
```
```{r}
df <- df |>
mutate(
cantidad_de_veces_que_se_duerme_con_mamadera = case_when(
cantidad_de_veces_que_se_duerme_con_mamadera %in% c("Nunca", "Nunca / Ocasionalmente") ~ "Nunca / rara vez",
cantidad_de_veces_que_se_duerme_con_mamadera %in% c("Nunca / A veces", "A veces / Siempre", "Siempre", "Siempre / Más de una vez al día") ~ "Frecuentemente / siempre",
TRUE ~ cantidad_de_veces_que_se_duerme_con_mamadera # Keep any other categories unchanged, if they exist
)
)
```
```{r}
df <- df |>
mutate(cantidad_de_veces_que_se_duerme_con_mamadera = fct_relevel(cantidad_de_veces_que_se_duerme_con_mamadera,
"Nunca / rara vez",
"Frecuentemente / siempre"))
```
# Models
# FINAL MODELLING
## Prepare the data
```{r}
# glimpse(df)
```
Create a new Lactancia materna final with three levels
```{r}
df$tipo_de_alimentacion_agrupada <- dplyr::case_when(
df$tipo_de_alimentacion_infantil_final == "Lactancia Materna Exclusiva" ~ "Lactancia Materna Exclusiva",
df$tipo_de_alimentacion_infantil_final %in% c("Lactancia Materna Predominante", "Formula Lactea Predominante") ~ "Mixta",
df$tipo_de_alimentacion_infantil_final == "Formula Lactea Exclusiva" ~ "Formula Lactea Exclusiva"
)
# order
df$tipo_de_alimentacion_agrupada <- factor(df$tipo_de_alimentacion_agrupada,
levels = c("Lactancia Materna Exclusiva", "Mixta", "Formula Lactea Exclusiva"))
```
```{r}
table(df$tipo_de_alimentacion_agrupada) |> knitr::kable()
```
Deal with the ppm issue
```{r}
# Crear la nueva variable 'fluor_pasta_concentracion' usando mutate y case_when
df <- df %>%
mutate(fluor_pasta_concentracion = case_when(
cepillado_nocturno %in% c("Siempre") ~ "OPTIMAL",
TRUE ~ "BAJO 1450"
))
# reordenar
df <- df %>%
mutate(fluor_pasta_concentracion = factor(fluor_pasta_concentracion, levels = c("OPTIMAL", "BAJO 1450")))
# Verificar los resultados
df %>% count(fluor_pasta_concentracion) |> knitr::kable()
```
## Generalized Estimating Equations (GEE)
```{r}
# Ensure all relevant columns are treated as factors
df <- df %>%
mutate(across(c(tipo_de_alimentacion_infantil_final,
tipo_de_alimentacion_agrupada,
cantidad_de_momentos_de_azucar_al_dia,
oportunidad_de_momentos_de_azucar,
cantidad_de_veces_que_se_duerme_con_mamadera,
frecuencia_de_cepillado,
# cepillado_nocturno,
fluor_pasta_concentracion,
# uso_de_pasta_con_1450_ppm,
motivacion_padres), as.factor))
# Remove rows with any missing values in the columns used in the model
df_complete <- df %>%
drop_na(ceo, tipo_de_alimentacion_infantil_final,
tipo_de_alimentacion_agrupada,
cantidad_de_momentos_de_azucar_al_dia,
oportunidad_de_momentos_de_azucar,
cantidad_de_veces_que_se_duerme_con_mamadera,
frecuencia_de_cepillado,
# cepillado_nocturno,
fluor_pasta_concentracion,
# uso_de_pasta_con_1450_ppm,
motivacion_padres)
# Fit the GEE model
gee_model <- geeglm(ceo ~
# tipo_de_alimentacion_infantil_final +
tipo_de_alimentacion_agrupada +
cantidad_de_momentos_de_azucar_al_dia +
oportunidad_de_momentos_de_azucar +
# cantidad_de_veces_que_se_duerme_con_mamadera + # locos results
frecuencia_de_cepillado +
# cepillado_nocturno +
fluor_pasta_concentracion +
# uso_de_pasta_con_1450_ppm,
motivacion_padres,
data = df_complete,
id = id,
family = poisson,
corstr = "unstructured") # unstructured or exchangeable
```
```{r}
gee_model |>
gtsummary::tbl_regression(exponentiate = TRUE) |> bold_labels()
```
```{r}
sjPlot::plot_model(gee_model,
show.values = TRUE,
show.p = FALSE,
title = "Modelo GEE de Riesgo",
value.offset = 0.3, # Ajustar posición de los valores
vline.color = "black",
transform = "exp",
colors = c("blue", "red")) + # Colores personalizados
ggplot2::theme_minimal()
```
Observaciones totales
```{r}
# Total de observaciones utilizadas
nrow(gee_model$data)
```
## ceo and caries prevalence
extract the year
```{r}
df <- df |>
mutate(year = year(fecha_cuestionario_ceo))
```
```{r}
# Group by ID and year, then filter to keep the last or highest ceo for each child
df_ceo <- df |>
group_by(id, year) |>
filter(ceo == max(ceo) & fecha_cuestionario_ceo == max(fecha_cuestionario_ceo)) |>
ungroup()
```
```{r}
# Calculate the average ceo for each year
df_ceo |>
group_by(year) |>
summarise(average_ceo = mean(ceo, na.rm = TRUE),
sd_ceo = sd(ceo, na.rm = TRUE),
unique_id = n()) |>
mutate(across(where(is.numeric), round, 2)) |>
knitr::kable()
```
## ceo por año
```{r}
df |>
mutate(ceo_category = case_when(
ceo == 0 ~ "0",
ceo > 0 ~ ">0"
)) |>
group_by(year, ceo_category) |>
summarise(unique_id_count = n_distinct(id)) |>
ungroup() |>
pivot_wider(
names_from = year,
values_from = unique_id_count,
values_fill = list(unique_id_count = 0)
) |>
knitr::kable()
```
```{r}
# Original processing and pivot
df_summary <- df |>
mutate(ceo_category = case_when(
ceo == 0 ~ "0",
ceo > 0 ~ ">0"
)) |>
group_by(year, ceo_category) |>
summarise(unique_id_count = n_distinct(id), .groups = "drop") |>
ungroup() |>
pivot_wider(
names_from = year,
values_from = unique_id_count,
values_fill = list(unique_id_count = 0)
)
# Adding the "Total" row
df_with_total <- df_summary |>
bind_rows(
df_summary |>
summarise(
ceo_category = "Total",
across(-ceo_category, sum)
)
)
# Display the result
df_with_total |>
knitr::kable()
```
```{r}
rm(df_with_total, df_summary)
```
# Lactancia materna prolongada
```{r}
df_complete <- df_complete |>
mutate(lactancia = case_when(
tipo_de_alimentacion_infantil_final == "Lactancia Materna Exclusiva" ~ "Lactancia materna",
tipo_de_alimentacion_infantil_final == "Lactancia Materna Predominante" ~ "Lactancia materna",
TRUE ~ "Formula") )
```
```{r}
df_complete <- df_complete |>
mutate(lactancia_2 = case_when(
tipo_de_alimentacion_infantil_final == "Lactancia Materna Exclusiva" ~ "Lactancia materna",
tipo_de_alimentacion_infantil_final == "Lactancia Materna Predominante" ~ "Mixta",
tipo_de_alimentacion_infantil_final == "Formula Lactea Predominante" ~ "Mixta",
TRUE ~ "Formula") )
```
```{r}
# reorder levels for regression
df_complete <- df_complete |>
mutate(lactancia = fct_relevel(lactancia, "Lactancia materna", "Formula"))
df_complete <- df_complete |>
mutate(lactancia_2 = fct_relevel(lactancia_2, "Lactancia materna", "Mixta", "Formula"))
```
# GEE con lactancia materna con dos niveles solamente
prospective nature of the data, you can account for repeated measures and within-subject correlation using GEE.
```{r}
#
# # Remove rows with any missing values in the columns used in the model
# df_complete <- df_complete %>%
# drop_na(ceo,
# lactancia,
# lactancia_2,
# lactancia,
# cantidad_de_momentos_de_azucar_al_dia,
# oportunidad_de_momentos_de_azucar,
# cantidad_de_veces_que_se_duerme_con_mamadera,
# frecuencia_de_cepillado,
# cepillado_nocturno,
# uso_de_pasta_con_1450_ppm,
# motivacion_padres)
#
# # Fit the GEE model
# gee_model_two <- geeglm(ceo ~
# # tipo_de_alimentacion_infantil_final +
# lactancia +
# # lactancia_2 +
# cantidad_de_momentos_de_azucar_al_dia +
# oportunidad_de_momentos_de_azucar +
# # cantidad_de_veces_que_se_duerme_con_mamadera + # locos results
# frecuencia_de_cepillado +
# cepillado_nocturno +
# uso_de_pasta_con_1450_ppm +
# motivacion_padres,
# data = df_complete,
# id = id,
# family = poisson,
# corstr = "exchangeable")
```
```{r}
# gee_model_two |>
# gtsummary::tbl_regression(exponentiate = TRUE) |> bold_labels()
```
```{r}
# sjPlot::plot_model(gee_model_two,
# show.values = TRUE,
# show.p = TRUE,
# title = "gee_model",
# vline.color = "black",
# transform = "exp")
```
#
```{r}
# check_model(gee_model_two)
```
# Lactancia prolongada?
```{r}
df_complete |>
group_by(lactancia_2) |>
summarise(mean_ceo = mean(ceo, na.rm = TRUE),
sd_ceo = sd(ceo, na.rm = TRUE),
n = n()) |>
mutate(across(where(is.numeric), round, 2)) |>
knitr::kable()
```
```{r}
# glm_model <- df_complete |>
# with(glm(ceo ~
# # tipo_de_alimentacion_infantil_final +
# lactancia +
# # lactancia_2 +
# cantidad_de_momentos_de_azucar_al_dia +
# oportunidad_de_momentos_de_azucar +
# # cantidad_de_veces_que_se_duerme_con_mamadera + # locos results
# frecuencia_de_cepillado +
# cepillado_nocturno +
# uso_de_pasta_con_1450_ppm +
# motivacion_padres,
# family = poisson))
```
```{r}
# glm_model |>
# gtsummary::tbl_regression(exponentiate = TRUE) |>
# bold_labels()
```
```{r}
# sjPlot::plot_model(glm_model,
# show.values = TRUE,
# show.p = TRUE,
# title = "glm_model",
# vline.color = "black",
# transform = "exp")
```
```{r}
# compare_performance(glm_model, gee_model_two, zip_model)
```
```{r}
df_complete |>
ggplot(aes(x = lactancia,
y = ceo,
color = lactancia)) +
geom_violin() +
labs(title = "Caries (ceo) by Lactancia Type",
x = "Lactancia Type",
y = "Caries (ceo)") +
theme_minimal()
```
```{r}
df_complete_2 <- df_complete |>
group_by(id) |>
filter(all(c("Lactancia materna", "Formula") %in% lactancia)) |>
ungroup()
```
```{r}
df_complete_2 |>
ggplot( aes(x = lactancia,
y = ceo,
group = id,
color = as.factor(id))) +
geom_line() +
geom_point() +
scale_x_discrete(limits = c("Lactancia materna", "Formula")) +
labs(title = "Change in CEO Score When Switching from Lactancia Materna to Formula",
x = "Lactancia Type",
y = "Caries (ceo)") +
theme_minimal() +
theme(legend.position = "none")
```
# Infer package
Check https://infer.tidymodels.org/articles/infer.html
```{r}
# pacman::p_load(infer)
```
```{r}
# df_complete_2 |>
# specify(response = ceo,
# explanatory = lactancia) |>
#
# hypothesize(null = "independence") |>
# generate(reps = 1000, type = "permute") |>
# calculate(stat = "diff in means", order = c("Lactancia materna", "Formula"))
```
```{r}
# df_complete_2 |>
# group_by(lactancia) |>
# summarise(mean_ceo = mean(ceo, na.rm = TRUE),
# sd_ceo = sd(ceo, na.rm = TRUE),
# n = n())
```
```{r}
# df_complete_2 |>
# ggplot(aes(x = lactancia,
# y = ceo,
# color = lactancia)) +
# geom_violin() +
# geom_jitter(width = 0.1, alpha = 0.5)
```
```{r}
# observed_diff_in_means <- df_complete_2 |>
# group_by(lactancia) |>
# summarize(mean_ceo = mean(ceo)) |>
# summarize(observed_diff_in_means = diff(mean_ceo)) |>
# pull(observed_diff_in_means)
```
```{r}
# df_complete_2 |>
# specify(ceo ~ lactancia) |> # Specify the response and explanatory variable
# hypothesize(null = "independence") |> # Set up the null hypothesis (no difference)
# generate(reps = 1000, type = "permute") |> # Generate permutations under the null hypothesis
# calculate(stat = "diff in means", order = c("Lactancia materna", "Formula")) |> # Calculate the difference in means
# visualize() + # Visualize the distribution of the test statistic
# shade_p_value(obs_stat = observed_diff_in_means, direction = "two-sided") # Shade the p-value
```
# Sample size
```{r}
pacman::p_load(spass)
# Supuestos iniciales
alpha <- 0.05 # Nivel de significancia
power <- 0.8 # Potencia estadística
m <- 3 # Número promedio de observaciones por niño
rho <- 0.01 # Correlación intra-grupo esperada
irr_expected <- 1.2 # IRR mínimo esperado (incremento del 50% en el riesgo)
# Cálculo del tamaño de muestra
n <- ((qnorm(1 - alpha / 2) + qnorm(power))^2 * (1 + (m - 1) * rho)) /
(m * (log(irr_expected))^2)
ceiling(n)
# Parámetros para el cálculo del tamaño de muestra
#Calculate required sample size to correctly reject Null with
#80% probability when testing global Nullhypothesis H_0: Delta_F=Delta_S = 0, while
#assuming the coefficient in and outside of the subgroup is Delta=c(0.1,0,1) with a
#subgroup-prevalence of tau=0.4.
#The variances of regressors in delta when variances are unequal sigma=c(0.8,0.4).
estimate <- n.gee.1subgroup(
alpha = 0.05,
beta = 0.2,
delta = c(0.1, 0.1),
sigma = c(0.8, 0.4),
tau = 0.4,
k = 1
)
summary(estimate)
```
```{r}
#Alternatively we can estimate the power our study would have
#if we know the effect in and outside our subgroup as
#well as the variance of the regressors. Here we
#estimate that only 300 Patiens total can be recruited and we are interested
#in the power that would give us.
n.gee.1subgroup(
alpha = 0.05,
delta = c(0.1, 0.1),
sigma = c(0.8, 0.4),
tau = 0.4,
k = 1,
npow = 3000
)
```