# se cargará el parquete epirhandbook
# -> paquete que proporciona datos y ejemplos del Epidemiologist R Handbook
if (requireNamespace("epirhandbook", quietly = TRUE)) {
library(epirhandbook)
} else {
message(
"El paquete 'epirhandbook' no está instalado.\n",
"Si desea usar los datos del Epidemiologist R Handbook, ejecute en la consola:\n",
" install.packages('remotes')\n",
" remotes::install_github('appliedepi/epirhandbook')\n"
)
}
# Paquetes de trabajo general
paquetes_basicos <- c("tidyverse", "ggplot2", "readxl", "dplyr")
instalar_si_falta <- function(pk) {
if (!requireNamespace(pk, quietly = TRUE)) {
install.packages(pk)
}
library(pk, character.only = TRUE)
}
invisible(lapply(paquetes_basicos, instalar_si_falta))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Se dejará como ejemplo y estará desactivado
(eval=FALSE) para que el documento HTML se pueda compilar
sin depender de interacción con RStudio ni de archivos locales. Úselo
manualmente en su sesión si desea descargar los datos.
# Ejecutar manualmente en una sesión interactiva de RStudio para descargar los datos:
# (creará una carpeta y descargará los archivos de ejemplo)
library(epirhandbook)
epirhandbook::get_data("all")
## ✔ File(s) successfully saved here: /Users/jesushernandez/Documents/BASE DE TRABAJO R
# Ejemplo: lectura desde Excel (ajustar ruta según la carpeta donde descargó los datos)
#en una carpeta "EPIDATA/linelist_cleaned.xlsx"
En la base de datos linelist_cleaned cada fila representa un caso y cada columna corresponde a una variable epidemiológica o clínica (edad, sexo, fecha de inicio de síntomas, desenlace, etc.).
Se descargó el archivo xlsx y se almacenó en un drive externo.
'/Volumes/JESUS SSD/EPIDATA/linelist_cleaned.xlsx'
## [1] "/Volumes/JESUS SSD/EPIDATA/linelist_cleaned.xlsx"
library(readxl)
ruta_linelist <- "/Volumes/JESUS SSD/EPIDATA/linelist_cleaned.xlsx"
if (!file.exists(ruta_linelist)) {
stop(
paste0(
"No se encontró el archivo", ruta_linelist, ".\n",
"Verifique la ruta o descargue la base con epirhandbook::get_data('all')."
)
)
}
linelist <- readxl::read_excel(ruta_linelist)
# vistazo general de las primeras filas
head(linelist)
# Dimensiones de la base (número de filas y columnas)
dim(linelist)
## [1] 5888 30
# nombres de las variables
names(linelist)
## [1] "case_id" "generation" "date_infection"
## [4] "date_onset" "date_hospitalisation" "date_outcome"
## [7] "outcome" "gender" "age"
## [10] "age_unit" "age_years" "age_cat"
## [13] "age_cat5" "hospital" "lon"
## [16] "lat" "infector" "source"
## [19] "wt_kg" "ht_cm" "ct_blood"
## [22] "fever" "chills" "cough"
## [25] "aches" "vomit" "temp"
## [28] "time_admission" "bmi" "days_onset_hosp"
# resumen general de las variables
summary(linelist)
## case_id generation date_infection
## Length:5888 Min. : 0.00 Min. :2014-03-19 00:00:00.0
## Class :character 1st Qu.:13.00 1st Qu.:2014-09-06 00:00:00.0
## Mode :character Median :16.00 Median :2014-10-11 00:00:00.0
## Mean :16.56 Mean :2014-10-22 21:47:24.2
## 3rd Qu.:20.00 3rd Qu.:2014-12-05 00:00:00.0
## Max. :37.00 Max. :2015-04-27 00:00:00.0
## NA's :2087
## date_onset date_hospitalisation
## Min. :2014-04-07 00:00:00.00 Min. :2014-04-17 00:00:00.00
## 1st Qu.:2014-09-16 00:00:00.00 1st Qu.:2014-09-19 00:00:00.00
## Median :2014-10-23 00:00:00.00 Median :2014-10-23 00:00:00.00
## Mean :2014-11-03 01:16:26.93 Mean :2014-11-03 14:33:49.89
## 3rd Qu.:2014-12-19 00:00:00.00 3rd Qu.:2014-12-17 00:00:00.00
## Max. :2015-04-30 00:00:00.00 Max. :2015-04-30 00:00:00.00
## NA's :256
## date_outcome outcome gender
## Min. :2014-04-19 00:00:00.00 Length:5888 Length:5888
## 1st Qu.:2014-09-26 00:00:00.00 Class :character Class :character
## Median :2014-11-01 00:00:00.00 Mode :character Mode :character
## Mean :2014-11-12 21:21:48.55
## 3rd Qu.:2014-12-28 00:00:00.00
## Max. :2015-06-04 00:00:00.00
## NA's :936
## age age_unit age_years age_cat
## Min. : 0.00 Length:5888 Min. : 0.00 Length:5888
## 1st Qu.: 6.00 Class :character 1st Qu.: 6.00 Class :character
## Median :13.00 Mode :character Median :13.00 Mode :character
## Mean :16.07 Mean :16.02
## 3rd Qu.:23.00 3rd Qu.:23.00
## Max. :84.00 Max. :84.00
## NA's :86 NA's :86
## age_cat5 hospital lon lat
## Length:5888 Length:5888 Min. :-13.27 Min. :8.446
## Class :character Class :character 1st Qu.:-13.25 1st Qu.:8.461
## Mode :character Mode :character Median :-13.23 Median :8.469
## Mean :-13.23 Mean :8.470
## 3rd Qu.:-13.22 3rd Qu.:8.480
## Max. :-13.21 Max. :8.492
##
## infector source wt_kg ht_cm
## Length:5888 Length:5888 Min. :-11.00 Min. : 4
## Class :character Class :character 1st Qu.: 41.00 1st Qu.: 91
## Mode :character Mode :character Median : 54.00 Median :129
## Mean : 52.64 Mean :125
## 3rd Qu.: 66.00 3rd Qu.:159
## Max. :111.00 Max. :295
##
## ct_blood fever chills cough
## Min. :16.00 Length:5888 Length:5888 Length:5888
## 1st Qu.:20.00 Class :character Class :character Class :character
## Median :22.00 Mode :character Mode :character Mode :character
## Mean :21.21
## 3rd Qu.:22.00
## Max. :26.00
##
## aches vomit temp time_admission
## Length:5888 Length:5888 Min. :35.20 Length:5888
## Class :character Class :character 1st Qu.:38.20 Class :character
## Mode :character Mode :character Median :38.80 Mode :character
## Mean :38.56
## 3rd Qu.:39.20
## Max. :40.80
## NA's :149
## bmi days_onset_hosp
## Min. :-1200.00 Min. : 0.000
## 1st Qu.: 24.56 1st Qu.: 1.000
## Median : 32.12 Median : 1.000
## Mean : 46.89 Mean : 2.059
## 3rd Qu.: 50.01 3rd Qu.: 3.000
## Max. : 1250.00 Max. :22.000
## NA's :256
library(tidyverse) # se debe llamar la libreria tidyverse para poder utilizar tibble
variables_resumen <- tibble(
variable = names(linelist),
tipo = sapply(linelist, class)
)
knitr::kable(
variables_resumen,
caption = "Resumen de variables en la base linelist_cleaned: nombre y tipo de dato.",
longtable = TRUE
)
| variable | tipo |
|---|---|
| case_id | character |
| generation | numeric |
| date_infection | POSIXct, POSIXt |
| date_onset | POSIXct, POSIXt |
| date_hospitalisation | POSIXct, POSIXt |
| date_outcome | POSIXct, POSIXt |
| outcome | character |
| gender | character |
| age | numeric |
| age_unit | character |
| age_years | numeric |
| age_cat | character |
| age_cat5 | character |
| hospital | character |
| lon | numeric |
| lat | numeric |
| infector | character |
| source | character |
| wt_kg | numeric |
| ht_cm | numeric |
| ct_blood | numeric |
| fever | character |
| chills | character |
| cough | character |
| aches | character |
| vomit | character |
| temp | numeric |
| time_admission | character |
| bmi | numeric |
| days_onset_hosp | numeric |
Selección de variables clave para la descripción inicial
Es útil identificar variables clave como:
# ejemplo de selección de variables (se deberá ajustar a la base real)
vars_clave <- c("id", "age", "gender", "date_onset", "date_report", "outcome")
vars_clave[!vars_clave %in% names(linelist)]
## [1] "id" "date_report"
# crear una versión reducida con las variables que sí existen
linelist_reducida <- linelist |>
dplyr::select(any_of(vars_clave))
head(linelist_reducida)
names(linelist_reducida)
## [1] "age" "gender" "date_onset" "outcome"
linelist_reducida |>
summarise(
n_casos = sum(!is.na(age)),
edad_min = min(age, na.rm = TRUE),
edad_max = max(age, na.rm = TRUE),
edad_media = mean(age, na.rm = TRUE),
edad_mediana = median(age, na.rm = TRUE)
) |>
knitr::kable(
digits = 1,
caption = "Resumen descriptivo de la edad en la base linelist."
)
| n_casos | edad_min | edad_max | edad_media | edad_mediana |
|---|---|---|---|---|
| 5802 | 0 | 84 | 16.1 | 13 |
linelist_reducida |>
ggplot(aes(x = age)) +
geom_histogram(binwidth = 5, boundary = 0, closed = "left") +
labs(title = "Distribución de la edad en casos registrados (linelist)", x = "Edad (años)", y = "Frecuencia"
)
## Warning: Removed 86 rows containing non-finite outside the scale range
## (`stat_bin()`).
linelist_reducida |>
count(gender) |>
mutate(prop = n / sum(n)) |>
knitr::kable(
digits = 3,
caption = "Distribución de casos por sexo (frecuencia y propagación)."
)
| gender | n | prop |
|---|---|---|
| f | 2807 | 0.477 |
| m | 2803 | 0.476 |
| NA | 278 | 0.047 |
linelist_reducida |>
count(gender) |>
mutate(prop = n / sum(n)) |>
ggplot(aes(x = gender, y = prop)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Proporción de casos por sexo (linelist)",
x = "Sexo",
y = "Proporción de casos"
)
linelist_reducida |>
ggplot(aes(x = gender, y = age)) +
geom_boxplot() +
labs(
title = "Distribución de la edad por sexo",
x = "Sexo",
y = "Edad (años)"
)
## Warning: Removed 86 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Para la caracterización temporal del brote o evento, se construye una curva epidémica, agrupando los casos por fecha (o semana) de inicio de síntomas.
library(lubridate)
linelist_fechas <- linelist_reducida |>
mutate(
date_onset = as.Date(date_onset),
semana_onset = floor_date(date_onset, unit = "week", week_start = 1)
)
# Visita rápida para comprobar la transformación
linelist_fechas |>
dplyr::select(date_onset, semana_onset) |>
head()
linelist_fechas |>
count(semana_onset) |>
ggplot(aes(x = semana_onset, y = n)) +
geom_col() +
labs(
title = "Curva epidémica semanal según fecha de inicio de síntomas", x = "Semana de inicio de síntomas", y = "Número de casos"
)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
linelist_fechas |>
filter(!is.na(gender)) |>
count(semana_onset, gender) |>
ggplot(aes(x = semana_onset, y = n, fill = gender)) +
geom_col(position = "stack") +
labs(
title = "Curva epidémica semanal estratificada por sexo", x = "Semana de inicio de síntomas", y = "Número de casos", fill = "Sexo"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_col()`).
Si la base incluye una variable de desenlace (por ejemplo, outcome con categorías como “Died” / “Recovered”), es posible estimar proporciones de letalidad y explorar diferencias por edad o sexo.
library("dplyr")
if ("outcome" %in% names(linelist_reducida)) {
linelist_reducida |>
count(outcome) |>
mutate(prop = n / sum(n)) |>
knitr::kable(
digits = 3,
caption = "Distribución de desenlaces clínicos (outcome) en la base linelist"
)
}
| outcome | n | prop |
|---|---|---|
| Death | 2582 | 0.439 |
| Recover | 1983 | 0.337 |
| NA | 1323 | 0.225 |
Si la base incluye una variable de desenlace (por ejemplo, outcome con categorías como “Death” / “Recover”), es posible estimar proporciones de letalidad y explorar diferencias por edad o sexo/género.
if (all(c("outcome", "sex") %in% names(linelist_reducida))) {
linelist_reducida |>
mutate(muerto = ifelse(outcome %in% c("Died", "Dead", "Fallecido"), 1, 0)) |>
group_by(sex) |>
summarise(
n = n(),
muertes = sum(muerto, na.rm = TRUE),
letalidad = muertes / n
) |>
knitr::kable(
digits = 3,
caption = "Letalidad (proporción de fallecidos) por sexo, según variable outcome."
)
}
Si la base incluye una variable de desenlace (por ejemplo, outcome
con categorías como "Death" / "Recover"), es posible
estimar proporciones de letalidad y explorar diferencias por edad o
sexo/género.
if("outcome" %in% names(linelist_reducida)) {
linelist_reducida |>
count(outcome) |>
mutate(prop = n / sum(n)) |>
knitr::kable(
digits = 3,
caption = "Distribución de desenlaces clínicos (outcome) en la base linelist."
)
}
| outcome | n | prop |
|---|---|---|
| Death | 2582 | 0.439 |
| Recover | 1983 | 0.337 |
| NA | 1323 | 0.225 |
En la base linelist_cleaned, la variable disponible es
gender (y no sex), por lo que conviene usarla como estrato.
if(all(c("outcome", "gender") %in% names(linelist_reducida))){
linelist_reducida |>
mutate(muerto = ifelse(outcome %in% c("Death", "Died", "Dead", "Fallecido"), 1, 0)) |>
group_by(gender) |>
summarise(
n = n(),
muertes = sum(muerto, na.rm = TRUE),
letalidad = muertes / n
) |>
knitr::kable(
digits = 3,
caption = "Letalidad (proporción de fallecidos) por género, según la variable outcome."
)
}
| gender | n | muertes | letalidad |
|---|---|---|---|
| f | 2807 | 1227 | 0.437 |
| m | 2803 | 1228 | 0.438 |
| NA | 278 | 127 | 0.457 |
gtsummary para tablas clínicasEl paquete gtsummary facilita la construcción de tablas descriptivas y de resultados de modelos, con formato adecuado para informes y manuscritos.
if (!requireNamespace("gtsummary", quietly = TRUE)) {
install.packages("gtsummary")
}
library(gtsummary)
gtsummary (edad y
género)if (all(c("age", "gender") %in% names(linelist_reducida))) {
linelist_reducida |>
select(age, gender) |>
tbl_summary(
by = gender,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
missing = "no"
) |>
add_n() |>
modify_header(label ~ "Variable") |>
modify_caption("**Resumen descriptivo de edad por género (gtsummary)**")
}
## 278 missing rows in the "gender" column have been removed.
| Variable | N | f N = 2,8071 |
m N = 2,8031 |
|---|---|---|---|
| age | 5,610 | 13 (10) | 20 (14) |
| 1 Mean (SD) | |||
Primero se crea una variable binaria de fallecimiento (muerto), luego se resume su distribución por género:
if (all(c("outcome", "gender") %in% names(linelist_reducida))) {
linelist_tab_out <- linelist_reducida |>
mutate(
genero = gender,
muerto = dplyr::if_else(
outcome %in% c("Death", "Died", "Dead", "Fallecido"),
"Sí",
"No",
missing = "No"
)
)
linelist_tab_out |>
select(genero, muerto) |>
tbl_summary(
by = genero,
statistic = all_categorical() ~"{n} ({p}%)",
missing = "no"
) |>
modify_header(label ~ "Variable") |>
modify_caption("**Distribución de fallecimiento (muerto = Sí) por género (gtsummary)**")
}
## 278 missing rows in the "genero" column have been removed.
| Variable | f N = 2,8071 |
m N = 2,8031 |
|---|---|---|
| muerto | ||
| No | 1,580 (56%) | 1,575 (56%) |
| Sí | 1,227 (44%) | 1,228 (44%) |
| 1 n (%) | ||
En muchos análisis epidemiológicos resulta útil estimar la letalidad por grupos de edad, para identificar un gradiente de riesgo.
library(dplyr)
if (all(c("outcome", "age") %in% names(linelist_reducida))) {
linelist_grupos <- linelist_reducida |>
mutate(
muerto = ifelse(outcome %in% c("Death", "Dead", "Fallecido"), 1, 0),
grupo_edad = cut(
age,
breaks = c(-Inf, 4, 14, 24, 44, 64, Inf),
labels = c("0-4", "5-14", "15-24", "25-44", "45-64", "65+")
)
)
tabla_letalidad_edad <- linelist_grupos |>
group_by(grupo_edad) |>
summarise(
n = n(),
muertes = sum(muerto, na.rm = TRUE),
letalidad = muertes / n
)
knitr::kable(
tabla_letalidad_edad,
digits = 3,
caption = "Letalidad por grupos de edad en la base linelist"
)
}
| grupo_edad | n | muertes | letalidad |
|---|---|---|---|
| 0-4 | 1072 | 464 | 0.433 |
| 5-14 | 2051 | 920 | 0.449 |
| 15-24 | 1387 | 611 | 0.441 |
| 25-44 | 1098 | 477 | 0.434 |
| 45-64 | 175 | 70 | 0.400 |
| 65+ | 19 | 8 | 0.421 |
| NA | 86 | 32 | 0.372 |
library(ggplot2)
if (exists("tabla_letalidad_edad")) {
tabla_letalidad_edad |>
ggplot(aes(x = grupo_edad, y = letalidad)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Letalidad por grupo de edad",
x = "Grupo de edad (años)",
y = "Letalidad (%)"
)
}
library(dplyr)
library(gtsummary)
if (all(c("outcome", "age", "gender") %in% names(linelist_reducida))) {
linelist_grupos_gt <- linelist_reducida |>
mutate(
grupo_edad = cut(
age,
breaks = c(-Inf, 4, 14, 24, 44, 64, Inf),
labels = c("0-4", "5-14", "15-24", "25-44", "45-64", "65+")
),
muerto = dplyr::if_else(
outcome %in% c("Death", "Died", "Dead", "Fallecido"),
"Sí",
"No",
missing = "No"
)
)
linelist_grupos_gt |>
select(grupo_edad, gender, muerto) |>
tbl_summary(
by = grupo_edad,
statistic = all_categorical() ~"{n} ({p}%)",
missing = "no"
) |>
modify_caption("**Distribución de fallecidos por grupo de edad y género (gtsummary)**")
}
## 86 missing rows in the "grupo_edad" column have been removed.
| Characteristic | 0-4 N = 1,0721 |
5-14 N = 2,0511 |
15-24 N = 1,3871 |
25-44 N = 1,0981 |
45-64 N = 1751 |
65+ N = 191 |
|---|---|---|---|---|---|---|
| gender | ||||||
| f | 626 (60%) | 1,169 (59%) | 667 (49%) | 335 (31%) | 10 (5.9%) | 0 (0%) |
| m | 409 (40%) | 800 (41%) | 682 (51%) | 736 (69%) | 159 (94%) | 17 (100%) |
| muerto | ||||||
| No | 608 (57%) | 1,131 (55%) | 776 (56%) | 621 (57%) | 105 (60%) | 11 (58%) |
| Sí | 464 (43%) | 920 (45%) | 611 (44%) | 477 (43%) | 70 (40%) | 8 (42%) |
| 1 n (%) | ||||||
library(ggplot2)
if (all(c("date_onset", "outcome") %in% names(linelist_reducida))) {
linelist_fechas_out <- linelist_reducida |>
mutate(
date_onset = as.Date(date_onset),
semana_onset = lubridate::floor_date(date_onset, unit = "week", week_start = 1)
)
linelist_fechas_out |>
filter(!is.na(outcome)) |>
count(semana_onset, outcome) |>
ggplot(aes(x = semana_onset, y = n, fill = outcome)) +
geom_col(position = "stack") +
labs(
title = "Curva epidémica semanal estratificada por desenlace (outcome)",
x = "Semana de inicio de síntomas",
y = "Número de casos",
fill = "Desenlace"
)
}
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_col()`).
if (all(c("date_onset", "outcome", "gender") %in% names(linelist_reducida))) {
linelist_fechas_go <- linelist_reducida |>
mutate(
date_onset = as.Date(date_onset),
semana_onset = lubridate::floor_date(date_onset, unit = "week", week_start = 1)
)
linelist_fechas_go |>
filter(!is.na(outcome), !is.na(gender)) |>
count(semana_onset, gender, outcome) |>
ggplot(aes(x = semana_onset, y = n, fill = outcome)) +
geom_col(position = "stack") +
facet_wrap(~ gender) +
labs(
title = "Curva epidémica semanal por género y desenlace",
x = "Semana de inicio de síntomas",
y = "Número de casos",
fill = "Desenlace"
)
}
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_col()`).
Como ejemplo introductorio de análisis multivariable, se puede ajustar un modelo de regresión logística para explorar la asociación entre edad, género y probabilidad de fallecer. Aquí se muestra tanto el resumen clásico como su presentación con gtsummary.
if (all(c("outcome", "age", "gender") %in% names(linelist_reducida))) {
linelist_modelo <- linelist_reducida |>
mutate(
muerto = ifelse(outcome %in% c("Death", "Died", "Dead", "Fallecido"), 1, 0)
) |>
filter(!is.na(muerto), !is.na(age), !is.na(gender))
if (!requireNamespace("broom", quietly = TRUE)){
install.packages("broom")
}
library(broom)
modelo_logit <- glm(
muerto ~ age + gender,
data = linelist_modelo,
family = binomial(link = "logit")
)
summary(modelo_logit)
}
##
## Call:
## glm(formula = muerto ~ age + gender, family = binomial(link = "logit"),
## data = linelist_modelo)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.228825 0.047277 -4.840 1.3e-06 ***
## age -0.001899 0.002220 -0.856 0.392
## genderm 0.017081 0.055954 0.305 0.760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7689.5 on 5609 degrees of freedom
## Residual deviance: 7688.8 on 5607 degrees of freedom
## AIC: 7694.8
##
## Number of Fisher Scoring iterations: 3
library(gtsummary)
if (exists("modelo_logit")) {
tbl_regression(
modelo_logit,
exponentiate = TRUE
) |>
modify_header(label ~ "Variable") |>
modify_caption("**Modelo de regresión logística para mortalidad, ajustado por edad y género (OR e IC 95%)")
}
| Variable | OR | 95% CI | p-value |
|---|---|---|---|
| age | 1.00 | 0.99, 1.00 | 0.4 |
| gender | |||
| f | — | — | |
| m | 1.02 | 0.91, 1.14 | 0.8 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
if (all(c("age", "outcome") %in% names(linelist_reducida))) {
linelist_reducida |>
# Se crearán grupos de edad por decenios (0–9, 10–19, 20–29)
mutate(grupo_decenio = cut(
age,
breaks = seq(0, max(age, na.rm = TRUE) + 10, by = 10),
right = FALSE,
include.lowest = TRUE
)) |>
# Calcular casos, muertes y letalidad
group_by(grupo_decenio) |>
summarise(
casos = n(),
muertes = sum(outcome %in% c("Death", "Died", "Dead", "Fallecido"), na.rm = TRUE),
letalidad = muertes / casos
) |>
# Mostrar tabla
knitr::kable(
digits = 3,
caption = "Letalidad por grupos de edad (decenios)"
)
}
| grupo_decenio | casos | muertes | letalidad |
|---|---|---|---|
| [0,10) | 2180 | 946 | 0.434 |
| [10,20) | 1687 | 761 | 0.451 |
| [20,30) | 1078 | 478 | 0.443 |
| [30,40) | 530 | 233 | 0.440 |
| [40,50) | 226 | 96 | 0.425 |
| [50,60) | 70 | 26 | 0.371 |
| [60,70) | 25 | 7 | 0.280 |
| [70,80) | 5 | 3 | 0.600 |
| [80,90] | 1 | 0 | 0.000 |
| NA | 86 | 32 | 0.372 |
Se presenta en forma de gráfica utilizando ggplot2
if (all(c("age", "outcome") %in% names(linelist_reducida))) {
linelist_reducida |>
# Se crearán grupos de edad por decenios (0–9, 10–19, 20–29)
mutate(grupo_decenio = cut(
age,
breaks = seq(0, max(age, na.rm = TRUE) + 10, by = 10),
right = FALSE,
include.lowest = TRUE
)) |>
# Calcular casos, muertes y letalidad
group_by(grupo_decenio) |>
summarise(
casos = n(),
muertes = sum(outcome %in% c("Death", "Died", "Dead", "Fallecido"), na.rm = TRUE),
letalidad = muertes / casos
) |>
# SE INTENTARÁ HACER UN GRÁFICO CON GGPLOT PARA VER EN QUE RANGO DE EDAD SE PRESENTA MAYOR LETALIDAD
ggplot(aes(x = grupo_decenio, y = letalidad)) +
geom_col() +
labs(
title = "Letalidad por grupos de edad (decenios)",
x = "Rango de edad (decenios)",
y = "Letalidad"
) +
theme_classic()
}
fever, o estado nutricional aproximado
con bmi) dentro del modelo logístico.# se corroborarán los nombres de las columnas en la base de datos del modelo logistico.
linelist
# se hará la inclusión de las variables
# temp, fever, bmi
if (all(c("outcome", "age", "gender", "temp", "fever", "bmi") %in% names(linelist))) {
linelist_modelo <- linelist |>
mutate(
muerto = ifelse(outcome %in% c("Death", "Died", "Dead", "Fallecido"), 1, 0),
fever_bin = ifelse(fever == "yes", 1, 0)
) |>
filter(
!is.na(muerto),
!is.na(age),
!is.na(gender),
!is.na(temp),
!is.na(fever_bin),
!is.na(bmi)
)
if (!requireNamespace("broom", quietly = TRUE)) {
install.packages("broom")
}
library(broom)
modelo_logit <- glm(
muerto ~ age + gender + temp + fever + bmi,
data = linelist_modelo,
family = binomial(link = "logit")
)
summary(modelo_logit)
}
##
## Call:
## glm(formula = muerto ~ age + gender + temp + fever + bmi, family = binomial(link = "logit"),
## data = linelist_modelo)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.806681 2.202191 0.820 0.412
## age -0.003417 0.002489 -1.372 0.170
## genderm 0.026142 0.058050 0.450 0.652
## temp -0.055266 0.059518 -0.929 0.353
## feveryes 0.206588 0.141412 1.461 0.144
## bmi -0.001087 0.000571 -1.903 0.057 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7157.2 on 5222 degrees of freedom
## Residual deviance: 7150.0 on 5217 degrees of freedom
## AIC: 7162
##
## Number of Fisher Scoring iterations: 4
# La variable muerto será binaria
linelist$muerto <- ifelse(
linelist$outcome %in% c("Death","Died","Dead","Fallecido"),
1,
0
)
# Se filtrarán los NA
linelist_modelo <- linelist[
!is.na(linelist$muerto) &
!is.na(linelist$age) &
!is.na(linelist$fever) &
!is.na(linelist$bmi),
]
# Ajustar modelo logístico
modelo_logit <- glm(
muerto ~ age + gender + fever + bmi,
data = linelist_modelo,
family = binomial(link = "logit")
)
summary(modelo_logit)
##
## Call:
## glm(formula = muerto ~ age + gender + fever + bmi, family = binomial(link = "logit"),
## data = linelist_modelo)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2084753 0.0869116 -2.399 0.0165 *
## age -0.0040133 0.0024653 -1.628 0.1035
## genderm 0.0125150 0.0573061 0.218 0.8271
## feveryes 0.0762064 0.0700005 1.089 0.2763
## bmi -0.0011129 0.0005683 -1.958 0.0502 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7357.7 on 5370 degrees of freedom
## Residual deviance: 7351.4 on 5366 degrees of freedom
## (182 observations deleted due to missingness)
## AIC: 7361.4
##
## Number of Fisher Scoring iterations: 4
PRESENTACIÓN DEL MÓDELO
library(gtsummary)
if (exists("modelo_logit")) {
tbl_regression(
modelo_logit,
exponentiate = TRUE
) |>
modify_header(label ~ "Variable") |>
modify_caption(
"**Modelo de regresión logística para mortalidad, ajustado por edad, género, fiebre (fever) y estado nutricional (BMI) — OR e IC 95%**"
)
}
| Variable | OR | 95% CI | p-value |
|---|---|---|---|
| age | 1.00 | 0.99, 1.00 | 0.10 |
| gender | |||
| f | — | — | |
| m | 1.01 | 0.91, 1.13 | 0.8 |
| fever | |||
| no | — | — | |
| yes | 1.08 | 0.94, 1.24 | 0.3 |
| bmi | 1.00 | 1.00, 1.00 | 0.050 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
Construir tablas 2×2 específicas (por ejemplo, exposición = fiebre sí/no, desenlace = muerte sí/no) y comparar:
Riesgo de muerte.
Razón de riesgos.
Diferencia de riesgos.
library(dplyr)
library(tidyr)
# Construcción de la tabla 2×2
tabla_2x2 <- linelist |>
filter(!is.na(fever), !is.na(muerto)) |>
count(fever, muerto) |>
pivot_wider(names_from = muerto, values_from = n, values_fill = 0)
tabla_2x2
# Extraer cada celda
a <- tabla_2x2$`1`[tabla_2x2$fever == "yes"] # Muertes con fiebre
b <- tabla_2x2$`0`[tabla_2x2$fever == "yes"] # Vivos con fiebre
c <- tabla_2x2$`1`[tabla_2x2$fever == "no"] # Muertes sin fiebre
d <- tabla_2x2$`0`[tabla_2x2$fever == "no"] # Vivos sin fiebre
# Calcular riesgos
riesgo_fiebre <- a / (a + b)
riesgo_no_fiebre <- c / (c + d)
# Medidas epidemiológicas
RR <- riesgo_fiebre / riesgo_no_fiebre
RD <- riesgo_fiebre - riesgo_no_fiebre
list(
riesgo_con_fiebre = riesgo_fiebre,
riesgo_sin_fiebre = riesgo_no_fiebre,
razon_de_riesgos = RR,
diferencia_de_riesgos = RD
)
## $riesgo_con_fiebre
## [1] 0.4400967
##
## $riesgo_sin_fiebre
## [1] 0.4201835
##
## $razon_de_riesgos
## [1] 1.047392
##
## $diferencia_de_riesgos
## [1] 0.01991324
hospital) o por
categoría de edad (age_cat, age_cat5).# HOSPITALES
library(gtsummary)
tbl_hospital <- linelist |>
select(hospital, age, gender, fever, temp, bmi, muerto) |>
tbl_summary(
by = hospital, # estratificación
missing = "ifany", # mostrar NA si existen
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
)
) |>
add_p() |>
modify_caption("**Tabla descriptiva estratificada por hospital**")
tbl_hospital
| Characteristic | Central Hospital N = 4541 |
Military Hospital N = 8961 |
Missing N = 1,4691 |
Other N = 8851 |
Port Hospital N = 1,7621 |
St. Mark’s Maternity Hospital (SMMH) N = 4221 |
p-value2 |
|---|---|---|---|---|---|---|---|
| age | 16 (12) | 16 (12) | 16 (13) | 16 (12) | 16 (13) | 16 (12) | >0.9 |
| Unknown | 9 | 12 | 28 | 12 | 23 | 2 | |
| gender | 0.7 | ||||||
| f | 202 (47%) | 421 (49%) | 700 (50%) | 418 (50%) | 857 (51%) | 209 (52%) | |
| m | 230 (53%) | 435 (51%) | 696 (50%) | 423 (50%) | 823 (49%) | 196 (48%) | |
| Unknown | 22 | 40 | 73 | 44 | 82 | 17 | |
| fever | 351 (81%) | 708 (83%) | 1,144 (81%) | 676 (79%) | 1,350 (80%) | 320 (80%) | 0.4 |
| Unknown | 20 | 44 | 55 | 33 | 75 | 22 | |
| temp | 38.52 (0.96) | 38.62 (0.95) | 38.58 (0.97) | 38.53 (1.01) | 38.55 (0.98) | 38.51 (0.98) | 0.2 |
| Unknown | 4 | 23 | 38 | 23 | 49 | 12 | |
| bmi | 46 (41) | 47 (46) | 48 (50) | 45 (70) | 47 (55) | 48 (69) | 0.9 |
| muerto | 193 (43%) | 399 (45%) | 611 (42%) | 395 (45%) | 785 (45%) | 199 (47%) | 0.3 |
| 1 Mean (SD); n (%) | |||||||
| 2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test | |||||||
# AGE_CAT
tbl_agecat <- linelist |>
select(age_cat, age, gender, fever, temp, bmi, muerto) |>
tbl_summary(
by = age_cat,
missing = "ifany",
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
)
) |>
add_p() |>
modify_caption("**Tabla descriptiva estratificada por categoría de edad (age_cat)**")
## 86 missing rows in the "age_cat" column have been removed.
## The following errors were returned during `modify_caption()`:
## ✖ For variable `fever` (`age_cat`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17610 is too small
## for this problem, (pastp=762.583, ipn_0:=ipoin[itp=52]=4316,
## stp[ipn_0]=763.007). Increase workspace or consider using
## 'simulate.p.value=TRUE'
## ✖ For variable `gender` (`age_cat`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17610 is too small
## for this problem, (pastp=410.112, ipn_0:=ipoin[itp=246]=5459,
## stp[ipn_0]=419.544). Increase workspace or consider using
## 'simulate.p.value=TRUE'
## ✖ For variable `muerto` (`age_cat`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17580 is too small
## for this problem, (pastp=1086.01, ipn_0:=ipoin[itp=468]=842,
## stp[ipn_0]=1084.05). Increase workspace or consider using
## 'simulate.p.value=TRUE'
tbl_agecat
| Characteristic | 0-4 N = 1,0951 |
10-14 N = 9411 |
15-19 N = 7431 |
20-29 N = 1,0731 |
30-49 N = 7541 |
5-9 N = 1,0951 |
50-69 N = 951 |
70+ N = 61 |
p-value2 |
|---|---|---|---|---|---|---|---|---|---|
| age | 2 (3) | 12 (1) | 17 (1) | 24 (3) | 37 (5) | 7 (1) | 57 (6) | 75 (5) | <0.001 |
| gender | |||||||||
| f | 640 (61%) | 518 (57%) | 359 (50%) | 468 (45%) | 179 (24%) | 641 (61%) | 2 (2.2%) | 0 (0%) | |
| m | 416 (39%) | 383 (43%) | 364 (50%) | 575 (55%) | 557 (76%) | 412 (39%) | 91 (98%) | 5 (100%) | |
| Unknown | 39 | 40 | 20 | 30 | 18 | 42 | 2 | 1 | |
| fever | 831 (80%) | 706 (80%) | 574 (81%) | 860 (83%) | 576 (79%) | 860 (82%) | 64 (71%) | 6 (100%) | |
| Unknown | 52 | 53 | 31 | 34 | 28 | 46 | 5 | 0 | |
| temp | 38.53 (0.98) | 38.52 (1.01) | 38.59 (1.00) | 38.62 (0.97) | 38.53 (0.96) | 38.57 (0.95) | 38.36 (1.10) | 38.90 (0.54) | 0.13 |
| Unknown | 28 | 20 | 22 | 35 | 16 | 26 | 2 | 0 | |
| bmi | 107 (105) | 33 (8) | 33 (12) | 27 (7) | 22 (5) | 48 (21) | 18 (3) | 17 (4) | <0.001 |
| muerto | 471 (43%) | 438 (47%) | 323 (43%) | 477 (44%) | 329 (44%) | 476 (43%) | 33 (35%) | 3 (50%) | |
| 1 Mean (SD); n (%) | |||||||||
| 2 Kruskal-Wallis rank sum test; NA | |||||||||
# AGE_CAT 5
tbl_agecat5 <- linelist |>
select(age_cat5, age, gender, fever, temp, bmi, muerto) |>
tbl_summary(
by = age_cat5,
missing = "ifany",
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
)
) |>
add_p() |>
modify_caption("**Tabla descriptiva estratificada por categoría de edad (age_cat5)**")
## 86 missing rows in the "age_cat5" column have been removed.
## The following errors were returned during `modify_caption()`:
## ✖ For variable `fever` (`age_cat5`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17610 is too small
## for this problem, (pastp=33.4524, ipn_0:=ipoin[itp=321]=2231,
## stp[ipn_0]=36.4824). Increase workspace or consider using
## 'simulate.p.value=TRUE'
## ✖ For variable `gender` (`age_cat5`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17580 is too small
## for this problem, (pastp=51.1508, ipn_0:=ipoin[itp=253]=16905,
## stp[ipn_0]=43.1484). Increase workspace or consider using
## 'simulate.p.value=TRUE'
## ✖ For variable `muerto` (`age_cat5`) and "estimate", "p.value", "conf.low", and
## "conf.high" statistics: FEXACT error 7(location). LDSTP=17550 is too small
## for this problem, (pastp=55.8074, ipn_0:=ipoin[itp=346]=959,
## stp[ipn_0]=53.9459). Increase workspace or consider using
## 'simulate.p.value=TRUE'
tbl_agecat5
| Characteristic | 0-4 N = 1,0951 |
10-14 N = 9411 |
15-19 N = 7431 |
20-24 N = 6381 |
25-29 N = 4351 |
30-34 N = 3261 |
35-39 N = 2021 |
40-44 N = 1331 |
45-49 N = 931 |
5-9 N = 1,0951 |
50-54 N = 401 |
55-59 N = 301 |
60-64 N = 121 |
65-69 N = 131 |
70-74 N = 41 |
75-79 N = 11 |
80-84 N = 11 |
p-value2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 2 (3) | 12 (1) | 17 (1) | 22 (1) | 27 (1) | 32 (1) | 37 (1) | 42 (1) | 47 (1) | 7 (1) | 51 (1) | 57 (1) | 62 (1) | 67 (2) | 72 (1) | 76 (NA) | 84 (NA) | <0.001 |
| gender | ||||||||||||||||||
| f | 640 (61%) | 518 (57%) | 359 (50%) | 305 (49%) | 163 (39%) | 104 (33%) | 42 (21%) | 25 (19%) | 8 (9.1%) | 641 (61%) | 2 (5.1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | 0 (0%) | |
| m | 416 (39%) | 383 (43%) | 364 (50%) | 316 (51%) | 259 (61%) | 213 (67%) | 157 (79%) | 107 (81%) | 80 (91%) | 412 (39%) | 37 (95%) | 30 (100%) | 12 (100%) | 12 (100%) | 4 (100%) | 0 (NA%) | 1 (100%) | |
| Unknown | 39 | 40 | 20 | 17 | 13 | 9 | 3 | 1 | 5 | 42 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | |
| fever | 831 (80%) | 706 (80%) | 574 (81%) | 506 (82%) | 354 (84%) | 255 (81%) | 156 (80%) | 99 (77%) | 66 (78%) | 860 (82%) | 24 (62%) | 22 (81%) | 9 (75%) | 9 (75%) | 4 (100%) | 1 (100%) | 1 (100%) | |
| Unknown | 52 | 53 | 31 | 19 | 15 | 10 | 6 | 4 | 8 | 46 | 1 | 3 | 0 | 1 | 0 | 0 | 0 | |
| temp | 38.53 (0.98) | 38.52 (1.01) | 38.59 (1.00) | 38.62 (0.99) | 38.63 (0.95) | 38.55 (0.93) | 38.53 (0.93) | 38.50 (1.03) | 38.51 (1.04) | 38.57 (0.95) | 38.12 (1.22) | 38.41 (0.91) | 38.65 (1.03) | 38.66 (1.11) | 38.73 (0.53) | 39.60 (NA) | 38.90 (NA) | 0.3 |
| Unknown | 28 | 20 | 22 | 22 | 13 | 7 | 3 | 3 | 3 | 26 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
| bmi | 107 (105) | 33 (8) | 33 (12) | 28 (8) | 26 (6) | 23 (5) | 22 (5) | 20 (4) | 19 (4) | 48 (21) | 19 (3) | 18 (3) | 15 (2) | 16 (3) | 18 (5) | 14 (NA) | 13 (NA) | <0.001 |
| muerto | 471 (43%) | 438 (47%) | 323 (43%) | 287 (45%) | 190 (44%) | 151 (46%) | 82 (41%) | 54 (41%) | 42 (45%) | 476 (43%) | 14 (35%) | 12 (40%) | 2 (17%) | 5 (38%) | 2 (50%) | 1 (100%) | 0 (0%) | |
| 1 Mean (SD); n (%) | ||||||||||||||||||
| 2 Kruskal-Wallis rank sum test; NA | ||||||||||||||||||
# CREAR UN VECTOR DE DATOS
datos <- c(100, 200, 300, 400, 500)
# CALCULAR EL TOTAL Y PROMEDIO
total <- sum(datos)
promedio <- mean(datos)
# MOSTRAR RESULTADOS
cat("Total:", total, "\n")
## Total: 1500
# Total: 1500
cat("Promedio:", promedio, "\n")
## Promedio: 300
# Promedio: 300
# GRÁFICAR LOS DATOS CON ggplot2
library("ggplot2")
library("dplyr")
df <- data.frame(x = 1:5, y = datos)
df
# se llama la función de ggplot, data frame llamado df, aesthetic (aes) el eje x va a gráficar lo que esté en x, en el eje y lo que esté en y.
ggplot(df, aes(x = x, y = y)) +
geom_bar(stat = "identity", fill = "#A2C4E6") +
labs(title = "Gráfico de barras", x = "Índice", y = "Valores")
df <- df %>% mutate(x = c("jorge", "miguel", "manuel", "claudia", "ivan"),
# factor obliga al codigo a seguir el orden de nombres dado, no reordenarlo por alfabetico.
x = factor(x, levels = c("jorge", "miguel", "manuel", "claudia", "ivan")))
df
ggplot(df, aes(x = x, y = y)) +
geom_bar(stat = "identity", fill = "#A2C4E6") +
labs(title = "Gráfico de barras", x = "Índice", y = "Valores")