0) Setup
# ============================================
# Configuración e instalación de paquetes
# ============================================
req <- c("tidyverse","readxl","lubridate","janitor","scales","kableExtra","zoo")
# Define un mirror por defecto si no hay ninguno
if (is.null(getOption("repos")) || getOption("repos")["CRAN"] == "@CRAN@") {
options(repos = c(CRAN = "https://cloud.r-project.org"))
}
# Instala paquetes faltantes
to_install <- setdiff(req, rownames(installed.packages()))
if (length(to_install)) {
install.packages(to_install, Ncpus = 2L, quiet = TRUE)
}
invisible(lapply(req, library, character.only = TRUE))
# ============================================
# Funciones utilitarias
# ============================================
parse_excel_date <- function(x){
if (is.numeric(x)) as.Date(x, origin = "1899-12-30")
else suppressWarnings(as.Date(x))
}
yn_normalize <- function(x){
x <- tolower(trimws(as.character(x)))
case_when(
x %in% c("yes","y","true","1","si","sí") ~ "Yes",
x %in% c("no","n","false","0") ~ "No",
TRUE ~ NA_character_
)
}
# ============================================
# Carga de datos
# ============================================
fpath <- "2. covid_example_data.xlsx"
stopifnot(file.exists(fpath))
raw <- readxl::read_xlsx(fpath)
dat <- raw %>%
clean_names() %>%
rename(
report_date = reprt_creationdt_false,
dob = case_dob_false
) %>%
mutate(
across(ends_with("_false"), parse_excel_date),
case_age = suppressWarnings(as.numeric(case_age)),
case_gender = if_else(is.na(case_gender)|case_gender=="", "Unknown", case_gender),
case_race = if_else(is.na(case_race)|case_race=="", "Unknown", case_race),
case_eth = if_else(is.na(case_eth)|case_eth=="", "Unknown", case_eth),
hospitalized = yn_normalize(hospitalized),
died = yn_normalize(died),
confirmed_case = yn_normalize(confirmed_case)
)
sym_cols <- dat %>% select(starts_with("sym_")) %>% select(-ends_with("_false")) %>% names()
dat <- dat %>% mutate(across(all_of(sym_cols), yn_normalize))
dat <- dat %>% mutate(
age_group = cut(case_age,
breaks = c(-Inf,17,29,44,64,Inf),
labels = c("0-17","18-29","30-44","45-64","65+"))
)
1) Contextualización
inicial
n_filas <- nrow(dat); n_cols <- ncol(dat)
rango_fechas <- range(dat$report_date, na.rm = TRUE)
tibble(
`N° filas` = n_filas,
`N° columnas` = n_cols,
`Rango de fechas (report_date)` = paste(format(rango_fechas[1]), "—", format(rango_fechas[2]))
) %>% kable() %>% kable_styling(full_width = FALSE)
|
N° filas
|
N° columnas
|
Rango de fechas (report_date)
|
|
82101
|
32
|
2019-12-27 — 2021-07-27
|
Variables principales y posibles faltantes
miss <- dat %>% summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "na_count") %>%
arrange(desc(na_count))
miss %>% head(15) %>%
kable() %>% kable_styling(full_width = FALSE)
|
variable
|
na_count
|
|
sym_resolved
|
82101
|
|
died_dt_false
|
80394
|
|
hosp_dischdt_false
|
78600
|
|
hosp_admidt_false
|
77115
|
|
sym_resolveddt_false
|
65799
|
|
sym_losstastesmell
|
51258
|
|
died_covid
|
42302
|
|
sym_subjfever
|
38932
|
|
sym_startdt_false
|
37480
|
|
died
|
36990
|
|
contact_household
|
36737
|
|
sym_sorethroat
|
33479
|
|
sym_myalgia
|
33357
|
|
sym_headache
|
33230
|
|
sym_fever
|
33023
|
Lectura: La base contiene 82101 registros y 32 variables. El
rango temporal va de 2019-12-27 a
2021-07-27.
2) Tablas
descriptivas
2.1 Perfil
demográfico
dist_tbl <- function(v){
dat %>%
count({{v}}, name = "n") %>%
mutate(pct = scales::percent(n/sum(n))) %>%
arrange(desc(n))
}
kbl1 <- dist_tbl(case_gender) %>% kable(caption="Género (n y %)") %>% kable_styling(full_width = FALSE)
kbl2 <- dist_tbl(case_race) %>% kable(caption="Raza (n y %)") %>% kable_styling(full_width = FALSE)
kbl3 <- dist_tbl(case_eth) %>% kable(caption="Etnia (n y %)") %>% kable_styling(full_width = FALSE)
kbl1; kbl2; kbl3
Género (n y %)
|
case_gender
|
n
|
pct
|
|
Female
|
43299
|
52.7%
|
|
Male
|
38393
|
46.8%
|
|
Unknown
|
409
|
0.5%
|
Raza (n y %)
|
case_race
|
n
|
pct
|
|
BLACK
|
35048
|
42.6889%
|
|
WHITE
|
31599
|
38.4880%
|
|
OTHER
|
5863
|
7.1412%
|
|
UNKNOWN
|
3723
|
4.5347%
|
|
ASIAN
|
3075
|
3.7454%
|
|
Unknown
|
2630
|
3.2034%
|
|
AMERICAN INDIAN/ALASKA NATIVE
|
84
|
0.1023%
|
|
NATIVE HAWAIIAN/PACIFIC ISLANDER
|
79
|
0.0962%
|
Etnia (n y %)
|
case_eth
|
n
|
pct
|
|
NON-HISPANIC/LATINO
|
62677
|
76.34%
|
|
HISPANIC/LATINO
|
8625
|
10.51%
|
|
NOT SPECIFIED
|
8225
|
10.02%
|
|
Unknown
|
2574
|
3.14%
|
tab_age_gender <- dat %>%
filter(!is.na(age_group)) %>%
count(age_group, case_gender) %>%
group_by(age_group) %>%
mutate(pct = n/sum(n)) %>% ungroup()
tab_age_gender %>%
mutate(pct = scales::percent(pct)) %>%
arrange(age_group, desc(n)) %>%
kable(caption = "Tabla cruzada: age_group × case_gender") %>%
kable_styling(full_width = FALSE)
Tabla cruzada: age_group × case_gender
|
age_group
|
case_gender
|
n
|
pct
|
|
0-17
|
Female
|
4015
|
50.125%
|
|
0-17
|
Male
|
3948
|
49.288%
|
|
0-17
|
Unknown
|
47
|
0.587%
|
|
18-29
|
Female
|
11227
|
54.352%
|
|
18-29
|
Male
|
9333
|
45.183%
|
|
18-29
|
Unknown
|
96
|
0.465%
|
|
30-44
|
Female
|
11935
|
52.612%
|
|
30-44
|
Male
|
10639
|
46.899%
|
|
30-44
|
Unknown
|
111
|
0.489%
|
|
45-64
|
Female
|
10969
|
51.030%
|
|
45-64
|
Male
|
10432
|
48.532%
|
|
45-64
|
Unknown
|
94
|
0.437%
|
|
65+
|
Female
|
5132
|
55.740%
|
|
65+
|
Male
|
4026
|
43.728%
|
|
65+
|
Unknown
|
49
|
0.532%
|
Interpretación:
- Predomina el grupo etario 30–44 y el género más
frecuente es Female.
- Las categorías “Unknown” son menores, lo que mejora la confiabilidad
del análisis.
2.2 Síntomas
sym_long <- dat %>%
select(all_of(sym_cols)) %>%
pivot_longer(cols = everything(), names_to = "symptom", values_to = "val") %>%
filter(!is.na(val))
sym_summary <- sym_long %>%
group_by(symptom) %>%
summarise(prop_yes = mean(val == "Yes")) %>%
mutate(prop_yes_pct = percent(prop_yes)) %>%
arrange(desc(prop_yes))
sym_summary %>%
mutate(symptom = gsub("^sym_", "", symptom)) %>%
kable(caption = "Prevalencia de síntomas (Yes)") %>%
kable_styling(full_width = FALSE)
Prevalencia de síntomas (Yes)
|
symptom
|
prop_yes
|
prop_yes_pct
|
|
cough
|
0.4440375
|
44.404%
|
|
headache
|
0.4435146
|
44.351%
|
|
losstastesmell
|
0.4128652
|
41.287%
|
|
myalgia
|
0.4007468
|
40.075%
|
|
fever
|
0.3082236
|
30.822%
|
|
subjfever
|
0.2944706
|
29.447%
|
|
sorethroat
|
0.2574143
|
25.741%
|
top_sym <- sym_summary %>% slice_max(prop_yes, n = 7)
top_sym %>%
mutate(symptom = gsub("^sym_", "", symptom),
prop_yes = percent(prop_yes)) %>%
kable(caption = "Top 7 síntomas") %>%
kable_styling(full_width = FALSE)
Top 7 síntomas
|
symptom
|
prop_yes
|
prop_yes_pct
|
|
cough
|
44.404%
|
44.404%
|
|
headache
|
44.351%
|
44.351%
|
|
losstastesmell
|
41.287%
|
41.287%
|
|
myalgia
|
40.075%
|
40.075%
|
|
fever
|
30.822%
|
30.822%
|
|
subjfever
|
29.447%
|
29.447%
|
|
sorethroat
|
25.741%
|
25.741%
|
Interpretación:
- El síntoma más frecuente es fiebre, seguido de
tos y dolor de cabeza.
- Los síntomas respiratorios son los más prevalentes.
2.3 Resultados
clínicos
hosp <- dat %>% filter(!is.na(hospitalized)) %>%
summarise(rate = mean(hospitalized == "Yes")) %>% pull(rate)
tibble(`Tasa de hospitalización` = percent(hosp)) %>% kable()
cfr <- dat %>%
filter(confirmed_case == "Yes", !is.na(died)) %>%
summarise(cfr = mean(died == "Yes")) %>% pull(cfr)
tibble(`CFR (letalidad)` = percent(cfr)) %>% kable()
by_age <- dat %>%
filter(!is.na(age_group)) %>%
summarise(
hosp_rate = mean(hospitalized == "Yes", na.rm = TRUE),
cfr = mean(died == "Yes" & confirmed_case == "Yes", na.rm = TRUE),
.by = age_group
) %>%
mutate(hosp_rate = percent(hosp_rate), cfr = percent(cfr))
by_age %>% kable(caption = "Hospitalización y letalidad por grupo etario") %>%
kable_styling(full_width = FALSE)
Hospitalización y letalidad por grupo etario
|
age_group
|
hosp_rate
|
cfr
|
|
0-17
|
1.9%
|
0.040%
|
|
45-64
|
13.7%
|
2.585%
|
|
65+
|
38.5%
|
25.338%
|
|
30-44
|
6.3%
|
0.436%
|
|
18-29
|
3.1%
|
0.121%
|
3) Análisis
gráfico
3.1 Casos diarios y
tendencia
daily <- dat %>%
filter(!is.na(report_date)) %>%
count(report_date, name="cases") %>%
arrange(report_date) %>%
mutate(ma7 = zoo::rollmean(cases, 7, fill = NA, align = "right"))
ggplot(daily, aes(report_date, cases)) +
geom_col(fill="#3182bd") +
geom_line(aes(y = ma7), color="red", linewidth=0.8) +
labs(x="Fecha", y="Casos", title="Casos diarios y media móvil (7 días)")
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

3.2 Distribución
demográfica
dat %>%
filter(!is.na(age_group)) %>%
count(age_group, case_gender) %>%
ggplot(aes(age_group, n, fill = case_gender)) +
geom_col(position="fill") +
scale_y_continuous(labels=percent) +
labs(x="Grupo etario", y="% dentro del grupo", fill="Género",
title="Distribución por edad y género (proporciones)")

3.3 Hospitalización y
letalidad
by_age_long <- dat %>%
filter(!is.na(age_group)) %>%
summarise(
hosp_rate = mean(hospitalized == "Yes", na.rm = TRUE),
cfr = mean(died == "Yes" & confirmed_case == "Yes", na.rm = TRUE),
.by = age_group
) %>%
pivot_longer(cols = c(hosp_rate,cfr), names_to = "metric", values_to = "val")
ggplot(by_age_long, aes(age_group, val, group = metric, color=metric)) +
geom_line(linewidth=1) + geom_point(size=3) +
scale_y_continuous(labels=percent) +
labs(x="Grupo etario", y="Tasa", title="Hospitalización y CFR por edad", color="Métrica")

3.4 Heatmap de
síntomas
sym_age <- dat %>%
filter(!is.na(age_group)) %>%
summarise(across(all_of(sym_cols), ~mean(. == "Yes", na.rm = TRUE)), .by = age_group) %>%
pivot_longer(-age_group, names_to = "symptom", values_to = "val") %>%
mutate(symptom = gsub("^sym_", "", symptom))
ggplot(sym_age, aes(symptom, age_group, fill = val)) +
geom_tile() +
scale_fill_continuous(labels = percent) +
labs(x="Síntoma", y="Grupo etario", fill="% Yes",
title="Prevalencia de síntomas por grupo etario") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

4) Conclusiones
- Se describió la base (filas, columnas, fechas, faltantes
principales).
- Se elaboraron tablas y gráficos descriptivos sobre género, edad,
síntomas y resultados clínicos.
- Los grupos mayores presentan mayor hospitalización y
letalidad.
- Los síntomas más comunes fueron fiebre, tos y cefalea.
- El análisis muestra una clara tendencia temporal con picos de casos
y variaciones estacionales.
```