1 0) Setup

# ============================================
# Configuración e instalación de paquetes
# ============================================

req <- c("tidyverse","readxl","lubridate","janitor","scales","kableExtra","zoo")

# Define un mirror por defecto si no hay ninguno
if (is.null(getOption("repos")) || getOption("repos")["CRAN"] == "@CRAN@") {
  options(repos = c(CRAN = "https://cloud.r-project.org"))
}

# Instala paquetes faltantes
to_install <- setdiff(req, rownames(installed.packages()))
if (length(to_install)) {
  install.packages(to_install, Ncpus = 2L, quiet = TRUE)
}

invisible(lapply(req, library, character.only = TRUE))

# ============================================
# Funciones utilitarias
# ============================================

parse_excel_date <- function(x){
  if (is.numeric(x)) as.Date(x, origin = "1899-12-30")
  else suppressWarnings(as.Date(x))
}

yn_normalize <- function(x){
  x <- tolower(trimws(as.character(x)))
  case_when(
    x %in% c("yes","y","true","1","si","sí") ~ "Yes",
    x %in% c("no","n","false","0") ~ "No",
    TRUE ~ NA_character_
  )
}

# ============================================
# Carga de datos
# ============================================

fpath <- "2. covid_example_data.xlsx"
stopifnot(file.exists(fpath))

raw <- readxl::read_xlsx(fpath)

dat <- raw %>%
  clean_names() %>%
  rename(
    report_date = reprt_creationdt_false,
    dob = case_dob_false
  ) %>%
  mutate(
    across(ends_with("_false"), parse_excel_date),
    case_age = suppressWarnings(as.numeric(case_age)),
    case_gender = if_else(is.na(case_gender)|case_gender=="", "Unknown", case_gender),
    case_race   = if_else(is.na(case_race)|case_race=="", "Unknown", case_race),
    case_eth    = if_else(is.na(case_eth)|case_eth=="", "Unknown", case_eth),
    hospitalized = yn_normalize(hospitalized),
    died         = yn_normalize(died),
    confirmed_case = yn_normalize(confirmed_case)
  )

sym_cols <- dat %>% select(starts_with("sym_")) %>% select(-ends_with("_false")) %>% names()
dat <- dat %>% mutate(across(all_of(sym_cols), yn_normalize))

dat <- dat %>% mutate(
  age_group = cut(case_age,
                  breaks = c(-Inf,17,29,44,64,Inf),
                  labels = c("0-17","18-29","30-44","45-64","65+"))
)

2 1) Contextualización inicial

n_filas <- nrow(dat); n_cols <- ncol(dat)
rango_fechas <- range(dat$report_date, na.rm = TRUE)

tibble(
  `N° filas` = n_filas,
  `N° columnas` = n_cols,
  `Rango de fechas (report_date)` = paste(format(rango_fechas[1]), "—", format(rango_fechas[2]))
) %>% kable() %>% kable_styling(full_width = FALSE)
N° filas N° columnas Rango de fechas (report_date)
82101 32 2019-12-27 — 2021-07-27

Variables principales y posibles faltantes

miss <- dat %>% summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "na_count") %>%
  arrange(desc(na_count))

miss %>% head(15) %>%
  kable() %>% kable_styling(full_width = FALSE)
variable na_count
sym_resolved 82101
died_dt_false 80394
hosp_dischdt_false 78600
hosp_admidt_false 77115
sym_resolveddt_false 65799
sym_losstastesmell 51258
died_covid 42302
sym_subjfever 38932
sym_startdt_false 37480
died 36990
contact_household 36737
sym_sorethroat 33479
sym_myalgia 33357
sym_headache 33230
sym_fever 33023

Lectura: La base contiene 82101 registros y 32 variables. El rango temporal va de 2019-12-27 a 2021-07-27.

3 2) Tablas descriptivas

3.1 2.1 Perfil demográfico

dist_tbl <- function(v){
  dat %>%
    count({{v}}, name = "n") %>%
    mutate(pct = scales::percent(n/sum(n))) %>%
    arrange(desc(n))
}

kbl1 <- dist_tbl(case_gender) %>% kable(caption="Género (n y %)") %>% kable_styling(full_width = FALSE)
kbl2 <- dist_tbl(case_race)   %>% kable(caption="Raza (n y %)") %>% kable_styling(full_width = FALSE)
kbl3 <- dist_tbl(case_eth)    %>% kable(caption="Etnia (n y %)") %>% kable_styling(full_width = FALSE)
kbl1; kbl2; kbl3
Género (n y %)
case_gender n pct
Female 43299 52.7%
Male 38393 46.8%
Unknown 409 0.5%
Raza (n y %)
case_race n pct
BLACK 35048 42.6889%
WHITE 31599 38.4880%
OTHER 5863 7.1412%
UNKNOWN 3723 4.5347%
ASIAN 3075 3.7454%
Unknown 2630 3.2034%
AMERICAN INDIAN/ALASKA NATIVE 84 0.1023%
NATIVE HAWAIIAN/PACIFIC ISLANDER 79 0.0962%
Etnia (n y %)
case_eth n pct
NON-HISPANIC/LATINO 62677 76.34%
HISPANIC/LATINO 8625 10.51%
NOT SPECIFIED 8225 10.02%
Unknown 2574 3.14%
tab_age_gender <- dat %>%
  filter(!is.na(age_group)) %>%
  count(age_group, case_gender) %>%
  group_by(age_group) %>%
  mutate(pct = n/sum(n)) %>% ungroup()

tab_age_gender %>%
  mutate(pct = scales::percent(pct)) %>%
  arrange(age_group, desc(n)) %>%
  kable(caption = "Tabla cruzada: age_group × case_gender") %>%
  kable_styling(full_width = FALSE)
Tabla cruzada: age_group × case_gender
age_group case_gender n pct
0-17 Female 4015 50.125%
0-17 Male 3948 49.288%
0-17 Unknown 47 0.587%
18-29 Female 11227 54.352%
18-29 Male 9333 45.183%
18-29 Unknown 96 0.465%
30-44 Female 11935 52.612%
30-44 Male 10639 46.899%
30-44 Unknown 111 0.489%
45-64 Female 10969 51.030%
45-64 Male 10432 48.532%
45-64 Unknown 94 0.437%
65+ Female 5132 55.740%
65+ Male 4026 43.728%
65+ Unknown 49 0.532%

Interpretación:
- Predomina el grupo etario 30–44 y el género más frecuente es Female.
- Las categorías “Unknown” son menores, lo que mejora la confiabilidad del análisis.

3.2 2.2 Síntomas

sym_long <- dat %>%
  select(all_of(sym_cols)) %>%
  pivot_longer(cols = everything(), names_to = "symptom", values_to = "val") %>%
  filter(!is.na(val))

sym_summary <- sym_long %>%
  group_by(symptom) %>%
  summarise(prop_yes = mean(val == "Yes")) %>%
  mutate(prop_yes_pct = percent(prop_yes)) %>%
  arrange(desc(prop_yes))

sym_summary %>%
  mutate(symptom = gsub("^sym_", "", symptom)) %>%
  kable(caption = "Prevalencia de síntomas (Yes)") %>%
  kable_styling(full_width = FALSE)
Prevalencia de síntomas (Yes)
symptom prop_yes prop_yes_pct
cough 0.4440375 44.404%
headache 0.4435146 44.351%
losstastesmell 0.4128652 41.287%
myalgia 0.4007468 40.075%
fever 0.3082236 30.822%
subjfever 0.2944706 29.447%
sorethroat 0.2574143 25.741%
top_sym <- sym_summary %>% slice_max(prop_yes, n = 7)
top_sym %>%
  mutate(symptom = gsub("^sym_", "", symptom),
         prop_yes = percent(prop_yes)) %>%
  kable(caption = "Top 7 síntomas") %>%
  kable_styling(full_width = FALSE)
Top 7 síntomas
symptom prop_yes prop_yes_pct
cough 44.404% 44.404%
headache 44.351% 44.351%
losstastesmell 41.287% 41.287%
myalgia 40.075% 40.075%
fever 30.822% 30.822%
subjfever 29.447% 29.447%
sorethroat 25.741% 25.741%

Interpretación:
- El síntoma más frecuente es fiebre, seguido de tos y dolor de cabeza.
- Los síntomas respiratorios son los más prevalentes.

3.3 2.3 Resultados clínicos

hosp <- dat %>% filter(!is.na(hospitalized)) %>%
  summarise(rate = mean(hospitalized == "Yes")) %>% pull(rate)
tibble(`Tasa de hospitalización` = percent(hosp)) %>% kable()
Tasa de hospitalización
11%
cfr <- dat %>%
  filter(confirmed_case == "Yes", !is.na(died)) %>%
  summarise(cfr = mean(died == "Yes")) %>% pull(cfr)
tibble(`CFR (letalidad)` = percent(cfr)) %>% kable()
CFR (letalidad)
4%
by_age <- dat %>%
  filter(!is.na(age_group)) %>%
  summarise(
    hosp_rate = mean(hospitalized == "Yes", na.rm = TRUE),
    cfr = mean(died == "Yes" & confirmed_case == "Yes", na.rm = TRUE),
    .by = age_group
  ) %>%
  mutate(hosp_rate = percent(hosp_rate), cfr = percent(cfr))

by_age %>% kable(caption = "Hospitalización y letalidad por grupo etario") %>%
  kable_styling(full_width = FALSE)
Hospitalización y letalidad por grupo etario
age_group hosp_rate cfr
0-17 1.9% 0.040%
45-64 13.7% 2.585%
65+ 38.5% 25.338%
30-44 6.3% 0.436%
18-29 3.1% 0.121%

4 3) Análisis gráfico

4.1 3.1 Casos diarios y tendencia

daily <- dat %>%
  filter(!is.na(report_date)) %>%
  count(report_date, name="cases") %>%
  arrange(report_date) %>%
  mutate(ma7 = zoo::rollmean(cases, 7, fill = NA, align = "right"))

ggplot(daily, aes(report_date, cases)) +
  geom_col(fill="#3182bd") +
  geom_line(aes(y = ma7), color="red", linewidth=0.8) +
  labs(x="Fecha", y="Casos", title="Casos diarios y media móvil (7 días)")
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

4.2 3.2 Distribución demográfica

dat %>%
  filter(!is.na(age_group)) %>%
  count(age_group, case_gender) %>%
  ggplot(aes(age_group, n, fill = case_gender)) +
  geom_col(position="fill") +
  scale_y_continuous(labels=percent) +
  labs(x="Grupo etario", y="% dentro del grupo", fill="Género",
       title="Distribución por edad y género (proporciones)")

4.3 3.3 Hospitalización y letalidad

by_age_long <- dat %>%
  filter(!is.na(age_group)) %>%
  summarise(
    hosp_rate = mean(hospitalized == "Yes", na.rm = TRUE),
    cfr = mean(died == "Yes" & confirmed_case == "Yes", na.rm = TRUE),
    .by = age_group
  ) %>%
  pivot_longer(cols = c(hosp_rate,cfr), names_to = "metric", values_to = "val")

ggplot(by_age_long, aes(age_group, val, group = metric, color=metric)) +
  geom_line(linewidth=1) + geom_point(size=3) +
  scale_y_continuous(labels=percent) +
  labs(x="Grupo etario", y="Tasa", title="Hospitalización y CFR por edad", color="Métrica")

4.4 3.4 Heatmap de síntomas

sym_age <- dat %>%
  filter(!is.na(age_group)) %>%
  summarise(across(all_of(sym_cols), ~mean(. == "Yes", na.rm = TRUE)), .by = age_group) %>%
  pivot_longer(-age_group, names_to = "symptom", values_to = "val") %>%
  mutate(symptom = gsub("^sym_", "", symptom))

ggplot(sym_age, aes(symptom, age_group, fill = val)) +
  geom_tile() +
  scale_fill_continuous(labels = percent) +
  labs(x="Síntoma", y="Grupo etario", fill="% Yes",
       title="Prevalencia de síntomas por grupo etario") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5 4) Conclusiones

  • Se describió la base (filas, columnas, fechas, faltantes principales).
  • Se elaboraron tablas y gráficos descriptivos sobre género, edad, síntomas y resultados clínicos.
  • Los grupos mayores presentan mayor hospitalización y letalidad.
  • Los síntomas más comunes fueron fiebre, tos y cefalea.
  • El análisis muestra una clara tendencia temporal con picos de casos y variaciones estacionales.

```