============================================================

Variables:

Dependiente: tipo de tratamiento (derivado de las 2 últimas columnas)

Estadística: proporciones/frecuencias (categ.), media/DE o mediana/RIQ (num.),

X^2/Fisher para categóricas; t de Student para numéricas entre grupos.

OR por regresión logística cuando aplique.

============================================================

— 0) Paquetes para el proyecto ———————————

install_if_needed <- function(pkgs) {
  to_install <- pkgs[!pkgs %in% rownames(installed.packages())]
  if (length(to_install)) install.packages(to_install, dependencies = TRUE)
}
req <- c("tidyverse","readxl","janitor","stringr","forcats","gt","gtsummary","ggpubr","rstatix","fs","broom")
install_if_needed(req)
invisible(lapply(req, library, character.only = TRUE))

## Warning: package 'tidyverse' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'readxl' was built under R version 4.4.3

## Warning: package 'janitor' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## 
## Adjuntando el paquete: 'rstatix'
## 
## The following object is masked from 'package:janitor':
## 
##     make_clean_names
## 
## The following object is masked from 'package:stats':
## 
##     filter

options(width = 120, scipen = 999)
fs::dir_create("outputs/tables"); fs::dir_create("outputs/figures"); fs::dir_create("outputs/logs")

— 1) Importación de la base de datos —————————————–

excel_path <- "C:/Users/fidel/OneDrive - UNIVERSIDAD AUTONOMA DE SINALOA/MDATOS/trabajo nuevocotix/Pacientes_Fractura_Filtrados_con_ID (2).xlsx"
stopifnot(file.exists(excel_path))
raw <- readxl::read_excel(excel_path, sheet = 1) %>% janitor::clean_names()

— 2) Identificar variables clave ————————–

Heurística: buscar columnas de tratamiento por nombre; si no, tomar las 2 últimas.

nm <- names(raw)
col_tx_cons <- grep("tratamiento.*conserv", nm, ignore.case = TRUE, value = TRUE)[1]
col_tx_quir <- grep("tratamiento.*quir",   nm, ignore.case = TRUE, value = TRUE)[1]
if (is.na(col_tx_cons) || is.na(col_tx_quir)) {
  # fallback estricto: últimas 2 columnas
  last2 <- tail(nm, 2)
  col_tx_cons <- last2[1]; col_tx_quir <- last2[2]
  message(sprintf("[Aviso] No se encontraron columnas de tratamiento por nombre. Se asumen las últimas dos: %s, %s", col_tx_cons, col_tx_quir))
}

— 3) Limpieza/armonización mínima ————————-

normalize_text <- function(x) stringr::str_to_upper(stringr::str_squish(stringr::str_trim(as.character(x))), locale = "es")

x <- raw %>%
  mutate(across(where(is.character), ~na_if(.x, ""))) %>%
  # Renombres suaves si existen
  rename(
    id      = any_of(c("id_paciente","id")),
    edad    = any_of(c("edad")),
    sexo    = any_of(c("sexo")),
    t_score = any_of(c("t_score","tscore","t_score_dxa")),
    segmento_fractura_afectado = any_of(c("segmento_fractura_afectado","segmento_afectado","segmento")),
    nivel_fractura_afectado    = any_of(c("nivel_fractura_afectado","nivel")),
    clas_ao_spine              = any_of(c("clasificacion_ao_spine_of","clas_ao","clas_ao_spine_of"))
  ) %>%
  mutate(
    # Sexo F/M
    sexo = factor(case_when(
      normalize_text(sexo) %in% c("F","FEMENINO") ~ "F",
      normalize_text(sexo) %in% c("M","MASCULINO") ~ "M",
      TRUE ~ NA_character_
    ), levels = c("F","M")),
    # Numéricos
    edad    = suppressWarnings(as.numeric(edad)),
    t_score = suppressWarnings(as.numeric(t_score)),
    # Segmento afectado (torácica/lumbar)
    segmento_afectado = case_when(
      str_detect(normalize_text(segmento_fractura_afectado), "TORAC|DOR|^T[0-9]|") ~ "TORÁCICA",
      str_detect(normalize_text(segmento_fractura_afectado), "LUMB|^L[0-9]") ~ "LUMBAR",
      TRUE ~ NA_character_
    ),
    # Clasificación AO-OF (extraer OF1-OF5)
    ao_raw = normalize_text(clas_ao_spine),
    ao_tmp = str_extract(ao_raw, "OF\\s*[1-5]|^[1-5]"),
    ao_tmp = str_replace_all(ao_tmp, "\\s+", ""),
    clas_ao_of = factor(case_when(
      ao_tmp %in% c("1","OF1") ~ "OF1",
      ao_tmp %in% c("2","OF2") ~ "OF2",
      ao_tmp %in% c("3","OF3") ~ "OF3",
      ao_tmp %in% c("4","OF4") ~ "OF4",
      ao_tmp %in% c("5","OF5") ~ "OF5",
      TRUE ~ NA_character_
    ), levels = paste0("OF", 1:5), ordered = TRUE)
  ) %>%
  select(-any_of(c("ao_raw","ao_tmp")))

— 4) Dependiente: tipo_tratamiento ————————

Normalizar binarios en las 2 columnas detectadas (SI/NO, 1/0, TRUE/FALSE)

norm_bin <- function(v) {
  vv <- normalize_text(v)
  out <- case_when(
    vv %in% c("1","SI","SÍ","TRUE","T","X","PRESENTE") ~ 1,
    vv %in% c("0","NO","FALSE","F","AUSENTE") ~ 0,
    is.na(v) ~ NA_real_,
    TRUE ~ suppressWarnings(as.numeric(v))
  )
  as.integer(out)
}

x <- x %>% mutate(
  tx_cons = norm_bin(.data[[col_tx_cons]]),
  tx_quir = norm_bin(.data[[col_tx_quir]]),
  tipo_tratamiento = factor(case_when(
    tx_cons == 1 & (is.na(tx_quir) | tx_quir == 0) ~ "Conservador",
    tx_quir == 1 & (is.na(tx_cons) | tx_cons == 0) ~ "Quirúrgico",
    TRUE ~ NA_character_
  ), levels = c("Conservador","Quirúrgico"))
)

— 5) Derivadas para análisis ——————————

x <- x %>% mutate(
  grupo_edad = cut(edad, breaks = c(50,60,70,80,120), right = FALSE,
                   labels = c("50–59","60–69","70–79","≥80"))
)

— 6) Control de calidad ———————————-

Duplicados por id

if ("id" %in% names(x)) {
  dup <- x %>% count(id) %>% filter(n > 1)
  if (nrow(dup) > 0) {
    readr::write_csv(x %>% filter(id %in% dup$id), "outputs/tables/01_duplicados.csv")
    x <- x %>% distinct(id, .keep_all = TRUE)
  }
}
# Rangos plausibles (ajustables al protocolo)
x <- x %>% mutate(
  edad    = if_else(!is.na(edad)    & (edad < 50 | edad > 110), NA_real_, edad),
  t_score = if_else(!is.na(t_score) & (t_score < -6 | t_score > 2), NA_real_, t_score)
)
# Faltantes
N <- nrow(x)
miss <- x %>% summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_na") %>%
  mutate(pct = round(100*n_na/N,1)) %>% arrange(desc(pct))
readr::write_csv(miss, "outputs/tables/02_missingness.csv")

# --- 7) Descriptivos (Tabla global y por tratamiento) --------
# Global
tabla_global <- x %>%
  select(edad, grupo_edad, sexo, t_score, segmento_afectado, clas_ao_of, tipo_tratamiento) %>%
  gtsummary::tbl_summary(
    missing = "ifany",
    statistic = list(
      gtsummary::all_continuous() ~ "{mean} \\u00B1 {sd}",
      gtsummary::all_categorical() ~ "{n} ({p}%)"
    ),
    digits = gtsummary::all_continuous() ~ 1
  ) %>% gtsummary::bold_labels()
tabla_global

Characteristic	N = 174¹
edad	64.5 \u00B1 10.4
grupo_edad
50–59	53 (30%)
60–69	76 (44%)
70–79	23 (13%)
≥80	22 (13%)
sexo
F	124 (71%)
M	50 (29%)
t_score	-2.9 \u00B1 0.5
segmento_afectado
TORÁCICA	174 (100%)
clas_ao_of
OF1	38 (22%)
OF2	68 (39%)
OF3	34 (20%)
OF4	26 (15%)
OF5	8 (4.6%)
tipo_tratamiento
Conservador	132 (76%)
Quirúrgico	42 (24%)
¹ Mean \u00B1 SD; n (%)

suppressMessages(tabla_global %>% gtsummary::as_gt() %>% gt::gtsave("outputs/tables/10_descriptivo_global.html"))

# Por tipo de tratamiento (si hay ≥2 categorías)
{
  tabla_por_trat <- x %>%
    select(tipo_tratamiento, edad, t_score, grupo_edad, sexo, segmento_afectado, clas_ao_of) %>%
    gtsummary::tbl_summary(
      by = tipo_tratamiento, missing = "ifany",
      statistic = list(
        gtsummary::all_continuous() ~ "{mean} \\{sd}",
        gtsummary::all_categorical() ~ "{n} ({p}%)"
      ),
      digits = gtsummary::all_continuous() ~ 1
    ) %>%
    gtsummary::add_p(test = list(
      gtsummary::all_continuous() ~ "t.test",
      gtsummary::all_categorical() ~ "chisq.test"
    )) %>%
    gtsummary::bold_labels()
  suppressMessages(tabla_por_trat %>% gtsummary::as_gt() %>% gt::gtsave("outputs/tables/11_descriptivo_por_trat.html"))
}

## The following errors were returned during `add_p()`:
## ✖ For variable `segmento_afectado` (`tipo_tratamiento`) and "statistic", "p.value", and "parameter" statistics: 'x' and
##   'y' must have at least 2 levels
## The following warnings were returned during `add_p()`:
## ! For variable `clas_ao_of` (`tipo_tratamiento`) and "statistic", "p.value", and "parameter" statistics: Chi-squared
##   approximation may be incorrect

tabla_por_trat

Characteristic	Conservador N = 132¹	Quirúrgico N = 42¹	p-value²
edad	65.6 \10.8	61.2 \8.2	0.007
t_score	-2.9 \0.5	-3.1 \0.5	0.025
grupo_edad			0.015
50–59	35 (27%)	18 (43%)
60–69	59 (45%)	17 (40%)
70–79	16 (12%)	7 (17%)
≥80	22 (17%)	0 (0%)
sexo			0.3
F	97 (73%)	27 (64%)
M	35 (27%)	15 (36%)
segmento_afectado
TORÁCICA	132 (100%)	42 (100%)
clas_ao_of			<0.001
OF1	38 (29%)	0 (0%)
OF2	67 (51%)	1 (2.4%)
OF3	22 (17%)	12 (29%)
OF4	5 (3.8%)	21 (50%)
OF5	0 (0%)	8 (19%)
¹ Mean \SD; n (%)
² Welch Two Sample t-test; Pearson’s Chi-squared test

— 8) Pruebas específicas según protocolo ——————

Chi^2 / Fisher entre tratamiento y variables categóricas clave

cat_tests <- list()
cat_vars  <- c("sexo","segmento_afectado","clas_ao_of")
for (v in intersect(cat_vars, names(x))) {
  df <- x %>% filter(!is.na(.data[[v]]), !is.na(tipo_tratamiento))
  if (n_distinct(df[[v]]) >= 2 && n_distinct(df$tipo_tratamiento) >= 2 && nrow(df) > 0) {
    tab <- table(df[[v]], df$tipo_tratamiento)
    if (all(tab >= 5)) {
      tt <- suppressWarnings(broom::tidy(chisq.test(tab)))
      tt$test <- "chisq"
    } else {
      ft <- fisher.test(tab)
      tt <- tibble::tibble(statistic = unname(ft$estimate %||% NA_real_), p.value = ft$p.value, method = "Fisher")
      tt$test <- "fisher"
    }
    tt$variable <- v
    cat_tests[[v]] <- tt
  }
}
if (length(cat_tests)) readr::write_csv(bind_rows(cat_tests), "outputs/tables/12_tests_categoricas_vs_trat.csv")

# t de Student para edad y t_score vs tratamiento
cont_tests <- list()
for (v in c("edad","t_score")) {
  if (v %in% names(x)) {
    df <- x %>% filter(!is.na(.data[[v]]), !is.na(tipo_tratamiento))
    if (n_distinct(df$tipo_tratamiento) >= 2 && nrow(df) > 2) {
      tt <- broom::tidy(t.test(df[[v]] ~ df$tipo_tratamiento))
      tt$variable <- v
      cont_tests[[v]] <- tt
    }
  }
}
if (length(cont_tests)) readr::write_csv(bind_rows(cont_tests), "outputs/tables/13_ttests_vs_trat.csv")

cont_tests

## $edad
## # A tibble: 1 × 11
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method                alternative variable
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl> <chr>                 <chr>       <chr>   
## 1     4.36      65.6      61.2      2.78 0.00665      90.7     1.24      7.48 Welch Two Sample t-t… two.sided   edad    
## 
## $t_score
## # A tibble: 1 × 11
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method                alternative variable
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl> <chr>                 <chr>       <chr>   
## 1    0.189     -2.88     -3.07      2.29  0.0251      68.4   0.0243     0.353 Welch Two Sample t-t… two.sided   t_score

# OR (modelo logístico) para probabilidad de tratamiento quirúrgico vs sexo (si procede)
{
  df_or <- x %>% filter(!is.na(sexo), !is.na(tipo_tratamiento)) %>% mutate(
    quir = if_else(tipo_tratamiento == "Quirúrgico", 1, 0)
  )
  if (n_distinct(df_or$quir) == 2 && n_distinct(df_or$sexo) == 2) {
    fit <- glm(quir ~ sexo, data = df_or, family = binomial())
    or_tbl <- broom::tidy(fit, conf.int = TRUE, exponentiate = TRUE)
    readr::write_csv(or_tbl, "outputs/tables/14_or_quirurgico_vs_sexo.csv")
  }
}
or_tbl

## # A tibble: 2 × 7
##   term        estimate std.error statistic       p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>         <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.278     0.218     -5.88 0.00000000417    0.178     0.420
## 2 sexoM          1.54      0.378      1.14 0.253            0.724     3.21

— 9) Figuras simples con p‑valor ————————–

Distribución por tratamiento

p_trat <- x %>% filter(!is.na(tipo_tratamiento)) %>% count(tipo_tratamiento) %>%
  ggplot(aes(x = tipo_tratamiento, y = n)) + geom_col() + theme_minimal() +
  labs(x = "Tipo de tratamiento", y = "Frecuencia")
try(ggsave("outputs/figures/01_tratamiento_bar.png", p_trat, width = 6, height = 4, dpi = 300), silent = TRUE)
p_trat

# Boxplots por tratamiento con p de t.test (edad, t_score)
{
  if ("edad" %in% names(x)) {
    p_age <- ggpubr::ggboxplot(x, x = "tipo_tratamiento", y = "edad", fill = "tipo_tratamiento") +
      ggpubr::stat_compare_means(method = "t.test", label = "p.format") +
      labs(x = "Tipo de tratamiento", y = "Edad (años)")
    try(ggsave("outputs/figures/02_edad_por_trat.png", p_age, width = 6, height = 4, dpi = 300), silent = TRUE)
  }
  if ("t_score" %in% names(x)) {
    p_ts <- ggpubr::ggboxplot(x, x = "tipo_tratamiento", y = "t_score", fill = "tipo_tratamiento") +
      ggpubr::stat_compare_means(method = "t.test", label = "p.format") +
      labs(x = "Tipo de tratamiento", y = "T-score")
    try(ggsave("outputs/figures/03_tscore_por_trat.png", p_ts, width = 6, height = 4, dpi = 300), silent = TRUE)
  }
}
p_age

# Barras: AO/OF por tratamiento con p (Chi^2/Fisher)
if (all(c("clas_ao_of","tipo_tratamiento") %in% names(x))) {
  dfp <- x %>% filter(!is.na(clas_ao_of), !is.na(tipo_tratamiento))
  if (nrow(dfp) > 0 && n_distinct(dfp$tipo_tratamiento) >= 2) {
    tab <- table(dfp$clas_ao_of, dfp$tipo_tratamiento)
    if (all(tab >= 5)) {
      pval <- suppressWarnings(chisq.test(tab)$p.value)
      subt <- paste0("Chi^2: p=", signif(pval, 3))
    } else {
      pval <- fisher.test(tab)$p.value
      subt <- paste0("Fisher: p=", signif(pval, 3))
    }
    p_of <- dfp %>% count(clas_ao_of, tipo_tratamiento) %>%
      ggplot(aes(x = clas_ao_of, y = n, fill = tipo_tratamiento)) + geom_col(position = position_dodge()) +
      theme_minimal() + labs(x = "AO/OF", y = "Casos", subtitle = subt)
    try(ggsave("outputs/figures/04_ao_of_por_trat.png", p_of, width = 7, height = 4, dpi = 300), silent = TRUE)
  }
}
p_of

Proyecto Dr. Contreras

MDatos

2025-08-13

============================================================

Variables:

Dependiente: tipo de tratamiento (derivado de las 2 últimas columnas)

Estadística: proporciones/frecuencias (categ.), media/DE o mediana/RIQ (num.),

X^2/Fisher para categóricas; t de Student para numéricas entre grupos.

OR por regresión logística cuando aplique.

============================================================

— 0) Paquetes para el proyecto ———————————

— 1) Importación de la base de datos —————————————–

— 2) Identificar variables clave ————————–

Heurística: buscar columnas de tratamiento por nombre; si no, tomar las 2 últimas.

— 3) Limpieza/armonización mínima ————————-

— 4) Dependiente: tipo_tratamiento ————————

Normalizar binarios en las 2 columnas detectadas (SI/NO, 1/0, TRUE/FALSE)

— 5) Derivadas para análisis ——————————

— 6) Control de calidad ———————————-

Duplicados por id

— 8) Pruebas específicas según protocolo ——————

Chi^2 / Fisher entre tratamiento y variables categóricas clave

— 9) Figuras simples con p‑valor ————————–

Distribución por tratamiento