## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df_train= read_csv("C:/Users/Jao/Documents/Doc/Cursos/Maestria Efectividad Clinica/2/4 Logistica/TH/train_TH.csv")
## Rows: 824 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (36): Died, age, gender, hypertension, a_fib, CHD, diabetes, anemia, dep...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Normalizacion de datos

# Convertir 'gender' en factor binario
df_train <- df_train %>%
  mutate(gender = as.factor(ifelse(gender == "1", 1, 0)))

# Convertir otras variables a factor
df_train <- df_train %>%
  mutate(across(c(Died, hypertension, a_fib, CHD, diabetes, anemia, depression, Hyperlipemia, CKD, COPD), as.factor))

# Convertir nombres de columnas a minúscula
names(df_train) <- tolower(names(df_train))

# Función para renombrar columnas con guion bajo
renombrar_columnas <- function(df) {
  names(df) <- gsub(" ", "_", names(df))
  return(df)
}

# Aplicar la función
df_train <- renombrar_columnas(df_train)

Evaluar NA

# Total de NA en todo el dataframe
total_na <- sum(is.na(df_train))
cat("Total de valores NA en el dataframe:", total_na, "\n\n")
## Total de valores NA en el dataframe: 0
# NA por variable
na_por_variable <- colSums(is.na(df_train))
print(na_por_variable)
##                     died                      age                   gender 
##                        0                        0                        0 
##             hypertension                    a_fib                      chd 
##                        0                        0                        0 
##                 diabetes                   anemia               depression 
##                        0                        0                        0 
##             hyperlipemia                      ckd                     copd 
##                        0                        0                        0 
##               heart_rate  systolic_blood_pressure diastolic_blood_pressure 
##                        0                        0                        0 
##         respiratory_rate              temperature                    sp_o2 
##                        0                        0                        0 
##             urine_output               hematocrit                leucocyte 
##                        0                        0                        0 
##                platelets                      inr                nt-probnp 
##                        0                        0                        0 
##               creatinine            urea_nitrogen                  glucose 
##                        0                        0                        0 
##          blood_potassium             blood_sodium            blood_calcium 
##                        0                        0                        0 
##                 chloride            magnesium_ion                       ph 
##                        0                        0                        0 
##              bicarbonate              lactic_acid                     pco2 
##                        0                        0                        0

Tabla 1

tabla_1 = CreateTableOne(vars = colnames(df_train), strata = "died", data = df_train)
#kableone(tabla_1)
#ordenar por signficancia p

tabla_1p <- print(tabla_1, printToggle = FALSE)

# Convertir matriz a data.frame
tabla_1p_df <- as.data.frame(tabla_1p)

# Ordenar por p-valor
tabla_1p_ordenada <- tabla_1p_df %>%
  mutate(
    p_num = gsub("<0\\.001", "0.0009", p),      # reemplaza "<0.001" por un valor numérico muy bajo
    p_num = suppressWarnings(as.numeric(p_num))
  ) %>%
  arrange(p_num)

# Mostrar
knitr::kable(tabla_1p_ordenada, caption = "Tabla 1: Ordenada por valor p")
Tabla 1: Ordenada por valor p
0 1 p test p_num
died = 1 (%) 0 ( 0.0) 108 (100.0) <0.001 0.0009
systolic_blood_pressure (mean (SD)) 119.21 (17.05) 110.39 (16.75) <0.001 0.0009
temperature (mean (SD)) 36.70 (0.57) 36.48 (0.73) <0.001 0.0009
urine_output (mean (SD)) 1999.65 (1271.35) 1251.19 (1042.46) <0.001 0.0009
leucocyte (mean (SD)) 10.37 (4.52) 13.84 (7.96) <0.001 0.0009
inr (mean (SD)) 1.58 (0.75) 2.02 (1.40) <0.001 0.0009
nt-probnp (mean (SD)) 10814.14 (12818.97) 15554.25 (14867.09) <0.001 0.0009
urea_nitrogen (mean (SD)) 34.83 (20.81) 48.71 (28.71) <0.001 0.0009
blood_potassium (mean (SD)) 4.15 (0.38) 4.34 (0.58) <0.001 0.0009
blood_calcium (mean (SD)) 8.54 (0.55) 8.23 (0.63) <0.001 0.0009
ph (mean (SD)) 7.38 (0.06) 7.36 (0.08) <0.001 0.0009
bicarbonate (mean (SD)) 27.41 (5.01) 23.48 (5.32) <0.001 0.0009
lactic_acid (mean (SD)) 1.71 (0.75) 2.40 (1.36) <0.001 0.0009
anemia = 1 (%) 266 (37.2) 22 ( 20.4) 0.001 0.0010
ckd = 1 (%) 276 (38.5) 24 ( 22.2) 0.001 0.0010
platelets (mean (SD)) 248.72 (108.03) 214.03 (133.44) 0.003 0.0030
a_fib = 1 (%) 313 (43.7) 64 ( 59.3) 0.004 0.0040
magnesium_ion (mean (SD)) 2.10 (0.24) 2.18 (0.31) 0.004 0.0040
respiratory_rate (mean (SD)) 20.51 (3.93) 21.68 (4.27) 0.005 0.0050
heart_rate (mean (SD)) 83.80 (15.83) 88.24 (14.68) 0.006 0.0060
chloride (mean (SD)) 102.12 (5.28) 103.36 (6.04) 0.026 0.0260
hypertension = 1 (%) 521 (72.8) 67 ( 62.0) 0.029 0.0290
diastolic_blood_pressure (mean (SD)) 60.00 (10.84) 57.67 (9.21) 0.034 0.0340
copd = 1 (%) 64 ( 8.9) 3 ( 2.8) 0.046 0.0460
creatinine (mean (SD)) 1.60 (1.21) 1.84 (1.20) 0.046 0.0460
blood_sodium (mean (SD)) 139.02 (3.97) 138.18 (5.35) 0.052 0.0520
pco2 (mean (SD)) 45.49 (11.54) 43.16 (12.22) 0.053 0.0530
sp_o2 (mean (SD)) 96.35 (2.14) 95.90 (3.01) 0.054 0.0540
age (mean (SD)) 73.76 (13.01) 75.98 (14.01) 0.102 0.1020
depression = 1 (%) 90 (12.6) 8 ( 7.4) 0.166 0.1660
gender = 1 (%) 334 (46.6) 56 ( 51.9) 0.365 0.3650
diabetes = 1 (%) 310 (43.3) 42 ( 38.9) 0.448 0.4480
glucose (mean (SD)) 148.52 (50.57) 151.15 (52.98) 0.617 0.6170
hyperlipemia = 1 (%) 278 (38.8) 39 ( 36.1) 0.664 0.6640
hematocrit (mean (SD)) 31.91 (5.11) 31.97 (5.43) 0.913 0.9130
chd = 1 (%) 66 ( 9.2) 10 ( 9.3) 1.000 1.0000
n 716 108 NA

Analisis bivariado

library(dplyr)
library(broom) # Para tidy() que facilita extraer resultados de modelos

# Crear un dataframe vacío para guardar los p-valores
p_values_df <- data.frame(
  Variable = character(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Lista de variables predictoras
variables <- names(df_train)[-1]

for (var in variables) {
  # Construir fórmula
  formula_str <- paste("died ~", var)

  # Intentar ajustar el modelo dentro de tryCatch por si hay errores
  result <- tryCatch({
    mod <- glm(as.formula(formula_str), data = df_train, family = "binomial")
    anova_result <- anova(mod, test = "LRT")
    p_value <- anova_result$`Pr(>Chi)`[2]

    data.frame(Variable = var, P_Value = p_value)
  }, error = function(e) {
    # Si da error (por variable constante, NA, etc), guardar NA
    data.frame(Variable = var, P_Value = NA)
  })

  # Agregar al dataframe final
  p_values_df <- bind_rows(p_values_df, result)
}

# Ordenar por p-valor de forma descendente
options(scipen = 999)
p_values_df_sorted <- p_values_df %>%
  arrange((P_Value))

print(p_values_df_sorted)
##                    Variable                P_Value
## 1               bicarbonate 0.00000000000001085484
## 2               lactic_acid 0.00000000001320984581
## 3              urine_output 0.00000000006626346501
## 4                 leucocyte 0.00000000585487299115
## 5             urea_nitrogen 0.00000002749205662082
## 6             blood_calcium 0.00000006304410498066
## 7   systolic_blood_pressure 0.00000017618452785106
## 8                       inr 0.00001124996104129712
## 9           blood_potassium 0.00002115186624546195
## 10                       ph 0.00003010890124866368
## 11                   anemia 0.00039788582968445168
## 12              temperature 0.00049234296151003001
## 13                      ckd 0.00068233347860687725
## 14                platelets 0.00141173179870201062
## 15                    a_fib 0.00253629680344145871
## 16         respiratory_rate 0.00537632999659311793
## 17            magnesium_ion 0.00591681699371985562
## 18               heart_rate 0.00664821532893986912
## 19                     copd 0.01385596144003607526
## 20             hypertension 0.02467889263042962258
## 21                 chloride 0.02494201485176068822
## 22 diastolic_blood_pressure 0.03128397193300556239
## 23                     pco2 0.04509818951166879447
## 24             blood_sodium 0.05717305259412443219
## 25                    sp_o2 0.06221886993549445577
## 26               creatinine 0.06222099522072049166
## 27                      age 0.09523123260360358278
## 28               depression 0.10264561218978675294
## 29                   gender 0.31303158113340368018
## 30                 diabetes 0.38633599555904540868
## 31             hyperlipemia 0.58740343737471800711
## 32                  glucose 0.61999139280857096246
## 33               hematocrit 0.91311483535898174679
## 34                      chd 0.98895161420733002089
## 35                nt-probnp                     NA

Analizar correlacion

col_num = sapply(df_train, is.numeric)
df_num = df_train[,col_num]
matriz_corr = cor(df_num, method = "spearman")

library(formattable)
#3. Convertir la matriz de correlación a un data.frame para usar formattable
#    También redondeamos para una mejor lectura.
matriz_corr_df <- as.data.frame(round(matriz_corr, 3))
knitr::kable(matriz_corr_df)
age heart_rate systolic_blood_pressure diastolic_blood_pressure respiratory_rate temperature sp_o2 urine_output hematocrit leucocyte platelets inr nt-probnp creatinine urea_nitrogen glucose blood_potassium blood_sodium blood_calcium chloride magnesium_ion ph bicarbonate lactic_acid pco2
age 1.000 -0.186 -0.056 -0.319 0.041 -0.174 0.054 -0.179 0.058 0.040 -0.016 0.070 0.120 0.019 0.107 -0.099 -0.071 0.105 -0.026 0.149 0.075 0.093 -0.071 0.080 -0.132
heart_rate -0.186 1.000 -0.155 0.337 0.322 0.140 -0.110 -0.023 -0.006 0.145 0.092 0.039 -0.013 -0.228 -0.187 0.053 -0.037 -0.072 -0.092 0.031 -0.077 0.008 -0.078 0.163 -0.113
systolic_blood_pressure -0.056 -0.155 1.000 0.333 -0.074 0.095 -0.050 0.249 0.018 -0.085 0.083 -0.273 -0.151 0.005 -0.030 0.087 -0.026 0.116 0.132 -0.012 -0.042 0.074 0.128 -0.168 0.113
diastolic_blood_pressure -0.319 0.337 0.333 1.000 0.128 0.011 -0.106 0.186 0.281 -0.053 0.075 -0.043 -0.050 -0.094 -0.121 0.097 -0.008 0.019 0.130 -0.065 -0.024 0.088 0.066 0.077 -0.021
respiratory_rate 0.041 0.322 -0.074 0.128 1.000 0.093 -0.240 0.041 0.110 0.150 0.110 0.049 0.039 -0.088 -0.021 0.087 -0.031 -0.024 0.028 -0.068 0.002 0.057 0.000 0.133 -0.049
temperature -0.174 0.140 0.095 0.011 0.093 1.000 0.030 0.171 -0.095 0.018 -0.003 -0.072 -0.188 -0.133 -0.202 -0.057 -0.182 0.075 -0.116 0.106 -0.099 0.129 0.021 -0.066 -0.014
sp_o2 0.054 -0.110 -0.050 -0.106 -0.240 0.030 1.000 -0.080 -0.243 -0.013 -0.086 0.060 0.181 0.076 0.027 -0.057 -0.068 -0.022 -0.146 0.162 -0.046 0.017 -0.187 0.065 -0.167
urine_output -0.179 -0.023 0.249 0.186 0.041 0.171 -0.080 1.000 0.120 -0.144 0.039 -0.095 -0.268 -0.149 -0.194 0.027 -0.113 0.012 0.173 -0.124 -0.058 0.169 0.237 -0.134 0.097
hematocrit 0.058 -0.006 0.018 0.281 0.110 -0.095 -0.243 0.120 1.000 0.070 -0.047 -0.024 -0.189 -0.181 -0.138 0.038 0.022 0.089 0.283 -0.125 0.098 0.013 0.195 0.127 0.099
leucocyte 0.040 0.145 -0.085 -0.053 0.150 0.018 -0.013 -0.144 0.070 1.000 0.269 0.084 0.095 -0.008 0.079 0.119 0.001 -0.052 -0.086 0.020 0.088 -0.037 -0.151 0.261 -0.116
platelets -0.016 0.092 0.083 0.075 0.110 -0.003 -0.086 0.039 -0.047 0.269 1.000 -0.013 -0.016 -0.182 -0.176 0.065 -0.001 -0.064 0.112 -0.140 -0.015 0.114 0.132 -0.045 0.022
inr 0.070 0.039 -0.273 -0.043 0.049 -0.072 0.060 -0.095 -0.024 0.084 -0.013 1.000 0.190 0.086 0.071 -0.085 -0.009 -0.064 -0.080 0.010 0.051 0.100 -0.098 0.203 -0.167
nt-probnp 0.120 -0.013 -0.151 -0.050 0.039 -0.188 0.181 -0.268 -0.189 0.095 -0.016 0.190 1.000 0.442 0.440 -0.024 0.096 -0.040 -0.107 0.050 0.093 -0.065 -0.288 0.107 -0.210
creatinine 0.019 -0.228 0.005 -0.094 -0.088 -0.133 0.076 -0.149 -0.181 -0.008 -0.182 0.086 0.442 1.000 0.781 0.064 0.322 -0.048 0.040 -0.003 0.255 -0.230 -0.320 0.003 -0.099
urea_nitrogen 0.107 -0.187 -0.030 -0.121 -0.021 -0.202 0.027 -0.194 -0.138 0.079 -0.176 0.071 0.440 0.781 1.000 0.150 0.370 0.032 0.078 0.017 0.382 -0.207 -0.238 0.015 -0.055
glucose -0.099 0.053 0.087 0.097 0.087 -0.057 -0.057 0.027 0.038 0.119 0.065 -0.085 -0.024 0.064 0.150 1.000 0.110 -0.005 0.085 -0.037 0.106 -0.025 0.000 0.193 -0.014
blood_potassium -0.071 -0.037 -0.026 -0.008 -0.031 -0.182 -0.068 -0.113 0.022 0.001 -0.001 -0.009 0.096 0.322 0.370 0.110 1.000 -0.216 0.110 -0.119 0.187 -0.393 -0.109 0.020 0.098
blood_sodium 0.105 -0.072 0.116 0.019 -0.024 0.075 -0.022 0.012 0.089 -0.052 -0.064 -0.064 -0.040 -0.048 0.032 -0.005 -0.216 1.000 0.070 0.566 0.095 -0.044 0.191 -0.151 0.202
blood_calcium -0.026 -0.092 0.132 0.130 0.028 -0.116 -0.146 0.173 0.283 -0.086 0.112 -0.080 -0.107 0.040 0.078 0.085 0.110 0.070 1.000 -0.254 0.156 0.067 0.288 -0.071 0.179
chloride 0.149 0.031 -0.012 -0.065 -0.068 0.106 0.162 -0.124 -0.125 0.020 -0.140 0.010 0.050 -0.003 0.017 -0.037 -0.119 0.566 -0.254 1.000 -0.031 -0.096 -0.533 0.037 -0.359
magnesium_ion 0.075 -0.077 -0.042 -0.024 0.002 -0.099 -0.046 -0.058 0.098 0.088 -0.015 0.051 0.093 0.255 0.382 0.106 0.187 0.095 0.156 -0.031 1.000 -0.026 0.008 0.030 0.028
ph 0.093 0.008 0.074 0.088 0.057 0.129 0.017 0.169 0.013 -0.037 0.114 0.100 -0.065 -0.230 -0.207 -0.025 -0.393 -0.044 0.067 -0.096 -0.026 1.000 0.131 0.064 -0.409
bicarbonate -0.071 -0.078 0.128 0.066 0.000 0.021 -0.187 0.237 0.195 -0.151 0.132 -0.098 -0.288 -0.320 -0.238 0.000 -0.109 0.191 0.288 -0.533 0.008 0.131 1.000 -0.281 0.704
lactic_acid 0.080 0.163 -0.168 0.077 0.133 -0.066 0.065 -0.134 0.127 0.261 -0.045 0.203 0.107 0.003 0.015 0.193 0.020 -0.151 -0.071 0.037 0.030 0.064 -0.281 1.000 -0.332
pco2 -0.132 -0.113 0.113 -0.021 -0.049 -0.014 -0.167 0.097 0.099 -0.116 0.022 -0.167 -0.210 -0.099 -0.055 -0.014 0.098 0.202 0.179 -0.359 0.028 -0.409 0.704 -0.332 1.000
#ver correlacion en forma grafica entre ac lactico y bicarbonato
library(ggplot2)

ggplot(df_train, aes(x = bicarbonate, y = lactic_acid)) +
  geom_point(alpha = 0.6) +  # puntos de dispersión
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # línea de regresión lineal
  theme_minimal() +
  labs(
    title = "Correlación entre ac lactico y bicarbonato",
    x = "Ac Lactico",
    y = "bicarboanto"
  )
## `geom_smooth()` using formula = 'y ~ x'

cor(df_train$bicarbonate, df_train$lactic_acid, method = "spearman", use = "complete.obs")
## [1] -0.2812888

Evaluar linealidad entre variables continuas y logit died

#Bicarbonato
df_cont <- df_train %>%
  mutate(bicarbonate_q = ntile(bicarbonate, 5))

df_cont %>%
  group_by(bicarbonate_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = bicarbonate_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

#Urine output
df_cont <- df_train %>%
  mutate(urine_output_q = ntile(urine_output, 5))

df_cont %>%
  group_by(urine_output_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = urine_output_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

#leucocyte
df_cont <- df_train %>%
  mutate(leucocyte_q = ntile(leucocyte, 5))

df_cont %>%
  group_by(leucocyte_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = leucocyte_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

library(dplyr)
library(ggplot2)

evaluar_linealidad_logit <- function(df, outcome, variables, n_grupos = 5) {
  # Selecciona variables numéricas de la lista
  vars_num <- variables[sapply(df[variables], is.numeric)]
  
  for (var in vars_num) {
    # Crea grupos con ntile
    df_cont <- df %>%
      mutate(
        !!paste0(var, "_q") := ntile(.data[[var]], n_grupos)
      )
    
    # Calcula proporción del outcome en cada grupo
    resumen <- df_cont %>%
      group_by(.data[[paste0(var, "_q")]]) %>%
      summarise(
        proporcion = mean(as.numeric(.data[[outcome]]) == 1, na.rm = TRUE),
        valor_medio = mean(.data[[var]], na.rm = TRUE),
        .groups = "drop"
      )
    
    # Gráfico proporción vs grupo con línea de regresión
    p <- ggplot(resumen, aes(x = valor_medio, y = proporcion)) +
      geom_point() +
      geom_smooth(method = "lm", color = "blue") +
      labs(
        title = paste("Evaluación de linealidad para", var),
        x = paste("Valor medio de", var, "(por grupo)"),
        y = paste("Proporción de", outcome, "=1")
      ) +
      theme_minimal()
    
    print(p)
  }
}

#Extraer todas las variables numericas del dataframe

obtener_variables_numericas <- function(df) {
  nombres_num <- names(df_train)[sapply(df, is.numeric)]
  return(nombres_num)
}

var_numericas = obtener_variables_numericas((df_train))
evaluar_linealidad_logit(df_train, "died", var_numericas)
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'