Take Home

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

df_train= read_csv("C:/Users/Jao/Documents/Doc/Cursos/Maestria Efectividad Clinica/2/4 Logistica/TH/train_TH.csv")

## Rows: 824 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (36): Died, age, gender, hypertension, a_fib, CHD, diabetes, anemia, dep...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Normalizacion de datos

# Convertir 'gender' en factor binario
df_train <- df_train %>%
  mutate(gender = as.factor(ifelse(gender == "1", 1, 0)))

# Convertir otras variables a factor
df_train <- df_train %>%
  mutate(across(c(Died, hypertension, a_fib, CHD, diabetes, anemia, depression, Hyperlipemia, CKD, COPD), as.factor))

# Convertir nombres de columnas a minúscula
names(df_train) <- tolower(names(df_train))

# Función para renombrar columnas con guion bajo
renombrar_columnas <- function(df) {
  names(df) <- gsub(" ", "_", names(df))
  return(df)
}

# Aplicar la función
df_train <- renombrar_columnas(df_train)

Evaluar NA

# Total de NA en todo el dataframe
total_na <- sum(is.na(df_train))
cat("Total de valores NA en el dataframe:", total_na, "\n\n")

## Total de valores NA en el dataframe: 0

# NA por variable
na_por_variable <- colSums(is.na(df_train))
print(na_por_variable)

##                     died                      age                   gender 
##                        0                        0                        0 
##             hypertension                    a_fib                      chd 
##                        0                        0                        0 
##                 diabetes                   anemia               depression 
##                        0                        0                        0 
##             hyperlipemia                      ckd                     copd 
##                        0                        0                        0 
##               heart_rate  systolic_blood_pressure diastolic_blood_pressure 
##                        0                        0                        0 
##         respiratory_rate              temperature                    sp_o2 
##                        0                        0                        0 
##             urine_output               hematocrit                leucocyte 
##                        0                        0                        0 
##                platelets                      inr                nt-probnp 
##                        0                        0                        0 
##               creatinine            urea_nitrogen                  glucose 
##                        0                        0                        0 
##          blood_potassium             blood_sodium            blood_calcium 
##                        0                        0                        0 
##                 chloride            magnesium_ion                       ph 
##                        0                        0                        0 
##              bicarbonate              lactic_acid                     pco2 
##                        0                        0                        0

Tabla 1

tabla_1 = CreateTableOne(vars = colnames(df_train), strata = "died", data = df_train)
#kableone(tabla_1)
#ordenar por signficancia p

tabla_1p <- print(tabla_1, printToggle = FALSE)

# Convertir matriz a data.frame
tabla_1p_df <- as.data.frame(tabla_1p)

# Ordenar por p-valor
tabla_1p_ordenada <- tabla_1p_df %>%
  mutate(
    p_num = gsub("<0\\.001", "0.0009", p),      # reemplaza "<0.001" por un valor numérico muy bajo
    p_num = suppressWarnings(as.numeric(p_num))
  ) %>%
  arrange(p_num)

# Mostrar
knitr::kable(tabla_1p_ordenada, caption = "Tabla 1: Ordenada por valor p")

Tabla 1: Ordenada por valor p
	0	1	p	p_num
died = 1 (%)	0 ( 0.0)	108 (100.0)	<0.001	0.0009
systolic_blood_pressure (mean (SD))	119.21 (17.05)	110.39 (16.75)	<0.001	0.0009
temperature (mean (SD))	36.70 (0.57)	36.48 (0.73)	<0.001	0.0009
urine_output (mean (SD))	1999.65 (1271.35)	1251.19 (1042.46)	<0.001	0.0009
leucocyte (mean (SD))	10.37 (4.52)	13.84 (7.96)	<0.001	0.0009
inr (mean (SD))	1.58 (0.75)	2.02 (1.40)	<0.001	0.0009
nt-probnp (mean (SD))	10814.14 (12818.97)	15554.25 (14867.09)	<0.001	0.0009
urea_nitrogen (mean (SD))	34.83 (20.81)	48.71 (28.71)	<0.001	0.0009
blood_potassium (mean (SD))	4.15 (0.38)	4.34 (0.58)	<0.001	0.0009
blood_calcium (mean (SD))	8.54 (0.55)	8.23 (0.63)	<0.001	0.0009
ph (mean (SD))	7.38 (0.06)	7.36 (0.08)	<0.001	0.0009
bicarbonate (mean (SD))	27.41 (5.01)	23.48 (5.32)	<0.001	0.0009
lactic_acid (mean (SD))	1.71 (0.75)	2.40 (1.36)	<0.001	0.0009
anemia = 1 (%)	266 (37.2)	22 ( 20.4)	0.001	0.0010
ckd = 1 (%)	276 (38.5)	24 ( 22.2)	0.001	0.0010
platelets (mean (SD))	248.72 (108.03)	214.03 (133.44)	0.003	0.0030
a_fib = 1 (%)	313 (43.7)	64 ( 59.3)	0.004	0.0040
magnesium_ion (mean (SD))	2.10 (0.24)	2.18 (0.31)	0.004	0.0040
respiratory_rate (mean (SD))	20.51 (3.93)	21.68 (4.27)	0.005	0.0050
heart_rate (mean (SD))	83.80 (15.83)	88.24 (14.68)	0.006	0.0060
chloride (mean (SD))	102.12 (5.28)	103.36 (6.04)	0.026	0.0260
hypertension = 1 (%)	521 (72.8)	67 ( 62.0)	0.029	0.0290
diastolic_blood_pressure (mean (SD))	60.00 (10.84)	57.67 (9.21)	0.034	0.0340
copd = 1 (%)	64 ( 8.9)	3 ( 2.8)	0.046	0.0460
creatinine (mean (SD))	1.60 (1.21)	1.84 (1.20)	0.046	0.0460
blood_sodium (mean (SD))	139.02 (3.97)	138.18 (5.35)	0.052	0.0520
pco2 (mean (SD))	45.49 (11.54)	43.16 (12.22)	0.053	0.0530
sp_o2 (mean (SD))	96.35 (2.14)	95.90 (3.01)	0.054	0.0540
age (mean (SD))	73.76 (13.01)	75.98 (14.01)	0.102	0.1020
depression = 1 (%)	90 (12.6)	8 ( 7.4)	0.166	0.1660
gender = 1 (%)	334 (46.6)	56 ( 51.9)	0.365	0.3650
diabetes = 1 (%)	310 (43.3)	42 ( 38.9)	0.448	0.4480
glucose (mean (SD))	148.52 (50.57)	151.15 (52.98)	0.617	0.6170
hyperlipemia = 1 (%)	278 (38.8)	39 ( 36.1)	0.664	0.6640
hematocrit (mean (SD))	31.91 (5.11)	31.97 (5.43)	0.913	0.9130
chd = 1 (%)	66 ( 9.2)	10 ( 9.3)	1.000	1.0000
n	716	108		NA

Analisis bivariado

library(dplyr)
library(broom) # Para tidy() que facilita extraer resultados de modelos

# Crear un dataframe vacío para guardar los p-valores
p_values_df <- data.frame(
  Variable = character(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Lista de variables predictoras
variables <- names(df_train)[-1]

for (var in variables) {
  # Construir fórmula
  formula_str <- paste("died ~", var)

  # Intentar ajustar el modelo dentro de tryCatch por si hay errores
  result <- tryCatch({
    mod <- glm(as.formula(formula_str), data = df_train, family = "binomial")
    anova_result <- anova(mod, test = "LRT")
    p_value <- anova_result$`Pr(>Chi)`[2]

    data.frame(Variable = var, P_Value = p_value)
  }, error = function(e) {
    # Si da error (por variable constante, NA, etc), guardar NA
    data.frame(Variable = var, P_Value = NA)
  })

  # Agregar al dataframe final
  p_values_df <- bind_rows(p_values_df, result)
}

# Ordenar por p-valor de forma descendente
options(scipen = 999)
p_values_df_sorted <- p_values_df %>%
  arrange((P_Value))

print(p_values_df_sorted)

##                    Variable                P_Value
## 1               bicarbonate 0.00000000000001085484
## 2               lactic_acid 0.00000000001320984581
## 3              urine_output 0.00000000006626346501
## 4                 leucocyte 0.00000000585487299115
## 5             urea_nitrogen 0.00000002749205662082
## 6             blood_calcium 0.00000006304410498066
## 7   systolic_blood_pressure 0.00000017618452785106
## 8                       inr 0.00001124996104129712
## 9           blood_potassium 0.00002115186624546195
## 10                       ph 0.00003010890124866368
## 11                   anemia 0.00039788582968445168
## 12              temperature 0.00049234296151003001
## 13                      ckd 0.00068233347860687725
## 14                platelets 0.00141173179870201062
## 15                    a_fib 0.00253629680344145871
## 16         respiratory_rate 0.00537632999659311793
## 17            magnesium_ion 0.00591681699371985562
## 18               heart_rate 0.00664821532893986912
## 19                     copd 0.01385596144003607526
## 20             hypertension 0.02467889263042962258
## 21                 chloride 0.02494201485176068822
## 22 diastolic_blood_pressure 0.03128397193300556239
## 23                     pco2 0.04509818951166879447
## 24             blood_sodium 0.05717305259412443219
## 25                    sp_o2 0.06221886993549445577
## 26               creatinine 0.06222099522072049166
## 27                      age 0.09523123260360358278
## 28               depression 0.10264561218978675294
## 29                   gender 0.31303158113340368018
## 30                 diabetes 0.38633599555904540868
## 31             hyperlipemia 0.58740343737471800711
## 32                  glucose 0.61999139280857096246
## 33               hematocrit 0.91311483535898174679
## 34                      chd 0.98895161420733002089
## 35                nt-probnp                     NA

Analizar correlacion

col_num = sapply(df_train, is.numeric)
df_num = df_train[,col_num]
matriz_corr = cor(df_num, method = "spearman")

library(formattable)
#3. Convertir la matriz de correlación a un data.frame para usar formattable
#    También redondeamos para una mejor lectura.
matriz_corr_df <- as.data.frame(round(matriz_corr, 3))
knitr::kable(matriz_corr_df)

	age	heart_rate	systolic_blood_pressure	diastolic_blood_pressure	respiratory_rate	temperature	sp_o2	urine_output	hematocrit	leucocyte	platelets	inr	nt-probnp	creatinine	urea_nitrogen	glucose	blood_potassium	blood_sodium	blood_calcium	chloride	magnesium_ion	ph	bicarbonate	lactic_acid	pco2
age	1.000	-0.186	-0.056	-0.319	0.041	-0.174	0.054	-0.179	0.058	0.040	-0.016	0.070	0.120	0.019	0.107	-0.099	-0.071	0.105	-0.026	0.149	0.075	0.093	-0.071	0.080	-0.132
heart_rate	-0.186	1.000	-0.155	0.337	0.322	0.140	-0.110	-0.023	-0.006	0.145	0.092	0.039	-0.013	-0.228	-0.187	0.053	-0.037	-0.072	-0.092	0.031	-0.077	0.008	-0.078	0.163	-0.113
systolic_blood_pressure	-0.056	-0.155	1.000	0.333	-0.074	0.095	-0.050	0.249	0.018	-0.085	0.083	-0.273	-0.151	0.005	-0.030	0.087	-0.026	0.116	0.132	-0.012	-0.042	0.074	0.128	-0.168	0.113
diastolic_blood_pressure	-0.319	0.337	0.333	1.000	0.128	0.011	-0.106	0.186	0.281	-0.053	0.075	-0.043	-0.050	-0.094	-0.121	0.097	-0.008	0.019	0.130	-0.065	-0.024	0.088	0.066	0.077	-0.021
respiratory_rate	0.041	0.322	-0.074	0.128	1.000	0.093	-0.240	0.041	0.110	0.150	0.110	0.049	0.039	-0.088	-0.021	0.087	-0.031	-0.024	0.028	-0.068	0.002	0.057	0.000	0.133	-0.049
temperature	-0.174	0.140	0.095	0.011	0.093	1.000	0.030	0.171	-0.095	0.018	-0.003	-0.072	-0.188	-0.133	-0.202	-0.057	-0.182	0.075	-0.116	0.106	-0.099	0.129	0.021	-0.066	-0.014
sp_o2	0.054	-0.110	-0.050	-0.106	-0.240	0.030	1.000	-0.080	-0.243	-0.013	-0.086	0.060	0.181	0.076	0.027	-0.057	-0.068	-0.022	-0.146	0.162	-0.046	0.017	-0.187	0.065	-0.167
urine_output	-0.179	-0.023	0.249	0.186	0.041	0.171	-0.080	1.000	0.120	-0.144	0.039	-0.095	-0.268	-0.149	-0.194	0.027	-0.113	0.012	0.173	-0.124	-0.058	0.169	0.237	-0.134	0.097
hematocrit	0.058	-0.006	0.018	0.281	0.110	-0.095	-0.243	0.120	1.000	0.070	-0.047	-0.024	-0.189	-0.181	-0.138	0.038	0.022	0.089	0.283	-0.125	0.098	0.013	0.195	0.127	0.099
leucocyte	0.040	0.145	-0.085	-0.053	0.150	0.018	-0.013	-0.144	0.070	1.000	0.269	0.084	0.095	-0.008	0.079	0.119	0.001	-0.052	-0.086	0.020	0.088	-0.037	-0.151	0.261	-0.116
platelets	-0.016	0.092	0.083	0.075	0.110	-0.003	-0.086	0.039	-0.047	0.269	1.000	-0.013	-0.016	-0.182	-0.176	0.065	-0.001	-0.064	0.112	-0.140	-0.015	0.114	0.132	-0.045	0.022
inr	0.070	0.039	-0.273	-0.043	0.049	-0.072	0.060	-0.095	-0.024	0.084	-0.013	1.000	0.190	0.086	0.071	-0.085	-0.009	-0.064	-0.080	0.010	0.051	0.100	-0.098	0.203	-0.167
nt-probnp	0.120	-0.013	-0.151	-0.050	0.039	-0.188	0.181	-0.268	-0.189	0.095	-0.016	0.190	1.000	0.442	0.440	-0.024	0.096	-0.040	-0.107	0.050	0.093	-0.065	-0.288	0.107	-0.210
creatinine	0.019	-0.228	0.005	-0.094	-0.088	-0.133	0.076	-0.149	-0.181	-0.008	-0.182	0.086	0.442	1.000	0.781	0.064	0.322	-0.048	0.040	-0.003	0.255	-0.230	-0.320	0.003	-0.099
urea_nitrogen	0.107	-0.187	-0.030	-0.121	-0.021	-0.202	0.027	-0.194	-0.138	0.079	-0.176	0.071	0.440	0.781	1.000	0.150	0.370	0.032	0.078	0.017	0.382	-0.207	-0.238	0.015	-0.055
glucose	-0.099	0.053	0.087	0.097	0.087	-0.057	-0.057	0.027	0.038	0.119	0.065	-0.085	-0.024	0.064	0.150	1.000	0.110	-0.005	0.085	-0.037	0.106	-0.025	0.000	0.193	-0.014
blood_potassium	-0.071	-0.037	-0.026	-0.008	-0.031	-0.182	-0.068	-0.113	0.022	0.001	-0.001	-0.009	0.096	0.322	0.370	0.110	1.000	-0.216	0.110	-0.119	0.187	-0.393	-0.109	0.020	0.098
blood_sodium	0.105	-0.072	0.116	0.019	-0.024	0.075	-0.022	0.012	0.089	-0.052	-0.064	-0.064	-0.040	-0.048	0.032	-0.005	-0.216	1.000	0.070	0.566	0.095	-0.044	0.191	-0.151	0.202
blood_calcium	-0.026	-0.092	0.132	0.130	0.028	-0.116	-0.146	0.173	0.283	-0.086	0.112	-0.080	-0.107	0.040	0.078	0.085	0.110	0.070	1.000	-0.254	0.156	0.067	0.288	-0.071	0.179
chloride	0.149	0.031	-0.012	-0.065	-0.068	0.106	0.162	-0.124	-0.125	0.020	-0.140	0.010	0.050	-0.003	0.017	-0.037	-0.119	0.566	-0.254	1.000	-0.031	-0.096	-0.533	0.037	-0.359
magnesium_ion	0.075	-0.077	-0.042	-0.024	0.002	-0.099	-0.046	-0.058	0.098	0.088	-0.015	0.051	0.093	0.255	0.382	0.106	0.187	0.095	0.156	-0.031	1.000	-0.026	0.008	0.030	0.028
ph	0.093	0.008	0.074	0.088	0.057	0.129	0.017	0.169	0.013	-0.037	0.114	0.100	-0.065	-0.230	-0.207	-0.025	-0.393	-0.044	0.067	-0.096	-0.026	1.000	0.131	0.064	-0.409
bicarbonate	-0.071	-0.078	0.128	0.066	0.000	0.021	-0.187	0.237	0.195	-0.151	0.132	-0.098	-0.288	-0.320	-0.238	0.000	-0.109	0.191	0.288	-0.533	0.008	0.131	1.000	-0.281	0.704
lactic_acid	0.080	0.163	-0.168	0.077	0.133	-0.066	0.065	-0.134	0.127	0.261	-0.045	0.203	0.107	0.003	0.015	0.193	0.020	-0.151	-0.071	0.037	0.030	0.064	-0.281	1.000	-0.332
pco2	-0.132	-0.113	0.113	-0.021	-0.049	-0.014	-0.167	0.097	0.099	-0.116	0.022	-0.167	-0.210	-0.099	-0.055	-0.014	0.098	0.202	0.179	-0.359	0.028	-0.409	0.704	-0.332	1.000

#ver correlacion en forma grafica entre ac lactico y bicarbonato
library(ggplot2)

ggplot(df_train, aes(x = bicarbonate, y = lactic_acid)) +
  geom_point(alpha = 0.6) +  # puntos de dispersión
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # línea de regresión lineal
  theme_minimal() +
  labs(
    title = "Correlación entre ac lactico y bicarbonato",
    x = "Ac Lactico",
    y = "bicarboanto"
  )

## `geom_smooth()` using formula = 'y ~ x'

cor(df_train$bicarbonate, df_train$lactic_acid, method = "spearman", use = "complete.obs")

## [1] -0.2812888

Evaluar linealidad entre variables continuas y logit died

#Bicarbonato
df_cont <- df_train %>%
  mutate(bicarbonate_q = ntile(bicarbonate, 5))

df_cont %>%
  group_by(bicarbonate_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = bicarbonate_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")

## `geom_smooth()` using formula = 'y ~ x'

#Urine output
df_cont <- df_train %>%
  mutate(urine_output_q = ntile(urine_output, 5))

df_cont %>%
  group_by(urine_output_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = urine_output_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")

## `geom_smooth()` using formula = 'y ~ x'

#leucocyte
df_cont <- df_train %>%
  mutate(leucocyte_q = ntile(leucocyte, 5))

df_cont %>%
  group_by(leucocyte_q) %>%
  summarise(tar = mean(as.numeric(died) == 1)) %>%
  ggplot(aes(x = leucocyte_q, y = tar)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue")

## `geom_smooth()` using formula = 'y ~ x'

library(dplyr)
library(ggplot2)

evaluar_linealidad_logit <- function(df, outcome, variables, n_grupos = 5) {
  # Selecciona variables numéricas de la lista
  vars_num <- variables[sapply(df[variables], is.numeric)]
  
  for (var in vars_num) {
    # Crea grupos con ntile
    df_cont <- df %>%
      mutate(
        !!paste0(var, "_q") := ntile(.data[[var]], n_grupos)
      )
    
    # Calcula proporción del outcome en cada grupo
    resumen <- df_cont %>%
      group_by(.data[[paste0(var, "_q")]]) %>%
      summarise(
        proporcion = mean(as.numeric(.data[[outcome]]) == 1, na.rm = TRUE),
        valor_medio = mean(.data[[var]], na.rm = TRUE),
        .groups = "drop"
      )
    
    # Gráfico proporción vs grupo con línea de regresión
    p <- ggplot(resumen, aes(x = valor_medio, y = proporcion)) +
      geom_point() +
      geom_smooth(method = "lm", color = "blue") +
      labs(
        title = paste("Evaluación de linealidad para", var),
        x = paste("Valor medio de", var, "(por grupo)"),
        y = paste("Proporción de", outcome, "=1")
      ) +
      theme_minimal()
    
    print(p)
  }
}

#Extraer todas las variables numericas del dataframe

obtener_variables_numericas <- function(df) {
  nombres_num <- names(df_train)[sapply(df, is.numeric)]
  return(nombres_num)
}

var_numericas = obtener_variables_numericas((df_train))

evaluar_linealidad_logit(df_train, "died", var_numericas)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Take Home

Daniel Chang - Jao lee

2025-07-03