##instalación de paquetes útiles
Transformo las variables categoricas en factores
base$Sexo <- factor(base$Sexo, labels = c("Femenino", "Masculino"))
base$HCC <- factor(base$HCC, labels = c("No", "Si"))
base$DM <- factor(base$DM, labels = c("No", "Si"))
base$HTA <- factor(base$HTA, labels = c("No", "Si"))
base$EPS [base$EPS == 2 | base$EPS == 3 ]<- 1
base$EPS <- factor(base$EPS, labels = c("No", "Si"))
base$Ascitis <- factor(base$Ascitis, labels = c("No", "Si"))
base$AKI_previas <- factor(base$AKI_previas, labels = c("No", "Si"))
base$Sme_reperfusion <- factor(base$Sme_reperfusion, labels = c("No", "Si"))
base$Pobre_funcion <- factor(base$Pobre_funcion, labels = c("No", "Si"))
base$Relaparotomia <- factor(base$Relaparotomia, labels = c("No", "Si"))
base$Nefrotoxicos <- factor(base$Nefrotoxicos, labels = c("No", "Si"))
base$tecnica_quirurgica <- factor(base$tecnica_quirurgica)
Borro las variables que no vamos a utilizar para el modelo
Esta decisión es tomada bajo lineamientos teóricos establecidos en investigaciones previas y bajo razonamientos clínicos.
base <- base [, - c(4,18)] # Saco grupo sanguineo (4) y MELD_NA (18)
base <- base [, - c(10,12,35)] # Saco grado de ascitis (10) y grados de AKI previas (12) y grado de AKI 48 hs (35)
base <- base [, - c(33:37)] # Saco TTR (33), basiliximab (34) dias de UTI (35), internacion (36), muerte (37)
base <- base [, - c(3)] # Saco Causa de cirrosis (3)
Chequeo de datos faltantes en la base.
Utilizamos la libreria mice para buscar datos faltantes
library(mice)
md.pattern(base)
## Sexo Edad_Tx HCC BMI DM HTA EPS Ascitis AKI_previas RIN BT Na Cistatina
## 149 1 1 1 1 1 1 1 1 1 1 1 1 1
## 14 1 1 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0 0 0
## Creatinina WIT_min CIT_hs Sme_reperfusion Inotropicos_qx Globulos_Rojos
## 149 1 1 1 1 1 1
## 14 1 1 1 1 1 1
## 0 0 0 0 0 0
## Hemoderivados AST ALT tecnica_quirurgica Pico_Lactato Nefrotoxicos
## 149 1 1 1 1 1 1
## 14 1 1 1 1 1 1
## 0 0 0 0 0 0
## edad_donante BMI_donante Pobre_funcion Relaparotomia AKI_48hs Alb
## 149 1 1 1 1 1 1 0
## 14 1 1 1 1 1 0 1
## 0 0 0 0 0 14 14
El gráfico nos dice que: Hay 149 observaciones completas, sin celdas vacías o datos faltantes Hay 14 observaciones que le faltan solo la variable albúmina
Debido a que la variable Albúmina es la que presenta datos faltantes , realizo un análisis descriptivo breve en función de dicha variable para determinar si hay una tendencia en la falta de ese dato.
library(arsenal)
base$miss.univ<-ifelse(!is.na(base$Alb), 0, 1)
tab1 <- tableby(miss.univ ~ ., data=base)
summary(tab1, text=TRUE)
##
##
## | | 0 (N=149) | 1 (N=14) | Total (N=163) | p value|
## |:------------------|:-------------------:|:-------------------:|:-------------------:|-------:|
## |Sexo | | | | 0.418|
## |- Femenino | 59 (39.6%) | 4 (28.6%) | 63 (38.7%) | |
## |- Masculino | 90 (60.4%) | 10 (71.4%) | 100 (61.3%) | |
## |Edad_Tx | | | | 0.332|
## |- Mean (SD) | 52.644 (12.278) | 55.929 (9.286) | 52.926 (12.062) | |
## |- Range | 21.000 - 73.000 | 42.000 - 68.000 | 21.000 - 73.000 | |
## |HCC | | | | 0.078|
## |- No | 99 (66.4%) | 6 (42.9%) | 105 (64.4%) | |
## |- Si | 50 (33.6%) | 8 (57.1%) | 58 (35.6%) | |
## |BMI | | | | 0.218|
## |- Mean (SD) | 27.821 (5.227) | 29.586 (3.386) | 27.973 (5.111) | |
## |- Range | 17.800 - 41.600 | 20.400 - 34.000 | 17.800 - 41.600 | |
## |DM | | | | 0.670|
## |- No | 114 (76.5%) | 10 (71.4%) | 124 (76.1%) | |
## |- Si | 35 (23.5%) | 4 (28.6%) | 39 (23.9%) | |
## |HTA | | | | 0.159|
## |- No | 135 (90.6%) | 11 (78.6%) | 146 (89.6%) | |
## |- Si | 14 (9.4%) | 3 (21.4%) | 17 (10.4%) | |
## |EPS | | | | 0.098|
## |- No | 72 (48.3%) | 10 (71.4%) | 82 (50.3%) | |
## |- Si | 77 (51.7%) | 4 (28.6%) | 81 (49.7%) | |
## |Ascitis | | | | 0.329|
## |- No | 45 (30.2%) | 6 (42.9%) | 51 (31.3%) | |
## |- Si | 104 (69.8%) | 8 (57.1%) | 112 (68.7%) | |
## |AKI_previas | | | | 0.088|
## |- No | 107 (71.8%) | 13 (92.9%) | 120 (73.6%) | |
## |- Si | 42 (28.2%) | 1 (7.1%) | 43 (26.4%) | |
## |RIN | | | | 0.404|
## |- Mean (SD) | 1.824 (0.834) | 1.636 (0.399) | 1.808 (0.807) | |
## |- Range | 0.900 - 7.870 | 1.200 - 2.500 | 0.900 - 7.870 | |
## |BT | | | | 0.024|
## |- Mean (SD) | 7.113 (8.676) | 1.821 (1.771) | 6.659 (8.440) | |
## |- Range | 0.460 - 50.730 | 0.400 - 7.100 | 0.400 - 50.730 | |
## |Na | | | | 0.350|
## |- Mean (SD) | 134.564 (5.226) | 135.929 (5.030) | 134.681 (5.209) | |
## |- Range | 120.000 - 148.000 | 124.000 - 144.000 | 120.000 - 148.000 | |
## |Alb | | | | |
## |- N-Miss | 0 | 14 | 14 | |
## |- Mean (SD) | 2.872 (0.650) | NA | 2.872 (0.650) | |
## |- Range | 1.300 - 4.760 | NA | 1.300 - 4.760 | |
## |Cistatina | | | | 0.861|
## |- Mean (SD) | 1.482 (0.504) | 1.459 (0.219) | 1.480 (0.486) | |
## |- Range | 0.490 - 3.660 | 1.170 - 1.870 | 0.490 - 3.660 | |
## |Creatinina | | | | 0.855|
## |- Mean (SD) | 0.855 (0.415) | 0.875 (0.181) | 0.856 (0.400) | |
## |- Range | 0.320 - 4.200 | 0.560 - 1.170 | 0.320 - 4.200 | |
## |WIT_min | | | | 0.674|
## |- Mean (SD) | 45.215 (9.916) | 44.071 (6.889) | 45.117 (9.682) | |
## |- Range | 11.000 - 77.000 | 30.000 - 55.000 | 11.000 - 77.000 | |
## |CIT_hs | | | | 0.726|
## |- Mean (SD) | 7.392 (4.610) | 6.957 (1.305) | 7.355 (4.424) | |
## |- Range | 4.100 - 60.000 | 4.400 - 9.300 | 4.100 - 60.000 | |
## |Sme_reperfusion | | | | 0.108|
## |- No | 73 (49.0%) | 10 (71.4%) | 83 (50.9%) | |
## |- Si | 76 (51.0%) | 4 (28.6%) | 80 (49.1%) | |
## |Inotropicos_qx | | | | 0.324|
## |- Mean (SD) | 0.174 (0.381) | 0.071 (0.267) | 0.166 (0.373) | |
## |- Range | 0.000 - 1.000 | 0.000 - 1.000 | 0.000 - 1.000 | |
## |Globulos_Rojos | | | | 0.009|
## |- Mean (SD) | 2.497 (2.811) | 0.500 (0.760) | 2.325 (2.753) | |
## |- Range | 0.000 - 16.000 | 0.000 - 2.000 | 0.000 - 16.000 | |
## |Hemoderivados | | | | 0.006|
## |- Mean (SD) | 10.933 (12.687) | 1.357 (1.781) | 10.110 (12.432) | |
## |- Range | 0.000 - 50.000 | 0.000 - 5.000 | 0.000 - 50.000 | |
## |AST | | | | 0.279|
## |- Mean (SD) | 1953.047 (1688.017) | 2461.286 (1501.993) | 1996.699 (1674.693) | |
## |- Range | 221.000 - 8636.000 | 520.000 - 5305.000 | 221.000 - 8636.000 | |
## |ALT | | | | 0.224|
## |- Mean (SD) | 1087.168 (1362.046) | 1542.929 (1001.852) | 1126.313 (1338.581) | |
## |- Range | 109.000 - 13356.000 | 205.000 - 2873.000 | 109.000 - 13356.000 | |
## |tecnica_quirurgica | | | | 0.010|
## |- Convencional | 49 (32.9%) | 0 (0.0%) | 49 (30.1%) | |
## |- PB | 100 (67.1%) | 14 (100.0%) | 114 (69.9%) | |
## |Pico_Lactato | | | | 0.010|
## |- Mean (SD) | 4.317 (2.340) | 2.674 (0.942) | 4.176 (2.300) | |
## |- Range | 1.000 - 14.200 | 1.000 - 4.000 | 1.000 - 14.200 | |
## |Nefrotoxicos | | | | 0.006|
## |- No | 128 (85.9%) | 8 (57.1%) | 136 (83.4%) | |
## |- Si | 21 (14.1%) | 6 (42.9%) | 27 (16.6%) | |
## |edad_donante | | | | 0.949|
## |- Mean (SD) | 43.490 (15.602) | 43.214 (12.154) | 43.466 (15.305) | |
## |- Range | 8.000 - 81.000 | 25.000 - 63.000 | 8.000 - 81.000 | |
## |BMI_donante | | | | 0.688|
## |- Mean (SD) | 28.672 (6.174) | 29.343 (2.152) | 28.730 (5.936) | |
## |- Range | 17.500 - 60.600 | 25.000 - 32.000 | 17.500 - 60.600 | |
## |Pobre_funcion | | | | 0.749|
## |- No | 102 (68.5%) | 9 (64.3%) | 111 (68.1%) | |
## |- Si | 47 (31.5%) | 5 (35.7%) | 52 (31.9%) | |
## |Relaparotomia | | | | 0.433|
## |- No | 127 (85.2%) | 13 (92.9%) | 140 (85.9%) | |
## |- Si | 22 (14.8%) | 1 (7.1%) | 23 (14.1%) | |
## |AKI_48hs | | | | 0.531|
## |- Mean (SD) | 0.483 (0.501) | 0.571 (0.514) | 0.491 (0.501) | |
## |- Range | 0.000 - 1.000 | 0.000 - 1.000 | 0.000 - 1.000 | |
base<-base[,-c(32)]
Hay diferencias en algunas de las variables como Globulos rojos y Tecnica quirurgica pero no en la variable resultado
Realizamos una exploración gráfica de los datos faltantes.
En los análisis de los datos faltantes podemos ver que el 0,277% de las celdas totales presentan datos faltantes. En caso que se eliminen los pacientes que contengan al menos un dato faltante, el porcentaje de pérdida de la muestra sería 8,6%
El mecanismo de generación de datos faltantes fue Missing at Random (MAR), ya que la probabilidad de que falte una observación no se relacionó con el valor ausente en si, sino con la variable en que fue observada. Los datos faltantes corresponden al mismo centro de salud que no reportó en sus laboratorios los valores de dicha variable.
Imputo los datos faltantes de la variable Alb (albumina)
Uso el paquete MissRanger que utiliza Random Forest para imputar los datos de albumina que nos faltan. No use la que nos dio Adriana porque despues no permite comparar modelos ni trabajar mucho con el objeto.
La hice asi por una cuestion de comodidad, una vez que elijamos bien las variables, hacemos la otra tecnica de imputacion para obtener bien los coeficientes
library(ggplot2)
library(missRanger)
a<-missRanger( base, formula = Alb ~ .,
num.trees = 1000,
verbose = 2,
seed = 111,
returnOOB = T)
##
## Missing value imputation by random forests
##
## Variables to impute: Alb
## Variables used to impute: Sexo, Edad_Tx, HCC, BMI, DM, HTA, EPS, Ascitis, AKI_previas, RIN, BT, Na, Alb, Cistatina, Creatinina, WIT_min, CIT_hs, Sme_reperfusion, Inotropicos_qx, Globulos_Rojos, Hemoderivados, AST, ALT, tecnica_quirurgica, Pico_Lactato, Nefrotoxicos, edad_donante, BMI_donante, Pobre_funcion, Relaparotomia, AKI_48hs
## Alb
## iter 1: 0.6423
## iter 2: 0.2259
## iter 3: 0.2404
Utilizo una variable continua de la base (Na) para ver graficamente como imputo la tecnica.
ggplot()+ geom_point(data = a, aes(Alb, Cistatina ),
color = "red")+ geom_point(data = base, aes(Alb, Cistatina), color = "dodgerblue3") + theme_bw()
## Warning: Removed 14 rows containing missing values (geom_point).
sum(is.na(a)) # No hay datos faltantes en la base
## [1] 0
Empiezo a buscar metodos de seleccion de variables
modelo_glm <- glm( AKI_48hs~. , data = a, family = binomial())
summary(modelo_glm)
##
## Call:
## glm(formula = AKI_48hs ~ ., family = binomial(), data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6012 -0.9222 -0.3722 0.9357 2.0810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7336444 6.3404563 0.431 0.6664
## SexoMasculino 0.8960333 0.4747051 1.888 0.0591 .
## Edad_Tx -0.0019501 0.0209260 -0.093 0.9258
## HCCSi -0.3051944 0.5142896 -0.593 0.5529
## BMI 0.0448953 0.0432455 1.038 0.2992
## DMSi 0.4070473 0.5239952 0.777 0.4373
## HTASi -0.2874923 0.6697792 -0.429 0.6678
## EPSSi 0.2005553 0.4090550 0.490 0.6239
## AscitisSi 0.3104311 0.5089598 0.610 0.5419
## AKI_previasSi -0.4626664 0.5579011 -0.829 0.4069
## RIN -0.1890335 0.2792872 -0.677 0.4985
## BT 0.0156510 0.0310849 0.503 0.6146
## Na -0.0445662 0.0472544 -0.943 0.3456
## Alb -0.2361222 0.4333888 -0.545 0.5859
## Cistatina 1.6548794 0.6485188 2.552 0.0107 *
## Creatinina -0.9908975 0.7093915 -1.397 0.1625
## WIT_min -0.0008902 0.0234276 -0.038 0.9697
## CIT_hs 0.0431571 0.0452704 0.953 0.3404
## Sme_reperfusionSi -0.6695037 0.4945725 -1.354 0.1758
## Inotropicos_qx -0.5729760 0.5778916 -0.991 0.3214
## Globulos_Rojos 0.1110216 0.1298969 0.855 0.3927
## Hemoderivados -0.0188826 0.0234511 -0.805 0.4207
## AST 0.0001922 0.0001723 1.115 0.2648
## ALT -0.0000230 0.0001630 -0.141 0.8878
## tecnica_quirurgicaPB -0.1993657 0.4623109 -0.431 0.6663
## Pico_Lactato 0.1319232 0.1068506 1.235 0.2170
## NefrotoxicosSi 0.3934237 0.5774058 0.681 0.4956
## edad_donante 0.0080201 0.0141800 0.566 0.5717
## BMI_donante -0.0268570 0.0426458 -0.630 0.5288
## Pobre_funcionSi 0.6073226 0.5414260 1.122 0.2620
## RelaparotomiaSi 1.9226244 0.8300788 2.316 0.0205 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 175.23 on 132 degrees of freedom
## AIC: 237.23
##
## Number of Fisher Scoring iterations: 5
De nuevo las unicas que aparecen son Cistatina y Relaparotomia pero Sexo masculino tiene una pvalor en el borde de la significancia
Utilizamos la libreria bestGLM como primera aproximacion.
library(bestglm)
Xmodelo<-model.matrix(modelo_glm)
Y<-a$AKI_48hs
Xy<-data.frame(Xmodelo[,-1],Y)
res.bestglm <-bestglm(Xy = Xy, IC = "CV",method = "exhaustive")
res.bestglm$Subsets
El metodo BestGLM selecciona un modelo que solo tiene 2 variables: Cistatina y Relaparotomia
Realizo un modelo GLM solo con esas 2 variables
modelo_cista_relap <- glm(AKI_48hs ~ Cistatina + Relaparotomia, data = a, family = binomial())
summary(modelo_cista_relap)
##
## Call:
## glm(formula = AKI_48hs ~ Cistatina + Relaparotomia, family = binomial(),
## data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3659 -1.0028 -0.7541 1.2075 1.7053
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6898 0.5748 -2.940 0.003284 **
## Cistatina 0.9470 0.3698 2.561 0.010447 *
## RelaparotomiaSi 2.2097 0.6499 3.400 0.000673 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 201.81 on 160 degrees of freedom
## AIC: 207.81
##
## Number of Fisher Scoring iterations: 4
Utilizamos Random Forest como metodo alternativo de seleccion de variables
library(randomForest)
rf<-randomForest(a$AKI_48hs~., data = a [, - 31] )
library( dplyr)
v<-as.vector(rf$importance[,1])
w<-(as.vector((colnames(a))))
DF<-cbind(w,v)
DF<-as.data.frame(DF)
str(DF)
## 'data.frame': 31 obs. of 2 variables:
## $ w: chr "Sexo" "Edad_Tx" "HCC" "BMI" ...
## $ v: chr "0.436923394170995" "1.65526258801807" "0.203447889121571" "2.25363359402753" ...
DF<-DF %>% mutate(v=as.numeric(v),
w=as.factor(w))
ggplot(DF, aes(x=reorder(w,v), y=v,fill=w))+
geom_bar(stat="identity", position="dodge")+ coord_flip()+
ylab("Variable Importance")+ theme_bw() +
xlab("")+ theme(legend.position = "none")
Lo primero que se puede observar en el random Random Forest es que parecen haber varias variables involucradas en el desarrollo del outcome primario.
Aquellas con mayor importancia parecen ser Pico de lactato, Cistatina y Bilirrubina. Tambien BMI, RIN, ALT, AST y la edad del donante parecen ser relevantes. Llamativamente, Relaparotomia parece tener un valor intermedio con respecto a otras variables
Por ultimo utilizamos Regresion LASSO
library(glmnet)
x_1 <- model.matrix(AKI_48hs ~., a)[,- 32] #aca le sacamos la rta
y_1 <- a$AKI_48hs
dim(x_1)
## [1] 163 31
lasso_m1 <- glmnet(x_1, y_1, alpha=1)
plot(lasso_m1,xvar="lambda",label=TRUE)
plot(lasso_m1,xvar="norm",label=TRUE)
#x_1
set.seed(1)
lasso_m1_cv <- cv.glmnet(x_1, y_1, alpha=1, nfolds=101)
# mean cross validation error para lambda minimo
lasso_m1_cv$lambda.min
## [1] 0.06055254
which(lasso_m1_cv$lambda==lasso_m1_cv$lambda.min)
## [1] 11
lasso_m1_cv$cvm[(which(lasso_m1_cv$lambda==lasso_m1_cv$lambda.min))]
## s10
## 0.2369313
# guardo lambda minimo
min_lambda_lasso_m1_cv <- lasso_m1_cv$lambda.min
plot(lasso_m1_cv); abline(v=log(min_lambda_lasso_m1_cv), col="blue")
#### se calculan los coeficientes para el lambda seleccionado
predict(lasso_m1_cv, type="coefficients", s=min_lambda_lasso_m1_cv)[1:32,]
## (Intercept) (Intercept) SexoMasculino
## 0.287431531 0.000000000 0.000000000
## Edad_Tx HCCSi BMI
## 0.000000000 0.000000000 0.000000000
## DMSi HTASi EPSSi
## 0.000000000 0.000000000 0.000000000
## AscitisSi AKI_previasSi RIN
## 0.000000000 0.000000000 0.000000000
## BT Na Alb
## 0.000000000 0.000000000 -0.008812062
## Cistatina Creatinina WIT_min
## 0.079944455 0.000000000 0.000000000
## CIT_hs Sme_reperfusionSi Inotropicos_qx
## 0.000000000 0.000000000 0.000000000
## Globulos_Rojos Hemoderivados AST
## 0.007073926 0.000000000 0.000000000
## ALT tecnica_quirurgicaPB Pico_Lactato
## 0.000000000 0.000000000 0.013824542
## NefrotoxicosSi edad_donante BMI_donante
## 0.000000000 0.000000000 0.000000000
## Pobre_funcionSi RelaparotomiaSi
## 0.024558519 0.202324474
#coefs_lasso_min <- coef(lasso_m1_cv, s=min_lambda_lasso_m1_cv)
#coefs_lasso_min
Se puede ver que Cistatina, Relaparotomia y Pico de lactato aparecen entre las variables mas relevantes. Tambien aparecen Pobre fucion y Albumina
modelo_selec <- glm(a$AKI_48hs~ Relaparotomia + Cistatina + Pico_Lactato + BMI + BT +
RIN + AST + Globulos_Rojos,
data = a, family = binomial())
summary(modelo_selec)
##
## Call:
## glm(formula = a$AKI_48hs ~ Relaparotomia + Cistatina + Pico_Lactato +
## BMI + BT + RIN + AST + Globulos_Rojos, family = binomial(),
## data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6395 -0.9092 -0.6161 1.0658 1.9934
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0944284 1.3021632 -3.144 0.00166 **
## RelaparotomiaSi 1.8367198 0.7155745 2.567 0.01026 *
## Cistatina 0.9395025 0.4018956 2.338 0.01940 *
## Pico_Lactato 0.1648267 0.0958320 1.720 0.08544 .
## BMI 0.0468399 0.0362770 1.291 0.19664
## BT 0.0154347 0.0245772 0.628 0.53000
## RIN -0.0061248 0.2262271 -0.027 0.97840
## AST 0.0001365 0.0001158 1.178 0.23870
## Globulos_Rojos 0.0587532 0.0961345 0.611 0.54110
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 190.86 on 154 degrees of freedom
## AIC: 208.86
##
## Number of Fisher Scoring iterations: 5
No parece que la mayoria de las variables seleccionadas previamente sean significativas en el modelo asi como estan
AIC(modelo_cista_relap, modelo_selec)
Incluso el modelo con las 2 variables elegidas por GLM tiene mejor AIC que este ultimo
Es posible que no todas las variables continuas tengan una relacion lineal con respecto a la variables resultado Utilizo ggpredict para ver graficamente la sospecha
library(ggeffects)
grafico<-ggpredict(modelo_selec)
plot(grafico, add.data = TRUE, grid = FALSE)
## $Relaparotomia
##
## $Cistatina
##
## $Pico_Lactato
##
## $BMI
##
## $BT
##
## $RIN
##
## $AST
##
## $Globulos_Rojos
Utilizo la funcion cutpointr para elegir el mejor punto de corte para la categorizacion de las variables
library(cutpointr)
Cistatina <-cutpointr(a,Cistatina , AKI_48hs, method = maximize_metric, metric = youden)
summary(Cistatina)
plot_metric(Cistatina)
rojos<-cutpointr(a,Globulos_Rojos , AKI_48hs, method = maximize_metric, metric = youden)
summary(rojos)
plot_metric(rojos)
lactato<-cutpointr(a, Pico_Lactato , AKI_48hs, method = maximize_metric, metric = youden)
summary(lactato)
plot_metric(lactato)
AST<-cutpointr(a, AST , AKI_48hs, method = maximize_metric, metric = youden)
summary(AST)
plot_metric(AST)
BMI <-cutpointr(a, BMI , AKI_48hs, method = maximize_metric, metric = youden)
summary(BMI)
plot_metric(BMI)
BT <-cutpointr(a, BT , AKI_48hs, method = maximize_metric, metric = youden)
summary(BT)
plot_metric(BT)
RIN <-cutpointr(a, RIN , AKI_48hs, method = maximize_metric, metric = youden)
summary(RIN)
plot_metric(RIN)
a$globrojcat<-as.factor(ifelse(a$Globulos_Rojos>=5,1,0))
a$cistcat<-as.factor(ifelse(a$Cistatina>=1.5,1,0))
a$bmicat<-as.factor(ifelse(a$BMI>=29,1,0))
a$lactatocat<-as.factor(ifelse(a$Pico_Lactato>=5,1,0))
a$astcat<-as.factor(ifelse(a$AST>=3000,1,0))
a$RINcat <- as.factor(ifelse(a$RIN >=1.45,1,0))
a$BTcat <- as.factor(ifelse(a$BT >=3.5,1,0))
modelo_cat3 <- glm(a$AKI_48hs~ Relaparotomia + cistcat + bmicat + globrojcat +
lactatocat + astcat + bmicat + BTcat + RINcat ,
data = a, family = binomial())
summary(modelo_cat3)
##
## Call:
## glm(formula = a$AKI_48hs ~ Relaparotomia + cistcat + bmicat +
## globrojcat + lactatocat + astcat + bmicat + BTcat + RINcat,
## family = binomial(), data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7164 -0.7985 -0.3649 0.8461 2.0277
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.67589 0.49764 -5.377 7.57e-08 ***
## RelaparotomiaSi 1.74868 0.78880 2.217 0.02663 *
## cistcat1 1.11484 0.41399 2.693 0.00708 **
## bmicat1 0.75696 0.40796 1.855 0.06353 .
## globrojcat1 0.08398 0.63829 0.132 0.89532
## lactatocat1 1.81518 0.48643 3.732 0.00019 ***
## astcat1 1.35524 0.54929 2.467 0.01362 *
## BTcat1 0.74299 0.45171 1.645 0.10000
## RINcat1 0.90429 0.47916 1.887 0.05913 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 158.31 on 154 degrees of freedom
## AIC: 176.31
##
## Number of Fisher Scoring iterations: 5
Saco las variables RIN y Globulos rojos que no dieron significativas
modelo_cat4 <- glm(a$AKI_48hs~ Relaparotomia + cistcat + bmicat +
lactatocat + astcat + BTcat ,
data = a, family = binomial())
summary(modelo_cat4)
##
## Call:
## glm(formula = a$AKI_48hs ~ Relaparotomia + cistcat + bmicat +
## lactatocat + astcat + BTcat, family = binomial(), data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4878 -0.7616 -0.4482 0.8529 2.1671
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2477 0.4156 -5.409 6.34e-08 ***
## RelaparotomiaSi 1.6501 0.7772 2.123 0.033742 *
## cistcat1 1.1314 0.3994 2.833 0.004614 **
## bmicat1 0.7691 0.3993 1.926 0.054103 .
## lactatocat1 1.7452 0.4506 3.873 0.000107 ***
## astcat1 1.3396 0.5418 2.472 0.013425 *
## BTcat1 1.1583 0.3952 2.931 0.003383 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 162.06 on 156 degrees of freedom
## AIC: 176.06
##
## Number of Fisher Scoring iterations: 5
Parece que este seria el modelo definitivo
Compruebo la colinealidad y los supuestos del modelo
library(car)
library(DHARMa)
vif_modelo <- vif(modelo_cat4)
vif_modelo
## Relaparotomia cistcat bmicat lactatocat astcat
## 1.010724 1.056285 1.024002 1.057080 1.053540
## BTcat
## 1.042492
supuestos <-simulateResiduals(modelo_cat4, seed = 123, plot = T)
La variable Bilirrubina (BTcat) es la que esta llevando problemas al modelo
modelo_cat5 <- glm(a$AKI_48hs~ Relaparotomia + cistcat +
lactatocat + astcat + bmicat ,
data = a, family = binomial())
summary(modelo_cat5)
##
## Call:
## glm(formula = a$AKI_48hs ~ Relaparotomia + cistcat + lactatocat +
## astcat + bmicat, family = binomial(), data = a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7348 -0.7932 -0.5622 0.8961 1.9611
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7650 0.3498 -5.046 4.52e-07 ***
## RelaparotomiaSi 1.7656 0.7374 2.394 0.016659 *
## cistcat1 1.2768 0.3852 3.315 0.000917 ***
## lactatocat1 1.6683 0.4340 3.844 0.000121 ***
## astcat1 1.2256 0.5298 2.313 0.020703 *
## bmicat1 0.7699 0.3884 1.982 0.047448 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.91 on 162 degrees of freedom
## Residual deviance: 170.98 on 157 degrees of freedom
## AIC: 182.98
##
## Number of Fisher Scoring iterations: 5
supuestos <-simulateResiduals(modelo_cat5, seed = 123, plot = T)
Si saco del modelo la variable BTcat se resuelve el problema de los supuestos aunque el AIC es bastante peor
library(gtsummary)
tbl_regression(modelo_cat4, exponentiate = T, label = c(Relaparotomia ~ "Relaparotomia",
cistcat~"Cistatina > 1.5 mg/dl", bmicat~"IMC > 29 Kg/m2",lactatocat~"Lactato > 5 mg/dl", astcat~"AST > 3000 UI/L", BTcat~"BT > 3.5 mg/dl"), show_single_row = c(Relaparotomia,cistcat, bmicat, lactatocat, astcat, BTcat))
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Relaparotomia | 5.21 | 1.26, 28.4 | 0.034 |
| Cistatina > 1.5 mg/dl | 3.10 | 1.44, 6.92 | 0.005 |
| IMC > 29 Kg/m2 | 2.16 | 0.99, 4.78 | 0.054 |
| Lactato > 5 mg/dl | 5.73 | 2.44, 14.4 | <0.001 |
| AST > 3000 UI/L | 3.82 | 1.36, 11.6 | 0.013 |
| BT > 3.5 mg/dl | 3.18 | 1.48, 7.04 | 0.003 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
Divido la base en entrenamiento y testeo
set.seed(123)
n <- nrow(a)
ntrain <- floor(n*0.7)
itrain <- sample(1:n, ntrain)
datostrain <- a[itrain,] # Armo mi base de entrenamiento con N=114 (70%)
datostest <- a[-itrain, ] # Armo mi base de testeo con N=49 (30%)
Entreno el modelo con la base de entrenamiento basado en el 70% de la muestra
ajuste1<-glm(AKI_48hs~ Relaparotomia + cistcat + bmicat + lactatocat + astcat + BTcat ,
data = datostrain, family = binomial())
# Modelo a testear hecho con la base de entrenamiento
summary(ajuste1)
##
## Call:
## glm(formula = AKI_48hs ~ Relaparotomia + cistcat + bmicat + lactatocat +
## astcat + BTcat, family = binomial(), data = datostrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3172 -0.8227 -0.4842 0.7793 2.0985
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0847 0.4767 -4.374 1.22e-05 ***
## RelaparotomiaSi 1.5548 0.9385 1.657 0.097593 .
## cistcat1 0.8265 0.4806 1.720 0.085516 .
## bmicat1 0.3717 0.4847 0.767 0.443152
## lactatocat1 1.9458 0.5377 3.619 0.000296 ***
## astcat1 1.5091 0.6303 2.394 0.016660 *
## BTcat1 1.1751 0.4707 2.497 0.012537 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 157.72 on 113 degrees of freedom
## Residual deviance: 114.16 on 107 degrees of freedom
## AIC: 128.16
##
## Number of Fisher Scoring iterations: 5
Pongo un tita conservador y armo la tabla de doble entrada con observados y predichos por el modelo.
tita<-0.5 # Conservador como tirar la moneda
ajustados1<-predict(ajuste1, newdata = datostest, type="response") # Probabilidad predicha de AKI 48 hs con el modelo sobre la base datostesteo
predichos1<-as.factor(ifelse(ajustados1>tita, 1, 0)) # Como quedaría categorizada esa observación según tita 0.5
tabla1<-table(datostest$AKI_48hs, predichos1)
tabla1
## predichos1
## 0 1
## 0 21 2
## 1 8 18
tasa_error_te1<-mean(datostest$AKI_48hs!=predichos1)
accuracy1<-1-tasa_error_te1
accuracy1
## [1] 0.7959184
Calculo el AUC y el punto de corte de menor distancia al [0,1]
library(ROCit)
## Warning: package 'ROCit' was built under R version 4.2.1
##
## Attaching package: 'ROCit'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:car':
##
## logit
clase<-datostest$AKI_48hs
ROC <- rocit(score=ajustados1,class=clase)
plot(ROC, col="#1c61b6AA")
ROC$AUC
## [1] 0.8118729
# busco la distancia del (fpr,tpr) al (0,1)
distancia<-sqrt(ROC$FPR^2 +(ROC$TPR-1)^2)
plot(ROC$Cutoff,distancia,pch=20)
minimo<-ROC$Cutoff[which.min(distancia)]
Busco la tasa de error del modelo pero ahora utilizando el mínimo tita óptimo encontrado.
# el minimo es el tita optimo segun la curva roc, calculo el error ahora
tita<-minimo
ajustados1<-predict(ajuste1,newdata = datostest,type="response")
predichos1<-ifelse(ajustados1>tita,1,0)
tabla1<-table(datostest$AKI_48hs, predichos1)
tasa_error_te1<-mean(datostest$AKI_48hs!=predichos1)
tasa_error_te1
## [1] 0.2857143
Utilizo la base de entrenamiento para calcular ROC y AUC
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:cutpointr':
##
## auc, roc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# CON LA BASE DE ENTRENAMIENTO
#creo las curvas sobre las que trabajaré.
roc<-roc(datostrain$AKI_48hs,ajuste1$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc1<-roc(datostrain$AKI_48hs,ajuste1$fitted.values,smooth=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc)
plot(roc1)
auc(roc)
## Area under the curve: 0.8363
coords(roc,transpose = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## threshold -Inf 0.1316827 0.1870263 0.2541789 0.2894527 0.3258836 0.3643068
## specificity 0 0.3000000 0.4000000 0.4833333 0.5666667 0.6166667 0.6166667
## sensitivity 1 0.9814815 0.9629630 0.9074074 0.8703704 0.8333333 0.7962963
## [,8] [,9] [,10] [,11] [,12] [,13]
## threshold 0.3696023 0.4098636 0.4572613 0.4722779 0.5185818 0.5647900
## specificity 0.6666667 0.6833333 0.7000000 0.7666667 0.8500000 0.8666667
## sensitivity 0.7777778 0.7777778 0.7777778 0.7592593 0.6851852 0.6296296
## [,14] [,15] [,16] [,17] [,18] [,19]
## threshold 0.5726215 0.6095669 0.6481683 0.6533597 0.6606722 0.6998929
## specificity 0.9000000 0.9000000 0.9333333 0.9333333 0.9333333 0.9500000
## sensitivity 0.6111111 0.5925926 0.5740741 0.5370370 0.5185185 0.5185185
## [,20] [,21] [,22] [,23] [,24] [,25]
## threshold 0.7362365 0.7403315 0.7699793 0.8004199 0.8048510 0.8285977
## specificity 0.9500000 0.9500000 0.9500000 0.9500000 0.9500000 0.9500000
## sensitivity 0.5000000 0.4259259 0.4074074 0.3703704 0.3333333 0.3148148
## [,26] [,27] [,28] [,29] [,30] [,31]
## threshold 0.8543941 0.8617352 0.8827779 0.9158543 0.9413090 0.9575859
## specificity 0.9500000 0.9666667 0.9666667 0.9833333 1.0000000 1.0000000
## sensitivity 0.2962963 0.2592593 0.1851852 0.1851852 0.1851852 0.1666667
## [,32] [,33] [,34] [,35] [,36] [,37]
## threshold 0.9652490 0.9664971 0.9675275 0.97264989 0.97746986 0.98534278
## specificity 1.0000000 1.0000000 1.0000000 1.00000000 1.00000000 1.00000000
## sensitivity 0.1481481 0.1296296 0.1111111 0.05555556 0.03703704 0.01851852
## [,38]
## threshold Inf
## specificity 1
## sensitivity 0
coords(roc,transpose = FALSE)
#dibujamos bonita la curva.
#insertar AUC y IC
rocobj <- plot.roc(datostrain$AKI_48hs,ajuste1$fitted.values,
main="Confidence intervals",
percent=TRUE,ci=TRUE,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#creamos objeto para plotear el IC por defecto al 95% CI. Boots Default: 2000.
ciobj <- ci.se(rocobj, specificities=seq(0, 100, 5))
#insertamos el IC
plot(ciobj, type="shape", col="#1c61b6AA") # plot as a blue shape
plot(ci(rocobj, of="thresholds", thresholds="best")) # add one threshold
#le inserto el punto de corte sobre la gráfica con su IC
umbral<-plot.roc(datostrain$AKI_48hs,ajuste1$fitted.values, main="Confidence interval of a threshold", percent=TRUE,ci=TRUE, of="thresholds",thresholds="best",print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
umbral<-plot.roc(datostrain$AKI_48hs,ajuste1$fitted.values, main="Confidence interval of a threshold", percent=TRUE,ci=TRUE,print.auc=TRUE, of="thresholds",thresholds="best",print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ciobj, type="shape")
#potencia estadística de tu objeto.
power.roc.test(roc)
##
## One ROC curve power calculation
##
## ncases = 54
## ncontrols = 60
## auc = 0.8362654
## sig.level = 0.05
## power = 1
power.roc.test(ncases=54, ncontrols=60, auc=0.8382854, sig.level=0.05)
##
## One ROC curve power calculation
##
## ncases = 54
## ncontrols = 60
## auc = 0.8382854
## sig.level = 0.05
## power = 1
AUC 83.6% con la base de datos de entrenamiento.
Ahora utilizo el modelo para testearlo con la base de datos de testeo.
# Cálculo de la probabilidad predicha por el modelo con los datos de test
prob.modelo <- predict(object = ajuste1, newdata = datostest, type = "response")
# Vector de caracteres “Down”
pred.modelo <- rep(0, length(prob.modelo))
# Sustitución de “Down” por “Up” si la probabilidad a posteriori > 0,5
pred.modelo[prob.modelo > 0.5] <- 1
# Matriz de confusión
table(pred.modelo, datostest$AKI_48hs)
##
## pred.modelo 0 1
## 0 21 8
## 1 2 18
mean(pred.modelo != datostest$AKI_48hs)
## [1] 0.2040816
Utilizo la base de testeo para calcular ROC y AUC
# CON LA BASE DE TESTEO
#insertar AUC y IC
rocobj<-plot.roc(datostest$AKI_48hs,pred.modelo)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
rocobj <- plot.roc(datostest$AKI_48hs,pred.modelo,
main="Confidence intervals",
percent=TRUE,print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#creamos objeto para plotear el IC por defecto al 95% CI. Boots Default: 2000.
ciobj <- ci.se(rocobj, specificities=seq(0, 100, 5))
#insertamos el IC
plot(ciobj, type="shape", col="#1c61b6AA") # plot as a blue shape
plot(ci(rocobj, of="thresholds", thresholds="best")) # add one threshold
#le inserto el punto de corte sobre la gráfica con su IC
umbral<-plot.roc(datostest$AKI_48hs,pred.modelo, main="Confidence interval of a threshold", percent=TRUE,ci=TRUE, of="thresholds",thresholds="best",print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
umbral<-plot.roc(datostest$AKI_48hs,pred.modelo, main="Confidence interval of a threshold", percent=TRUE,ci=TRUE,print.auc=TRUE, of="thresholds",thresholds="best",print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ciobj, type="shape", col="#1c61b6AA")
#potencia estadística de tu objeto.
power.roc.test(roc)
##
## One ROC curve power calculation
##
## ncases = 54
## ncontrols = 60
## auc = 0.8362654
## sig.level = 0.05
## power = 1
library(caTools)
set.seed(123)
split <- sample.split(a$AKI_48hs, SplitRatio = 0.80)
training_set <- subset(a, split == TRUE)
test_set <- subset(a, split == FALSE)
library(caret)
folds <- createFolds(a$AKI_48hs, k = 5)
# Regresion Logistica
cvRegresionLogistica <- lapply(folds, function(x){
training_fold <- training_set[-x, ]
test_fold <- training_set[x, ]
clasificador <- glm(AKI_48hs~Relaparotomia + cistcat + bmicat +
lactatocat + astcat + BTcat , family = binomial, data = training_fold)
y_pred <- predict(clasificador, type = 'response', newdata = test_fold)
y_pred <- ifelse(y_pred > 0.5, 1, 0)
y_pred <- factor(y_pred, levels = c("0", "1"), labels = c("AKI no", "AKI si"))
cm <- table(test_fold$AKI_48hs, y_pred)
precision <- (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] +cm[1,2] + cm[2,1])
return(precision)
})
precisionRegresionLogistica <- mean(as.numeric(cvRegresionLogistica))
precisionRegresionLogistica
## [1] 0.7122294
cvRegresionLogisticanocat <- lapply(folds, function(x){
training_fold <- training_set[-x, ]
test_fold <- training_set[x, ]
clasificadornocat <- glm(AKI_48hs~Relaparotomia + Cistatina + BMI +
Pico_Lactato + AST+Globulos_Rojos, family = binomial, data = training_fold)
y_pred <- predict(clasificadornocat, type = 'response', newdata = test_fold)
y_pred <- ifelse(y_pred > 0.5, 1, 0)
y_pred <- factor(y_pred, levels = c("0", "1"), labels = c("AKI no", "AKI si"))
cm <- table(test_fold$AKI_48hs, y_pred)
precisionnocat <- (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] +cm[1,2] + cm[2,1])
return(precisionnocat)
})
precisionRegresionLogistica_nocat <- mean(as.numeric(cvRegresionLogisticanocat))
precisionRegresionLogistica_nocat
## [1] 0.5614719