Este material ha sido desarrollado en el marco del curso Tópicos Avanzados de la Maestría en Bioestadística.
La base de datos ha sido modificada exclusivamente con fines académicos.
El conjunto de datos contiene 1207 niños con DSS, con 25 variables clínicas y demográficas.
Variables:
library(tidyverse)
library(car)
library(rms) #Estrategias de modelamiento
library(ResourceSelection) #Selección de variables
library(pROC) #Discriminación AUC/ROC
library(caret) #Validación interna/externa
library(mfp) #RCS Splines cubicos restringidos
library(skimr) #Exploración inicial
skimr
skim(dataDSS)
Name | dataDSS |
Number of rows | 1207 |
Number of columns | 26 |
_______________________ | |
Column type frequency: | |
character | 11 |
numeric | 15 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
profound | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
reshock | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
inotrop | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
severe_bleed | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
organ_failure | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
survive | 0 | 1.00 | 4 | 8 | 0 | 2 | 0 |
sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
bleed_exam | 0 | 1.00 | 4 | 6 | 0 | 3 | 0 |
abtend | 2 | 1.00 | 2 | 3 | 0 | 2 | 0 |
ascite_eff | 3 | 1.00 | 2 | 3 | 0 | 2 | 0 |
PCRserotype | 40 | 0.97 | 5 | 8 | 0 | 6 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
st_no | 0 | 1.00 | 1441.42 | 497.83 | 102.0 | 1202.50 | 1513.0 | 1818.50 | 2123.0 | ▂▁▆▇▇ |
year | 0 | 1.00 | 2006.91 | 1.83 | 2003.0 | 2006.00 | 2007.0 | 2008.00 | 2009.0 | ▂▂▂▂▇ |
age | 0 | 1.00 | 9.32 | 3.13 | 0.0 | 7.00 | 10.0 | 12.00 | 15.0 | ▁▅▇▇▅ |
wt | 0 | 1.00 | 30.19 | 11.68 | 7.2 | 21.00 | 29.0 | 38.00 | 80.0 | ▅▇▃▁▁ |
day_ill | 0 | 1.00 | 5.08 | 0.87 | 3.0 | 5.00 | 5.0 | 6.00 | 10.0 | ▅▇▅▁▁ |
pp | 0 | 1.00 | 16.56 | 5.80 | 0.0 | 15.00 | 20.0 | 20.00 | 35.0 | ▁▁▇▁▁ |
sys_bp | 0 | 1.00 | 86.20 | 24.67 | 0.0 | 85.00 | 90.0 | 100.00 | 120.0 | ▁▁▁▇▅ |
pulse | 0 | 1.00 | 131.95 | 36.17 | 70.0 | 110.00 | 120.0 | 140.00 | 200.0 | ▁▇▂▁▃ |
hemo | 0 | 1.00 | 1.38 | 0.60 | 1.0 | 1.00 | 1.0 | 2.00 | 3.0 | ▇▁▃▁▁ |
temp | 1 | 1.00 | 37.17 | 0.42 | 36.5 | 37.00 | 37.0 | 37.00 | 39.5 | ▇▁▁▁▁ |
liver | 12 | 0.99 | 1.70 | 0.95 | 0.0 | 1.00 | 2.0 | 2.00 | 5.0 | ▆▇▃▁▁ |
hct | 21 | 0.98 | 50.24 | 5.39 | 34.0 | 46.72 | 49.6 | 53.01 | 65.0 | ▁▃▇▃▁ |
plt | 19 | 0.98 | 46072.96 | 38194.32 | 5400.0 | 26000.00 | 38000.0 | 54225.00 | 648000.0 | ▇▁▁▁▁ |
AST | 297 | 0.75 | 203.97 | 247.96 | 13.0 | 89.00 | 133.0 | 218.00 | 2741.0 | ▇▁▁▁▁ |
y | 0 | 1.00 | 0.18 | 0.39 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▂ |
\[log(\frac{Profound_{DSS}}{1-Profound_{DSS}})=\beta_0 + \beta_1Age+ \beta_2day.ill+\beta_3 pulse+\beta_4temp+\beta_5hct +\beta_6sex + \beta_7 Hemo2 + \beta_8 hemo3\]
# Modelo Logístico propuesto en Tabla 2
datatrain<-datatrain%>%
select(y,age, day_ill, pulse, temp, hct, sex, hemo)
datatrain$hemo<-factor(datatrain$hemo)
# Eliminar faltantes
datatrain<-na.omit(datatrain)
# Casos disponibles para análisis
datatrain%>%
group_by(y)%>%
summarise(n=n(), per=n/nrow(datatrain)*100)
## # A tibble: 2 × 3
## y n per
## <dbl> <int> <dbl>
## 1 0 744 81.0
## 2 1 174 19.0
# Ajuste del modelo logístico completo
model<- glm(y ~., data=datatrain, family = binomial)
# Estimación del Logit
logit <- predict(model, type = "link")
# Seleccionar predictores continuos
mydata <- datatrain %>%
select(-y)%>%
dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Visualización de datos
mydata <- mydata %>%
mutate(logit = logit) %>%
gather(key = "predictores", value = "valor", -logit)
ggplot(mydata, aes(logit, valor))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictores, scales = "free_y")
## `geom_smooth()` using formula = 'y ~ x'
\[log(\frac{Profound_{DSS}}{1-Profound_{DSS}})=\beta_0 + \beta_1 hct\]
model.hct.lineal <- glm(y ~ hct, data = datatrain, family = "binomial")
summary(model.hct.lineal)
##
## Call:
## glm(formula = y ~ hct, family = "binomial", data = datatrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.74154 0.78850 -6.013 1.82e-09 ***
## hct 0.06488 0.01530 4.241 2.23e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 891.49 on 917 degrees of freedom
## Residual deviance: 873.48 on 916 degrees of freedom
## AIC: 877.48
##
## Number of Fisher Scoring iterations: 4
datatrain$hctcat <- cut(datatrain$hct, breaks = c(-Inf, 45, 55, Inf), right = TRUE)
model.hct.cat <- glm(y ~ hctcat, data = datatrain, family = "binomial")
summary(model.hct.cat)
##
## Call:
## glm(formula = y ~ hctcat, family = "binomial", data = datatrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3895 0.1998 -1.949 0.0513 .
## hctcat(45,55] -1.8733 0.2391 -7.833 4.75e-15 ***
## hctcat(55, Inf] 0.4043 0.2638 1.533 0.1253
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 891.49 on 917 degrees of freedom
## Residual deviance: 751.52 on 915 degrees of freedom
## AIC: 757.52
##
## Number of Fisher Scoring iterations: 5
Ejemplo con tres nodos:
\[log(\frac{Profound_{DSS}}{1-Profound_{DSS}})==\beta_0 + \beta_1 sp_1 (hct) + \beta_2 sp_2 (hct)\] Aqui, \(sp_1\) es la variable original hct y \(sp_2\) es una función compleja que genera la forma del RCS.
model.hct.rcs <- glm(y ~ rcs(hct,3), data = datatrain, family = "binomial")
summary(model.hct.rcs)
##
## Call:
## glm(formula = y ~ rcs(hct, 3), family = "binomial", data = datatrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.38739 1.83243 8.397 <2e-16 ***
## rcs(hct, 3)hct -0.37505 0.03951 -9.492 <2e-16 ***
## rcs(hct, 3)hct' 0.63862 0.05377 11.878 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 891.49 on 917 degrees of freedom
## Residual deviance: 662.94 on 915 degrees of freedom
## AIC: 668.94
##
## Number of Fisher Scoring iterations: 5
# Muestra: División en deciles y % del evento por decil
probs = seq(0, 1, by = 0.1)
decile_breaks <- unique(quantile(datatrain$hct, probs = seq(0, 1, by = 0.1)))
tablehct <- datatrain %>%
mutate(hct_decile = cut(hct, breaks = decile_breaks, include.lowest = TRUE)) %>%
group_by(hct_decile) %>%
summarise(mean_hct = mean(hct), prop = mean(y), logit = log(prop / (1 - prop)), n = n())
#Predicción por cada modelo
datatrain$predicthct.lineal=predict(model.hct.lineal, data=datatrain, type="link")
datatrain$predicthct.cat=predict(model.hct.cat,data=datatrain, type="link")
datatrain$predicthct.rcs=predict(model.hct.rcs, data=datatrain, type="link")
# Unificación de datos en formato largo
df_hct<-datatrain%>%
select(hct, predicthct.lineal, predicthct.cat, predicthct.rcs)%>%
bind_rows(
datatrain %>% mutate(modelo = "lineal", pred = predicthct.lineal),
datatrain %>% mutate(modelo = "cat", pred = predicthct.cat),
datatrain %>% mutate(modelo = "rcs", pred = predicthct.rcs)
)
ggplot() +
# LOESS para lineal y rcs
geom_smooth(
data = df_hct %>% filter(modelo %in% c("lineal", "rcs")),
aes(x = hct, y = pred, color = modelo),
method = "loess", se = FALSE
) +
# Línea directa para cat
geom_line(
data = df_hct %>% filter(modelo == "cat"),
aes(x = hct, y = pred, color = modelo)
) +
# Puntos adicionales de tablehct
geom_point(
data = tablehct, aes(x = mean_hct, y = logit),
inherit.aes = FALSE, shape = 21, size = 2, fill = "black"
) +
labs(x = "Hematocrito (hct)", y = "logit", color = "Modelo") + theme_gray()
## LL/AIC
LL.model<- data.frame(model=c("lineal", "cat", "rcs"),
logLik=c(logLik(model.hct.lineal),
logLik(model.hct.cat),
logLik(model.hct.rcs)),
AIC=c(AIC(model.hct.lineal),
AIC(model.hct.cat),
AIC(model.hct.rcs))
)
knitr::kable(LL.model, digits=3)
model | logLik | AIC |
---|---|---|
lineal | -436.741 | 877.482 |
cat | -375.759 | 757.518 |
rcs | -331.468 | 668.937 |