Librerías

A continuación se presentan las librerías que se utilizaron en la realización de esta práctica

library(tidyverse)
library(caret)
library(e1071)
library(ROSE)

Carga de Datos y Selección Final de Variables

Se seleccionó la variable death como variable objetivo.

# Carga de Datos
support2 <- read_csv("support2.csv")

## Rows: 9105 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): sex, dzgroup, dzclass, income, race, ca, dnr, sfdm2
## dbl (39): age, death, hospdead, slos, d.time, num.co, edu, scoma, charges, t...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Cambiamos el tipo de dato de la variable
# death a factor
support2$death <- as.factor(support2$death)

# Seleccionamos las variables a utilizar
support2 <- support2 |>
  select(death, meanbp, hrt, wblc,
         crea, bun, ph, pafi
         )

Tratamiento de Valores Nulos

A continuación se presenta el manejo de valores nulos para este conjunto de datos.

# Tratamiento de NA's
colSums(is.na(support2))

##  death meanbp    hrt   wblc   crea    bun     ph   pafi 
##      0      1      1    212     67   4352   2284   2325

summary(support2)

##  death        meanbp            hrt              wblc             crea         
##  0:2904   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : 0.09999  
##  1:6201   1st Qu.: 63.00   1st Qu.: 72.00   1st Qu.:  7.00   1st Qu.: 0.89990  
##           Median : 77.00   Median :100.00   Median : 10.60   Median : 1.19995  
##           Mean   : 84.55   Mean   : 97.16   Mean   : 12.35   Mean   : 1.77096  
##           3rd Qu.:107.00   3rd Qu.:120.00   3rd Qu.: 15.30   3rd Qu.: 1.89990  
##           Max.   :195.00   Max.   :300.00   Max.   :200.00   Max.   :21.50000  
##           NA's   :1        NA's   :1        NA's   :212      NA's   :67        
##       bun               ph             pafi      
##  Min.   :  1.00   Min.   :6.829   Min.   : 12.0  
##  1st Qu.: 14.00   1st Qu.:7.380   1st Qu.:155.1  
##  Median : 23.00   Median :7.420   Median :224.0  
##  Mean   : 32.35   Mean   :7.415   Mean   :239.5  
##  3rd Qu.: 42.00   3rd Qu.:7.470   3rd Qu.:304.8  
##  Max.   :300.00   Max.   :7.770   Max.   :890.4  
##  NA's   :4352     NA's   :2284    NA's   :2325

# Las variables con un sólo NA las inputamos usando la mediana
support2$meanbp[is.na(support2$meanbp)] <- median(support2$meanbp, na.rm = T)
support2$hrt[is.na(support2$hrt)] <- median(support2$hrt, na.rm = T)
# Imputamos la variable wblc con la mediana dado que su distribución
# no es simétrica
support2$wblc[is.na(support2$wblc)] <- median(support2$wblc, na.rm = T)
# Imputamos la variable crea con la mediana dado que su distribución
# no es simétrica
support2$crea[is.na(support2$crea)] <- median(support2$crea, na.rm = T)
# Imputamos la variable ph con la media
support2$ph[is.na(support2$ph)] <- mean(support2$ph, na.rm = T)
# Imputamos la variable pafi con la mediana dado que su distribución
# no es simétrica
support2$pafi[is.na(support2$pafi)] <- median(support2$pafi, na.rm = T)
# Eliminamos la variable bun dado que cuenta con un número elevado de NA
# Cercano al 50% del conjunto
round((sum(is.na(support2$bun)) / nrow(support2)) * 100, 2)

## [1] 47.8

support2 <- support2 |>
  select(-bun)

summary(support2)

##  death        meanbp            hrt              wblc             crea         
##  0:2904   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : 0.09999  
##  1:6201   1st Qu.: 63.00   1st Qu.: 72.00   1st Qu.:  7.00   1st Qu.: 0.89990  
##           Median : 77.00   Median :100.00   Median : 10.60   Median : 1.19995  
##           Mean   : 84.55   Mean   : 97.16   Mean   : 12.31   Mean   : 1.76676  
##           3rd Qu.:107.00   3rd Qu.:120.00   3rd Qu.: 15.10   3rd Qu.: 1.89990  
##           Max.   :195.00   Max.   :300.00   Max.   :200.00   Max.   :21.50000  
##        ph             pafi      
##  Min.   :6.829   Min.   : 12.0  
##  1st Qu.:7.399   1st Qu.:180.0  
##  Median :7.415   Median :224.0  
##  Mean   :7.415   Mean   :235.6  
##  3rd Qu.:7.449   3rd Qu.:274.2  
##  Max.   :7.770   Max.   :890.4

Outliers

A continuación se presenta el tratamiento de outliers.

# Outliers
ggplot(support2, aes(y = meanbp)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Boxplot de meanbp") +
  theme_minimal()

ggplot(support2, aes(y = hrt)) +
  geom_boxplot(fill = "tomato") +
  labs(title = "Boxplot de hrt") +
  theme_minimal()

ggplot(support2, aes(y = wblc)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Boxplot de wblc") +
  theme_minimal()

ggplot(support2, aes(y = crea)) +
  geom_boxplot(fill = "orchid") +
  labs(title = "Boxplot de crea") +
  theme_minimal()

ggplot(support2, aes(y = ph)) +
  geom_boxplot(fill = "hotpink") +
  labs(title = "Boxplot de ph") +
  theme_minimal()

ggplot(support2, aes(y = pafi)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Boxplot de pafi") +
  theme_minimal()

# Se eliminan los registros con los valores más extremos 
# del conjunto de datos

support2 <- support2 |>
  filter(support2[["wblc"]] <= 150)

support2 <- support2 |>
  filter(support2[["crea"]] <= 15)

Modelado

Para este proyecto se tomó el criterio 70-30 para separar el conjunto de datos en entrenamiento y prueba, respectivamente.

# Fijamos la semilla
set.seed(091243)

# Índicesparadividireldataset
train_indices <- sample(seq_len(nrow(support2)),
                      size=0.7*nrow(support2))
# Conjuntosdeentrenamientoyprueba
train_data <- support2[train_indices,]
test_data <- support2[-train_indices,]

Regresión Logistica

# Regresión logística conjunto desbalanceado (original)
regresionL <- glm(death ~ ., data = train_data, family = "binomial")
prob_logit <- predict(regresionL, newdata = test_data, type = "response")
pred_logit <- ifelse(prob_logit > 0.5, "1", "0") |> factor(levels = c("0", "1"))
test_data$death <- factor(test_data$death, levels = c(0, 1))
confusionMatrix(pred_logit, test_data$death)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    0    0
##          1  863 1868
##                                           
##                Accuracy : 0.684           
##                  95% CI : (0.6662, 0.7014)
##     No Information Rate : 0.684           
##     P-Value [Acc > NIR] : 0.5092          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.000           
##             Specificity : 1.000           
##          Pos Pred Value :   NaN           
##          Neg Pred Value : 0.684           
##              Prevalence : 0.316           
##          Detection Rate : 0.000           
##    Detection Prevalence : 0.000           
##       Balanced Accuracy : 0.500           
##                                           
##        'Positive' Class : 0               
##

# Regresión logística balanceado (usando la librería ROSE)
train_data_bal <- ROSE(death ~ ., data = train_data, seed = 091243)$data
modelo_logit_rose <- glm(death ~ ., data = train_data_bal, family = "binomial")
prob_logit_rose <- predict(modelo_logit_rose, newdata = test_data, type = "response")
pred_logit_rose <- ifelse(prob_logit_rose > 0.5, "1", "0") |> factor(levels = c("0", "1"))
test_data$death <- factor(test_data$death, levels = c("0", "1"))
confusionMatrix(pred_logit_rose, test_data$death)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  160  332
##          1  703 1536
##                                           
##                Accuracy : 0.621           
##                  95% CI : (0.6025, 0.6393)
##     No Information Rate : 0.684           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0087          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.18540         
##             Specificity : 0.82227         
##          Pos Pred Value : 0.32520         
##          Neg Pred Value : 0.68602         
##              Prevalence : 0.31600         
##          Detection Rate : 0.05859         
##    Detection Prevalence : 0.18015         
##       Balanced Accuracy : 0.50383         
##                                           
##        'Positive' Class : 0               
##

Support Vector Machine

# SVM conjunto desbalanceado (original)
modelo_svm <- svm(death ~ ., 
                  data = train_data)
pred_svm <- predict(modelo_svm, newdata = test_data)
svm_pred <- predict(modelo_svm, newdata = test_data)
confusionMatrix(svm_pred, test_data$death)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    1    2
##          1  862 1866
##                                           
##                Accuracy : 0.6836          
##                  95% CI : (0.6658, 0.7011)
##     No Information Rate : 0.684           
##     P-Value [Acc > NIR] : 0.5256          
##                                           
##                   Kappa : 1e-04           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0011587       
##             Specificity : 0.9989293       
##          Pos Pred Value : 0.3333333       
##          Neg Pred Value : 0.6840176       
##              Prevalence : 0.3160015       
##          Detection Rate : 0.0003662       
##    Detection Prevalence : 0.0010985       
##       Balanced Accuracy : 0.5000440       
##                                           
##        'Positive' Class : 0               
##

# SVM conjunto balanceado (se le da peso a la clase menos común)
modelo_svm <- svm(death ~ ., 
                  data = train_data,
                  class.weights = c("0" = 2.1, "1" = 1))
pred_svm <- predict(modelo_svm, newdata = test_data)
svm_pred <- predict(modelo_svm, newdata = test_data)
confusionMatrix(svm_pred, test_data$death)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  453  833
##          1  410 1035
##                                          
##                Accuracy : 0.5449         
##                  95% CI : (0.526, 0.5637)
##     No Information Rate : 0.684          
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0698         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5249         
##             Specificity : 0.5541         
##          Pos Pred Value : 0.3523         
##          Neg Pred Value : 0.7163         
##              Prevalence : 0.3160         
##          Detection Rate : 0.1659         
##    Detection Prevalence : 0.4709         
##       Balanced Accuracy : 0.5395         
##                                          
##        'Positive' Class : 0              
##

Anexo I - Código

Técnicas Estadísticas y Minería de Datos.

Módulo 3 - Minería de datos

Ángel Miranda Razo

05 de agosto, 2025

Librerías

Carga de Datos y Selección Final de Variables

Tratamiento de Valores Nulos

Outliers

Modelado

Regresión Logistica

Support Vector Machine

Anexo I - Código

Técnicas Estadísticas y Minería de Datos. Módulo 3 - Minería de datos

Ángel Miranda Razo

05 de agosto, 2025

Librerías

Carga de Datos y Selección Final de Variables

Tratamiento de Valores Nulos

Outliers

Modelado

Regresión Logistica

Support Vector Machine

Técnicas Estadísticas y Minería de Datos.

Módulo 3 - Minería de datos