Técnicas Estadísticas y Minería de Datos.
Módulo 3 - Minería de datos
A continuación se presentan las librerías que se utilizaron en la realización de esta práctica
library(tidyverse)
library(caret)
library(e1071)
library(ROSE)
Se seleccionó la variable death
como variable
objetivo.
# Carga de Datos
support2 <- read_csv("support2.csv")
## Rows: 9105 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): sex, dzgroup, dzclass, income, race, ca, dnr, sfdm2
## dbl (39): age, death, hospdead, slos, d.time, num.co, edu, scoma, charges, t...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Cambiamos el tipo de dato de la variable
# death a factor
support2$death <- as.factor(support2$death)
# Seleccionamos las variables a utilizar
support2 <- support2 |>
select(death, meanbp, hrt, wblc,
crea, bun, ph, pafi
)
A continuación se presenta el manejo de valores nulos para este conjunto de datos.
# Tratamiento de NA's
colSums(is.na(support2))
## death meanbp hrt wblc crea bun ph pafi
## 0 1 1 212 67 4352 2284 2325
summary(support2)
## death meanbp hrt wblc crea
## 0:2904 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.09999
## 1:6201 1st Qu.: 63.00 1st Qu.: 72.00 1st Qu.: 7.00 1st Qu.: 0.89990
## Median : 77.00 Median :100.00 Median : 10.60 Median : 1.19995
## Mean : 84.55 Mean : 97.16 Mean : 12.35 Mean : 1.77096
## 3rd Qu.:107.00 3rd Qu.:120.00 3rd Qu.: 15.30 3rd Qu.: 1.89990
## Max. :195.00 Max. :300.00 Max. :200.00 Max. :21.50000
## NA's :1 NA's :1 NA's :212 NA's :67
## bun ph pafi
## Min. : 1.00 Min. :6.829 Min. : 12.0
## 1st Qu.: 14.00 1st Qu.:7.380 1st Qu.:155.1
## Median : 23.00 Median :7.420 Median :224.0
## Mean : 32.35 Mean :7.415 Mean :239.5
## 3rd Qu.: 42.00 3rd Qu.:7.470 3rd Qu.:304.8
## Max. :300.00 Max. :7.770 Max. :890.4
## NA's :4352 NA's :2284 NA's :2325
# Las variables con un sólo NA las inputamos usando la mediana
support2$meanbp[is.na(support2$meanbp)] <- median(support2$meanbp, na.rm = T)
support2$hrt[is.na(support2$hrt)] <- median(support2$hrt, na.rm = T)
# Imputamos la variable wblc con la mediana dado que su distribución
# no es simétrica
support2$wblc[is.na(support2$wblc)] <- median(support2$wblc, na.rm = T)
# Imputamos la variable crea con la mediana dado que su distribución
# no es simétrica
support2$crea[is.na(support2$crea)] <- median(support2$crea, na.rm = T)
# Imputamos la variable ph con la media
support2$ph[is.na(support2$ph)] <- mean(support2$ph, na.rm = T)
# Imputamos la variable pafi con la mediana dado que su distribución
# no es simétrica
support2$pafi[is.na(support2$pafi)] <- median(support2$pafi, na.rm = T)
# Eliminamos la variable bun dado que cuenta con un número elevado de NA
# Cercano al 50% del conjunto
round((sum(is.na(support2$bun)) / nrow(support2)) * 100, 2)
## [1] 47.8
support2 <- support2 |>
select(-bun)
summary(support2)
## death meanbp hrt wblc crea
## 0:2904 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.09999
## 1:6201 1st Qu.: 63.00 1st Qu.: 72.00 1st Qu.: 7.00 1st Qu.: 0.89990
## Median : 77.00 Median :100.00 Median : 10.60 Median : 1.19995
## Mean : 84.55 Mean : 97.16 Mean : 12.31 Mean : 1.76676
## 3rd Qu.:107.00 3rd Qu.:120.00 3rd Qu.: 15.10 3rd Qu.: 1.89990
## Max. :195.00 Max. :300.00 Max. :200.00 Max. :21.50000
## ph pafi
## Min. :6.829 Min. : 12.0
## 1st Qu.:7.399 1st Qu.:180.0
## Median :7.415 Median :224.0
## Mean :7.415 Mean :235.6
## 3rd Qu.:7.449 3rd Qu.:274.2
## Max. :7.770 Max. :890.4
A continuación se presenta el tratamiento de outliers.
# Outliers
ggplot(support2, aes(y = meanbp)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Boxplot de meanbp") +
theme_minimal()
ggplot(support2, aes(y = hrt)) +
geom_boxplot(fill = "tomato") +
labs(title = "Boxplot de hrt") +
theme_minimal()
ggplot(support2, aes(y = wblc)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Boxplot de wblc") +
theme_minimal()
ggplot(support2, aes(y = crea)) +
geom_boxplot(fill = "orchid") +
labs(title = "Boxplot de crea") +
theme_minimal()
ggplot(support2, aes(y = ph)) +
geom_boxplot(fill = "hotpink") +
labs(title = "Boxplot de ph") +
theme_minimal()
ggplot(support2, aes(y = pafi)) +
geom_boxplot(fill = "orange") +
labs(title = "Boxplot de pafi") +
theme_minimal()
# Se eliminan los registros con los valores más extremos
# del conjunto de datos
support2 <- support2 |>
filter(support2[["wblc"]] <= 150)
support2 <- support2 |>
filter(support2[["crea"]] <= 15)
Para este proyecto se tomó el criterio 70-30 para separar el conjunto de datos en entrenamiento y prueba, respectivamente.
# Fijamos la semilla
set.seed(091243)
# Índicesparadividireldataset
train_indices <- sample(seq_len(nrow(support2)),
size=0.7*nrow(support2))
# Conjuntosdeentrenamientoyprueba
train_data <- support2[train_indices,]
test_data <- support2[-train_indices,]
# Regresión logística conjunto desbalanceado (original)
regresionL <- glm(death ~ ., data = train_data, family = "binomial")
prob_logit <- predict(regresionL, newdata = test_data, type = "response")
pred_logit <- ifelse(prob_logit > 0.5, "1", "0") |> factor(levels = c("0", "1"))
test_data$death <- factor(test_data$death, levels = c(0, 1))
confusionMatrix(pred_logit, test_data$death)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 863 1868
##
## Accuracy : 0.684
## 95% CI : (0.6662, 0.7014)
## No Information Rate : 0.684
## P-Value [Acc > NIR] : 0.5092
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.000
## Specificity : 1.000
## Pos Pred Value : NaN
## Neg Pred Value : 0.684
## Prevalence : 0.316
## Detection Rate : 0.000
## Detection Prevalence : 0.000
## Balanced Accuracy : 0.500
##
## 'Positive' Class : 0
##
# Regresión logística balanceado (usando la librería ROSE)
train_data_bal <- ROSE(death ~ ., data = train_data, seed = 091243)$data
modelo_logit_rose <- glm(death ~ ., data = train_data_bal, family = "binomial")
prob_logit_rose <- predict(modelo_logit_rose, newdata = test_data, type = "response")
pred_logit_rose <- ifelse(prob_logit_rose > 0.5, "1", "0") |> factor(levels = c("0", "1"))
test_data$death <- factor(test_data$death, levels = c("0", "1"))
confusionMatrix(pred_logit_rose, test_data$death)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 160 332
## 1 703 1536
##
## Accuracy : 0.621
## 95% CI : (0.6025, 0.6393)
## No Information Rate : 0.684
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0087
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.18540
## Specificity : 0.82227
## Pos Pred Value : 0.32520
## Neg Pred Value : 0.68602
## Prevalence : 0.31600
## Detection Rate : 0.05859
## Detection Prevalence : 0.18015
## Balanced Accuracy : 0.50383
##
## 'Positive' Class : 0
##
# SVM conjunto desbalanceado (original)
modelo_svm <- svm(death ~ .,
data = train_data)
pred_svm <- predict(modelo_svm, newdata = test_data)
svm_pred <- predict(modelo_svm, newdata = test_data)
confusionMatrix(svm_pred, test_data$death)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1 2
## 1 862 1866
##
## Accuracy : 0.6836
## 95% CI : (0.6658, 0.7011)
## No Information Rate : 0.684
## P-Value [Acc > NIR] : 0.5256
##
## Kappa : 1e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0011587
## Specificity : 0.9989293
## Pos Pred Value : 0.3333333
## Neg Pred Value : 0.6840176
## Prevalence : 0.3160015
## Detection Rate : 0.0003662
## Detection Prevalence : 0.0010985
## Balanced Accuracy : 0.5000440
##
## 'Positive' Class : 0
##
# SVM conjunto balanceado (se le da peso a la clase menos común)
modelo_svm <- svm(death ~ .,
data = train_data,
class.weights = c("0" = 2.1, "1" = 1))
pred_svm <- predict(modelo_svm, newdata = test_data)
svm_pred <- predict(modelo_svm, newdata = test_data)
confusionMatrix(svm_pred, test_data$death)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 453 833
## 1 410 1035
##
## Accuracy : 0.5449
## 95% CI : (0.526, 0.5637)
## No Information Rate : 0.684
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0698
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5249
## Specificity : 0.5541
## Pos Pred Value : 0.3523
## Neg Pred Value : 0.7163
## Prevalence : 0.3160
## Detection Rate : 0.1659
## Detection Prevalence : 0.4709
## Balanced Accuracy : 0.5395
##
## 'Positive' Class : 0
##