Paso 1. Preparación inicial y limpieza de los datos:
heart <- read.csv("TrabajoEnCLase.csv")
library(tidyverse)
library(naniar)
library(caret)
library(kknn)
library(class)
head(heart)
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## 1 75 0 582 0 20
## 2 55 0 7861 0 38
## 3 65 0 146 0 20
## 4 50 1 111 0 20
## 5 65 1 160 1 20
## 6 90 1 47 0 40
## high_blood_pressure platelets serum_creatinine serum_sodium sex smoking time
## 1 1 265000 1.9 130 1 0 4
## 2 0 263358 1.1 136 1 0 6
## 3 0 162000 1.3 129 1 1 7
## 4 0 210000 1.9 137 1 0 7
## 5 0 327000 2.7 116 0 0 8
## 6 1 204000 2.1 132 1 1 8
## DEATH_EVENT
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
dim(heart)
## [1] 299 13
summary(heart)
## age anaemia creatinine_phosphokinase diabetes
## Min. :40.00 Min. :0.0000 Min. : 23.0 Min. :0.0000
## 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.: 116.5 1st Qu.:0.0000
## Median :60.00 Median :0.0000 Median : 250.0 Median :0.0000
## Mean :60.83 Mean :0.4314 Mean : 581.8 Mean :0.4181
## 3rd Qu.:70.00 3rd Qu.:1.0000 3rd Qu.: 582.0 3rd Qu.:1.0000
## Max. :95.00 Max. :1.0000 Max. :7861.0 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :14.00 Min. :0.0000 Min. : 25100 Min. :0.500
## 1st Qu.:30.00 1st Qu.:0.0000 1st Qu.:212500 1st Qu.:0.900
## Median :38.00 Median :0.0000 Median :262000 Median :1.100
## Mean :38.08 Mean :0.3512 Mean :263358 Mean :1.394
## 3rd Qu.:45.00 3rd Qu.:1.0000 3rd Qu.:303500 3rd Qu.:1.400
## Max. :80.00 Max. :1.0000 Max. :850000 Max. :9.400
## serum_sodium sex smoking time
## Min. :113.0 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:134.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 73.0
## Median :137.0 Median :1.0000 Median :0.0000 Median :115.0
## Mean :136.6 Mean :0.6488 Mean :0.3211 Mean :130.3
## 3rd Qu.:140.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:203.0
## Max. :148.0 Max. :1.0000 Max. :1.0000 Max. :285.0
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
heart$DEATH_EVENT <- as.factor(heart$DEATH_EVENT)
colSums(is.na(heart))
## age anaemia creatinine_phosphokinase
## 0 0 0
## diabetes ejection_fraction high_blood_pressure
## 0 0 0
## platelets serum_creatinine serum_sodium
## 0 0 0
## sex smoking time
## 0 0 0
## DEATH_EVENT
## 0
Se exploró el conjunto de datos y no se observaron valores faltantes, por lo cual decidimos continuar con el análisis.
Paso 2: Dividir los datos en conjunto de entrenamiento y prueba:
set.seed(123)
folds <- createFolds(heart$DEATH_EVENT, k = 6)
entrenamiento <- heart[-folds[[6]], ]
prueba <- heart[folds[[6]], ]
dim(entrenamiento)
## [1] 249 13
dim(prueba)
## [1] 50 13
entrenamiento_labels <- heart$DEATH_EVENT[-folds[[6]]]
prueba_labels <- heart$DEATH_EVENT[folds[[6]]]
Los datos se dividieron en conjuntos de entrenamiento y prueba para evaluar el modelo con observaciones no utilizadas durante el ajuste.
Paso 3: Aplicar el algoritmo K-NN:
train.kknn(as.factor(entrenamiento_labels) ~ .,
data = entrenamiento[, names(entrenamiento) != "DEATH_EVENT"],
kmax = 50)
##
## Call:
## train.kknn(formula = as.factor(entrenamiento_labels) ~ ., data = entrenamiento[, names(entrenamiento) != "DEATH_EVENT"], kmax = 50)
##
## Type of response variable: nominal
## Minimal misclassification: 0.253012
## Best kernel: optimal
## Best k: 16
entrenamiento_labels <- as.factor(entrenamiento_labels)
prueba_labels <- as.factor(prueba_labels)
entrenamiento_x <- entrenamiento[, names(entrenamiento) != "DEATH_EVENT"]
prueba_x <- prueba[, names(prueba) != "DEATH_EVENT"]
entrenamiento_x <- as.data.frame(scale(entrenamiento_x))
prueba_x <- as.data.frame(scale(prueba_x))
pred <- knn(train = entrenamiento_x,
test = prueba_x,
cl = entrenamiento_labels,
k = 16)
confusionMatrix(data = pred, reference = prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34 11
## 1 0 5
##
## Accuracy : 0.78
## 95% CI : (0.6404, 0.8847)
## No Information Rate : 0.68
## P-Value [Acc > NIR] : 0.083185
##
## Kappa : 0.382
##
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 1.0000
## Specificity : 0.3125
## Pos Pred Value : 0.7556
## Neg Pred Value : 1.0000
## Prevalence : 0.6800
## Detection Rate : 0.6800
## Detection Prevalence : 0.9000
## Balanced Accuracy : 0.6562
##
## 'Positive' Class : 0
##
Se aplicó K-NN con variables estandarizadas y k = 16, valor óptimo determinado previamente.
Paso 4: Validar la estabilidad del modelo
exactitud <- numeric(length = 6)
for(i in 1:6){
prueba_i <- heart[folds[[i]], ]
entrenamiento_i <- heart[-folds[[i]], ]
entrenamiento_labels_i <- as.factor(heart$DEATH_EVENT[-folds[[i]]])
prueba_labels_i <- as.factor(heart$DEATH_EVENT[folds[[i]]])
entrenamiento_x_i <- entrenamiento_i[, names(entrenamiento_i) != "DEATH_EVENT"]
prueba_x_i <- prueba_i[, names(prueba_i) != "DEATH_EVENT"]
entrenamiento_x_i <- as.data.frame(scale(entrenamiento_x_i))
prueba_x_i <- as.data.frame(scale(prueba_x_i))
pred_i <- knn(train = entrenamiento_x_i,
test = prueba_x_i,
cl = entrenamiento_labels_i,
k = 16)
cm_i <- confusionMatrix(pred_i, prueba_labels_i)
exactitud[i] <- cm_i$overall["Accuracy"]
cat("Fold", i, "- Exactitud:", exactitud[i], "\n")
}
## Fold 1 - Exactitud: 0.755102
## Fold 2 - Exactitud: 0.72
## Fold 3 - Exactitud: 0.72
## Fold 4 - Exactitud: 0.72
## Fold 5 - Exactitud: 0.78
## Fold 6 - Exactitud: 0.8
Exactitud_promedio <- round(mean(exactitud), 4) * 100
paste("Exactitud promedio:", Exactitud_promedio, "%")
## [1] "Exactitud promedio: 74.92 %"
Se muestra un modelo estable, con una exactitud promedio de 74.92%, manteniendo resultados similares entre los diferentes folds.
Paso 5: Interpretación de los resultados finales:
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10,
savePredictions = TRUE)
heart_norm <- as.data.frame(scale(heart[, names(heart) != "DEATH_EVENT"]))
heart_knn <- cbind(heart_norm, DEATH_EVENT = as.factor(heart$DEATH_EVENT))
knn_cv <- train(DEATH_EVENT ~ .,
data = heart_knn,
method = "knn",
trControl = train_control,
tuneGrid = data.frame(k = 16))
knn_cv
## k-Nearest Neighbors
##
## 299 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 268, 269, 269, 270, 270, 269, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7490768 0.2883127
##
## Tuning parameter 'k' was held constant at a value of 16
confusionMatrix(data = pred, reference = prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34 11
## 1 0 5
##
## Accuracy : 0.78
## 95% CI : (0.6404, 0.8847)
## No Information Rate : 0.68
## P-Value [Acc > NIR] : 0.083185
##
## Kappa : 0.382
##
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 1.0000
## Specificity : 0.3125
## Pos Pred Value : 0.7556
## Neg Pred Value : 1.0000
## Prevalence : 0.6800
## Detection Rate : 0.6800
## Detection Prevalence : 0.9000
## Balanced Accuracy : 0.6562
##
## 'Positive' Class : 0
##
Presenta mejor desempeño al predecir la clase 0 que la clase 1, indicando un desbalance en la capacidad de clasificación.
Paso 1. Preparación inicial y limpieza de los datos:
red <- read.csv("winequality-red.csv",sep=";")
white <- read.csv("winequality-white.csv",sep=";")
vino_total <- rbind(red, white)
library(tidyverse)
library(naniar)
library(caret)
library(kknn)
library(class)
head(vino_total)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
dim(vino_total)
## [1] 6497 12
summary(vino_total)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :9.000
vino_total$quality <- as.factor(vino_total$quality)
colSums(is.na(vino_total))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
Se exploró el conjunto de datos, y no existen valores faltantes, por lo cual están listos para el análisis.
Paso 2: Dividir los datos en conjunto de entrenamiento y prueba:
set.seed(123)
folds <- createFolds(vino_total$quality, k = 6)
entrenamiento <- vino_total[-folds[[6]], ]
prueba <- vino_total[folds[[6]], ]
dim(entrenamiento)
## [1] 5413 12
dim(prueba)
## [1] 1084 12
entrenamiento_labels <- vino_total$quality[-folds[[6]]]
prueba_labels <- vino_total$quality[folds[[6]]]
Los datos se dividieron en conjuntos de entrenamiento y prueba para poder evaluar el modelo de forma más confiable.
Paso 3: Aplicar el algoritmo K-NN:
train.kknn(entrenamiento_labels ~ ., data = entrenamiento, kmax = 50)
##
## Call:
## train.kknn(formula = entrenamiento_labels ~ ., data = entrenamiento, kmax = 50)
##
## Type of response variable: nominal
## Minimal misclassification: 0.0182893
## Best kernel: optimal
## Best k: 5
entrenamiento_labels <- as.factor(entrenamiento_labels)
prueba_labels <- as.factor(prueba_labels)
entrenamiento_x <- entrenamiento[, names(entrenamiento) != "quality"]
prueba_x <- prueba[, names(prueba) != "quality"]
entrenamiento_x <- as.data.frame(scale(entrenamiento_x))
prueba_x <- as.data.frame(scale(prueba_x))
pred <- knn(train = entrenamiento_x,
test = prueba_x,
cl = entrenamiento_labels,
k = 5)
confusionMatrix(data = pred, reference = prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 1 1 0 0 0 0 0
## 4 0 3 7 2 0 0 0
## 5 1 15 219 101 11 1 0
## 6 3 15 119 312 75 14 1
## 7 0 2 12 54 87 14 0
## 8 0 0 0 4 7 3 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5766
## 95% CI : (0.5465, 0.6062)
## No Information Rate : 0.4363
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3492
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.2000000 0.083333 0.6134 0.6596 0.48333 0.093750
## Specificity 0.9990732 0.991412 0.8226 0.6285 0.90929 0.989544
## Pos Pred Value 0.5000000 0.250000 0.6293 0.5788 0.51479 0.214286
## Neg Pred Value 0.9963031 0.969216 0.8125 0.7046 0.89836 0.972897
## Prevalence 0.0046125 0.033210 0.3293 0.4363 0.16605 0.029520
## Detection Rate 0.0009225 0.002768 0.2020 0.2878 0.08026 0.002768
## Detection Prevalence 0.0018450 0.011070 0.3210 0.4972 0.15590 0.012915
## Balanced Accuracy 0.5995366 0.537373 0.7180 0.6440 0.69631 0.541647
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9990775
## Prevalence 0.0009225
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
data_z <- as.data.frame(scale(vino_total[, names(vino_total) != "quality"]))
entrenamiento_z <- data_z[-folds[[6]],]
prueba_z <- data_z[folds[[6]],]
pred_z <- knn(entrenamiento_z, prueba_z, cl = entrenamiento_labels, k = 5)
confusionMatrix(data = pred_z, reference = prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 1 0 0 0 0 0
## 4 1 2 7 1 0 0 0
## 5 1 18 214 100 10 1 0
## 6 3 14 123 310 73 13 0
## 7 0 1 12 56 89 14 1
## 8 0 0 1 6 8 4 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.571
## 95% CI : (0.5409, 0.6007)
## No Information Rate : 0.4363
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3424
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.0000000 0.055556 0.5994 0.6554 0.4944 0.12500
## Specificity 0.9990732 0.991412 0.8212 0.6301 0.9071 0.98574
## Pos Pred Value 0.0000000 0.181818 0.6221 0.5784 0.5145 0.21053
## Neg Pred Value 0.9953832 0.968313 0.8068 0.7026 0.9001 0.97371
## Prevalence 0.0046125 0.033210 0.3293 0.4363 0.1661 0.02952
## Detection Rate 0.0000000 0.001845 0.1974 0.2860 0.0821 0.00369
## Detection Prevalence 0.0009225 0.010148 0.3173 0.4945 0.1596 0.01753
## Balanced Accuracy 0.4995366 0.523484 0.7103 0.6428 0.7008 0.55537
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9990775
## Prevalence 0.0009225
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
El modelo K-NN con k = 5 presenta un desempeño moderado, con una exactitud cercana al 57%, mostrando mayor capacidad para predecir clases intermedias y dificultades en clases menos frecuentes.
Paso 4: Validar la estabilidad del modelo
data_norm <- as.data.frame(scale(vino_total[, names(vino_total) != "quality"]))
set.seed(123)
folds <- createFolds(vino_total$quality, k = 6)
exactitud <- numeric(length = 6)
for(i in 1:6){
prueba <- data_norm[folds[[i]], ]
entrenamiento <- data_norm[-folds[[i]], ]
entrenamiento_labels <- as.factor(vino_total$quality[-folds[[i]]])
prueba_labels <- as.factor(vino_total$quality[folds[[i]]])
pred_knn <- knn(train = entrenamiento,
test = prueba,
cl = entrenamiento_labels,
k = 5)
pred_knn <- factor(pred_knn, levels = levels(entrenamiento_labels))
prueba_labels <- factor(prueba_labels, levels = levels(entrenamiento_labels))
cm <- confusionMatrix(pred_knn, prueba_labels)
exactitud[i] <- cm$overall["Accuracy"]
cat("Fold", i, "- Exactitud:", round(exactitud[i], 4), "\n")
}
## Fold 1 - Exactitud: 0.5749
## Fold 2 - Exactitud: 0.5577
## Fold 3 - Exactitud: 0.5416
## Fold 4 - Exactitud: 0.5656
## Fold 5 - Exactitud: 0.5231
## Fold 6 - Exactitud: 0.5655
Exactitud_promedio <- round(mean(exactitud), 4) * 100
paste("Exactitud promedio:", Exactitud_promedio, "%")
## [1] "Exactitud promedio: 55.47 %"
Evaluamos la estabilidad del modelo, obteniendo una exactitud promedio del 55.47%.
Paso 5: Interpretación de los resultados finales:
train_control <- trainControl(method = "cv",
number = 10,
savePredictions = TRUE)
vino_knn <- cbind(data_z, quality = as.factor(vino_total$quality))
knn_cv <- train(quality ~ .,
data = vino_knn,
method = "knn",
trControl = train_control,
tuneGrid = data.frame(k = 5))
knn_cv
## k-Nearest Neighbors
##
## 6497 samples
## 11 predictor
## 7 classes: '3', '4', '5', '6', '7', '8', '9'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5848, 5848, 5846, 5847, 5848, 5848, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5573287 0.3233814
##
## Tuning parameter 'k' was held constant at a value of 5
confusionMatrix(knn_cv$pred$pred, knn_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 1 0 0 0 0 0 0
## 4 4 20 26 18 0 0 0
## 5 12 96 1308 650 68 6 0
## 6 10 89 718 1761 473 67 1
## 7 2 11 77 372 500 89 4
## 8 1 0 9 35 38 31 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5573
## 95% CI : (0.5452, 0.5695)
## No Information Rate : 0.4365
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3234
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.0333333 0.092593 0.6118 0.6209 0.46339 0.160622
## Specificity 1.0000000 0.992358 0.8091 0.6291 0.89756 0.986834
## Pos Pred Value 1.0000000 0.294118 0.6112 0.5646 0.47393 0.271930
## Neg Pred Value 0.9955357 0.969513 0.8095 0.6818 0.89361 0.974620
## Prevalence 0.0046175 0.033246 0.3291 0.4365 0.16608 0.029706
## Detection Rate 0.0001539 0.003078 0.2013 0.2710 0.07696 0.004771
## Detection Prevalence 0.0001539 0.010466 0.3294 0.4801 0.16238 0.017547
## Balanced Accuracy 0.5166667 0.542475 0.7105 0.6250 0.68048 0.573728
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9992304
## Prevalence 0.0007696
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
Los resultados muestran que el modelo tiene un desempeño moderado cercano al 56%, consistente con la validación cruzada realizada previamente.