# Paso 2

folds <- createFolds(data$DEATH_EVENT,k=5)
entrenamiento <- data_norm[-folds[[5]],]
prueba        <- data_norm[folds[[5]],]

# Etiquetas
entrenamiento_labels <- data$DEATH_EVENT[-folds[[5]]]
prueba_labels        <- data$DEATH_EVENT[folds[[5]]]

# Paso 3: Aplicar método K-NN

train.kknn(entrenamiento_labels ~., data =entrenamiento, kmax = 50 )

## 
## Call:
## train.kknn(formula = entrenamiento_labels ~ ., data = entrenamiento,     kmax = 50)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.2301255
## Best kernel: optimal
## Best k: 20

prediccion <- knn(entrenamiento,prueba, cl = entrenamiento_labels, k = 47)
confusionMatrix(data = prediccion, reference = prueba_labels)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Vivo Muerto
##     Vivo     40     19
##     Muerto    0      1
##                                           
##                Accuracy : 0.6833          
##                  95% CI : (0.5504, 0.7974)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.4516          
##                                           
##                   Kappa : 0.0656          
##                                           
##  Mcnemar's Test P-Value : 3.636e-05       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0500          
##          Pos Pred Value : 0.6780          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6667          
##          Detection Rate : 0.6667          
##    Detection Prevalence : 0.9833          
##       Balanced Accuracy : 0.5250          
##                                           
##        'Positive' Class : Vivo            
##

# Paso 4

exactitud <- numeric(length = 5)

for(i in 1:5){
  # Definir conjuntos entrenamiento y prueba según el fold actual
  prueba        <- data_norm[folds[[i]],]
  entrenamiento <- data_norm[-folds[[i]],]
  
  # Etiquetas
  entrenamiento_labels <- data$DEATH_EVENT[-folds[[i]]]
  prueba_labels        <- data$DEATH_EVENT[folds[[i]]]
  
  pred_knn <- knn(entrenamiento,prueba, cl= entrenamiento_labels, k = 47)
  
  # Evaluar exactitud del modelo en cada fold
  cm <- confusionMatrix(pred_knn, prueba_labels)
  exactitud[i] <- cm$overall["Accuracy"]
  
  # Mostrar resultado del fold
  cat("Fold", i, "- Exactitud:", exactitud[i], "\n")
}

## Fold 1 - Exactitud: 0.6779661 
## Fold 2 - Exactitud: 0.6833333 
## Fold 3 - Exactitud: 0.6833333 
## Fold 4 - Exactitud: 0.6833333 
## Fold 5 - Exactitud: 0.6833333

Exactitud_promedio <- round(mean(exactitud),4)*100
paste("Exactitud_promedio: ",Exactitud_promedio,"%",sep="")

## [1] "Exactitud_promedio: 68.23%"

# Validación cruzada
set.seed(2025)
train_control <- trainControl(method="cv",number=10,savePredictions = TRUE)

knn_cv <- train(DEATH_EVENT ~ ., data=cbind(prueba, DEATH_EVENT=prueba_labels), 
                method = "knn", trControl = train_control, tuneGrid = data.frame(k=47))

knn_cv

## k-Nearest Neighbors 
## 
## 60 samples
## 12 predictors
##  2 classes: 'Vivo', 'Muerto' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 54, 54, 54, 54, 54, 54, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.6666667  0    
## 
## Tuning parameter 'k' was held constant at a value of 47

Wine Quality

library(readr)
 winequality_red <- read_csv("winequality-red.csv")

## Rows: 1599 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): fixed acidity;volatile acidity;citric acid;residual sugar;chlorides...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

winequality_white <- read_csv("winequality-white.csv")

## Rows: 4898 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): fixed acidity;volatile acidity;citric acid;residual sugar;chlorides...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Limpieza de datos

winequality_red <- separate(winequality_red, col = 1, into = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar",
"chlorides","free_sulfur_dioxide","total_sulfur_dioxide",
"density","pH","sulphates","alcohol","quality"), sep = ";")
winequality_white <- separate(winequality_white, col = 1, into = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar",
"chlorides","free_sulfur_dioxide","total_sulfur_dioxide",
"density","pH","sulphates","alcohol","quality"), sep = ";")


winequality_red$quality <- as.factor(winequality_red$quality)
winequality_white$quality <- as.factor(winequality_white$quality)
winequality_red <- na.omit(winequality_red)
winequality_white <- na.omit(winequality_white)

# Normalización Min-Max con rescale()
wine_red_norm <- winequality_red %>% select(-quality) %>%
  mutate(across(where(is.numeric), rescale))
wine_white_norm <- winequality_white %>% select(-quality) %>%
  mutate(across(where(is.numeric), rescale))

# Paso 2

folds_wine_red <- createFolds(winequality_red$quality,k=5)
folds_wine_white <- createFolds(winequality_white$quality, k=5)
entrenamiento_w_red <- wine_red_norm[-folds_wine_red[[5]],]
prueba_w_red       <- wine_red_norm[folds_wine_red[[5]],]
entrenamiento_w_white <- wine_white_norm[-folds_wine_white[[5]],]
prueba_w_white <- wine_white_norm[folds_wine_white[[5]],]

# Etiquetas
entrenamiento_labels_w_red <- winequality_red$quality[-folds_wine_red[[5]]]
prueba_labels_w_red <- winequality_red$quality[folds_wine_red[[5]]]
entrenamiento_labels_w_white <- winequality_white$quality[-folds_wine_white[[5]]]
prueba_labels_w_white <- winequality_white$quality[folds_wine_white[[5]]]

# Entrenamiento
##Red wine
train.kknn(entrenamiento_labels_w_red ~., data =entrenamiento_w_red, kmax = 50 )

## 
## Call:
## train.kknn(formula = entrenamiento_labels_w_red ~ ., data = entrenamiento_w_red,     kmax = 50)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4295775
## Best kernel: optimal
## Best k: 5

prediccion_red_wine <- knn(entrenamiento_w_red,prueba_w_red, cl = entrenamiento_labels_w_red, k = 5)
confusionMatrix(data = prediccion_red_wine, reference = prueba_labels_w_red)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  3  4  5  6  7  8
##          3  0  0  0  0  0  0
##          4  0  0  0  1  1  0
##          5  1  8 81 48  1  0
##          6  1  2 51 67 18  3
##          7  0  1  4 12 20  1
##          8  0  0  0  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5234          
##                  95% CI : (0.4672, 0.5791)
##     No Information Rate : 0.4237          
##     P-Value [Acc > NIR] : 0.0002034       
##                                           
##                   Kappa : 0.2376          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000 0.000000   0.5956   0.5234  0.50000  0.00000
## Specificity          1.000000 0.993548   0.6865   0.6114  0.93594  1.00000
## Pos Pred Value            NaN 0.000000   0.5827   0.4718  0.52632      NaN
## Neg Pred Value       0.993769 0.965517   0.6978   0.6592  0.92933  0.98754
## Prevalence           0.006231 0.034268   0.4237   0.3988  0.12461  0.01246
## Detection Rate       0.000000 0.000000   0.2523   0.2087  0.06231  0.00000
## Detection Prevalence 0.000000 0.006231   0.4330   0.4424  0.11838  0.00000
## Balanced Accuracy    0.500000 0.496774   0.6410   0.5674  0.71797  0.50000

##White wine
train.kknn(entrenamiento_labels_w_white ~., data = entrenamiento_w_white, kmax=50)

## 
## Call:
## train.kknn(formula = entrenamiento_labels_w_white ~ ., data = entrenamiento_w_white,     kmax = 50)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4139867
## Best kernel: optimal
## Best k: 1

prediccion_white_wine <- knn(entrenamiento_w_white,prueba_w_white, cl = entrenamiento_labels_w_white, k = 1)
confusionMatrix(data = prediccion_white_wine, reference = prueba_labels_w_white)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   1   0   2   0   1   0   0
##          4   0   7   4   9   2   0   0
##          5   1  13 170  76  18   0   1
##          6   1  10  90 284  65  13   0
##          7   1   3  22  61  83  10   0
##          8   0   0   4   9   6  12   0
##          9   0   0   0   0   1   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5684          
##                  95% CI : (0.5367, 0.5996)
##     No Information Rate : 0.448           
##     P-Value [Acc > NIR] : 2.707e-14       
##                                           
##                   Kappa : 0.3545          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.250000 0.212121   0.5822   0.6469  0.47159  0.34286
## Specificity          0.996926 0.984161   0.8416   0.6691  0.87935  0.97989
## Pos Pred Value       0.250000 0.318182   0.6093   0.6134  0.46111  0.38710
## Neg Pred Value       0.996926 0.972860   0.8260   0.7002  0.88375  0.97576
## Prevalence           0.004082 0.033673   0.2980   0.4480  0.17959  0.03571
## Detection Rate       0.001020 0.007143   0.1735   0.2898  0.08469  0.01224
## Detection Prevalence 0.004082 0.022449   0.2847   0.4724  0.18367  0.03163
## Balanced Accuracy    0.623463 0.598141   0.7119   0.6580  0.67547  0.66138
##                      Class: 9
## Sensitivity           0.00000
## Specificity           0.99898
## Pos Pred Value        0.00000
## Neg Pred Value        0.99898
## Prevalence            0.00102
## Detection Rate        0.00000
## Detection Prevalence  0.00102
## Balanced Accuracy     0.49949

# Validación cruzada
##Red wine
set.seed(2025)
train_control_red <- trainControl(method="cv",number=10,savePredictions = TRUE)

knn_cv_red <- train(quality ~ ., data=cbind(prueba_w_red, quality=prueba_labels_w_red), 
                method = "knn", trControl = train_control, tuneGrid = data.frame(k=5))

# Resultados de validación cruzada
knn_cv_red

## k-Nearest Neighbors 
## 
## 321 samples
##  11 predictor
##   6 classes: '3', '4', '5', '6', '7', '8' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 289, 288, 288, 290, 288, 288, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.505022  0.1752814
## 
## Tuning parameter 'k' was held constant at a value of 5

##White wine
set.seed(2025)
train_control_white <- trainControl(method="cv",number=10,savePredictions = TRUE)

knn_cv_white <- train(quality ~ ., data=cbind(prueba_w_white, quality=prueba_labels_w_white), 
                    method = "knn", trControl = train_control, tuneGrid = data.frame(k=1))

# Resultados de validación cruzada
knn_cv_white

## k-Nearest Neighbors 
## 
## 980 samples
##  11 predictor
##   7 classes: '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 882, 884, 880, 884, 882, 882, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.4226225  0.1213336
## 
## Tuning parameter 'k' was held constant at a value of 1

Tarea Heart Failure/Wine Quality

Angel Serrano, Adrián Rosario

2026-03-18

Wine Quality

Limpieza de datos