# Base de datos
records <- read.csv("heart_failure_clinical_records_dataset.csv")
summary(records)
## age anaemia creatinine_phosphokinase diabetes
## Min. :40.00 Min. :0.0000 Min. : 23.0 Min. :0.0000
## 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.: 116.5 1st Qu.:0.0000
## Median :60.00 Median :0.0000 Median : 250.0 Median :0.0000
## Mean :60.83 Mean :0.4314 Mean : 581.8 Mean :0.4181
## 3rd Qu.:70.00 3rd Qu.:1.0000 3rd Qu.: 582.0 3rd Qu.:1.0000
## Max. :95.00 Max. :1.0000 Max. :7861.0 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :14.00 Min. :0.0000 Min. : 25100 Min. :0.500
## 1st Qu.:30.00 1st Qu.:0.0000 1st Qu.:212500 1st Qu.:0.900
## Median :38.00 Median :0.0000 Median :262000 Median :1.100
## Mean :38.08 Mean :0.3512 Mean :263358 Mean :1.394
## 3rd Qu.:45.00 3rd Qu.:1.0000 3rd Qu.:303500 3rd Qu.:1.400
## Max. :80.00 Max. :1.0000 Max. :850000 Max. :9.400
## serum_sodium sex smoking time
## Min. :113.0 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:134.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 73.0
## Median :137.0 Median :1.0000 Median :0.0000 Median :115.0
## Mean :136.6 Mean :0.6488 Mean :0.3211 Mean :130.3
## 3rd Qu.:140.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:203.0
## Max. :148.0 Max. :1.0000 Max. :1.0000 Max. :285.0
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
# Visión general de toda la base de datos
skimr::skim(records)
| Name | records |
| Number of rows | 299 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 60.83 | 11.89 | 40.0 | 51.0 | 60.0 | 70.0 | 95.0 | ▆▇▇▂▁ |
| anaemia | 0 | 1 | 0.43 | 0.50 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▆ |
| creatinine_phosphokinase | 0 | 1 | 581.84 | 970.29 | 23.0 | 116.5 | 250.0 | 582.0 | 7861.0 | ▇▁▁▁▁ |
| diabetes | 0 | 1 | 0.42 | 0.49 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▆ |
| ejection_fraction | 0 | 1 | 38.08 | 11.83 | 14.0 | 30.0 | 38.0 | 45.0 | 80.0 | ▃▇▂▂▁ |
| high_blood_pressure | 0 | 1 | 0.35 | 0.48 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▅ |
| platelets | 0 | 1 | 263358.03 | 97804.24 | 25100.0 | 212500.0 | 262000.0 | 303500.0 | 850000.0 | ▂▇▂▁▁ |
| serum_creatinine | 0 | 1 | 1.39 | 1.03 | 0.5 | 0.9 | 1.1 | 1.4 | 9.4 | ▇▁▁▁▁ |
| serum_sodium | 0 | 1 | 136.63 | 4.41 | 113.0 | 134.0 | 137.0 | 140.0 | 148.0 | ▁▁▃▇▁ |
| sex | 0 | 1 | 0.65 | 0.48 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | ▅▁▁▁▇ |
| smoking | 0 | 1 | 0.32 | 0.47 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▃ |
| time | 0 | 1 | 130.26 | 77.61 | 4.0 | 73.0 | 115.0 | 203.0 | 285.0 | ▆▇▃▆▃ |
| DEATH_EVENT | 0 | 1 | 0.32 | 0.47 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▃ |
# Características de las variables de la base de datos
glimpse(records)
## Rows: 299
## Columns: 13
## $ age <dbl> 75, 55, 65, 50, 65, 90, 75, 60, 65, 80, 75, 6…
## $ anaemia <int> 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ creatinine_phosphokinase <int> 582, 7861, 146, 111, 160, 47, 246, 315, 157, …
## $ diabetes <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ ejection_fraction <int> 20, 38, 20, 20, 20, 40, 15, 60, 65, 35, 38, 2…
## $ high_blood_pressure <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ platelets <dbl> 265000, 263358, 162000, 210000, 327000, 20400…
## $ serum_creatinine <dbl> 1.90, 1.10, 1.30, 1.90, 2.70, 2.10, 1.20, 1.1…
## $ serum_sodium <int> 130, 136, 129, 137, 116, 132, 137, 131, 138, …
## $ sex <int> 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, …
## $ smoking <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ time <int> 4, 6, 7, 7, 8, 8, 10, 10, 10, 10, 10, 10, 11,…
## $ DEATH_EVENT <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
# Información de las primeras filas de la base de datos
head(records)
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## 1 75 0 582 0 20
## 2 55 0 7861 0 38
## 3 65 0 146 0 20
## 4 50 1 111 0 20
## 5 65 1 160 1 20
## 6 90 1 47 0 40
## high_blood_pressure platelets serum_creatinine serum_sodium sex smoking time
## 1 1 265000 1.9 130 1 0 4
## 2 0 263358 1.1 136 1 0 6
## 3 0 162000 1.3 129 1 1 7
## 4 0 210000 1.9 137 1 0 7
## 5 0 327000 2.7 116 0 0 8
## 6 1 204000 2.1 132 1 1 8
## DEATH_EVENT
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
# Porcentaje total de valores faltantes
pct_miss(records)
## [1] 0
# Normalización de variable
records_norm <- records %>%
select(-DEATH_EVENT) %>%
mutate(across(everything(), rescale))
library(caret)
## Loading required package: lattice
# Crear folds usando la variable respuesta del dataset original
folds <- createFolds(records$DEATH_EVENT, k = 5)
# Definir conjuntos de entrenamiento y prueba usando la data normalizada
entrenamiento <- records_norm[-folds[[5]], ]
prueba <- records_norm[folds[[5]], ]
# Definir etiquetas usando la data original
entrenamiento_labels <- records$DEATH_EVENT[-folds[[5]]]
prueba_labels <- records$DEATH_EVENT[folds[[5]]]
# Ver tamaño de cada conjunto
dim(entrenamiento)[1]
## [1] 239
dim(prueba)[1]
## [1] 60
library(rpart)
library(rpart.plot)
library(C50)
# Unir predictores y variable respuesta para el conjunto de entrenamiento
entrenamiento_arbol <- cbind(entrenamiento, DEATH_EVENT = as.factor(entrenamiento_labels))
# Construcción del árbol CART
modelo_cart <- rpart(DEATH_EVENT ~ ., data = entrenamiento_arbol, method = "class")
modelo_cart
## n= 239
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 239 78 0 (0.67364017 0.32635983)
## 2) time>=0.2241993 181 27 0 (0.85082873 0.14917127)
## 4) serum_creatinine< 0.1067416 147 11 0 (0.92517007 0.07482993) *
## 5) serum_creatinine>=0.1067416 34 16 0 (0.52941176 0.47058824)
## 10) ejection_fraction>=0.280303 19 5 0 (0.73684211 0.26315789) *
## 11) ejection_fraction< 0.280303 15 4 1 (0.26666667 0.73333333) *
## 3) time< 0.2241993 58 7 1 (0.12068966 0.87931034) *
# Visualización del árbol CART
rpart.plot(modelo_cart)
# Construcción del árbol C5.0
modelo_c50 <- C5.0(DEATH_EVENT ~ ., data = entrenamiento_arbol)
modelo_c50
##
## Call:
## C5.0.formula(formula = DEATH_EVENT ~ ., data = entrenamiento_arbol)
##
## Classification Tree
## Number of samples: 239
## Number of predictors: 12
##
## Tree size: 15
##
## Non-standard options: attempt to group attributes
plot(modelo_c50)
set.seed(2025)
# Control de validación cruzada
train_control <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# Base para CART
entrenamiento_arbol <- cbind(entrenamiento, DEATH_EVENT = as.factor(entrenamiento_labels))
# Validación cruzada para CART
arbol_cart_cv <- train(DEATH_EVENT ~ .,
data = entrenamiento_arbol,
method = "rpart",
trControl = train_control,
tuneLength = 10)
arbol_cart_cv
## CART
##
## 239 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 215, 215, 214, 215, 215, 215, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00000000 0.8579420 0.6722270
## 0.06267806 0.8452464 0.6213762
## 0.12535613 0.8535797 0.6390232
## 0.18803419 0.8535797 0.6390232
## 0.25071225 0.8535797 0.6390232
## 0.31339031 0.8535797 0.6390232
## 0.37606838 0.8535797 0.6390232
## 0.43874644 0.8535797 0.6390232
## 0.50142450 0.8535797 0.6390232
## 0.56410256 0.7404638 0.2545307
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
# Matriz de confusión para CART
confusionMatrix(arbol_cart_cv$pred$pred, arbol_cart_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1523 291
## 1 87 489
##
## Accuracy : 0.8418
## 95% CI : (0.8266, 0.8563)
## No Information Rate : 0.6736
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6143
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9460
## Specificity : 0.6269
## Pos Pred Value : 0.8396
## Neg Pred Value : 0.8490
## Prevalence : 0.6736
## Detection Rate : 0.6372
## Detection Prevalence : 0.7590
## Balanced Accuracy : 0.7864
##
## 'Positive' Class : 0
##
# Validación cruzada para C5.0
arbol_c50_cv <- train(DEATH_EVENT ~ .,
data = entrenamiento_arbol,
method = "C5.0",
trControl = train_control,
tuneLength = 10)
arbol_c50_cv
## C5.0
##
## 239 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 215, 216, 215, 215, 215, 215, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.8288986 0.6115887
## rules FALSE 10 0.8368696 0.6175685
## rules FALSE 20 0.8200217 0.5795525
## rules FALSE 30 0.8243696 0.5901907
## rules FALSE 40 0.8327029 0.6100697
## rules FALSE 50 0.8240217 0.5923913
## rules FALSE 60 0.8325362 0.6111167
## rules FALSE 70 0.8325362 0.6111167
## rules FALSE 80 0.8325362 0.6111167
## rules FALSE 90 0.8325362 0.6108373
## rules TRUE 1 0.8412174 0.6232359
## rules TRUE 10 0.8370652 0.6187444
## rules TRUE 20 0.8285507 0.6011057
## rules TRUE 30 0.8285507 0.5978733
## rules TRUE 40 0.8242029 0.5863160
## rules TRUE 50 0.8242029 0.5863160
## rules TRUE 60 0.8285507 0.5978733
## rules TRUE 70 0.8285507 0.5978733
## rules TRUE 80 0.8285507 0.5978733
## rules TRUE 90 0.8327174 0.6086687
## tree FALSE 1 0.8162174 0.5845653
## tree FALSE 10 0.8490362 0.6477504
## tree FALSE 20 0.8198551 0.5927386
## tree FALSE 30 0.8156884 0.5817041
## tree FALSE 40 0.8240362 0.5962303
## tree FALSE 50 0.8242029 0.5891145
## tree FALSE 60 0.8242029 0.5891145
## tree FALSE 70 0.8325362 0.6104716
## tree FALSE 80 0.8281884 0.6033529
## tree FALSE 90 0.8281884 0.6033529
## tree TRUE 1 0.8453841 0.6356274
## tree TRUE 10 0.8537319 0.6566300
## tree TRUE 20 0.8408696 0.6243541
## tree TRUE 30 0.8408696 0.6218541
## tree TRUE 40 0.8367029 0.6134670
## tree TRUE 50 0.8408696 0.6218541
## tree TRUE 60 0.8408696 0.6218541
## tree TRUE 70 0.8408696 0.6218541
## tree TRUE 80 0.8325362 0.6056041
## tree TRUE 90 0.8408696 0.6218541
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
## = TRUE.
# Matriz de confusión para C5.0
confusionMatrix(arbol_c50_cv$pred$pred, arbol_c50_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5782 949
## 1 658 2171
##
## Accuracy : 0.8319
## 95% CI : (0.8243, 0.8394)
## No Information Rate : 0.6736
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6083
##
## Mcnemar's Test P-Value : 4.683e-13
##
## Sensitivity : 0.8978
## Specificity : 0.6958
## Pos Pred Value : 0.8590
## Neg Pred Value : 0.7674
## Prevalence : 0.6736
## Detection Rate : 0.6048
## Detection Prevalence : 0.7041
## Balanced Accuracy : 0.7968
##
## 'Positive' Class : 0
##
# Predicción de CART para los datos de entrenamiento
pred_cart_train <- predict(modelo_cart, entrenamiento_arbol, type = "class")
tt_cart_train <- table(pred_cart_train, entrenamiento_arbol$DEATH_EVENT)
tt_cart_train
##
## pred_cart_train 0 1
## 0 150 16
## 1 11 62
# Tasa de aciertos de CART en entrenamiento
TA_cart_train <- sum(diag(tt_cart_train)) / sum(tt_cart_train)
paste0("Tasa de aciertos de CART con los datos de entrenamiento: ", round(TA_cart_train, 4) * 100, "%")
## [1] "Tasa de aciertos de CART con los datos de entrenamiento: 88.7%"
# Predicción de CART para los datos de prueba
pred_cart_test <- predict(modelo_cart, prueba, type = "class")
tt_cart_test <- table(prueba_labels, pred_cart_test)
tt_cart_test
## pred_cart_test
## prueba_labels 0 1
## 0 35 7
## 1 6 12
# Tasa de aciertos de CART en prueba
TA_cart_test <- sum(diag(tt_cart_test)) / sum(tt_cart_test)
paste0("Tasa de aciertos de CART con los datos de prueba: ", round(TA_cart_test, 4) * 100, "%")
## [1] "Tasa de aciertos de CART con los datos de prueba: 78.33%"
# Predicción de C5.0 para los datos de entrenamiento
pred_c50_train <- predict(modelo_c50, entrenamiento, type = "class")
tt_c50_train <- table(pred_c50_train, entrenamiento_labels)
tt_c50_train
## entrenamiento_labels
## pred_c50_train 0 1
## 0 153 7
## 1 8 71
# Tasa de aciertos de C5.0 en entrenamiento
TA_c50_train <- sum(diag(tt_c50_train)) / sum(tt_c50_train)
paste0("Tasa de aciertos de C5.0 con los datos de entrenamiento: ", round(TA_c50_train, 4) * 100, "%")
## [1] "Tasa de aciertos de C5.0 con los datos de entrenamiento: 93.72%"
# Predicción de C5.0 para los datos de prueba
pred_c50_test <- predict(modelo_c50, prueba, type = "class")
tt_c50_test <- table(prueba_labels, pred_c50_test)
tt_c50_test
## pred_c50_test
## prueba_labels 0 1
## 0 34 8
## 1 6 12
# Tasa de aciertos de C5.0 en prueba
TA_c50_test <- sum(diag(tt_c50_test)) / sum(tt_c50_test)
paste0("Tasa de aciertos de C5.0 con los datos de prueba: ", round(TA_c50_test, 4) * 100, "%")
## [1] "Tasa de aciertos de C5.0 con los datos de prueba: 76.67%"
#Leer base de datos / vino blanco
library(readr)
winequality_white <- read_delim("winequality-white.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 4898 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (12): fixed acidity, volatile acidity, citric acid, residual sugar, chlo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Leer base de datos / vino rojo
library(readr)
winequality_red <- read_delim("winequality-red.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 1599 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (12): fixed acidity, volatile acidity, citric acid, residual sugar, chlo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Unir bases de datos
library(dplyr)
# Añadir una variable para identificar el tipo de vino
winequality_red$tipo <- "rojo"
winequality_white$tipo <- "blanco"
# Unir las dos bases de datos
winequality <- bind_rows(winequality_red, winequality_white)
# Convertir la variable respuesta en factor
winequality$tipo <- as.factor(winequality$tipo)
#Preparar y limpiar datos
head(winequality)
## # A tibble: 6 × 13
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ℹ 8 more variables: `free sulfur dioxide` <dbl>,
## # `total sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, tipo <fct>
str(winequality)
## spc_tbl_ [6,497 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed acidity : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile acidity : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric acid : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual sugar : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:6497] 5 5 5 6 5 5 5 7 7 5 ...
## $ tipo : Factor w/ 2 levels "blanco","rojo": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(winequality)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality tipo
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000 blanco:4898
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000 rojo :1599
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :9.000
# Verificar cantidad por tipo de vino
table(winequality$tipo)
##
## blanco rojo
## 4898 1599
round(prop.table(table(winequality$tipo)),2)
##
## blanco rojo
## 0.75 0.25
# Verificar valores faltantes
colSums(is.na(winequality))
## fixed acidity volatile acidity citric acid
## 0 0 0
## residual sugar chlorides free sulfur dioxide
## 0 0 0
## total sulfur dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
## tipo
## 0
#Dividir los datos en conjunto de entrenamiento y prueba
winequality_norm <- as.data.frame(lapply(winequality[, -ncol(winequality)], rescale))
winequality_norm$tipo <- winequality$tipo
set.seed(2025)
folds <- createFolds(winequality_norm$tipo, k = 5)
entrenamiento <- winequality_norm[c(folds$Fold1, folds$Fold2, folds$Fold3, folds$Fold4), ]
prueba <- winequality_norm[folds$Fold5, ]
dim(entrenamiento)[1]
## [1] 5198
dim(prueba)[1]
## [1] 1299