# Base de datos
records <- read.csv("~/Downloads/heart_failure_clinical_records_data.csv")
summary(records)
## age anaemia creatinine_phosphokinase diabetes
## Min. :40.00 Min. :0.0000 Min. : 23.0 Min. :0.0000
## 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.: 116.5 1st Qu.:0.0000
## Median :60.00 Median :0.0000 Median : 250.0 Median :0.0000
## Mean :60.83 Mean :0.4314 Mean : 581.8 Mean :0.4181
## 3rd Qu.:70.00 3rd Qu.:1.0000 3rd Qu.: 582.0 3rd Qu.:1.0000
## Max. :95.00 Max. :1.0000 Max. :7861.0 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :14.00 Min. :0.0000 Min. : 25100 Min. :0.500
## 1st Qu.:30.00 1st Qu.:0.0000 1st Qu.:212500 1st Qu.:0.900
## Median :38.00 Median :0.0000 Median :262000 Median :1.100
## Mean :38.08 Mean :0.3512 Mean :263358 Mean :1.394
## 3rd Qu.:45.00 3rd Qu.:1.0000 3rd Qu.:303500 3rd Qu.:1.400
## Max. :80.00 Max. :1.0000 Max. :850000 Max. :9.400
## serum_sodium sex smoking time
## Min. :113.0 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:134.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 73.0
## Median :137.0 Median :1.0000 Median :0.0000 Median :115.0
## Mean :136.6 Mean :0.6488 Mean :0.3211 Mean :130.3
## 3rd Qu.:140.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:203.0
## Max. :148.0 Max. :1.0000 Max. :1.0000 Max. :285.0
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
# Visión general de toda la base de datos
skimr::skim(records)
| Name | records |
| Number of rows | 299 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 60.83 | 11.89 | 40.0 | 51.0 | 60.0 | 70.0 | 95.0 | ▆▇▇▂▁ |
| anaemia | 0 | 1 | 0.43 | 0.50 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▆ |
| creatinine_phosphokinase | 0 | 1 | 581.84 | 970.29 | 23.0 | 116.5 | 250.0 | 582.0 | 7861.0 | ▇▁▁▁▁ |
| diabetes | 0 | 1 | 0.42 | 0.49 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▆ |
| ejection_fraction | 0 | 1 | 38.08 | 11.83 | 14.0 | 30.0 | 38.0 | 45.0 | 80.0 | ▃▇▂▂▁ |
| high_blood_pressure | 0 | 1 | 0.35 | 0.48 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▅ |
| platelets | 0 | 1 | 263358.03 | 97804.24 | 25100.0 | 212500.0 | 262000.0 | 303500.0 | 850000.0 | ▂▇▂▁▁ |
| serum_creatinine | 0 | 1 | 1.39 | 1.03 | 0.5 | 0.9 | 1.1 | 1.4 | 9.4 | ▇▁▁▁▁ |
| serum_sodium | 0 | 1 | 136.63 | 4.41 | 113.0 | 134.0 | 137.0 | 140.0 | 148.0 | ▁▁▃▇▁ |
| sex | 0 | 1 | 0.65 | 0.48 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | ▅▁▁▁▇ |
| smoking | 0 | 1 | 0.32 | 0.47 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▃ |
| time | 0 | 1 | 130.26 | 77.61 | 4.0 | 73.0 | 115.0 | 203.0 | 285.0 | ▆▇▃▆▃ |
| DEATH_EVENT | 0 | 1 | 0.32 | 0.47 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ▇▁▁▁▃ |
# Características de las variables de la base de datos
glimpse(records)
## Rows: 299
## Columns: 13
## $ age <dbl> 75, 55, 65, 50, 65, 90, 75, 60, 65, 80, 75, 6…
## $ anaemia <int> 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ creatinine_phosphokinase <int> 582, 7861, 146, 111, 160, 47, 246, 315, 157, …
## $ diabetes <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ ejection_fraction <int> 20, 38, 20, 20, 20, 40, 15, 60, 65, 35, 38, 2…
## $ high_blood_pressure <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ platelets <dbl> 265000, 263358, 162000, 210000, 327000, 20400…
## $ serum_creatinine <dbl> 1.90, 1.10, 1.30, 1.90, 2.70, 2.10, 1.20, 1.1…
## $ serum_sodium <int> 130, 136, 129, 137, 116, 132, 137, 131, 138, …
## $ sex <int> 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, …
## $ smoking <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ time <int> 4, 6, 7, 7, 8, 8, 10, 10, 10, 10, 10, 10, 11,…
## $ DEATH_EVENT <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
# Información de las primeras filas de la base de datos
head(records)
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## 1 75 0 582 0 20
## 2 55 0 7861 0 38
## 3 65 0 146 0 20
## 4 50 1 111 0 20
## 5 65 1 160 1 20
## 6 90 1 47 0 40
## high_blood_pressure platelets serum_creatinine serum_sodium sex smoking time
## 1 1 265000 1.9 130 1 0 4
## 2 0 263358 1.1 136 1 0 6
## 3 0 162000 1.3 129 1 1 7
## 4 0 210000 1.9 137 1 0 7
## 5 0 327000 2.7 116 0 0 8
## 6 1 204000 2.1 132 1 1 8
## DEATH_EVENT
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
# Porcentaje total de valores faltantes
pct_miss(records)
## [1] 0
# Cambio a factor
records <- records %>%
mutate(
DEATH_EVENT = as.character(DEATH_EVENT),
DEATH_EVENT = recode(DEATH_EVENT, "0" = "alive", "1" = "death"),
DEATH_EVENT = factor(DEATH_EVENT, levels = c("alive", "death")),
across(c(anaemia, diabetes, high_blood_pressure, sex, smoking), as.factor)
)
# Verificación de cambio
str(records)
## 'data.frame': 299 obs. of 13 variables:
## $ age : num 75 55 65 50 65 90 75 60 65 80 ...
## $ anaemia : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 2 1 2 ...
## $ creatinine_phosphokinase: int 582 7861 146 111 160 47 246 315 157 123 ...
## $ diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
## $ ejection_fraction : int 20 38 20 20 20 40 15 60 65 35 ...
## $ high_blood_pressure : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 1 2 ...
## $ platelets : num 265000 263358 162000 210000 327000 ...
## $ serum_creatinine : num 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ serum_sodium : int 130 136 129 137 116 132 137 131 138 133 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 2 1 2 ...
## $ smoking : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 2 1 2 ...
## $ time : int 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : Factor w/ 2 levels "alive","death": 2 2 2 2 2 2 2 2 2 2 ...
table(records$DEATH_EVENT)
##
## alive death
## 203 96
library(caret)
set.seed(2025)
# Crear folds usando la variable respuesta original
folds <- createFolds(records$DEATH_EVENT, k = 5)
# Usar la data original, no normalizada
entrenamiento <- records[-folds[[5]], ]
prueba <- records[folds[[5]], ]
# Etiquetas
entrenamiento_labels <- records$DEATH_EVENT[-folds[[5]]]
prueba_labels <- records$DEATH_EVENT[folds[[5]]]
# Ver tamaños
dim(entrenamiento)[1]
## [1] 239
dim(prueba)[1]
## [1] 60
library(rpart)
library(rpart.plot)
library(C50)
# Verificar estructura de la variable respuesta
str(entrenamiento$DEATH_EVENT)
## Factor w/ 2 levels "alive","death": 2 2 2 2 2 2 2 2 2 2 ...
# Árbol CART
modelo_cart <- rpart(DEATH_EVENT ~ ., data = entrenamiento, method = "class")
modelo_cart
## n= 239
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 239 77 alive (0.67782427 0.32217573)
## 2) time>=73.5 179 29 alive (0.83798883 0.16201117)
## 4) serum_creatinine< 1.45 143 13 alive (0.90909091 0.09090909) *
## 5) serum_creatinine>=1.45 36 16 alive (0.55555556 0.44444444)
## 10) platelets>=236000 19 5 alive (0.73684211 0.26315789) *
## 11) platelets< 236000 17 6 death (0.35294118 0.64705882) *
## 3) time< 73.5 60 12 death (0.20000000 0.80000000)
## 6) serum_sodium>=136.5 29 10 death (0.34482759 0.65517241)
## 12) time>=48.5 10 3 alive (0.70000000 0.30000000) *
## 13) time< 48.5 19 3 death (0.15789474 0.84210526) *
## 7) serum_sodium< 136.5 31 2 death (0.06451613 0.93548387) *
rpart.plot(modelo_cart)
# Árbol C5.0
modelo_c50 <- C5.0(DEATH_EVENT ~ ., data = entrenamiento)
modelo_c50
##
## Call:
## C5.0.formula(formula = DEATH_EVENT ~ ., data = entrenamiento)
##
## Classification Tree
## Number of samples: 239
## Number of predictors: 12
##
## Tree size: 19
##
## Non-standard options: attempt to group attributes
plot(modelo_c50)
library(caret)
set.seed(2025)
# Definir el esquema de validación cruzada
train_control <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# Entrenar el modelo C5.0 usando validación cruzada
arbol_cv <- train(DEATH_EVENT ~ .,
data = prueba,
method = "C5.0",
trControl = train_control,
tuneLength = 3)
# Generar matriz de confusión con las predicciones
confusionMatrix(arbol_cv$pred$pred, arbol_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction alive death
## alive 470 67
## death 22 161
##
## Accuracy : 0.8764
## 95% CI : (0.8501, 0.8995)
## No Information Rate : 0.6833
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6984
##
## Mcnemar's Test P-Value : 3.101e-06
##
## Sensitivity : 0.9553
## Specificity : 0.7061
## Pos Pred Value : 0.8752
## Neg Pred Value : 0.8798
## Prevalence : 0.6833
## Detection Rate : 0.6528
## Detection Prevalence : 0.7458
## Balanced Accuracy : 0.8307
##
## 'Positive' Class : alive
##
# Entrenar el modelo CART usando validación cruzada
arbol_cart <- train(
DEATH_EVENT ~ .,
data = entrenamiento,
method = "rpart",
trControl = train_control,
tuneLength = 3
)
# Entrenar el modelo C5.0 usando validación cruzada
arbol_c50 <- train(
DEATH_EVENT ~ .,
data = entrenamiento,
method = "C5.0",
trControl = train_control,
tuneLength = 3
)
# Comparar el desempeño de ambos modelos
comparacion <- resamples(list(CART = arbol_cart, C5.0 = arbol_c50))
# Mostrar el resumen de Accuracy y Kappa
summary(comparacion)
##
## Call:
## summary.resamples(object = comparacion)
##
## Models: CART, C5.0
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.6956522 0.7275000 0.7663043 0.7735870 0.8206522 0.880 0
## C5.0 0.7391304 0.7848732 0.8333333 0.8154638 0.8400000 0.875 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.3346008 0.3598334 0.4325581 0.4725329 0.5766094 0.7148289 0
## C5.0 0.3300971 0.4819734 0.5857143 0.5609610 0.6494675 0.6896552 0
# Graficar la comparación entre modelos
dotplot(comparacion)
# Leer base de datos / vino blanco
winequality_white <- read_delim("~/Downloads/wine+quality/winequality-white.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 4898 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (12): fixed acidity, volatile acidity, citric acid, residual sugar, chlo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Leer base de datos / vino rojo
winequality_red <- read_delim("~/Downloads/wine+quality/winequality-red.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 1599 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (12): fixed acidity, volatile acidity, citric acid, residual sugar, chlo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Añadir una variable para identificar el tipo de vino
winequality_red$tipo <- "rojo"
winequality_white$tipo <- "blanco"
# Unir las dos bases de datos
winequality <- bind_rows(winequality_red, winequality_white)
# Arreglar nombres de columnas
colnames(winequality) <- make.names(colnames(winequality))
# Convertir variables en factor
winequality$quality <- as.factor(winequality$quality)
winequality$tipo <- as.factor(winequality$tipo)
# Preparar y limpiar datos
head(winequality)
## # A tibble: 6 × 13
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ℹ 8 more variables: free.sulfur.dioxide <dbl>, total.sulfur.dioxide <dbl>,
## # density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>, quality <fct>,
## # tipo <fct>
str(winequality)
## spc_tbl_ [6,497 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed.acidity : num [1:6497] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num [1:6497] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num [1:6497] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num [1:6497] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:6497] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num [1:6497] 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num [1:6497] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:6497] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:6497] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:6497] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:6497] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : Factor w/ 7 levels "3","4","5","6",..: 3 3 3 4 3 3 3 5 5 3 ...
## $ tipo : Factor w/ 2 levels "blanco","rojo": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(winequality)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
##
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
##
## pH sulphates alcohol quality tipo
## Min. :2.720 Min. :0.2200 Min. : 8.00 3: 30 blanco:4898
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 4: 216 rojo :1599
## Median :3.210 Median :0.5100 Median :10.30 5:2138
## Mean :3.219 Mean :0.5313 Mean :10.49 6:2836
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 7:1079
## Max. :4.010 Max. :2.0000 Max. :14.90 8: 193
## 9: 5
# Verificar valores faltantes
colSums(is.na(winequality))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
## tipo
## 0
set.seed(2025)
folds <- createFolds(winequality$quality, k = 5)
entrenamiento <- winequality[-folds[[5]], ]
prueba <- winequality[folds[[5]], ]
entrenamiento_labels <- winequality$quality[-folds[[5]]]
prueba_labels <- winequality$quality[folds[[5]]]
arbol_1 <- rpart(quality ~ ., data = entrenamiento)
rpart.plot(arbol_1)
set.seed(2025)
train_control <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
cart_entrena <- train(quality ~ .,
data = entrenamiento,
method = "rpart",
trControl = train_control,
tuneLength = 10)
confusionMatrix(cart_entrena$pred$pred, cart_entrena$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 87 949 10386 5822 473 46 0
## 6 140 736 6535 15516 6592 1185 32
## 7 13 45 189 1352 1565 309 8
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5284
## 95% CI : (0.5241, 0.5327)
## No Information Rate : 0.4365
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2373
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.00000 0.6070 0.6838 0.18134 0.00000
## Specificity 1.000000 1.00000 0.7884 0.4804 0.95580 1.00000
## Pos Pred Value NaN NaN 0.5847 0.5048 0.44958 NaN
## Neg Pred Value 0.995383 0.96672 0.8035 0.6623 0.85433 0.97037
## Prevalence 0.004617 0.03328 0.3292 0.4365 0.16603 0.02963
## Detection Rate 0.000000 0.00000 0.1998 0.2985 0.03011 0.00000
## Detection Prevalence 0.000000 0.00000 0.3417 0.5913 0.06697 0.00000
## Balanced Accuracy 0.500000 0.50000 0.6977 0.5821 0.56857 0.50000
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9992305
## Prevalence 0.0007695
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
cart_prueba <- train(quality ~ .,
data = prueba,
method = "rpart",
trControl = train_control,
tuneLength = 10)
confusionMatrix(cart_prueba$pred$pred, cart_prueba$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 5 6 13 0 0 0
## 5 30 157 2171 1301 118 17 0
## 6 30 253 2051 3946 1780 241 2
## 7 0 15 42 407 259 132 8
## 8 0 0 0 3 3 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4912
## 95% CI : (0.4826, 0.4999)
## No Information Rate : 0.4365
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1712
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.0116279 0.5084 0.6959 0.11991 0.0000000
## Specificity 1.000000 0.9984873 0.8139 0.4048 0.94423 0.9995238
## Pos Pred Value NaN 0.2083333 0.5722 0.4752 0.30012 0.0000000
## Neg Pred Value 0.995381 0.9672220 0.7717 0.6322 0.84324 0.9699630
## Prevalence 0.004619 0.0331024 0.3287 0.4365 0.16628 0.0300231
## Detection Rate 0.000000 0.0003849 0.1671 0.3038 0.01994 0.0000000
## Detection Prevalence 0.000000 0.0018476 0.2921 0.6392 0.06644 0.0004619
## Balanced Accuracy 0.500000 0.5050576 0.6612 0.5504 0.53207 0.4997619
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9992302
## Prevalence 0.0007698
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
c5_entrena <- train(quality ~ ., data= entrenamiento,
method = "C5.0", trControl = train_control,
tuneLength = 10)
confusionMatrix(c5_entrena$pred$pred, c5_entrena$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 2 100 24 12 2 0 0
## 4 74 1315 548 443 64 13 0
## 5 564 3874 50211 19625 1772 173 16
## 6 266 1526 16254 59482 12432 1676 44
## 7 50 96 1348 10772 19809 2072 92
## 8 4 9 55 426 439 2222 8
## 9 0 0 0 0 2 4 0
##
## Overall Statistics
##
## Accuracy : 0.6399
## 95% CI : (0.6378, 0.6419)
## No Information Rate : 0.4365
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4532
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 2.083e-03 0.190029 0.7336 0.6554 0.57384 0.36071
## Specificity 9.993e-01 0.994318 0.8134 0.7252 0.91678 0.99534
## Pos Pred Value 1.429e-02 0.535206 0.6586 0.6488 0.57855 0.70250
## Neg Pred Value 9.954e-01 0.972720 0.8616 0.7309 0.91530 0.98077
## Prevalence 4.617e-03 0.033282 0.3292 0.4365 0.16603 0.02963
## Detection Rate 9.619e-06 0.006325 0.2415 0.2861 0.09527 0.01069
## Detection Prevalence 6.733e-04 0.011817 0.3667 0.4409 0.16467 0.01521
## Balanced Accuracy 5.007e-01 0.592174 0.7735 0.6903 0.74531 0.67803
## Class: 9
## Sensitivity 0.000e+00
## Specificity 1.000e+00
## Pos Pred Value 0.000e+00
## Neg Pred Value 9.992e-01
## Prevalence 7.695e-04
## Detection Rate 0.000e+00
## Detection Prevalence 2.886e-05
## Balanced Accuracy 5.000e-01
c5_prueba <- train(quality ~ ., data=prueba,
method = "C5.0", trControl = train_control,
tuneLength = 10)
confusionMatrix(c5_prueba$pred$pred, c5_prueba$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 1 55 0 4 26 0
## 4 0 158 266 194 7 18 0
## 5 182 1011 10956 6380 643 67 0
## 6 58 448 5086 12128 4127 634 18
## 7 0 100 692 3733 3730 605 4
## 8 0 2 25 243 129 210 18
## 9 0 0 0 2 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5231
## 95% CI : (0.5188, 0.5274)
## No Information Rate : 0.4365
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2776
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.091860 0.6415 0.5347 0.43171 0.134615
## Specificity 0.998337 0.990346 0.7625 0.6458 0.88149 0.991726
## Pos Pred Value 0.000000 0.245723 0.5695 0.5390 0.42080 0.334928
## Neg Pred Value 0.995373 0.969562 0.8128 0.6418 0.88607 0.973701
## Prevalence 0.004619 0.033102 0.3287 0.4365 0.16628 0.030023
## Detection Rate 0.000000 0.003041 0.2109 0.2334 0.07179 0.004042
## Detection Prevalence 0.001655 0.012375 0.3703 0.4330 0.17059 0.012067
## Balanced Accuracy 0.499169 0.541103 0.7020 0.5903 0.65660 0.563171
## Class: 9
## Sensitivity 0.000e+00
## Specificity 1.000e+00
## Pos Pred Value 0.000e+00
## Neg Pred Value 9.992e-01
## Prevalence 7.698e-04
## Detection Rate 0.000e+00
## Detection Prevalence 3.849e-05
## Balanced Accuracy 5.000e-01
comparacion <- resamples(list(CART = cart_prueba, C5.0 = c5_prueba))
summary(comparacion)
##
## Call:
## summary.resamples(object = comparacion)
##
## Models: CART, C5.0
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.4615385 0.4826923 0.5135377 0.5134592 0.5442308 0.5615385 0
## C5.0 0.4661654 0.5211293 0.5440693 0.5413112 0.5536125 0.6153846 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.09578696 0.1486661 0.1824752 0.1908844 0.2506781 0.2559734 0
## C5.0 0.18025994 0.2576485 0.2903452 0.2908894 0.3062175 0.4092520 0