Librerias

library(ggplot2)
library(lattice)
library(caret)
library(datasets)
library(DataExplorer)
library(kernlab)

## 
## Adjuntando el paquete: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Adjuntando el paquete: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(e1071)
library(rpart)

Crear Base de datos‹/span>

df <- read.csv("\\Users\\karee\\Downloads\\heart.csv")

Analisis exploratorio ‹/span>

summary(df)

##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000

str(df)

## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : int  1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : int  2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : int  2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : int  3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : int  0 0 0 0 0 1 0 0 0 0 ...

#plot_missing(df)

Partir datos en 80/20 ‹/span>

# Definir la proporción de entrenamiento (80%)
set.seed(123) 
train_indices <- sample(1:nrow(df), size = 0.8 * nrow(df))

# Crear el conjunto de entrenamiento y prueba
train_set <- df[train_indices, ]
test_set <- df[-train_indices, ]

test_set$exang <- as.factor(test_set$target)
train_set$exang <- as.factor(train_set$target)
test_set$restecg <- as.factor(test_set$target)
train_set$restecg <- as.factor(train_set$target)
test_set$ca <- as.factor(test_set$target)
train_set$ca <- as.factor(train_set$target)
test_set$slope <- as.factor(test_set$target)
train_set$slope <- as.factor(train_set$target)
test_set$age <- as.factor(test_set$target)
train_set$age <- as.factor(train_set$target)
test_set$cp <- as.factor(test_set$target)
train_set$cp <- as.factor(train_set$target)
test_set$fbs <- as.factor(test_set$target)
train_set$fbs <- as.factor(train_set$target)
train_set$sex <- as.factor(train_set$target)
test_set$sex <- as.factor(test_set$target)
train_set$target <- as.factor(train_set$target)
test_set$target <- as.factor(test_set$target)

names(df)

##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

<span style=“color: violet;”›Métodos para Modelar

Los métodos más utilizados para modelar aprendizaje automático son: SVM: Support Vector Machine o Máquina de Vectores de Soporte. Hay varios subtipos: Lineal (symLinear), Radial (svmRadial), Polinómico (symPoly), etc. * Árbol de Decisión: rpart * Redes Neuronales: nnet * Random Forest o Bosques Aleatorios: rf

<span style=“color: violet;”›Modelo con el metodo svmRadiaear

# Definir la fórmula del modelo
modelo1 <- target ~ .

# Configurar el método de entrenamiento (SVM con kernel lineal)
train_control <- trainControl(method = "none")

# Entrenar el modelo SVM con kernel lineal
set.seed(123)  # Para reproducibilidad
svm_model <- train(
  modelo1,
  data = train_set,
  method = "svmLinear",
  trControl = train_control
)

# Resumen del modelo entrenado
print(svm_model)

## Support Vector Machines with Linear Kernel 
## 
## 820 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: None

# Hacer predicciones en el conjunto de prueba
predicciones <- predict(svm_model, newdata = test_set)

# Ver los resultados de las predicciones
print(predicciones)

##   [1] 0 0 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1
##  [38] 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1
##  [75] 1 1 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1
## [112] 0 0 0 1 1 1 1 0 1 0 0 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0
## [149] 0 0 1 1 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1
## [186] 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 0
## Levels: 0 1

# Evaluar el rendimiento del modelo en el conjunto de prueba
matriz_confusion <- confusionMatrix(predicciones, test_set$target)
print(matriz_confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

2. Modelo con Método svmRadial

modelo5<- train(target ~ ., data=train_set,
                method = "svmRadial", #Cambiar
                preProcess=c("scale","center"),
                trControl = trainControl(method="cv", number=10),
                tuneGrid = data.frame(sigma=1,C=1) #Cambiar
                )
resultado_entrenamiento2 <- predict(modelo5,train_set)
resultado_prueba2 <- predict(modelo5,test_set)

# Matriz de Confusión del Resultado de Entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2, train_set$target)

# Matriz de Confusión del Resultado de Prueba
mcrp2 <- confusionMatrix(resultado_prueba2, test_set$target)

3. Modelo con Método svmPoly

modelo6<- train(target ~ ., data=train_set,
                method = "svmPoly", #Cambiar
                preProcess=c("scale","center"),
                trControl = trainControl(method="cv", number=10),
                tuneGrid = data.frame(degree=1, scale=1, C=1) #Cambiar
                )
resultado_entrenamiento3 <- predict(modelo6,train_set)
resultado_prueba3 <- predict(modelo6,test_set)

# Matriz de Confusión del Resultado de Entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3, train_set$target)

# Matriz de Confusión del Resultado de Prueba
mcrp3 <- confusionMatrix(resultado_prueba3, test_set$target)

<span style=“color: violet;”›Modelo con el metodo Arboles de Decisión

# Definir la fórmula del modelo
metodo2 <- target ~ .

# Entrenar el modelo de árbol de decisión usando rpart
set.seed(123)  # Para reproducibilidad
arbol_model <- rpart(metodo2, data = train_set, method = "class")

# Resumen del modelo entrenado
print(arbol_model)

## n= 820 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 820 404 1 (0.4926829 0.5073171)  
##   2) age=0 404   0 0 (1.0000000 0.0000000) *
##   3) age=1 416   0 1 (0.0000000 1.0000000) *

# Visualizar el árbol de decisión
plot(arbol_model)
text(arbol_model, use.n = TRUE)

# Hacer predicciones en el conjunto de prueba
predicciones <- predict(arbol_model, newdata = test_set, type = "class")

# Evaluar el rendimiento del modelo en el conjunto de prueba
matriz_confusion <- confusionMatrix(predicciones, test_set$target)
print(matriz_confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

<span style=“color: violet;”›Modelo con el metodo Redes Neuronales

# Definir la fórmula del modelo
modelo3 <- target ~ .

# Entrenar el modelo de redes neuronales usando nnet
set.seed(123)  # Para reproducibilidad
nnet_model <- train(
  modelo3,
  data = train_set,
  method = "nnet",
  trControl = trainControl(method = "none"),
  trace = FALSE,
  linout = FALSE,
  maxit = 200  # Número máximo de iteraciones
)

# Resumen del modelo entrenado
print(nnet_model)

## Neural Network 
## 
## 820 samples
##  13 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: None

# Hacer predicciones en el conjunto de prueba
predicciones <- predict(nnet_model, newdata = test_set)

# Evaluar el rendimiento del modelo en el conjunto de prueba
matriz_confusion <- confusionMatrix(predicciones, test_set$target)
print(matriz_confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1  95 110
##                                           
##                Accuracy : 0.5366          
##                  95% CI : (0.4658, 0.6063)
##     No Information Rate : 0.5366          
##     P-Value [Acc > NIR] : 0.5286          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5366          
##              Prevalence : 0.4634          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##

<span style=“color: violet;”›Modelo con el metodo Random Forest

# Definir la fórmula del modelo
modelo4 <- target ~ .

# Entrenar el modelo Random Forest
set.seed(123)  # Para reproducibilidad
rf_model <- randomForest(modelo4, data = train_set, ntree = 500, mtry = 3)

# Resumen del modelo entrenado
print(rf_model)

## 
## Call:
##  randomForest(formula = modelo4, data = train_set, ntree = 500,      mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##     0   1 class.error
## 0 404   0           0
## 1   0 416           0

# Hacer predicciones en el conjunto de prueba
predicciones <- predict(rf_model, newdata = test_set)

# Evaluar el rendimiento del modelo en el conjunto de prueba
matriz_confusion <- confusionMatrix(predicciones, test_set$target)
print(matriz_confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

<span style=“color: violet;”›Comparacion de matrices

# Supongamos que ya tienes los modelos entrenados y sus predicciones:
# svm_model, arbol_model, nnet_model, rf_model
# Y sus predicciones:
# pred_svm, pred_arbol, pred_nnet, pred_rf

# Crear una lista para almacenar las matrices de confusión
confusion_matrices <- list()

# Calcular la matriz de confusión para cada modelo
confusion_matrices$SVM <- confusionMatrix(predict(svm_model, newdata = test_set), test_set$target)
confusion_matrices$Decision_Tree <- confusionMatrix(predict(arbol_model, newdata = test_set, type = "class"), test_set$target)
confusion_matrices$Neural_Network <- confusionMatrix(predict(nnet_model, newdata = test_set), test_set$target)
confusion_matrices$Random_Forest <- confusionMatrix(predict(rf_model, newdata = test_set), test_set$target)

# Mostrar las matrices de confusión
print(confusion_matrices)

## $SVM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##                                      
## 
## $Decision_Tree
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##                                      
## 
## $Neural_Network
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1  95 110
##                                           
##                Accuracy : 0.5366          
##                  95% CI : (0.4658, 0.6063)
##     No Information Rate : 0.5366          
##     P-Value [Acc > NIR] : 0.5286          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5366          
##              Prevalence : 0.4634          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##                                           
## 
## $Random_Forest
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  95   0
##          1   0 110
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9822, 1)
##     No Information Rate : 0.5366     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4634     
##          Detection Rate : 0.4634     
##    Detection Prevalence : 0.4634     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

# Comparar las métricas de los modelos
comparison <- data.frame(
  Modelo = c("SVM", "Decision Tree", "Neural Network", "Random Forest"),
  Accuracy = c(confusion_matrices$SVM$overall['Accuracy'],
               confusion_matrices$Decision_Tree$overall['Accuracy'],
               confusion_matrices$Neural_Network$overall['Accuracy'],
               confusion_matrices$Random_Forest$overall['Accuracy']),
  Kappa = c(confusion_matrices$SVM$overall['Kappa'],
            confusion_matrices$Decision_Tree$overall['Kappa'],
            confusion_matrices$Neural_Network$overall['Kappa'],
            confusion_matrices$Random_Forest$overall['Kappa'])
)

# Mostrar la comparación de métricas
print(comparison)

##           Modelo  Accuracy Kappa
## 1            SVM 1.0000000     1
## 2  Decision Tree 1.0000000     1
## 3 Neural Network 0.5365854     0
## 4  Random Forest 1.0000000     1

<span style=“color: violet;”›Conclusiones

Aunque tres de los modelos parecen dar buenos resultados, es posible que estén ocultando problemas como sobreajuste o que el conjunto de prueba no sea representativo. En cuanto a la red neuronal, su desempeño es claramente insuficiente y probablemente necesite más ajustes.

CARET HEART

Karen Maldonado

2024-08-21