Teoría

El paquete CARET (Classification And Regression Training) es un paquete integral con una amplia variedad de algoritmos para el aprendizaje automatico

Instalar paquetes y llamar librerias

#install.packages("ggplot2")
library(ggplot2)
#install.packages("lattice")
library(lattice)
#install.packages("caret")
library(caret)
#install.packages("datasets")
library(datasets)
#install.packages("DataExplorer")
library(DataExplorer)

Crear la base de datos

df <- data.frame(iris)

Entender la base de datos

summary(df)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
str(df)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#create_report(df)
plot_missing(df)

plot_histogram(df)

plot_correlation(df)

# Partir la base de datos

# Nromalmente 80-20
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$Species, p=0.8, list=FALSE)
entrenamiento <- iris[renglones_entrenamiento, ]
prueba <- iris[-renglones_entrenamiento, ]

Distintos tipos de Métodos para modelar

los metodos mas utilizados para modelar aprendizaje automatico son:

  • SVM: Support Vector Machine o Máquina de Vectores de Soporte. Hay varios subtipos: Lineal (svmLinear), Radial(svmRadial), Polinómico (svmPoly), etc.
  • Árbol de Decisión: rpart
  • Redes neuronales nnet
  • Random Forest o Bosques Aleatorios: rf

Modelo 1. SVM Lineal

# Paquetes
library(caret)
library(e1071)  # para métricas en confusionMatrix

set.seed(123)

# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (C)
grid <- data.frame(C = 1)

# Entrenamiento SVM lineal
modelo1 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "svmLinear",
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento1 <- predict(modelo1, entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)

# Matriz de confusión - entrenamiento
mcre1 <- confusionMatrix(resultado_entrenamiento1, entrenamiento$Species)
mcre1
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         0
##   virginica       0          1        40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9544, 0.9998)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9875          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9750           1.0000
## Specificity                 1.0000            1.0000           0.9875
## Pos Pred Value              1.0000            1.0000           0.9756
## Neg Pred Value              1.0000            0.9877           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3250           0.3333
## Detection Prevalence        0.3333            0.3250           0.3417
## Balanced Accuracy           1.0000            0.9875           0.9938
# Matriz de confusión - prueba
mcrp1 <- confusionMatrix(resultado_prueba1, prueba$Species)
mcrp1
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

Modelo 2. SVM Radial

# Paquetes
library(caret)
library(e1071)  # para métricas en confusionMatrix

set.seed(123)

# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (C)
grid <- data.frame(sigma=1, C = 1) #Cambiar

# Entrenamiento SVM lineal
modelo2 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "svmRadial",
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)

# Matriz de confusión - entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento$Species)
mcre2
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         0
##   virginica       0          1        40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9544, 0.9998)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9875          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9750           1.0000
## Specificity                 1.0000            1.0000           0.9875
## Pos Pred Value              1.0000            1.0000           0.9756
## Neg Pred Value              1.0000            0.9877           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3250           0.3333
## Detection Prevalence        0.3333            0.3250           0.3417
## Balanced Accuracy           1.0000            0.9875           0.9938
# Matriz de confusión - prueba
mcrp2 <- confusionMatrix(resultado_prueba2, prueba$Species)
mcrp2
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         2
##   virginica       0          0         8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.7793, 0.9918)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.747e-12       
##                                           
##                   Kappa : 0.9             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8000
## Specificity                 1.0000            0.9000           1.0000
## Pos Pred Value              1.0000            0.8333           1.0000
## Neg Pred Value              1.0000            1.0000           0.9091
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.2667
## Detection Prevalence        0.3333            0.4000           0.2667
## Balanced Accuracy           1.0000            0.9500           0.9000

Modelo 3. SVM Polinómico

# Paquetes
library(caret)
library(e1071)  # para métricas en confusionMatrix

set.seed(123)

# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar

# Entrenamiento SVM lineal
modelo3 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "svmPoly",
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)

# Matriz de confusión - entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento$Species)
mcre3
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         0
##   virginica       0          1        40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9544, 0.9998)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9875          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9750           1.0000
## Specificity                 1.0000            1.0000           0.9875
## Pos Pred Value              1.0000            1.0000           0.9756
## Neg Pred Value              1.0000            0.9877           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3250           0.3333
## Detection Prevalence        0.3333            0.3250           0.3417
## Balanced Accuracy           1.0000            0.9875           0.9938
# Matriz de confusión - prueba
mcrp3 <- confusionMatrix(resultado_prueba3, prueba$Species)
mcrp3
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

Modelo 4. Arbol de decision

# Paquetes
library(caret)
library(e1071)  # para métricas en confusionMatrix

set.seed(123)

# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar

# Entrenamiento SVM lineal
modelo4 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "svmPoly",
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)

# Matriz de confusión - entrenamiento
mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento$Species)
mcre4
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         0
##   virginica       0          1        40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9544, 0.9998)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9875          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9750           1.0000
## Specificity                 1.0000            1.0000           0.9875
## Pos Pred Value              1.0000            1.0000           0.9756
## Neg Pred Value              1.0000            0.9877           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3250           0.3333
## Detection Prevalence        0.3333            0.3250           0.3417
## Balanced Accuracy           1.0000            0.9875           0.9938
# Matriz de confusión - prueba
mcrp4 <- confusionMatrix(resultado_prueba4, prueba$Species)
mcrp4
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

Modelo 5. Redes neuronales

# Paquetes
library(caret)
library(e1071)  # para métricas en confusionMatrix

set.seed(123)

# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar

# Entrenamiento SVM lineal
modelo5 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "svmPoly",
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)

# Matriz de confusión - entrenamiento
mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento$Species)
mcre5
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         0
##   virginica       0          1        40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9917          
##                  95% CI : (0.9544, 0.9998)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9875          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9750           1.0000
## Specificity                 1.0000            1.0000           0.9875
## Pos Pred Value              1.0000            1.0000           0.9756
## Neg Pred Value              1.0000            0.9877           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3250           0.3333
## Detection Prevalence        0.3333            0.3250           0.3417
## Balanced Accuracy           1.0000            0.9875           0.9938
# Matriz de confusión - prueba
mcrp5 <- confusionMatrix(resultado_prueba5, prueba$Species)
mcrp5
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

Modelo 6. bosques aleatorios

# Paquetes
library(caret)
library(e1071)

set.seed(123)

# Control CV
ctrl <- trainControl(method = "cv", number = 10)

# Rejilla de hiperparámetros (mtry = número de variables por split)
grid <- expand.grid(mtry = c(2, 3, 4))

# Entrenamiento Random Forest
modelo6 <- train(
  Species ~ .,
  data = entrenamiento,
  method = "rf",
  trControl = ctrl,
  tuneGrid = grid
)

# Predicciones
resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)

# Matrices de confusión
mcre6 <- confusionMatrix(resultado_entrenamiento6, entrenamiento$Species)
mcre6
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         40         0
##   virginica       0          0        40
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9697, 1)
##     No Information Rate : 0.3333     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000
mcrp6 <- confusionMatrix(resultado_prueba6, prueba$Species)
mcrp6
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         2
##   virginica       0          0         8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.7793, 0.9918)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.747e-12       
##                                           
##                   Kappa : 0.9             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.8000
## Specificity                 1.0000            0.9000           1.0000
## Pos Pred Value              1.0000            0.8333           1.0000
## Neg Pred Value              1.0000            1.0000           0.9091
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.2667
## Detection Prevalence        0.3333            0.4000           0.2667
## Balanced Accuracy           1.0000            0.9500           0.9000

Tabla de resultados

resultados <- data.frame(
  svmLinear = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
  svmRadial = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
  svmPoly   = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
  rpart     = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
  nnet      = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
  rf        = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)

rownames(resultados) <- c("Precision de entrenamiento","Precision de Prueba")
resultados
##                            svmLinear svmRadial   svmPoly     rpart      nnet
## Precision de entrenamiento 0.9916667 0.9916667 0.9916667 0.9916667 0.9916667
## Precision de Prueba        0.9666667 0.9333333 0.9666667 0.9666667 0.9666667
##                                   rf
## Precision de entrenamiento 1.0000000
## Precision de Prueba        0.9333333