El paquete CARET (Classification And Regression Training) es un paquete integral con una amplia variedad de algoritmos para el aprendizaje automatico
#install.packages("ggplot2")
library(ggplot2)
#install.packages("lattice")
library(lattice)
#install.packages("caret")
library(caret)
#install.packages("datasets")
library(datasets)
#install.packages("DataExplorer")
library(DataExplorer)
df <- data.frame(iris)
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
str(df)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#create_report(df)
plot_missing(df)
plot_histogram(df)
plot_correlation(df)
# Partir la base de datos
# Nromalmente 80-20
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$Species, p=0.8, list=FALSE)
entrenamiento <- iris[renglones_entrenamiento, ]
prueba <- iris[-renglones_entrenamiento, ]
los metodos mas utilizados para modelar aprendizaje automatico son:
# Paquetes
library(caret)
library(e1071) # para métricas en confusionMatrix
set.seed(123)
# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (C)
grid <- data.frame(C = 1)
# Entrenamiento SVM lineal
modelo1 <- train(
Species ~ .,
data = entrenamiento,
method = "svmLinear",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento1 <- predict(modelo1, entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)
# Matriz de confusión - entrenamiento
mcre1 <- confusionMatrix(resultado_entrenamiento1, entrenamiento$Species)
mcre1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de confusión - prueba
mcrp1 <- confusionMatrix(resultado_prueba1, prueba$Species)
mcrp1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# Paquetes
library(caret)
library(e1071) # para métricas en confusionMatrix
set.seed(123)
# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (C)
grid <- data.frame(sigma=1, C = 1) #Cambiar
# Entrenamiento SVM lineal
modelo2 <- train(
Species ~ .,
data = entrenamiento,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)
# Matriz de confusión - entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento$Species)
mcre2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de confusión - prueba
mcrp2 <- confusionMatrix(resultado_prueba2, prueba$Species)
mcrp2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
# Paquetes
library(caret)
library(e1071) # para métricas en confusionMatrix
set.seed(123)
# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar
# Entrenamiento SVM lineal
modelo3 <- train(
Species ~ .,
data = entrenamiento,
method = "svmPoly",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)
# Matriz de confusión - entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento$Species)
mcre3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de confusión - prueba
mcrp3 <- confusionMatrix(resultado_prueba3, prueba$Species)
mcrp3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# Paquetes
library(caret)
library(e1071) # para métricas en confusionMatrix
set.seed(123)
# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar
# Entrenamiento SVM lineal
modelo4 <- train(
Species ~ .,
data = entrenamiento,
method = "svmPoly",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)
# Matriz de confusión - entrenamiento
mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento$Species)
mcre4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de confusión - prueba
mcrp4 <- confusionMatrix(resultado_prueba4, prueba$Species)
mcrp4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# Paquetes
library(caret)
library(e1071) # para métricas en confusionMatrix
set.seed(123)
# Control de validación cruzada
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (C)
grid <- data.frame(degree=1, scale=1, C = 1) #Cambiar
# Entrenamiento SVM lineal
modelo5 <- train(
Species ~ .,
data = entrenamiento,
method = "svmPoly",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)
# Matriz de confusión - entrenamiento
mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento$Species)
mcre5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de confusión - prueba
mcrp5 <- confusionMatrix(resultado_prueba5, prueba$Species)
mcrp5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# Paquetes
library(caret)
library(e1071)
set.seed(123)
# Control CV
ctrl <- trainControl(method = "cv", number = 10)
# Rejilla de hiperparámetros (mtry = número de variables por split)
grid <- expand.grid(mtry = c(2, 3, 4))
# Entrenamiento Random Forest
modelo6 <- train(
Species ~ .,
data = entrenamiento,
method = "rf",
trControl = ctrl,
tuneGrid = grid
)
# Predicciones
resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)
# Matrices de confusión
mcre6 <- confusionMatrix(resultado_entrenamiento6, entrenamiento$Species)
mcre6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 0
## virginica 0 0 40
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9697, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
mcrp6 <- confusionMatrix(resultado_prueba6, prueba$Species)
mcrp6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
resultados <- data.frame(
svmLinear = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
svmRadial = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
svmPoly = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
rpart = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
nnet = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
rf = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)
rownames(resultados) <- c("Precision de entrenamiento","Precision de Prueba")
resultados
## svmLinear svmRadial svmPoly rpart nnet
## Precision de entrenamiento 0.9916667 0.9916667 0.9916667 0.9916667 0.9916667
## Precision de Prueba 0.9666667 0.9333333 0.9666667 0.9666667 0.9666667
## rf
## Precision de entrenamiento 1.0000000
## Precision de Prueba 0.9333333