El paquete caret(Classification and regression training) es una herramienta poderosa para la implementación de modelos de Machine Learning.
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
# install.packages("datasets")# Para usar la base de datos "iris"
library(datasets)
# install.packages("ggplot2")# Graficos con mejor diseño
library(ggplot2)
# install.packages("lattice")# Crear graficos
library(lattice)
# install.packages("DataExplorer")# Analisis Descriptivo
library(DataExplorer)
#install.packages("kernlab")
library(kernlab)##
## Adjuntando el paquete: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
** NOTA: La variable que queremos predecir debe tener formato de
FACTOR.**
Los métodos más utilizados para modelar aprendizaje automático son:
La Validación cruzada (cross validation, cv) es una técnica para evaluar el rendimiento de un modelo, dividiendo los datos en múltiples subconjuntos, permitiendo medir su capacidad de generalización y evitar sobreajuste (overfitting).
La matriz de confusión permite analizar que tan bien funciona un modelo y que tipos de errores comete. Lo que hace es comparar las predicciones del con los valores reales de la variable objetivo.
Si la precision es es muy alta en entrenamiento (95-100%), pero baja en prueba (60-70%) es una señal de sobreajuste.
modelo1 <- train(Species ~ ., data = entrenamiento,
method = "svmLinear", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
tuneGrid = data.frame(C=1) #Cambiar hiperparametros
)
resultado_Entrenamiento1 <- predict(modelo1,entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre1 <- confusionMatrix(resultado_Entrenamiento1, entrenamiento$Species)
mcre1## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo2 <- train(Species ~ ., data = entrenamiento,
method = "svmRadial", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
tuneGrid = data.frame(sigma=1 ,C=1) #Cambiar hiperparametros
)
resultado_Entrenamiento2 <- predict(modelo2,entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre2 <- confusionMatrix(resultado_Entrenamiento1, entrenamiento$Species)
mcre2## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo3 <- train(Species ~ ., data = entrenamiento,
method = "svmPoly", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
tuneGrid = data.frame(degree =1, scale = 1, C=1) #Cambiar hiperparametros
)
resultado_Entrenamiento3 <- predict(modelo3,entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre3 <- confusionMatrix(resultado_Entrenamiento3, entrenamiento$Species)
mcre3## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo4 <- train(Species ~ ., data = entrenamiento,
method = "rpart", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
tuneLength = 10 #Cambiar hiperparametros
)
resultado_Entrenamiento4 <- predict(modelo4,entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre4 <- confusionMatrix(resultado_Entrenamiento4, entrenamiento$Species)
mcre4## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 3
## virginica 0 1 37
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 0.9250
## Specificity 1.0000 0.9625 0.9875
## Pos Pred Value 1.0000 0.9286 0.9737
## Neg Pred Value 1.0000 0.9872 0.9634
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3083
## Detection Prevalence 0.3333 0.3500 0.3167
## Balanced Accuracy 1.0000 0.9688 0.9563
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo5 <- train(Species ~ ., data = entrenamiento,
method = "nnet", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
trace = FALSE #Cambiar hiperparametros
)
resultado_Entrenamiento5 <- predict(modelo5,entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre5 <- confusionMatrix(resultado_Entrenamiento5, entrenamiento$Species)
mcre5## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 36 0
## virginica 0 4 40
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
modelo6 <- train(Species ~ ., data = entrenamiento,
method = "rf", # Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number = 10),
tuneGrid = expand.grid(mtry = c(2,4,6)) #Cambiar hiperparametros
)## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
resultado_Entrenamiento6 <- predict(modelo6,entrenamiento)
resultado_prueba6 <- predict(modelo5, prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre6 <- confusionMatrix(resultado_Entrenamiento6, entrenamiento$Species)
mcre6## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 0
## virginica 0 0 40
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9697, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
resultados <- data.frame(
"SVM Lineal" = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
"SVM Radial" = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
"SVM Polinómico" = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
"Árbol de Decisión" = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
"Redes Neuronales" = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
"Bosques Aleatorios" = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)
rownames(resultados) <- c("Precisión de Entrenamiento", "Precisión de Prueba")
resultados## SVM.Lineal SVM.Radial SVM.Polinómico
## Precisión de Entrenamiento 0.9916667 0.9916667 0.9916667
## Precisión de Prueba 0.9666667 0.9333333 0.9666667
## Árbol.de.Decisión Redes.Neuronales
## Precisión de Entrenamiento 0.9666667 0.9666667
## Precisión de Prueba 0.9333333 0.9666667
## Bosques.Aleatorios
## Precisión de Entrenamiento 1.0000000
## Precisión de Prueba 0.9666667