El paquete caret(Classification And REgression Training) es una herramienta poderosa para la implementacion de modelos de Machine Learning.
#install.packages("caret") #Algoritmos de aprendizajes automatico
library(caret)
#install.packages("datasets") #Para usar la base de datos de "IRIS"
library(datasets)
#install.packages("ggplot2") #Graficas con mejor diseño
library(ggplot2)
#install.packages("lattice") # Crear graficos
library(lattice)
#install.packages("DataExplorer") #Analisis Descriptivo
library(DataExplorer)
#install.packages("kernlab") #Metodo de Aprendizaje Automatico
library(kernlab)** NOTA: La variable que queremos predecir debe tener formato de FACTOR. **
Los metodos mas utilizados para modelar aprendizaje automatico son:
La validacion cruzada (Cross validation, CV) es una tecnica para evaluar el rendimiento de un modelo, dividiendo los datos en mutliples subconjuntos, permitiendo medir su capacidad de generalizacion y evitar sobreajuste (overfitting).
La mmatriz de confusion(Confusion matrix) permite analizar que tan bien funciona un modelo y que tipos de errores comete. Lo que haces es comparar las predicciones del modelo con los valres reales de la variable objetivo
Si la precision es muy alta en entrenamiento (95-100%), pero baja en prueba (60-70%), es una señal de sobreajuste
modelo1 <- train(Species ~ ., data=entrenamiento,
method = "svmLinear", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(C=1) # Cambiar hiperparámetros
)
resultado_entrenamiento1 <- predict(modelo1, entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre1 <- confusionMatrix(resultado_entrenamiento1, entrenamiento$Species)
mcre1## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de Confusion del Resultado de la Prueba
mcrp1 <- confusionMatrix(resultado_prueba1, prueba$Species)
mcrp1## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo2 <- train(Species ~ ., data=entrenamiento,
method = "svmRadial", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(sigma=1, C=1) # Cambiar hiperparámetros
)
resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento$Species)
mcre2## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de Confusion del Resultado de la Prueba
mcrp2 <- confusionMatrix(resultado_prueba2, prueba$Species)
mcrp2## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo3 <- train(Species ~ ., data=entrenamiento,
method = "svmPoly", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(degree=1, scale=1, C=1) # Cambiar hiperparámetros
)
resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento$Species)
mcre3## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de Confusion del Resultado de la Prueba
mcrp3 <- confusionMatrix(resultado_prueba3, prueba$Species)
mcrp3## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo4 <- train(Species ~ ., data=entrenamiento,
method = "rpart", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
tuneLength = 10 # Cambiar hiperparámetros
)
resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento$Species)
mcre4## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 3
## virginica 0 1 37
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 0.9250
## Specificity 1.0000 0.9625 0.9875
## Pos Pred Value 1.0000 0.9286 0.9737
## Neg Pred Value 1.0000 0.9872 0.9634
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3083
## Detection Prevalence 0.3333 0.3500 0.3167
## Balanced Accuracy 1.0000 0.9688 0.9563
#Matriz de Confusion del Resultado de la Prueba
mcrp4 <- confusionMatrix(resultado_prueba4, prueba$Species)
mcrp4## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo5 <- train(Species ~ ., data=entrenamiento,
method = "nnet", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
# Cambiar hiperparámetros
trace=FALSE # Para ocultar resultados
)
resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento$Species)
mcre5## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 36 0
## virginica 0 4 40
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
#Matriz de Confusion del Resultado de la Prueba
mcrp5 <- confusionMatrix(resultado_prueba5, prueba$Species)
mcrp5## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
modelo6 <- train(Species ~ ., data=entrenamiento,
method = "rf", # Cambiar
preProcess=c("scale", "center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = expand.grid(mtry= c(2,4,6))# Cambiar hiperparámetros
)## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)
# Matriz de Confusión del Entrenamiento
mcre6 <- confusionMatrix(resultado_entrenamiento6,
entrenamiento$Species)
mcre6## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 0
## virginica 0 0 40
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9697, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
# Matriz de Confusión del Resultado de la Prueba
mcrp6 <- confusionMatrix(resultado_prueba6, prueba$Species)
mcrp6## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
resultados <- data.frame(
"SVM Lineal" = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
"SVM Radial" = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
"SVM Polinomico" = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
"Arbol de Decision" = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
"Redes Neuronales" = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
"Bosques Aleatorios" = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)
rownames(resultados) <- c("Precision de Entrenamiento", "Precision de Prueba")
resultados## SVM.Lineal SVM.Radial SVM.Polinomico
## Precision de Entrenamiento 0.9916667 0.9916667 0.9916667
## Precision de Prueba 0.9666667 0.9333333 0.9666667
## Arbol.de.Decision Redes.Neuronales
## Precision de Entrenamiento 0.9666667 0.9666667
## Precision de Prueba 0.9333333 0.9666667
## Bosques.Aleatorios
## Precision de Entrenamiento 1.0000000
## Precision de Prueba 0.9333333