El paquete caret (Classification And Regression Training) es una herramienta poderosa para la implementación de modelos de Machine Learning.
#install.packages("caret") #Algoritmos de aprendizaje automático
library(caret)
#install.packages("datasets") #Para usar la base de datos "Iris"
library(datasets)
library(ggplot2) #Gráficascon mejor diseñ
#install.packages("lattice") #Crear gráficos
library(lattice)
#install.packages("DataExplorer") #Análisis descriptivo
library(DataExplorer)
#install.packages("kernlab")
library(kernlab)
#install.packages("rpart")
library(rpart)
df <- data.frame(iris)
create_report(df)
##
##
## processing file: report.rmd
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro]
## | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile]
## | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram]
## | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar]
## | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq]
## | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis]
## | |........................... | 74% | |............................ | 76% [principal_component_analysis]
## | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## output file: /Users/anapaualvear/Desktop/modulo 2 R/report.knit.md
## /private/var/folders/gf/rh6n8hhj40l17_3n_7zgjzd80000gn/T/AppTranslocation/A1E2447B-2BD3-4050-877B-A0415BBE092C/d/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/pandoc +RTS -K512m -RTS '/Users/anapaualvear/Desktop/modulo 2 R/report.knit.md' --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc65a56c8ede3f.html --lua-filter /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/gf/rh6n8hhj40l17_3n_7zgjzd80000gn/T//RtmpHh7Hld/rmarkdown-str65a566287e96.html
##
## Output created: report.html
plot_missing(df)
plot_histogram(df)
plot_correlation(df)
NOTA: La variable que queremos predecir debe tener formato de FACTOR.
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$Species, p=0.8, list=FALSE)
entrenamiento <- iris[renglones_entrenamiento, ]
prueba <- iris[-renglones_entrenamiento, ]
Los métodos más utilizados para modelar aprnedizaje automático
son:
* SVM: Support Vector Machine o Máquina de
Vectores de Soporte. hay varios subtipos: Linear (svmLinear), radial
(svmRadial), polinómico (svmPoly), etc.
* Árbol de Decisión: rpart
* Redes Neuronales: nnet
* Random Forest o Bosques Aleatorios: rf
La validación cruzada (Cross Validation, CV) es una técnica para evaluar el rendimiento de un modelo, dividiendo los datos en múltiples subconjuntos, prermitiendo medir su capacidad de generlaización y evitar sobreajuste (Overfitting).
La Matriz de Confusión¨ permite analizar qué tan bien funciona un modelo y qué tipos de errores comete. Lo que hace es comparar las predicciones del modelo con los valores reales de la variable objetivo.
Si la precisión es muy alta en entrenamiento (95-100%), pero baja en prueba (60-70%), es una señal de sobreajuste (overfitting).
modelo1 <- train(Species ~ ., data = entrenamiento,
method = "svmLinear", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(C=1) #Cambiar hiperparámetro
)
resultado_entrenamiento1 <- predict(modelo1,entrenamiento)
resultados_prueba1 <- predict(modelo1,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre1 <- confusionMatrix(resultado_entrenamiento1, entrenamiento$Species)
mcre1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de Confusión del Resultado de la Prueba
mcrp1 <- confusionMatrix(resultados_prueba1, prueba$Species)
mcrp1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo2 <- train(Species ~ ., data = entrenamiento,
method = "svmRadial", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(sigma=1, C=1) #Cambiar hiperparámetro
)
resultado_entrenamiento2 <- predict(modelo2,entrenamiento)
resultados_prueba2 <- predict(modelo2,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento$Species)
mcre2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de Confusión del Resultado de la Prueba
mcrp2 <- confusionMatrix(resultados_prueba2, prueba$Species)
mcrp2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo3 <- train(Species ~ ., data = entrenamiento,
method = "svmPoly", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = data.frame(degree=1, scale=1, C=1) #Cambiar hiperparámetro
)
resultado_entrenamiento3 <- predict(modelo3,entrenamiento)
resultados_prueba3 <- predict(modelo3,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento$Species)
mcre3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
# Matriz de Confusión del Resultado de la Prueba
mcrp3 <- confusionMatrix(resultados_prueba3, prueba$Species)
mcrp3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo4 <- train(Species ~ ., data = entrenamiento,
method = "rpart", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
tuneLength = 10 #Cambiar hiperparámetros
)
resultado_entrenamiento4 <- predict(modelo4,entrenamiento)
resultados_prueba4 <- predict(modelo4,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento$Species)
mcre4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 3
## virginica 0 1 37
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 0.9250
## Specificity 1.0000 0.9625 0.9875
## Pos Pred Value 1.0000 0.9286 0.9737
## Neg Pred Value 1.0000 0.9872 0.9634
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3083
## Detection Prevalence 0.3333 0.3500 0.3167
## Balanced Accuracy 1.0000 0.9688 0.9563
# Matriz de Confusión del Resultado de la Prueba
mcrp4 <- confusionMatrix(resultados_prueba4, prueba$Species)
mcrp4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo5 <- train(Species ~ ., data = entrenamiento,
method = "nnet", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
trace=FALSE
)
resultado_entrenamiento5 <- predict(modelo5,entrenamiento)
resultados_prueba5 <- predict(modelo5,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento$Species)
mcre5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 36 0
## virginica 0 4 40
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
# Matriz de Confusión del Resultado de la Prueba
mcrp5 <- confusionMatrix(resultados_prueba5, prueba$Species)
mcrp5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
modelo6 <- train(Species ~ ., data = entrenamiento,
method = "rf", #Cambiar
preProcess=c("scale","center"),
trControl = trainControl(method="cv", number=10),
tuneGrid = expand.grid(mtry = c(2,4,6))
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
resultado_entrenamiento6 <- predict(modelo6,entrenamiento)
resultados_prueba6 <- predict(modelo6,prueba)
# Matriz de Confusión del Resultado del Entrenamiento
mcre6 <- confusionMatrix(resultado_entrenamiento6, entrenamiento$Species)
mcre6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 0
## virginica 0 0 40
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9697, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
# Matriz de Confusión del Resultado de la Prueba
mcrp6 <- confusionMatrix(resultados_prueba6, prueba$Species)
mcrp6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
resultados <- data.frame(
"SVM Lineal" = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
"SVM Radial" = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
"SVM Polinómico" = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
"Árbol de Decisión" = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
"Redes Neuronales" = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
"Bosques Aleatorios" = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)
rownames(resultados) <- c("Precisión de Entrenamiento", "Precisión de Prueba")
resultados
## SVM.Lineal SVM.Radial SVM.Polinómico
## Precisión de Entrenamiento 0.9916667 0.9916667 0.9916667
## Precisión de Prueba 0.9666667 0.9333333 0.9666667
## Árbol.de.Decisión Redes.Neuronales
## Precisión de Entrenamiento 0.9666667 0.9666667
## Precisión de Prueba 0.9333333 0.9666667
## Bosques.Aleatorios
## Precisión de Entrenamiento 1.0000000
## Precisión de Prueba 0.9333333