El paquete caret(Classification And Regression Training) es una herramienta poderosa para la implementación de modelos de Machine Learning.
#install.packages(“caret”) #Algoritmmos de aprendizaje automatico
library(caret)
#install.packages("datasets") #Para usar la base de datos "iris"
library(datasets)
#install.packages("ggplot2") #Graficas con mejor diseño
library(ggplot2)
#install.packages("lattice") #Crear graficos
library(lattice)
#install.packages("DataExplorer") #Análisis descriptivo
library(DataExplorer)
library(kernlab)
library(randomForest)
df <- data.frame(iris)
create_report(df)
##
##
## processing file: report.rmd
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro]
## | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile]
## | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram]
## | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar]
## | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq]
## | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis]
## | |........................... | 74% | |............................ | 76% [principal_component_analysis]
## | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## output file: C:/Users/OscBa/OneDrive/Documentos/report.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\Users\OscBa\OneDrive\Documentos\report.knit.md" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc2ee4348e5526.html --lua-filter "C:\Users\OscBa\AppData\Local\R\win-library\4.4\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\OscBa\AppData\Local\R\win-library\4.4\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\OscBa\AppData\Local\R\win-library\4.4\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\OscBa\AppData\Local\Temp\RtmpWw4OW0\rmarkdown-str2ee430dc4300.html"
##
## Output created: report.html
plot_missing(df)
plot_histogram(df)
plot_histogram(df)
NOTA: La variable que queremos predecir debe tener un formato de FACTOR.
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$Species, p=0.8, list= FALSE)
entrenamiento <-iris[renglones_entrenamiento, ]
prueba <- iris[-renglones_entrenamiento, ]
Los métodos mas utilizados para modelar aprendizaje automatico son: * SVM: Suport Verctor Machine o Máquina de vectores de soporte. Hay varios subtipos: Lineal (svmLinear); Radial (svmRadual), Polinomico (svmPoly), etc. * Arbol de decisión: rpart * Redes Neuronales: nnet * Random Forest: Bosques aleatorios: rf
La validación cruzada: (cross validation, cv) es una tecnica para evaluar el rendimiento de un modelo, dividiendo los datos en multiples subconjuntos, permitiendo medir su capacidad de generalización y evitar sobreajuste (overfitting). * La matriz de confusion: permite analizar que tan bien funciona un modelo y que tipos de errores comete. Lo que hace es comparar las predicciones del modelo con los valores reales de la variable objetivo. Si la precision es muy alta en entrenamiento (<95%), pero baja (60-70%), es una señal de sobreajuste.
modelo1 <- train(Species~ ., data= entrenamiento,
method = "svmLinear", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
tuneGrid = data.frame(C=1) #Cambiar hiperparametros
)
resultado_entrenamiento1 <- predict(modelo1, entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)
#Matriz de confusion del entrenamiento
mcre1 <- confusionMatrix(resultado_entrenamiento1,
entrenamiento$Species)
mcre1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de confusion del resultado de la prueba
mcrp1 <- confusionMatrix(resultado_prueba1,
prueba$Species)
mcrp1
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo2 <- train(Species~ ., data= entrenamiento,
method = "svmRadial", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
tuneGrid = data.frame(sigma = 1, C=1) #Cambiar hiperparametros
)
resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)
#Matriz de confusion del entrenamiento
mcre2 <- confusionMatrix(resultado_entrenamiento2,
entrenamiento$Species)
mcre2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de confusion del resultado de la prueba
mcrp2 <- confusionMatrix(resultado_prueba2,
prueba$Species)
mcrp2
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo3 <- train(Species~ ., data= entrenamiento,
method = "svmPoly", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
tuneGrid = data.frame(degree = 1, scale = 1, C=1) #Cambiar hiperparametros
)
resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)
#Matriz de confusion del entrenamiento
mcre3 <- confusionMatrix(resultado_entrenamiento3,
entrenamiento$Species)
mcre3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 0
## virginica 0 1 40
##
## Overall Statistics
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9875
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 1.0000
## Specificity 1.0000 1.0000 0.9875
## Pos Pred Value 1.0000 1.0000 0.9756
## Neg Pred Value 1.0000 0.9877 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3333
## Detection Prevalence 0.3333 0.3250 0.3417
## Balanced Accuracy 1.0000 0.9875 0.9938
#Matriz de confusion del resultado de la prueba
mcrp3 <- confusionMatrix(resultado_prueba3,
prueba$Species)
mcrp3
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
modelo4 <- train(Species~ ., data= entrenamiento,
method = "rpart", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
tuneLength = 10
)
resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)
#Matriz de confusion del entrenamiento
mcre4 <- confusionMatrix(resultado_entrenamiento4,
entrenamiento$Species)
mcre4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 39 3
## virginica 0 1 37
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9750 0.9250
## Specificity 1.0000 0.9625 0.9875
## Pos Pred Value 1.0000 0.9286 0.9737
## Neg Pred Value 1.0000 0.9872 0.9634
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3250 0.3083
## Detection Prevalence 0.3333 0.3500 0.3167
## Balanced Accuracy 1.0000 0.9688 0.9563
#Matriz de confusion del resultado de la prueba
mcrp4 <- confusionMatrix(resultado_prueba4,
prueba$Species)
mcrp4
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
modelo5 <- train(Species~ ., data= entrenamiento,
method = "nnet", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
trace = FALSE
)
resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)
#Matriz de confusion del entrenamiento
mcre5 <- confusionMatrix(resultado_entrenamiento5,
entrenamiento$Species)
mcre5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 36 0
## virginica 0 4 40
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9169, 0.9908)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
#Matriz de confusion del resultado de la prueba
mcrp5 <- confusionMatrix(resultado_prueba5,
prueba$Species)
mcrp5
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 0
## virginica 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 1.0000
## Specificity 1.0000 1.0000 0.9500
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9524 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3333
## Detection Prevalence 0.3333 0.3000 0.3667
## Balanced Accuracy 1.0000 0.9500 0.9750
modelo6 <- train(Species~ ., data= entrenamiento,
method = "rf", #Cambiar, para hacer un nuevo modelo
preProcess=c("scale","center"),
trControl = trainControl(method = "cv", number = 10),
tuneGrid = expand.grid(mtry = c(2,4,6))
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)
#Matriz de confusion del entrenamiento
mcre6 <- confusionMatrix(resultado_entrenamiento6,
entrenamiento$Species)
mcre6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 40 0
## virginica 0 0 40
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9697, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
#Matriz de confusion del resultado de la prueba
mcrp6 <- confusionMatrix(resultado_prueba6,
prueba$Species)
mcrp6
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
resultados <- data.frame(
"SVM Lineal" = c(mcre1$overall["Accuracy"], mcrp1$overall["Accuracy"]),
"SVM Radial" = c(mcre2$overall["Accuracy"], mcrp2$overall["Accuracy"]),
"SVM Polinomico" = c(mcre3$overall["Accuracy"], mcrp3$overall["Accuracy"]),
"Arbol de decision" = c(mcre4$overall["Accuracy"], mcrp4$overall["Accuracy"]),
"Redes neuronales" = c(mcre5$overall["Accuracy"], mcrp5$overall["Accuracy"]),
"Bosques aleatorios" = c(mcre6$overall["Accuracy"], mcrp6$overall["Accuracy"])
)
rownames(resultados) <- c("Precision de Entrenamiento", "Precision de Prueba")
resultados
## SVM.Lineal SVM.Radial SVM.Polinomico
## Precision de Entrenamiento 0.9916667 0.9916667 0.9916667
## Precision de Prueba 0.9666667 0.9333333 0.9666667
## Arbol.de.decision Redes.neuronales
## Precision de Entrenamiento 0.9666667 0.9666667
## Precision de Prueba 0.9333333 0.9666667
## Bosques.aleatorios
## Precision de Entrenamiento 1.0000000
## Precision de Prueba 0.9333333