1. LibrerĂas
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(xgboost)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(corrplot)
## corrplot 0.95 loaded
library(DataExplorer)
2. Carga de datos
data <- iris
4. EDA
summary(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3333
## 3rd Qu.:1.0000
## Max. :1.0000
plot_histogram(data)

corrplot(cor(data), method = "color")

5. Feature Engineering
data <- data %>%
mutate(
petal_ratio = Petal.Length / Petal.Width,
sepal_ratio = Sepal.Length / Sepal.Width
)
6. Split train/test
set.seed(123)
trainIndex <- createDataPartition(data$target, p = 0.7, list = FALSE)
train <- data[trainIndex,]
test <- data[-trainIndex,]
7. PreparaciĂ³n matrices
train_matrix <- xgb.DMatrix(
data = as.matrix(train %>% select(-target)),
label = train$target
)
test_matrix <- xgb.DMatrix(
data = as.matrix(test %>% select(-target)),
label = test$target
)
8. ParĂ¡metros base
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
9. Entrenamiento inicial
model_xgb <- xgb.train(
params = params,
data = train_matrix,
nrounds = 100,
watchlist = list(train = train_matrix, test = test_matrix),
early_stopping_rounds = 10,
verbose = 0
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
10. Predicciones
pred_prob <- predict(model_xgb, test_matrix)
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
11. EvaluaciĂ³n
confusionMatrix(as.factor(pred_class), as.factor(test$target))
## Warning in confusionMatrix.default(as.factor(pred_class),
## as.factor(test$target)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 14
## 1 0 0
##
## Accuracy : 0.6889
## 95% CI : (0.5335, 0.8183)
## No Information Rate : 0.6889
## P-Value [Acc > NIR] : 0.571667
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000512
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.6889
## Neg Pred Value : NaN
## Prevalence : 0.6889
## Detection Rate : 0.6889
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
12. ROC - AUC
roc_obj <- roc(test$target, pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

auc(roc_obj)
## Area under the curve: 1
13. Importancia de variables
importance <- xgb.importance(model = model_xgb)
xgb.plot.importance(importance)

14. Grid Search manual
grid <- expand.grid(
max_depth = c(3,4,5),
eta = c(0.01, 0.1),
subsample = c(0.7, 0.9)
)
results <- data.frame()
for(i in 1:nrow(grid)){
params_tmp <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = grid$max_depth[i],
eta = grid$eta[i],
subsample = grid$subsample[i]
)
model_tmp <- xgb.train(
params = params_tmp,
data = train_matrix,
nrounds = 50,
verbose = 0
)
pred_tmp <- predict(model_tmp, test_matrix)
auc_tmp <- auc(test$target, pred_tmp)
results <- rbind(results, cbind(grid[i,], auc = auc_tmp))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
15. Resultados tuning
results %>% arrange(desc(auc))
## max_depth eta subsample auc
## 1 3 0.01 0.7 1
## 2 4 0.01 0.7 1
## 3 5 0.01 0.7 1
## 4 3 0.10 0.7 1
## 5 4 0.10 0.7 1
## 6 5 0.10 0.7 1
## 7 3 0.01 0.9 1
## 8 4 0.01 0.9 1
## 9 5 0.01 0.9 1
## 10 3 0.10 0.9 1
## 11 4 0.10 0.9 1
## 12 5 0.10 0.9 1
16. AnĂ¡lisis de errores
errors <- test %>%
mutate(pred = pred_class, prob = pred_prob) %>%
filter(pred != target)
head(errors)
## Sepal.Length Sepal.Width Petal.Length Petal.Width target petal_ratio
## 1 5.1 3.5 1.4 0.2 1 7.000000
## 2 4.9 3.0 1.4 0.2 1 7.000000
## 3 4.7 3.2 1.3 0.2 1 6.500000
## 4 5.0 3.6 1.4 0.2 1 7.000000
## 5 5.4 3.7 1.5 0.2 1 7.500000
## 6 5.1 3.5 1.4 0.3 1 4.666667
## sepal_ratio pred prob
## 1 1.457143 0 0.4015939
## 2 1.633333 0 0.4015939
## 3 1.468750 0 0.4015939
## 4 1.388889 0 0.4015939
## 5 1.459459 0 0.4015939
## 6 1.457143 0 0.4015939
17. Threshold tuning
thresholds <- seq(0.1,0.9,0.1)
perf <- data.frame()
for(t in thresholds){
pred_t <- ifelse(pred_prob > t,1,0)
acc <- mean(pred_t == test$target)
perf <- rbind(perf, data.frame(threshold=t,accuracy=acc))
}
perf
## threshold accuracy
## 1 0.1 0.3111111
## 2 0.2 0.3111111
## 3 0.3 0.3111111
## 4 0.4 1.0000000
## 5 0.5 0.6888889
## 6 0.6 0.6888889
## 7 0.7 0.6888889
## 8 0.8 0.6888889
## 9 0.9 0.6888889
18. Insights de negocio
cat("XGBoost captura relaciones no lineales y mejora performance frente a modelos lineales.")
## XGBoost captura relaciones no lineales y mejora performance frente a modelos lineales.
19. Recomendaciones
cat("Optimizar con Bayesian tuning y usar SHAP para interpretabilidad.")
## Optimizar con Bayesian tuning y usar SHAP para interpretabilidad.