1. LibrerĂ­as

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(xgboost)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(corrplot)
## corrplot 0.95 loaded
library(DataExplorer)

2. Carga de datos

data <- iris

3. TransformaciĂ³n a problema binario

data <- data %>%
  mutate(target = ifelse(Species == "setosa", 1, 0)) %>%
  select(-Species)

4. EDA

summary(data)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##      target      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3333  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
plot_histogram(data)

corrplot(cor(data), method = "color")

5. Feature Engineering

data <- data %>%
  mutate(
    petal_ratio = Petal.Length / Petal.Width,
    sepal_ratio = Sepal.Length / Sepal.Width
  )

6. Split train/test

set.seed(123)
trainIndex <- createDataPartition(data$target, p = 0.7, list = FALSE)

train <- data[trainIndex,]
test  <- data[-trainIndex,]

7. PreparaciĂ³n matrices

train_matrix <- xgb.DMatrix(
  data = as.matrix(train %>% select(-target)),
  label = train$target
)

test_matrix <- xgb.DMatrix(
  data = as.matrix(test %>% select(-target)),
  label = test$target
)

8. ParĂ¡metros base

params <- list(
  objective = "binary:logistic",
  eval_metric = "auc",
  max_depth = 4,
  eta = 0.1,
  subsample = 0.8,
  colsample_bytree = 0.8
)

9. Entrenamiento inicial

model_xgb <- xgb.train(
  params = params,
  data = train_matrix,
  nrounds = 100,
  watchlist = list(train = train_matrix, test = test_matrix),
  early_stopping_rounds = 10,
  verbose = 0
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.

10. Predicciones

pred_prob <- predict(model_xgb, test_matrix)
pred_class <- ifelse(pred_prob > 0.5, 1, 0)

11. EvaluaciĂ³n

confusionMatrix(as.factor(pred_class), as.factor(test$target))
## Warning in confusionMatrix.default(as.factor(pred_class),
## as.factor(test$target)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 31 14
##          1  0  0
##                                           
##                Accuracy : 0.6889          
##                  95% CI : (0.5335, 0.8183)
##     No Information Rate : 0.6889          
##     P-Value [Acc > NIR] : 0.571667        
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.000512        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.6889          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.6889          
##          Detection Rate : 0.6889          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

12. ROC - AUC

roc_obj <- roc(test$target, pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

auc(roc_obj)
## Area under the curve: 1

13. Importancia de variables

importance <- xgb.importance(model = model_xgb)
xgb.plot.importance(importance)

14. Grid Search manual

grid <- expand.grid(
  max_depth = c(3,4,5),
  eta = c(0.01, 0.1),
  subsample = c(0.7, 0.9)
)

results <- data.frame()

for(i in 1:nrow(grid)){
  params_tmp <- list(
    objective = "binary:logistic",
    eval_metric = "auc",
    max_depth = grid$max_depth[i],
    eta = grid$eta[i],
    subsample = grid$subsample[i]
  )

  model_tmp <- xgb.train(
    params = params_tmp,
    data = train_matrix,
    nrounds = 50,
    verbose = 0
  )

  pred_tmp <- predict(model_tmp, test_matrix)
  auc_tmp <- auc(test$target, pred_tmp)

  results <- rbind(results, cbind(grid[i,], auc = auc_tmp))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

15. Resultados tuning

results %>% arrange(desc(auc))
##    max_depth  eta subsample auc
## 1          3 0.01       0.7   1
## 2          4 0.01       0.7   1
## 3          5 0.01       0.7   1
## 4          3 0.10       0.7   1
## 5          4 0.10       0.7   1
## 6          5 0.10       0.7   1
## 7          3 0.01       0.9   1
## 8          4 0.01       0.9   1
## 9          5 0.01       0.9   1
## 10         3 0.10       0.9   1
## 11         4 0.10       0.9   1
## 12         5 0.10       0.9   1

16. AnĂ¡lisis de errores

errors <- test %>%
  mutate(pred = pred_class, prob = pred_prob) %>%
  filter(pred != target)

head(errors)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width target petal_ratio
## 1          5.1         3.5          1.4         0.2      1    7.000000
## 2          4.9         3.0          1.4         0.2      1    7.000000
## 3          4.7         3.2          1.3         0.2      1    6.500000
## 4          5.0         3.6          1.4         0.2      1    7.000000
## 5          5.4         3.7          1.5         0.2      1    7.500000
## 6          5.1         3.5          1.4         0.3      1    4.666667
##   sepal_ratio pred      prob
## 1    1.457143    0 0.4015939
## 2    1.633333    0 0.4015939
## 3    1.468750    0 0.4015939
## 4    1.388889    0 0.4015939
## 5    1.459459    0 0.4015939
## 6    1.457143    0 0.4015939

17. Threshold tuning

thresholds <- seq(0.1,0.9,0.1)

perf <- data.frame()

for(t in thresholds){
  pred_t <- ifelse(pred_prob > t,1,0)
  acc <- mean(pred_t == test$target)
  perf <- rbind(perf, data.frame(threshold=t,accuracy=acc))
}

perf
##   threshold  accuracy
## 1       0.1 0.3111111
## 2       0.2 0.3111111
## 3       0.3 0.3111111
## 4       0.4 1.0000000
## 5       0.5 0.6888889
## 6       0.6 0.6888889
## 7       0.7 0.6888889
## 8       0.8 0.6888889
## 9       0.9 0.6888889

18. Insights de negocio

cat("XGBoost captura relaciones no lineales y mejora performance frente a modelos lineales.")
## XGBoost captura relaciones no lineales y mejora performance frente a modelos lineales.

19. Recomendaciones

cat("Optimizar con Bayesian tuning y usar SHAP para interpretabilidad.")
## Optimizar con Bayesian tuning y usar SHAP para interpretabilidad.