1. Librerías

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(xgboost)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(corrplot)
## corrplot 0.95 loaded

2. Simulación dataset clínico

set.seed(123)
n <- 1200

data <- tibble(
  age = rnorm(n,50,12),
  bmi = rnorm(n,27,6),
  glucose = rnorm(n,110,25),
  cholesterol = rnorm(n,210,35),
  smoker = sample(c(0,1), n, TRUE),
  activity = rnorm(n,5,2),
  target = sample(c(0,1), n, TRUE)
)

3. EDA

summary(data)
##       age             bmi            glucose        cholesterol   
##  Min.   :16.28   Min.   : 8.713   Min.   : 31.77   Min.   :103.5  
##  1st Qu.:42.30   1st Qu.:23.082   1st Qu.: 93.90   1st Qu.:186.3  
##  Median :50.11   Median :26.971   Median :109.38   Median :210.6  
##  Mean   :50.26   Mean   :27.021   Mean   :110.07   Mean   :209.3  
##  3rd Qu.:57.92   3rd Qu.:31.119   3rd Qu.:126.95   3rd Qu.:232.4  
##  Max.   :88.89   Max.   :47.342   Max.   :195.53   Max.   :330.6  
##      smoker          activity          target      
##  Min.   :0.0000   Min.   :-1.579   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 3.657   1st Qu.:0.0000  
##  Median :0.0000   Median : 5.049   Median :1.0000  
##  Mean   :0.4925   Mean   : 5.058   Mean   :0.5142  
##  3rd Qu.:1.0000   3rd Qu.: 6.551   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :12.431   Max.   :1.0000
corrplot(cor(data))

4. Split

set.seed(123)
trainIndex <- createDataPartition(data$target,p=0.7,list=FALSE)
train <- data[trainIndex,]
test <- data[-trainIndex,]

5. Random Forest baseline

rf_model <- randomForest(as.factor(target)~., data=train)

6. Preparación XGBoost

train_matrix <- xgb.DMatrix(
  data = as.matrix(train %>% select(-target)),
  label = train$target
)

test_matrix <- xgb.DMatrix(
  data = as.matrix(test %>% select(-target)),
  label = test$target
)

7. Modelo XGBoost

params <- list(objective="binary:logistic", eval_metric="auc")

model <- xgb.train(
  params=params,
  data=train_matrix,
  nrounds=120
)

8. Predicción

pred <- predict(model,test_matrix)

9. Evaluación

roc_obj <- roc(test$target,pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

auc(roc_obj)
## Area under the curve: 0.5277

10. Importancia

importance <- xgb.importance(model=model)
xgb.plot.importance(importance)

11. Análisis de errores

errors <- test %>%
  mutate(pred = ifelse(pred>0.5,1,0)) %>%
  filter(pred != target)

head(errors)
## # A tibble: 6 × 8
##     age   bmi glucose cholesterol smoker activity target  pred
##   <dbl> <dbl>   <dbl>       <dbl>  <dbl>    <dbl>  <dbl> <dbl>
## 1  50.8  22.5    99.7        192.      0     4.92      1     0
## 2  55.5  21.1    63.6        207.      0     4.76      1     0
## 3  54.3  24.4    89.6        267.      0     4.12      1     0
## 4  60.1  20.4   112.         221.      0     1.57      0     1
## 5  45.2  28.1    92.6        233.      1     9.62      1     0
## 6  49.0  29.3   110.         201.      0     5.95      0     1

12. Conclusión

cat("Variables metabólicas son clave en predicción de riesgo clínico.")
## Variables metabólicas son clave en predicción de riesgo clínico.