1. Librerías
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(corrplot)
## corrplot 0.95 loaded
2. Simulación dataset clínico
set.seed(123)
n <- 1200
data <- tibble(
age = rnorm(n,50,12),
bmi = rnorm(n,27,6),
glucose = rnorm(n,110,25),
cholesterol = rnorm(n,210,35),
smoker = sample(c(0,1), n, TRUE),
activity = rnorm(n,5,2),
target = sample(c(0,1), n, TRUE)
)
3. EDA
summary(data)
## age bmi glucose cholesterol
## Min. :16.28 Min. : 8.713 Min. : 31.77 Min. :103.5
## 1st Qu.:42.30 1st Qu.:23.082 1st Qu.: 93.90 1st Qu.:186.3
## Median :50.11 Median :26.971 Median :109.38 Median :210.6
## Mean :50.26 Mean :27.021 Mean :110.07 Mean :209.3
## 3rd Qu.:57.92 3rd Qu.:31.119 3rd Qu.:126.95 3rd Qu.:232.4
## Max. :88.89 Max. :47.342 Max. :195.53 Max. :330.6
## smoker activity target
## Min. :0.0000 Min. :-1.579 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 3.657 1st Qu.:0.0000
## Median :0.0000 Median : 5.049 Median :1.0000
## Mean :0.4925 Mean : 5.058 Mean :0.5142
## 3rd Qu.:1.0000 3rd Qu.: 6.551 3rd Qu.:1.0000
## Max. :1.0000 Max. :12.431 Max. :1.0000
corrplot(cor(data))

4. Split
set.seed(123)
trainIndex <- createDataPartition(data$target,p=0.7,list=FALSE)
train <- data[trainIndex,]
test <- data[-trainIndex,]
5. Random Forest baseline
rf_model <- randomForest(as.factor(target)~., data=train)
6. Preparación XGBoost
train_matrix <- xgb.DMatrix(
data = as.matrix(train %>% select(-target)),
label = train$target
)
test_matrix <- xgb.DMatrix(
data = as.matrix(test %>% select(-target)),
label = test$target
)
7. Modelo XGBoost
params <- list(objective="binary:logistic", eval_metric="auc")
model <- xgb.train(
params=params,
data=train_matrix,
nrounds=120
)
8. Predicción
pred <- predict(model,test_matrix)
9. Evaluación
roc_obj <- roc(test$target,pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

auc(roc_obj)
## Area under the curve: 0.5277
10. Importancia
importance <- xgb.importance(model=model)
xgb.plot.importance(importance)

11. Análisis de errores
errors <- test %>%
mutate(pred = ifelse(pred>0.5,1,0)) %>%
filter(pred != target)
head(errors)
## # A tibble: 6 × 8
## age bmi glucose cholesterol smoker activity target pred
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 50.8 22.5 99.7 192. 0 4.92 1 0
## 2 55.5 21.1 63.6 207. 0 4.76 1 0
## 3 54.3 24.4 89.6 267. 0 4.12 1 0
## 4 60.1 20.4 112. 221. 0 1.57 0 1
## 5 45.2 28.1 92.6 233. 1 9.62 1 0
## 6 49.0 29.3 110. 201. 0 5.95 0 1
12. Conclusión
cat("Variables metabólicas son clave en predicción de riesgo clínico.")
## Variables metabólicas son clave en predicción de riesgo clínico.