1. Librerías

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)
library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(corrplot)

## corrplot 0.95 loaded

2. Simulación dataset clínico

set.seed(123)
n <- 1200

data <- tibble(
  age = rnorm(n,50,12),
  bmi = rnorm(n,27,6),
  glucose = rnorm(n,110,25),
  cholesterol = rnorm(n,210,35),
  smoker = sample(c(0,1), n, TRUE),
  activity = rnorm(n,5,2),
  target = sample(c(0,1), n, TRUE)
)

3. EDA

summary(data)

##       age             bmi            glucose        cholesterol   
##  Min.   :16.28   Min.   : 8.713   Min.   : 31.77   Min.   :103.5  
##  1st Qu.:42.30   1st Qu.:23.082   1st Qu.: 93.90   1st Qu.:186.3  
##  Median :50.11   Median :26.971   Median :109.38   Median :210.6  
##  Mean   :50.26   Mean   :27.021   Mean   :110.07   Mean   :209.3  
##  3rd Qu.:57.92   3rd Qu.:31.119   3rd Qu.:126.95   3rd Qu.:232.4  
##  Max.   :88.89   Max.   :47.342   Max.   :195.53   Max.   :330.6  
##      smoker          activity          target      
##  Min.   :0.0000   Min.   :-1.579   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 3.657   1st Qu.:0.0000  
##  Median :0.0000   Median : 5.049   Median :1.0000  
##  Mean   :0.4925   Mean   : 5.058   Mean   :0.5142  
##  3rd Qu.:1.0000   3rd Qu.: 6.551   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :12.431   Max.   :1.0000

corrplot(cor(data))

4. Split

set.seed(123)
trainIndex <- createDataPartition(data$target,p=0.7,list=FALSE)
train <- data[trainIndex,]
test <- data[-trainIndex,]

5. Random Forest baseline

rf_model <- randomForest(as.factor(target)~., data=train)

6. Preparación XGBoost

train_matrix <- xgb.DMatrix(
  data = as.matrix(train %>% select(-target)),
  label = train$target
)

test_matrix <- xgb.DMatrix(
  data = as.matrix(test %>% select(-target)),
  label = test$target
)

7. Modelo XGBoost

params <- list(objective="binary:logistic", eval_metric="auc")

model <- xgb.train(
  params=params,
  data=train_matrix,
  nrounds=120
)

8. Predicción

pred <- predict(model,test_matrix)

9. Evaluación

roc_obj <- roc(test$target,pred)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_obj)

auc(roc_obj)

## Area under the curve: 0.5277

10. Importancia

importance <- xgb.importance(model=model)
xgb.plot.importance(importance)

11. Análisis de errores

errors <- test %>%
  mutate(pred = ifelse(pred>0.5,1,0)) %>%
  filter(pred != target)

head(errors)

## # A tibble: 6 × 8
##     age   bmi glucose cholesterol smoker activity target  pred
##   <dbl> <dbl>   <dbl>       <dbl>  <dbl>    <dbl>  <dbl> <dbl>
## 1  50.8  22.5    99.7        192.      0     4.92      1     0
## 2  55.5  21.1    63.6        207.      0     4.76      1     0
## 3  54.3  24.4    89.6        267.      0     4.12      1     0
## 4  60.1  20.4   112.         221.      0     1.57      0     1
## 5  45.2  28.1    92.6        233.      1     9.62      1     0
## 6  49.0  29.3   110.         201.      0     5.95      0     1

12. Conclusión

cat("Variables metabólicas son clave en predicción de riesgo clínico.")

## Variables metabólicas son clave en predicción de riesgo clínico.

AI Healthcare Risk Prediction

Marcelo Callao pimentel

2026-04-27

1. Librerías

2. Simulación dataset clínico

3. EDA

4. Split

5. Random Forest baseline

6. Preparación XGBoost

7. Modelo XGBoost

8. Predicción

9. Evaluación

10. Importancia

11. Análisis de errores

12. Conclusión