1. Librerías

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(xgboost)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.95 loaded

2. Generación de datos simulados tipo investigación

set.seed(123)
n <- 600

data <- tibble(
  date = seq.Date(as.Date("2020-01-01"), by="week", length.out=n),
  temperature = rnorm(n, 25, 4),
  humidity = runif(n, 60, 95),
  rainfall = runif(n, 0, 250),
  pollution = rnorm(n, 50, 10),
  dengue_cases = rpois(n, lambda = 20)
)

3. Visualización temporal

ggplot(data, aes(date, dengue_cases)) +
  geom_line(color="steelblue") +
  theme_minimal()

4. Correlaciones

corrplot(cor(data %>% select(-date)))

5. Target binario (riesgo epidemiológico)

data <- data %>%
  mutate(target = ifelse(dengue_cases > quantile(dengue_cases,0.75),1,0))

6. Split train/test

set.seed(123)
trainIndex <- createDataPartition(data$target, p=0.7, list=FALSE)
train <- data[trainIndex,]
test  <- data[-trainIndex,]

7. Matrices XGBoost

features <- c("temperature", "humidity", "rainfall", "pollution")

train_matrix <- xgb.DMatrix(
  data = as.matrix(train %>% select(all_of(features))),
  label = train$target
)

test_matrix <- xgb.DMatrix(
  data = as.matrix(test %>% select(all_of(features))),
  label = test$target
)

8. Entrenamiento modelo

params <- list(objective="binary:logistic", eval_metric="auc")

model <- xgb.train(
  params = params,
  data = train_matrix,
  nrounds = 150,
  evals = list(train = train_matrix, test = test_matrix),
  early_stopping_rounds = 15,
  maximize = TRUE,
  verbose = 1
)
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 15 rounds.
## 
## [1]  train-auc:0.801542  test-auc:0.429794 
## [2]  train-auc:0.819546  test-auc:0.487901 
## [3]  train-auc:0.874664  test-auc:0.506749 
## [4]  train-auc:0.897491  test-auc:0.511523 
## [5]  train-auc:0.917899  test-auc:0.479177 
## [6]  train-auc:0.926667  test-auc:0.464362 
## [7]  train-auc:0.952290  test-auc:0.470288 
## [8]  train-auc:0.957868  test-auc:0.472099 
## [9]  train-auc:0.961104  test-auc:0.466996 
## [10] train-auc:0.967241  test-auc:0.469465 
## [11] train-auc:0.970401  test-auc:0.477037 
## [12] train-auc:0.972698  test-auc:0.472263 
## [13] train-auc:0.976508  test-auc:0.468148 
## [14] train-auc:0.977989  test-auc:0.478519 
## [15] train-auc:0.982252  test-auc:0.485926 
## [16] train-auc:0.984324  test-auc:0.496132 
## [17] train-auc:0.987952  test-auc:0.486420 
## [18] train-auc:0.990385  test-auc:0.482305 
## Stopping. Best iteration:
## [19] train-auc:0.990385  test-auc:0.484280
## 
## [19] train-auc:0.990385  test-auc:0.484280

9. Predicciones

pred <- predict(model, test_matrix)

10. Evaluación ROC

roc_obj <- roc(test$target, pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

auc(roc_obj)
## Area under the curve: 0.5115

11. Importancia variables

importance <- xgb.importance(model=model)
xgb.plot.importance(importance)

12. Insights

cat("Factores climáticos muestran relación con riesgo epidemiológico.")
## Factores climáticos muestran relación con riesgo epidemiológico.