# Cargar librerías
library(xgboost)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(data.table)
# Cargar los datos desde el archivo CSV
df <- read.csv("~/Desktop/E_Fraud.csv", sep = ",", header = TRUE)
# Eliminar las columnas innecesarias
df <- df[, !(colnames(df) %in% c("nameOrig", "nameDest"))]
# Convertir la variable objetivo a factor (si no lo está)
df$isFraud <- as.factor(df$isFraud)
# Convertir variables categóricas a numéricas
df$type <- as.numeric(as.factor(df$type)) # Codificar 'type'
# Dividir en conjunto de entrenamiento y prueba (80%-20%)
set.seed(123)
trainIndex <- createDataPartition(df$isFraud, p = 0.8, list = FALSE)
trainData <- df[trainIndex,]
testData <- df[-trainIndex,]
# Convertirlas a factores y luego a números (one-hot encoding)
#if (length(categorical_columns) > 0) {
# df[, (categorical_columns) := lapply(.SD, function(x) as.integer(as.factor(x))), .SDcols = #categorical_columns]
#}
# Separar variables predictoras y objetivo
train_x <- as.matrix(trainData[, !colnames(trainData) %in% "isFraud"])
train_y <- as.numeric(trainData$isFraud) - 1 # Convertir a 0 y 1 para XGBoost
test_x <- as.matrix(testData[, !colnames(testData) %in% "isFraud"])
test_y <- as.numeric(testData$isFraud) - 1
str(df)
## 'data.frame': 400 obs. of 7 variables:
## $ type : num 4 4 5 4 2 2 2 2 4 4 ...
## $ amount : num 6275 21109 747530 12773 90315 ...
## $ oldbalanceOrg : num 805 0 599938 39327 0 ...
## $ newbalanceOrig: num 0 0 0 26554 0 ...
## $ oldbalanceDest: num 0 0 213814 0 626143 ...
## $ newbalanceDest: num 0 0 993586 0 1151347 ...
## $ isFraud : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
You can also embed plots, for example:
# Calcular el peso de clases para balancear el modelo
scale_pos_weight_value <- sum(train_y == 0) / sum(train_y == 1)
# Configurar los hiperparámetros
params <- list(
booster = "gbtree",
objective = "binary:logistic", # Para clasificación binaria
eval_metric = "auc", # Optimizar AUC en lugar de accuracy
scale_pos_weight = scale_pos_weight_value, # Ajuste de peso para la clase minoritaria
eta = 0.1, # Tasa de aprendizaje
max_depth = 6, # Profundidad del árbol
subsample = 0.8, # Evita sobreajuste
colsample_bytree = 0.8 # Reduce la correlación entre árboles
)
# Crear la estructura del dataset de XGBoost
dtrain <- xgb.DMatrix(data = train_x, label = train_y)
dtest <- xgb.DMatrix(data = test_x, label = test_y)
# Entrenar el modelo
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100, # Número de árboles
watchlist = list(train = dtrain, test = dtest),
verbose = 1
)
## [1] train-auc:0.967946 test-auc:0.952857
## [2] train-auc:0.990714 test-auc:0.936429
## [3] train-auc:0.990357 test-auc:0.943571
## [4] train-auc:0.994821 test-auc:0.974286
## [5] train-auc:0.995223 test-auc:0.970000
## [6] train-auc:0.995179 test-auc:0.967143
## [7] train-auc:0.996071 test-auc:0.971429
## [8] train-auc:0.997500 test-auc:0.977143
## [9] train-auc:0.997500 test-auc:0.975714
## [10] train-auc:0.998571 test-auc:0.975714
## [11] train-auc:0.998571 test-auc:0.980000
## [12] train-auc:0.998750 test-auc:0.979286
## [13] train-auc:0.998929 test-auc:0.982143
## [14] train-auc:0.999018 test-auc:0.980714
## [15] train-auc:0.999018 test-auc:0.978571
## [16] train-auc:0.998929 test-auc:0.978571
## [17] train-auc:0.998839 test-auc:0.977143
## [18] train-auc:0.998839 test-auc:0.977143
## [19] train-auc:0.998839 test-auc:0.977143
## [20] train-auc:0.999375 test-auc:0.977143
## [21] train-auc:0.999375 test-auc:0.975714
## [22] train-auc:0.999375 test-auc:0.975714
## [23] train-auc:0.999375 test-auc:0.974286
## [24] train-auc:0.999286 test-auc:0.971429
## [25] train-auc:0.999286 test-auc:0.972857
## [26] train-auc:0.999286 test-auc:0.974286
## [27] train-auc:0.999554 test-auc:0.972857
## [28] train-auc:0.999554 test-auc:0.971429
## [29] train-auc:0.999821 test-auc:0.965714
## [30] train-auc:0.999821 test-auc:0.968571
## [31] train-auc:0.999911 test-auc:0.970000
## [32] train-auc:1.000000 test-auc:0.970000
## [33] train-auc:0.999911 test-auc:0.970000
## [34] train-auc:1.000000 test-auc:0.968571
## [35] train-auc:1.000000 test-auc:0.964286
## [36] train-auc:0.999911 test-auc:0.961429
## [37] train-auc:1.000000 test-auc:0.964286
## [38] train-auc:1.000000 test-auc:0.961429
## [39] train-auc:1.000000 test-auc:0.962857
## [40] train-auc:1.000000 test-auc:0.964286
## [41] train-auc:1.000000 test-auc:0.965714
## [42] train-auc:1.000000 test-auc:0.964286
## [43] train-auc:1.000000 test-auc:0.962857
## [44] train-auc:1.000000 test-auc:0.961429
## [45] train-auc:1.000000 test-auc:0.961429
## [46] train-auc:1.000000 test-auc:0.958571
## [47] train-auc:1.000000 test-auc:0.958571
## [48] train-auc:1.000000 test-auc:0.957143
## [49] train-auc:1.000000 test-auc:0.957143
## [50] train-auc:1.000000 test-auc:0.957143
## [51] train-auc:1.000000 test-auc:0.957143
## [52] train-auc:1.000000 test-auc:0.957143
## [53] train-auc:1.000000 test-auc:0.957143
## [54] train-auc:1.000000 test-auc:0.957143
## [55] train-auc:1.000000 test-auc:0.957143
## [56] train-auc:1.000000 test-auc:0.951429
## [57] train-auc:1.000000 test-auc:0.951429
## [58] train-auc:1.000000 test-auc:0.952857
## [59] train-auc:1.000000 test-auc:0.957143
## [60] train-auc:1.000000 test-auc:0.954286
## [61] train-auc:1.000000 test-auc:0.950000
## [62] train-auc:1.000000 test-auc:0.947143
## [63] train-auc:1.000000 test-auc:0.947143
## [64] train-auc:1.000000 test-auc:0.944286
## [65] train-auc:1.000000 test-auc:0.942857
## [66] train-auc:1.000000 test-auc:0.934286
## [67] train-auc:1.000000 test-auc:0.934286
## [68] train-auc:1.000000 test-auc:0.930000
## [69] train-auc:1.000000 test-auc:0.922857
## [70] train-auc:1.000000 test-auc:0.921429
## [71] train-auc:1.000000 test-auc:0.921429
## [72] train-auc:1.000000 test-auc:0.924286
## [73] train-auc:1.000000 test-auc:0.922857
## [74] train-auc:1.000000 test-auc:0.921429
## [75] train-auc:1.000000 test-auc:0.921429
## [76] train-auc:1.000000 test-auc:0.921429
## [77] train-auc:1.000000 test-auc:0.921429
## [78] train-auc:1.000000 test-auc:0.921429
## [79] train-auc:1.000000 test-auc:0.921429
## [80] train-auc:1.000000 test-auc:0.921429
## [81] train-auc:1.000000 test-auc:0.921429
## [82] train-auc:1.000000 test-auc:0.922857
## [83] train-auc:1.000000 test-auc:0.922857
## [84] train-auc:1.000000 test-auc:0.922857
## [85] train-auc:1.000000 test-auc:0.922857
## [86] train-auc:1.000000 test-auc:0.922857
## [87] train-auc:1.000000 test-auc:0.922857
## [88] train-auc:1.000000 test-auc:0.922857
## [89] train-auc:1.000000 test-auc:0.922857
## [90] train-auc:1.000000 test-auc:0.922857
## [91] train-auc:1.000000 test-auc:0.921429
## [92] train-auc:1.000000 test-auc:0.921429
## [93] train-auc:1.000000 test-auc:0.921429
## [94] train-auc:1.000000 test-auc:0.921429
## [95] train-auc:1.000000 test-auc:0.921429
## [96] train-auc:1.000000 test-auc:0.921429
## [97] train-auc:1.000000 test-auc:0.921429
## [98] train-auc:1.000000 test-auc:0.921429
## [99] train-auc:1.000000 test-auc:0.921429
## [100] train-auc:1.000000 test-auc:0.921429
# Hacer predicciones
preds <- predict(xgb_model, dtest)
pred_labels <- ifelse(preds > 0.5, 1, 0)
# Calcular métricas de desempeño
conf_matrix <- confusionMatrix(as.factor(pred_labels), as.factor(test_y))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 66 1
## 1 4 9
##
## Accuracy : 0.9375
## 95% CI : (0.8601, 0.9794)
## No Information Rate : 0.875
## P-Value [Acc > NIR] : 0.05519
##
## Kappa : 0.7468
##
## Mcnemar's Test P-Value : 0.37109
##
## Sensitivity : 0.9429
## Specificity : 0.9000
## Pos Pred Value : 0.9851
## Neg Pred Value : 0.6923
## Prevalence : 0.8750
## Detection Rate : 0.8250
## Detection Prevalence : 0.8375
## Balanced Accuracy : 0.9214
##
## 'Positive' Class : 0
##
# Mostrar importancia de variables
importance <- xgb.importance(feature_names = colnames(train_x), model = xgb_model)
xgb.plot.importance(importance)
# Instalar paquetes si no están instalados
if (!require("pROC")) install.packages("pROC", dependencies=TRUE)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
if (!require("ggplot2")) install.packages("ggplot2", dependencies=TRUE)
# Cargar librerías
library(pROC)
library(ggplot2)
# Crear la curva ROC
roc_curve <- roc(test_y, preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calcular el AUC
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
## [1] "AUC: 0.921428571428571"
# Graficar la curva ROC
ggroc(roc_curve, legacy.axes = TRUE) +
ggtitle(paste("Curva ROC - AUC:", round(auc_value, 4))) +
xlab("Tasa de Falsos Positivos (FPR)") +
ylab("Tasa de Verdaderos Positivos (TPR)") +
theme_minimal()
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.