library(xgboost)
library(caret)
library(dplyr)
# library(purrr)
library(tibble)
library(data.table)
Змоделюємо дані:
set.seed(2)
tr_data <- caret::twoClassSim(1000) %>%
dplyr::mutate(Class = if_else(Class == "Class1", 1, 0))
dtrain <- xgboost::xgb.DMatrix(
data = as.matrix(tr_data[, -ncol(tr_data)]),
label = tr_data$Class
)
Зробимо сітку для крос-валідації:
set.seed(2212)
xgb_expand_grid <- expand.grid(
# nrounds = 200,
eta = 0.1,
max_depth = c(3, 5),
gamma = 0,
colsample_bytree = c(0.75, 1),
min_child_weight = 1,
subsample = c(0.75, 1)
) %>%
dplyr::as_data_frame() %>%
dplyr::mutate(model_id = 1:nrow(.)) %>%
dplyr::mutate(seed_number = sample.int(n = 1000000, size = nrow(.), replace = F))
glimpse(xgb_expand_grid)
## Observations: 8
## Variables: 8
## $ eta <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1
## $ max_depth <dbl> 3, 5, 3, 5, 3, 5, 3, 5
## $ gamma <dbl> 0, 0, 0, 0, 0, 0, 0, 0
## $ colsample_bytree <dbl> 0.75, 0.75, 1.00, 1.00, 0.75, 0.75, 1.00, 1.00
## $ min_child_weight <dbl> 1, 1, 1, 1, 1, 1, 1, 1
## $ subsample <dbl> 0.75, 0.75, 0.75, 0.75, 1.00, 1.00, 1.00, 1.00
## $ model_id <int> 1, 2, 3, 4, 5, 6, 7, 8
## $ seed_number <int> 141994, 941632, 18196, 183806, 849788, 97541,...
Функція для крос-валідації:
xgb_cv_grid <- function(data,
list_of_params,
seed,
n_rounds = 50, # к-сть ітерацій
n_fold = 3, # к-сть фолдів для крос-валідації
metric_name = c("auc", "logloss") ) {
set.seed(seed)
xgb_model_cv <- xgboost::xgb.cv(
params = list_of_params,
data = data,
nrounds = n_rounds,
nfold = n_fold,
metrics = metric_name,
stratified = TRUE,
verbose = FALSE
# print_every_n = 20
)
return(xgb_model_cv)
}
Запускаємо крос-валідацію по сітці:
start <- Sys.time()
xgb_expand_grid_result <- xgb_expand_grid %>%
dplyr::group_by(model_id, seed_number) %>%
tidyr::nest(.key = params_df) %>%
dplyr::mutate(params_list = purrr::map(params_df, purrr::flatten)) %>%
dplyr::mutate(
xgb_cv_model = purrr::map2(
.x = params_list,
.y = seed_number,
.f = ~xgb_cv_grid(
data = dtrain,
list_of_params = .x,
seed = .y
)
)
)
finish <- Sys.time()
time_work <- finish - start
Вибкркмо найкращі ітерації для кожної моделі:
xgb_expand_grid_result_best <- xgb_expand_grid_result %>%
dplyr::mutate(
xgb_cv_model_best_result = purrr::map(
.x = xgb_cv_model,
.f = ~{
.x$evaluation_log %>%
dplyr::filter_at(
.vars = vars("test_auc_mean"),
.vars_predicate = any_vars(. == max(.))
) %>%
dplyr::select_at(.vars = vars("iter", "train_auc_mean", "test_auc_mean"))
}
)
) %>%
dplyr::select(model_id, seed_number, params_df, xgb_cv_model_best_result) %>%
tidyr::unnest()
glimpse(xgb_expand_grid_result_best)
## Observations: 8
## Variables: 11
## $ model_id <int> 1, 2, 3, 4, 5, 6, 7, 8
## $ seed_number <int> 141994, 941632, 18196, 183806, 849788, 97541,...
## $ eta <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1
## $ max_depth <dbl> 3, 5, 3, 5, 3, 5, 3, 5
## $ gamma <dbl> 0, 0, 0, 0, 0, 0, 0, 0
## $ colsample_bytree <dbl> 0.75, 0.75, 1.00, 1.00, 0.75, 0.75, 1.00, 1.00
## $ min_child_weight <dbl> 1, 1, 1, 1, 1, 1, 1, 1
## $ subsample <dbl> 0.75, 0.75, 0.75, 0.75, 1.00, 1.00, 1.00, 1.00
## $ iter <dbl> 50, 50, 50, 50, 50, 49, 49, 50
## $ train_auc_mean <dbl> 0.9955613, 1.0000000, 0.9965567, 1.0000000, 0...
## $ test_auc_mean <dbl> 0.9301720, 0.9261570, 0.9308273, 0.9235660, 0...
xgb_expand_grid_result_best %>%
dplyr::select(model_id, test_auc_mean) %>%
dplyr::arrange(desc(test_auc_mean))
## # A tibble: 8 x 2
## model_id test_auc_mean
## <int> <dbl>
## 1 3 0.9308273
## 2 1 0.9301720
## 3 2 0.9261570
## 4 4 0.9235660
## 5 5 0.9205733
## 6 7 0.9193250
## 7 8 0.9146050
## 8 6 0.9139230
xgb_expand_grid_result_best %>%
dplyr::filter(model_id == 8)
## # A tibble: 1 x 11
## model_id seed_number eta max_depth gamma colsample_bytree
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 8 764758 0.1 5 0 1
## # ... with 5 more variables: min_child_weight <dbl>, subsample <dbl>,
## # iter <dbl>, train_auc_mean <dbl>, test_auc_mean <dbl>