# функція - отримати roc auc
get_auc <- function(pred, real, model_name, train_status){
pred_value <- ROCR::prediction(
predictions = pred, labels = real
)
auc_value <- ROCR::performance(
prediction.obj = pred_value,
measure = "auc"
)
auc_value_df = data_frame(
model_name = model_name,
train_status = train_status,
auc = auc_value@y.values[[1]]
)
return(auc_value_df)
}
# функція - отримати roc криву
get_roc <- function(pred, real, model_name, train_status){
pred_value <- ROCR::prediction(
predictions = pred, labels = real
)
roc_value <- ROCR::performance(
prediction.obj = pred_value,
measure = "tpr",
x.measure = "fpr"
)
roc_value_df <- data_frame(
model_name = model_name,
train_status = train_status,
fpr = roc_value@x.values[[1]],
tpr = roc_value@y.values[[1]]
)
return(roc_value_df)
}
data("GermanCredit")
df_all <- GermanCredit
levels(df_all$Class)
## [1] "Bad" "Good"
df_all <- df_all %>%
dplyr::mutate(Class_2 = if_else(Class == "Bad", 1, 0))
# змінні, які будемо кодувати середнім
var_cat_name <- df_all %>%
dplyr::select(
NumberExistingCredits:ForeignWorker,
CheckingAccountStatus.lt.0:Job.Management.SelfEmp.HighlyQualified
) %>%
colnames()
# поділимо на трейн і тест
set.seed(123)
train_index <- caret::createDataPartition(
y = df_all$Class,
p = 0.7,
times = 1,
list = FALSE
)
df_train <- dplyr::slice(df_all, train_index)
df_test <- dplyr::slice(df_all, -train_index)
К-сть фолдів для крос-валідації буде 3. Додамо номер фолда в табличку для трейна.
folds_count <- 3
folds_count_seq <- seq_len(folds_count)
set.seed(9)
cv_folds_number <- caret::createFolds(
y = df_train$Class,
k = folds_count,
list = FALSE
)
df_train$cv_folds_number <- cv_folds_number
Функція для кодування середнім категорійної змінної та їх комбінацій на крос-валідації.
create_mean_target_cv <- function(var_name) {
purrr::map_df(
.x = folds_count_seq,
.f = ~{
new_column_name <- paste0(var_name, collapse = "_")
new_column_name <- paste0(new_column_name, "_mt")
df_train %>%
dplyr::filter(cv_folds_number %in% folds_count_seq[-.x]) %>%
dplyr::group_by_at(vars(var_name)) %>%
dplyr::summarise(mean_target = mean(Class_2)) %>%
dplyr::mutate(cv_folds_number = .x) %>%
dplyr::ungroup() %>%
dplyr::rename_at(vars("mean_target"), funs(c(new_column_name)))
}
) %>%
return()
}
# без комбінацій
var_name_list <- var_cat_name
# попарні комбінації
var_name_list_vomb_2 <- combn(var_name_list, 2, simplify = F)
Кодування середнім - по одній змінній:
result_mean_1 <- purrr::map(
.x = var_name_list,
.f = ~create_mean_target_cv(var_name = .x)
)
result_mean_1[[length(result_mean_1) + 1]] <- df_train
df_train_1 <- purrr::reduce_right(
.x = result_mean_1,
.f = dplyr::left_join
)
Кодування середнім - попарні змінні:
result_mean_2 <- purrr::map(
.x = var_name_list_vomb_2,
.f = ~create_mean_target_cv(var_name = .x)
)
result_mean_2[[length(result_mean_2) + 1]] <- df_train_1
df_train_2 <- purrr::reduce_right(
.x = result_mean_2,
.f = dplyr::left_join
)
# прибрали лишні змінні
df_train_2 <- df_train_2[, -which(colnames(df_train_2) %in% var_name_list)]
df_train_2 <- df_train_2[-c(7, 8)]
rm(list = c("df_train_1", "result_mean_1", "result_mean_2"))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2186741 58.4 3886542 103.8 3886542 103.8
## Vcells 3409035 26.1 8415273 64.3 8415270 64.3
Для тестової вибірки робимо середнє не на крос-валідації, а по всьому трейні:
result_mean_all_1 <-purrr::map(
.x = var_name_list,
.f = ~{
new_column_name <- paste0(.x, collapse = "_")
new_column_name <- paste0(new_column_name, "_mt")
df_train %>%
dplyr::group_by_at(vars(.x)) %>%
dplyr::summarise(mean_target = mean(Class_2)) %>%
dplyr::ungroup() %>%
dplyr::rename_at(vars("mean_target"), funs(c(new_column_name)))
}
)
result_mean_all_1[[length(result_mean_all_1) + 1]] <- df_test
df_test_1 <- purrr::reduce_right(
.x = result_mean_all_1,
.f = dplyr::left_join
)
# glimpse(df_test_1)
result_mean_all_2 <-purrr::map(
.x = var_name_list_vomb_2,
.f = ~{
new_column_name <- paste0(.x, collapse = "_")
new_column_name <- paste0(new_column_name, "_mt")
df_train %>%
dplyr::group_by_at(vars(.x)) %>%
dplyr::summarise(mean_target = mean(Class_2)) %>%
dplyr::ungroup() %>%
dplyr::rename_at(vars("mean_target"), funs(c(new_column_name)))
}
)
result_mean_all_2[[length(result_mean_all_2) + 1]] <- df_test_1
df_test_2 <- purrr::reduce_right(
.x = result_mean_all_2,
.f = dplyr::left_join
)
df_test_2 <- df_test_2[, -which(colnames(df_test_2) %in% var_name_list)]
rm(list = c("df_test_1", "result_mean_all_1", "result_mean_all_2"))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2189118 58.5 3886542 103.8 3886542 103.8
## Vcells 3893266 29.8 8415273 64.3 8415270 64.3
Навчаємо моделі спочатку на початкових даних:
df_train <- df_train[, -which(colnames(df_train) %in% c("Class_2", "cv_folds_number"))]
df_train_2 <- randomForest::na.roughfix(df_train_2)
df_test_2 <- randomForest::na.roughfix(df_test_2)
set.seed(234)
train_control <- caret::trainControl(
method = "repeatedcv",
number = 3,
repeats = 3,
verboseIter = F,
returnResamp = "final",
classProbs = TRUE,
summaryFunction = twoClassSummary,
search = "random"
)
set.seed(234)
train_model_glmnet <- caret::train(
Class ~ .,
data = df_train,
method = "glmnet",
trControl = train_control,
preProcess = c("center", "scale", "nzv"),
metric = "ROC",
maximize = TRUE,
tuneLength = 20
)
train_model_glmnet
## glmnet
##
## 700 samples
## 61 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (46), scaled (46), remove (15)
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 467, 467, 466, 467, 467, 466, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.001198341 0.119517327 0.7955079 0.3793651 0.9183791
## 0.020037114 2.374420617 0.7696146 0.0000000 1.0000000
## 0.066910093 0.080613124 0.7964100 0.3857143 0.9143058
## 0.138326844 0.093576831 0.7953103 0.3301587 0.9292691
## 0.284230120 2.775250918 0.5000000 0.0000000 1.0000000
## 0.313152501 0.539893558 0.5160434 0.0000000 1.0000000
## 0.441117854 0.481537354 0.5000000 0.0000000 1.0000000
## 0.523069807 0.183973238 0.7162168 0.0000000 1.0000000
## 0.547701653 0.009486353 0.7930859 0.4555556 0.8837058
## 0.555724930 0.258810173 0.6342508 0.0000000 1.0000000
## 0.582847855 0.004776498 0.7914843 0.4698413 0.8728199
## 0.582989913 0.559718404 0.5000000 0.0000000 1.0000000
## 0.644795124 0.028604757 0.7922184 0.3730159 0.9122733
## 0.717642189 0.005494725 0.7920957 0.4666667 0.8789383
## 0.740014466 0.003893124 0.7912412 0.4714286 0.8707791
## 0.776085387 0.254634475 0.5000000 0.0000000 1.0000000
## 0.781712425 0.180283563 0.6578948 0.0000000 1.0000000
## 0.871777777 0.023870227 0.7903697 0.3634921 0.9115916
## 0.927736510 1.873905579 0.5000000 0.0000000 1.0000000
## 0.929385959 0.520044735 0.5000000 0.0000000 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.06691009 and lambda
## = 0.08061312.
set.seed(234)
train_model_glmnet_new <- caret::train(
Class ~ .,
data = df_train_2,
method = "glmnet",
trControl = train_control,
preProcess = c("center", "scale"),
metric = "ROC",
maximize = TRUE,
tuneLength = 20
)
train_model_glmnet_new
## glmnet
##
## 700 samples
## 1601 predictors
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (1601), scaled (1601)
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 467, 467, 466, 467, 467, 466, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.001198341 0.119517327 0.7901957 0.40952381 0.8986691
## 0.020037114 2.374420617 0.7703800 0.06984127 0.9857308
## 0.066910093 0.080613124 0.8187096 0.51269841 0.8734808
## 0.138326844 0.093576831 0.8114550 0.46507937 0.8979666
## 0.284230120 2.775250918 0.5000000 0.00000000 1.0000000
## 0.313152501 0.539893558 0.6140296 0.00000000 1.0000000
## 0.441117854 0.481537354 0.5000000 0.00000000 1.0000000
## 0.523069807 0.183973238 0.7424953 0.00000000 0.9993183
## 0.547701653 0.009486353 0.8144099 0.55555556 0.8571625
## 0.555724930 0.258810173 0.7301364 0.00000000 1.0000000
## 0.582847855 0.004776498 0.8100032 0.56984127 0.8510233
## 0.582989913 0.559718404 0.5000000 0.00000000 1.0000000
## 0.644795124 0.028604757 0.8027212 0.44761905 0.8979708
## 0.717642189 0.005494725 0.8096204 0.56825397 0.8510275
## 0.740014466 0.003893124 0.8074760 0.57301587 0.8455866
## 0.776085387 0.254634475 0.5000000 0.00000000 1.0000000
## 0.781712425 0.180283563 0.7271701 0.00000000 1.0000000
## 0.871777777 0.023870227 0.7976884 0.44761905 0.8993300
## 0.927736510 1.873905579 0.5000000 0.00000000 1.0000000
## 0.929385959 0.520044735 0.5000000 0.00000000 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.06691009 and lambda
## = 0.08061312.
Результати:
auc_value <- dplyr::bind_rows(
get_auc(
pred = predict(train_model_glmnet, df_test, type = "prob")[, "Good"],
real = df_test_2$Class,
model_name = "glmnet",
train_status = "test"
),
get_auc(
pred = predict(train_model_glmnet_new, df_test_2, type = "prob")[, "Good"],
real = df_test_2$Class,
model_name = "glmnet_new",
train_status = "test"
)
)
auc_value
## # A tibble: 2 x 3
## model_name train_status auc
## <chr> <chr> <dbl>
## 1 glmnet test 0.7534392
## 2 glmnet_new test 0.7659259
dplyr::bind_rows(
get_roc(
pred = predict(train_model_glmnet, df_test, type = "prob")[, "Good"],
real = df_test_2$Class,
model_name = "glmnet",
train_status = "test"
),
get_roc(
pred = predict(train_model_glmnet_new, df_test_2, type = "prob")[, "Good"],
real = df_test_2$Class,
model_name = "glmnet_new",
train_status = "test"
)
) %>%
ggplot(aes(x = fpr, y = tpr, col = model_name)) +
geom_line(size = 1.5) +
scale_color_manual(values = c("#8B0000", "#00468B")) +
theme(legend.position = "bottom")