# Load some packages:
library(tidyverse)
library(caret)
library(pROC)
# Load data:
data("GermanCredit")
df <- GermanCredit
# Set predictors and response:
response <- "Class"
predictors <- names(df %>% select(-response))
#===============================================================
# Scenario 1: ROC/AUC as a criterion for feature engineering
#===============================================================
# Split our data:
set.seed(1)
id <- createDataPartition(y = df %>% pull(response), p = 0.7, list = FALSE)
train <- df[id, ]
test <- df[-id, ]
# Function extracts ROC/AUC for a predictor selected:
returnROC_AUC <- function(predictor_selected) {
f <- as.formula(paste0(response, " ~ ", predictor_selected))
logit <- glm(f, family = "binomial", data = train)
prob_pred <- predict(logit, test, type = "response")
my_auc <- roc(test %>% pull(response), prob_pred)$auc %>% as.numeric()
return(tibble(predictor = predictor_selected, auc = my_auc))
}
# ROC/AUC by a given predictor:
do.call("bind_rows", lapply(predictors, returnROC_AUC)) %>%
arrange(-auc) -> df_auc
# Function extracts ROC/AUC on test data:
returnROC_AUCTestData <- function(predictor_selected) {
f <- as.formula(paste0(response, " ~ ", paste(predictor_selected, collapse = " + ")))
logit <- glm(f, family = "binomial", data = train)
prob_pred <- predict(logit, test, type = "response")
my_auc <- roc(test$Class, prob_pred)$auc %>% as.numeric()
return(my_auc)
}
# Calculate our AUC:
AUC_All <- returnROC_AUCTestData(predictor_selected = predictors)
AUC_Variables55 <- returnROC_AUCTestData(predictor_selected = df_auc %>% filter(auc > 0.55) %>% pull(predictor))
#==============================================================================
# Scenario 2: Information Value (IV) as a criterion for feature engineering
#==============================================================================
# Before conducting IV procedure we must convert response variable to numeric:
train %>%
mutate(Class = case_when(Class == "Bad" ~ 1, TRUE ~ 0)) -> train_converted
test %>%
mutate(Class = case_when(Class == "Bad" ~ 1, TRUE ~ 0)) -> test_converted
# Load scorecard package:
library(scorecard)
# Calculate Information Values (IV):
info_values <- iv(train_converted, y = "Class", positive = "Class|1")
# Variables with IV >= 0.1:
info_values %>%
filter(info_value >= 0.1) %>%
arrange(info_value) %>%
mutate(info_value = round(info_value, 3), variable = factor(variable, levels = variable)) -> df_iv_over1
# Plot:
df_iv_over1 %>%
ggplot(aes(variable, info_value)) +
geom_col(fill = "#377eb8") +
coord_flip() +
geom_text(aes(label = info_value), hjust = 1.1, size = 5, color = "white") +
labs(title = "Figure 1: Variables with Information Value (IV) >= 0.1",
x = NULL, y = "Information Value (IV)") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 0.5)) +
theme_minimal() +
theme(panel.grid.major.y = element_blank()) +
theme(plot.margin = unit(c(1, 1, 1, 1), "cm"))

## [INFO] creating woe binning ...
## [INFO] converting into woe values ...
## [INFO] converting into woe values ...
## Area under the curve: 0.6516
## [INFO] creating woe binning ...
## [INFO] converting into woe values ...
## [INFO] converting into woe values ...
# Train Logistic Regression with WoE-transformed predictors:
logitWoE_All <- glm(Class ~ ., family = "binomial", data = df_train_woeAll)
# Calculate ROC/AUC:
prob_predWoE_All <- predict(logitWoE_All, df_test_woeAll, type = "response")
AUC_WoETran <- roc(df_test_woeAll$Class, prob_predWoE_All)$auc %>% as.numeric()
# Compare AUC results:
df_results <- tibble(Method = c("All Variables", "AUC 0.55", "WoE Tran"),
AUC = c(AUC_All, AUC_Variables55, AUC_WoETran))
knitr::kable(df_results %>% mutate(AUC = round(AUC, 3)) %>% arrange(-AUC))
| AUC 0.55 |
0.799 |
| WoE Tran |
0.777 |
| All Variables |
0.750 |
LS0tCnRpdGxlOiAiQVVDIG9yIElWIENyaXRlcmlvbiAtIFdoaWNoIE1ldGhvZCBpcyBCZXR0ZXIgZm9yIEZlYXR1cmUgRW5naW5lZXJpbmc/IgpzdWJ0aXRsZTogIlIgZm9yIEZ1biIKYXV0aG9yOiAiTmd1eWVuIENoaSBEdW5nIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHllcwogICAgIyBjb2RlX2ZvbGRpbmc6IGhpZGUKICAgIGhpZ2hsaWdodDogemVuYnVybgogICAgdGhlbWU6IGZsYXRseQogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgd29yZF9kb2N1bWVudDoKICAgIHRvYzogeWVzCi0tLQoKYGBge3Igc2V0dXAsaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgZmlnLnJldGluYT0yKQpgYGAKCgpgYGB7ciwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0V9CiMgTG9hZCBzb21lIHBhY2thZ2VzOiAKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocFJPQykKCiMgTG9hZCBkYXRhOiAKZGF0YSgiR2VybWFuQ3JlZGl0IikKZGYgPC0gR2VybWFuQ3JlZGl0CgojIFNldCBwcmVkaWN0b3JzIGFuZCByZXNwb25zZTogCnJlc3BvbnNlIDwtICJDbGFzcyIKcHJlZGljdG9ycyA8LSBuYW1lcyhkZiAlPiUgc2VsZWN0KC1yZXNwb25zZSkpCgojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgIFNjZW5hcmlvIDE6IFJPQy9BVUMgYXMgYSBjcml0ZXJpb24gZm9yIGZlYXR1cmUgZW5naW5lZXJpbmcKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKIyBTcGxpdCBvdXIgZGF0YTogCgpzZXQuc2VlZCgxKQppZCA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKHkgPSBkZiAlPiUgcHVsbChyZXNwb25zZSksIHAgPSAwLjcsIGxpc3QgPSBGQUxTRSkKdHJhaW4gPC0gZGZbaWQsIF0KdGVzdCA8LSBkZlstaWQsIF0KCiMgRnVuY3Rpb24gZXh0cmFjdHMgUk9DL0FVQyBmb3IgYSBwcmVkaWN0b3Igc2VsZWN0ZWQ6IAoKcmV0dXJuUk9DX0FVQyA8LSBmdW5jdGlvbihwcmVkaWN0b3Jfc2VsZWN0ZWQpIHsKICAKICBmIDwtIGFzLmZvcm11bGEocGFzdGUwKHJlc3BvbnNlLCAiIH4gIiwgcHJlZGljdG9yX3NlbGVjdGVkKSkKICBsb2dpdCA8LSBnbG0oZiwgZmFtaWx5ID0gImJpbm9taWFsIiwgZGF0YSA9IHRyYWluKQogIHByb2JfcHJlZCA8LSBwcmVkaWN0KGxvZ2l0LCB0ZXN0LCB0eXBlID0gInJlc3BvbnNlIikKICBteV9hdWMgPC0gcm9jKHRlc3QgJT4lIHB1bGwocmVzcG9uc2UpLCBwcm9iX3ByZWQpJGF1YyAlPiUgYXMubnVtZXJpYygpCiAgcmV0dXJuKHRpYmJsZShwcmVkaWN0b3IgPSBwcmVkaWN0b3Jfc2VsZWN0ZWQsIGF1YyA9IG15X2F1YykpCiAgCn0KCgojIFJPQy9BVUMgYnkgYSBnaXZlbiBwcmVkaWN0b3I6IApkby5jYWxsKCJiaW5kX3Jvd3MiLCBsYXBwbHkocHJlZGljdG9ycywgcmV0dXJuUk9DX0FVQykpICU+JSAKICBhcnJhbmdlKC1hdWMpIC0+IGRmX2F1YwoKIyBGdW5jdGlvbiBleHRyYWN0cyBST0MvQVVDIG9uIHRlc3QgZGF0YTogCgpyZXR1cm5ST0NfQVVDVGVzdERhdGEgPC0gZnVuY3Rpb24ocHJlZGljdG9yX3NlbGVjdGVkKSB7CiAgZiA8LSBhcy5mb3JtdWxhKHBhc3RlMChyZXNwb25zZSwgIiB+ICIsIHBhc3RlKHByZWRpY3Rvcl9zZWxlY3RlZCwgY29sbGFwc2UgPSAiICsgIikpKQogIGxvZ2l0IDwtIGdsbShmLCBmYW1pbHkgPSAiYmlub21pYWwiLCBkYXRhID0gdHJhaW4pCiAgcHJvYl9wcmVkIDwtIHByZWRpY3QobG9naXQsIHRlc3QsIHR5cGUgPSAicmVzcG9uc2UiKQogIG15X2F1YyA8LSByb2ModGVzdCRDbGFzcywgcHJvYl9wcmVkKSRhdWMgJT4lIGFzLm51bWVyaWMoKQogIHJldHVybihteV9hdWMpCiAgCn0KCgojIENhbGN1bGF0ZSBvdXIgQVVDOiAKQVVDX0FsbCA8LSByZXR1cm5ST0NfQVVDVGVzdERhdGEocHJlZGljdG9yX3NlbGVjdGVkID0gcHJlZGljdG9ycykKQVVDX1ZhcmlhYmxlczU1IDwtIHJldHVyblJPQ19BVUNUZXN0RGF0YShwcmVkaWN0b3Jfc2VsZWN0ZWQgPSBkZl9hdWMgJT4lIGZpbHRlcihhdWMgPiAwLjU1KSAlPiUgcHVsbChwcmVkaWN0b3IpKQoKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQojICBTY2VuYXJpbyAyOiBJbmZvcm1hdGlvbiBWYWx1ZSAoSVYpIGFzIGEgY3JpdGVyaW9uIGZvciBmZWF0dXJlIGVuZ2luZWVyaW5nCiM9PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KCiMgQmVmb3JlIGNvbmR1Y3RpbmcgSVYgcHJvY2VkdXJlIHdlIG11c3QgY29udmVydCByZXNwb25zZSB2YXJpYWJsZSB0byBudW1lcmljOiAKCnRyYWluICU+JSAKICBtdXRhdGUoQ2xhc3MgPSBjYXNlX3doZW4oQ2xhc3MgPT0gIkJhZCIgfiAxLCBUUlVFIH4gMCkpIC0+IHRyYWluX2NvbnZlcnRlZAoKdGVzdCAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09ICJCYWQiIH4gMSwgVFJVRSB+IDApKSAtPiB0ZXN0X2NvbnZlcnRlZAoKCiMgTG9hZCBzY29yZWNhcmQgcGFja2FnZTogCmxpYnJhcnkoc2NvcmVjYXJkKQoKIyBDYWxjdWxhdGUgSW5mb3JtYXRpb24gVmFsdWVzIChJVik6IAppbmZvX3ZhbHVlcyA8LSBpdih0cmFpbl9jb252ZXJ0ZWQsIHkgPSAiQ2xhc3MiLCBwb3NpdGl2ZSA9ICJDbGFzc3wxIikKCiMgVmFyaWFibGVzIHdpdGggSVYgPj0gMC4xOiAKaW5mb192YWx1ZXMgJT4lIAogIGZpbHRlcihpbmZvX3ZhbHVlID49IDAuMSkgJT4lIAogIGFycmFuZ2UoaW5mb192YWx1ZSkgJT4lIAogIG11dGF0ZShpbmZvX3ZhbHVlID0gcm91bmQoaW5mb192YWx1ZSwgMyksIHZhcmlhYmxlID0gZmFjdG9yKHZhcmlhYmxlLCBsZXZlbHMgPSB2YXJpYWJsZSkpIC0+IGRmX2l2X292ZXIxCgoKIyBQbG90OiAKZGZfaXZfb3ZlcjEgJT4lIAogIGdncGxvdChhZXModmFyaWFibGUsIGluZm9fdmFsdWUpKSArIAogIGdlb21fY29sKGZpbGwgPSAiIzM3N2ViOCIpICsgCiAgY29vcmRfZmxpcCgpICsgCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IGluZm9fdmFsdWUpLCBoanVzdCA9IDEuMSwgc2l6ZSA9IDUsIGNvbG9yID0gIndoaXRlIikgKyAKICBsYWJzKHRpdGxlID0gIkZpZ3VyZSAxOiBWYXJpYWJsZXMgd2l0aCBJbmZvcm1hdGlvbiBWYWx1ZSAoSVYpID49IDAuMSIsIAogICAgICAgeCA9IE5VTEwsIHkgPSAiSW5mb3JtYXRpb24gVmFsdWUgKElWKSIpICsgCiAgc2NhbGVfeV9jb250aW51b3VzKGV4cGFuZCA9IGMoMCwgMCksIGxpbWl0cyA9IGMoMCwgMC41KSkgKyAKICB0aGVtZV9taW5pbWFsKCkgKyAKICB0aGVtZShwYW5lbC5ncmlkLm1ham9yLnkgPSBlbGVtZW50X2JsYW5rKCkpICsgCiAgdGhlbWUocGxvdC5tYXJnaW4gPSB1bml0KGMoMSwgMSwgMSwgMSksICJjbSIpKSAKCiMgQmlubmluZyB2YXJpYWJsZXMgd2l0aCBJViA+PSAwLjEgIDogCgpiaW5zIDwtIHdvZWJpbih0cmFpbl9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCB5ID0gIkNsYXNzIiwgIHBvc2l0aXZlID0gIkNsYXNzfDEiKQoKIyBDb25kdWN0IFdvRSB0cmFuc2Zvcm1hdGlvbjogCgpkZl90cmFpbl93b2UgPC0gd29lYmluX3BseSh0cmFpbl9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCBiaW5zID0gYmlucykgJT4lIAogIG11dGF0ZShDbGFzcyA9IGNhc2Vfd2hlbihDbGFzcyA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpICU+JSBhcy5mYWN0b3IoKSkKCmRmX3Rlc3Rfd29lIDwtIHdvZWJpbl9wbHkodGVzdF9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCBiaW5zID0gYmlucykgJT4lIAogIG11dGF0ZShDbGFzcyA9IGNhc2Vfd2hlbihDbGFzcyA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpICU+JSBhcy5mYWN0b3IoKSkKCgojIFRyYWluIExvZ2lzdGljIFJlZ3Jlc3Npb24gd2l0aCBXb0UtdHJhbnNmb3JlZCBwcmVkaWN0b3JzOiAKCmxvZ2l0V29FIDwtIGdsbShDbGFzcyB+IC4sIGZhbWlseSA9ICJiaW5vbWlhbCIsIGRhdGEgPSBkZl90cmFpbl93b2UpCgojIENhbGN1bGF0ZSBST0MvQVVDOiAKCnByb2JfcHJlZFdvRSA8LSBwcmVkaWN0KGxvZ2l0V29FLCBkZl90ZXN0X3dvZSwgdHlwZSA9ICJyZXNwb25zZSIpCnJvYyhkZl90ZXN0X3dvZSRDbGFzcywgcHJvYl9wcmVkV29FKSRhdWMgCgojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KIyAgIFNjZW5hcmlvIDM6IFVzZWwgYWxsIFdvRS10cmFuc2Zvcm1lZCBwcmVkaWN0b3JzCiM9PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKCiMgQmlubmluZyBhbGwgdmFyaWFibGVzOiAKCmJpbnNfYWxsIDwtIHdvZWJpbih0cmFpbl9jb252ZXJ0ZWQsIHkgPSAiQ2xhc3MiLCAgcG9zaXRpdmUgPSAiQ2xhc3N8MSIpCgojIENvbmR1Y3QgV29FIHRyYW5zZm9ybWF0aW9uOiAKCmRmX3RyYWluX3dvZUFsbCA8LSB3b2ViaW5fcGx5KHRyYWluX2NvbnZlcnRlZCwgYmlucyA9IGJpbnNfYWxsKSAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09IDEgfiAiQmFkIiwgVFJVRSB+ICJHb29kIikgJT4lIGFzLmZhY3RvcigpKQoKZGZfdGVzdF93b2VBbGwgPC0gd29lYmluX3BseSh0ZXN0X2NvbnZlcnRlZCwgYmlucyA9IGJpbnNfYWxsKSAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09IDEgfiAiQmFkIiwgVFJVRSB+ICJHb29kIikgJT4lIGFzLmZhY3RvcigpKQoKCiMgVHJhaW4gTG9naXN0aWMgUmVncmVzc2lvbiB3aXRoIFdvRS10cmFuc2Zvcm1lZCBwcmVkaWN0b3JzOiAKCmxvZ2l0V29FX0FsbCA8LSBnbG0oQ2xhc3MgfiAuLCBmYW1pbHkgPSAiYmlub21pYWwiLCBkYXRhID0gZGZfdHJhaW5fd29lQWxsKQoKIyBDYWxjdWxhdGUgUk9DL0FVQzogCgpwcm9iX3ByZWRXb0VfQWxsIDwtIHByZWRpY3QobG9naXRXb0VfQWxsLCBkZl90ZXN0X3dvZUFsbCwgdHlwZSA9ICJyZXNwb25zZSIpCkFVQ19Xb0VUcmFuIDwtIHJvYyhkZl90ZXN0X3dvZUFsbCRDbGFzcywgcHJvYl9wcmVkV29FX0FsbCkkYXVjICU+JSBhcy5udW1lcmljKCkKCiMgQ29tcGFyZSBBVUMgcmVzdWx0czoKZGZfcmVzdWx0cyA8LSB0aWJibGUoTWV0aG9kID0gYygiQWxsIFZhcmlhYmxlcyIsICJBVUMgMC41NSIsICJXb0UgVHJhbiIpLCAKICAgICAgICAgICAgICAgICAgICAgQVVDID0gYyhBVUNfQWxsLCBBVUNfVmFyaWFibGVzNTUsIEFVQ19Xb0VUcmFuKSkKCmtuaXRyOjprYWJsZShkZl9yZXN1bHRzICU+JSBtdXRhdGUoQVVDID0gcm91bmQoQVVDLCAzKSkgJT4lIGFycmFuZ2UoLUFVQykpCgoKCmBgYAoK