# Load some packages: 
library(tidyverse)
library(caret)
library(pROC)

# Load data: 
data("GermanCredit")
df <- GermanCredit

# Set predictors and response: 
response <- "Class"
predictors <- names(df %>% select(-response))

#===============================================================
#  Scenario 1: ROC/AUC as a criterion for feature engineering
#===============================================================

# Split our data: 

set.seed(1)
id <- createDataPartition(y = df %>% pull(response), p = 0.7, list = FALSE)
train <- df[id, ]
test <- df[-id, ]

# Function extracts ROC/AUC for a predictor selected: 

returnROC_AUC <- function(predictor_selected) {
  
  f <- as.formula(paste0(response, " ~ ", predictor_selected))
  logit <- glm(f, family = "binomial", data = train)
  prob_pred <- predict(logit, test, type = "response")
  my_auc <- roc(test %>% pull(response), prob_pred)$auc %>% as.numeric()
  return(tibble(predictor = predictor_selected, auc = my_auc))
  
}


# ROC/AUC by a given predictor: 
do.call("bind_rows", lapply(predictors, returnROC_AUC)) %>% 
  arrange(-auc) -> df_auc

# Function extracts ROC/AUC on test data: 

returnROC_AUCTestData <- function(predictor_selected) {
  f <- as.formula(paste0(response, " ~ ", paste(predictor_selected, collapse = " + ")))
  logit <- glm(f, family = "binomial", data = train)
  prob_pred <- predict(logit, test, type = "response")
  my_auc <- roc(test$Class, prob_pred)$auc %>% as.numeric()
  return(my_auc)
  
}


# Calculate our AUC: 
AUC_All <- returnROC_AUCTestData(predictor_selected = predictors)
AUC_Variables55 <- returnROC_AUCTestData(predictor_selected = df_auc %>% filter(auc > 0.55) %>% pull(predictor))

#==============================================================================
#  Scenario 2: Information Value (IV) as a criterion for feature engineering
#==============================================================================

# Before conducting IV procedure we must convert response variable to numeric: 

train %>% 
  mutate(Class = case_when(Class == "Bad" ~ 1, TRUE ~ 0)) -> train_converted

test %>% 
  mutate(Class = case_when(Class == "Bad" ~ 1, TRUE ~ 0)) -> test_converted


# Load scorecard package: 
library(scorecard)

# Calculate Information Values (IV): 
info_values <- iv(train_converted, y = "Class", positive = "Class|1")

# Variables with IV >= 0.1: 
info_values %>% 
  filter(info_value >= 0.1) %>% 
  arrange(info_value) %>% 
  mutate(info_value = round(info_value, 3), variable = factor(variable, levels = variable)) -> df_iv_over1


# Plot: 
df_iv_over1 %>% 
  ggplot(aes(variable, info_value)) + 
  geom_col(fill = "#377eb8") + 
  coord_flip() + 
  geom_text(aes(label = info_value), hjust = 1.1, size = 5, color = "white") + 
  labs(title = "Figure 1: Variables with Information Value (IV) >= 0.1", 
       x = NULL, y = "Information Value (IV)") + 
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.5)) + 
  theme_minimal() + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) 

## [INFO] creating woe binning ...
## [INFO] converting into woe values ...
## [INFO] converting into woe values ...
## Area under the curve: 0.6516
## [INFO] creating woe binning ...
## [INFO] converting into woe values ...
## [INFO] converting into woe values ...
Method AUC
AUC 0.55 0.799
WoE Tran 0.777
All Variables 0.750
LS0tCnRpdGxlOiAiQVVDIG9yIElWIENyaXRlcmlvbiAtIFdoaWNoIE1ldGhvZCBpcyBCZXR0ZXIgZm9yIEZlYXR1cmUgRW5naW5lZXJpbmc/IgpzdWJ0aXRsZTogIlIgZm9yIEZ1biIKYXV0aG9yOiAiTmd1eWVuIENoaSBEdW5nIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHllcwogICAgIyBjb2RlX2ZvbGRpbmc6IGhpZGUKICAgIGhpZ2hsaWdodDogemVuYnVybgogICAgdGhlbWU6IGZsYXRseQogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgd29yZF9kb2N1bWVudDoKICAgIHRvYzogeWVzCi0tLQoKYGBge3Igc2V0dXAsaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgZmlnLnJldGluYT0yKQpgYGAKCgpgYGB7ciwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0V9CiMgTG9hZCBzb21lIHBhY2thZ2VzOiAKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocFJPQykKCiMgTG9hZCBkYXRhOiAKZGF0YSgiR2VybWFuQ3JlZGl0IikKZGYgPC0gR2VybWFuQ3JlZGl0CgojIFNldCBwcmVkaWN0b3JzIGFuZCByZXNwb25zZTogCnJlc3BvbnNlIDwtICJDbGFzcyIKcHJlZGljdG9ycyA8LSBuYW1lcyhkZiAlPiUgc2VsZWN0KC1yZXNwb25zZSkpCgojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgIFNjZW5hcmlvIDE6IFJPQy9BVUMgYXMgYSBjcml0ZXJpb24gZm9yIGZlYXR1cmUgZW5naW5lZXJpbmcKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKIyBTcGxpdCBvdXIgZGF0YTogCgpzZXQuc2VlZCgxKQppZCA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKHkgPSBkZiAlPiUgcHVsbChyZXNwb25zZSksIHAgPSAwLjcsIGxpc3QgPSBGQUxTRSkKdHJhaW4gPC0gZGZbaWQsIF0KdGVzdCA8LSBkZlstaWQsIF0KCiMgRnVuY3Rpb24gZXh0cmFjdHMgUk9DL0FVQyBmb3IgYSBwcmVkaWN0b3Igc2VsZWN0ZWQ6IAoKcmV0dXJuUk9DX0FVQyA8LSBmdW5jdGlvbihwcmVkaWN0b3Jfc2VsZWN0ZWQpIHsKICAKICBmIDwtIGFzLmZvcm11bGEocGFzdGUwKHJlc3BvbnNlLCAiIH4gIiwgcHJlZGljdG9yX3NlbGVjdGVkKSkKICBsb2dpdCA8LSBnbG0oZiwgZmFtaWx5ID0gImJpbm9taWFsIiwgZGF0YSA9IHRyYWluKQogIHByb2JfcHJlZCA8LSBwcmVkaWN0KGxvZ2l0LCB0ZXN0LCB0eXBlID0gInJlc3BvbnNlIikKICBteV9hdWMgPC0gcm9jKHRlc3QgJT4lIHB1bGwocmVzcG9uc2UpLCBwcm9iX3ByZWQpJGF1YyAlPiUgYXMubnVtZXJpYygpCiAgcmV0dXJuKHRpYmJsZShwcmVkaWN0b3IgPSBwcmVkaWN0b3Jfc2VsZWN0ZWQsIGF1YyA9IG15X2F1YykpCiAgCn0KCgojIFJPQy9BVUMgYnkgYSBnaXZlbiBwcmVkaWN0b3I6IApkby5jYWxsKCJiaW5kX3Jvd3MiLCBsYXBwbHkocHJlZGljdG9ycywgcmV0dXJuUk9DX0FVQykpICU+JSAKICBhcnJhbmdlKC1hdWMpIC0+IGRmX2F1YwoKIyBGdW5jdGlvbiBleHRyYWN0cyBST0MvQVVDIG9uIHRlc3QgZGF0YTogCgpyZXR1cm5ST0NfQVVDVGVzdERhdGEgPC0gZnVuY3Rpb24ocHJlZGljdG9yX3NlbGVjdGVkKSB7CiAgZiA8LSBhcy5mb3JtdWxhKHBhc3RlMChyZXNwb25zZSwgIiB+ICIsIHBhc3RlKHByZWRpY3Rvcl9zZWxlY3RlZCwgY29sbGFwc2UgPSAiICsgIikpKQogIGxvZ2l0IDwtIGdsbShmLCBmYW1pbHkgPSAiYmlub21pYWwiLCBkYXRhID0gdHJhaW4pCiAgcHJvYl9wcmVkIDwtIHByZWRpY3QobG9naXQsIHRlc3QsIHR5cGUgPSAicmVzcG9uc2UiKQogIG15X2F1YyA8LSByb2ModGVzdCRDbGFzcywgcHJvYl9wcmVkKSRhdWMgJT4lIGFzLm51bWVyaWMoKQogIHJldHVybihteV9hdWMpCiAgCn0KCgojIENhbGN1bGF0ZSBvdXIgQVVDOiAKQVVDX0FsbCA8LSByZXR1cm5ST0NfQVVDVGVzdERhdGEocHJlZGljdG9yX3NlbGVjdGVkID0gcHJlZGljdG9ycykKQVVDX1ZhcmlhYmxlczU1IDwtIHJldHVyblJPQ19BVUNUZXN0RGF0YShwcmVkaWN0b3Jfc2VsZWN0ZWQgPSBkZl9hdWMgJT4lIGZpbHRlcihhdWMgPiAwLjU1KSAlPiUgcHVsbChwcmVkaWN0b3IpKQoKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQojICBTY2VuYXJpbyAyOiBJbmZvcm1hdGlvbiBWYWx1ZSAoSVYpIGFzIGEgY3JpdGVyaW9uIGZvciBmZWF0dXJlIGVuZ2luZWVyaW5nCiM9PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KCiMgQmVmb3JlIGNvbmR1Y3RpbmcgSVYgcHJvY2VkdXJlIHdlIG11c3QgY29udmVydCByZXNwb25zZSB2YXJpYWJsZSB0byBudW1lcmljOiAKCnRyYWluICU+JSAKICBtdXRhdGUoQ2xhc3MgPSBjYXNlX3doZW4oQ2xhc3MgPT0gIkJhZCIgfiAxLCBUUlVFIH4gMCkpIC0+IHRyYWluX2NvbnZlcnRlZAoKdGVzdCAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09ICJCYWQiIH4gMSwgVFJVRSB+IDApKSAtPiB0ZXN0X2NvbnZlcnRlZAoKCiMgTG9hZCBzY29yZWNhcmQgcGFja2FnZTogCmxpYnJhcnkoc2NvcmVjYXJkKQoKIyBDYWxjdWxhdGUgSW5mb3JtYXRpb24gVmFsdWVzIChJVik6IAppbmZvX3ZhbHVlcyA8LSBpdih0cmFpbl9jb252ZXJ0ZWQsIHkgPSAiQ2xhc3MiLCBwb3NpdGl2ZSA9ICJDbGFzc3wxIikKCiMgVmFyaWFibGVzIHdpdGggSVYgPj0gMC4xOiAKaW5mb192YWx1ZXMgJT4lIAogIGZpbHRlcihpbmZvX3ZhbHVlID49IDAuMSkgJT4lIAogIGFycmFuZ2UoaW5mb192YWx1ZSkgJT4lIAogIG11dGF0ZShpbmZvX3ZhbHVlID0gcm91bmQoaW5mb192YWx1ZSwgMyksIHZhcmlhYmxlID0gZmFjdG9yKHZhcmlhYmxlLCBsZXZlbHMgPSB2YXJpYWJsZSkpIC0+IGRmX2l2X292ZXIxCgoKIyBQbG90OiAKZGZfaXZfb3ZlcjEgJT4lIAogIGdncGxvdChhZXModmFyaWFibGUsIGluZm9fdmFsdWUpKSArIAogIGdlb21fY29sKGZpbGwgPSAiIzM3N2ViOCIpICsgCiAgY29vcmRfZmxpcCgpICsgCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IGluZm9fdmFsdWUpLCBoanVzdCA9IDEuMSwgc2l6ZSA9IDUsIGNvbG9yID0gIndoaXRlIikgKyAKICBsYWJzKHRpdGxlID0gIkZpZ3VyZSAxOiBWYXJpYWJsZXMgd2l0aCBJbmZvcm1hdGlvbiBWYWx1ZSAoSVYpID49IDAuMSIsIAogICAgICAgeCA9IE5VTEwsIHkgPSAiSW5mb3JtYXRpb24gVmFsdWUgKElWKSIpICsgCiAgc2NhbGVfeV9jb250aW51b3VzKGV4cGFuZCA9IGMoMCwgMCksIGxpbWl0cyA9IGMoMCwgMC41KSkgKyAKICB0aGVtZV9taW5pbWFsKCkgKyAKICB0aGVtZShwYW5lbC5ncmlkLm1ham9yLnkgPSBlbGVtZW50X2JsYW5rKCkpICsgCiAgdGhlbWUocGxvdC5tYXJnaW4gPSB1bml0KGMoMSwgMSwgMSwgMSksICJjbSIpKSAKCiMgQmlubmluZyB2YXJpYWJsZXMgd2l0aCBJViA+PSAwLjEgIDogCgpiaW5zIDwtIHdvZWJpbih0cmFpbl9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCB5ID0gIkNsYXNzIiwgIHBvc2l0aXZlID0gIkNsYXNzfDEiKQoKIyBDb25kdWN0IFdvRSB0cmFuc2Zvcm1hdGlvbjogCgpkZl90cmFpbl93b2UgPC0gd29lYmluX3BseSh0cmFpbl9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCBiaW5zID0gYmlucykgJT4lIAogIG11dGF0ZShDbGFzcyA9IGNhc2Vfd2hlbihDbGFzcyA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpICU+JSBhcy5mYWN0b3IoKSkKCmRmX3Rlc3Rfd29lIDwtIHdvZWJpbl9wbHkodGVzdF9jb252ZXJ0ZWQgJT4lIHNlbGVjdChkZl9pdl9vdmVyMSR2YXJpYWJsZSwgcmVzcG9uc2UpLCBiaW5zID0gYmlucykgJT4lIAogIG11dGF0ZShDbGFzcyA9IGNhc2Vfd2hlbihDbGFzcyA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpICU+JSBhcy5mYWN0b3IoKSkKCgojIFRyYWluIExvZ2lzdGljIFJlZ3Jlc3Npb24gd2l0aCBXb0UtdHJhbnNmb3JlZCBwcmVkaWN0b3JzOiAKCmxvZ2l0V29FIDwtIGdsbShDbGFzcyB+IC4sIGZhbWlseSA9ICJiaW5vbWlhbCIsIGRhdGEgPSBkZl90cmFpbl93b2UpCgojIENhbGN1bGF0ZSBST0MvQVVDOiAKCnByb2JfcHJlZFdvRSA8LSBwcmVkaWN0KGxvZ2l0V29FLCBkZl90ZXN0X3dvZSwgdHlwZSA9ICJyZXNwb25zZSIpCnJvYyhkZl90ZXN0X3dvZSRDbGFzcywgcHJvYl9wcmVkV29FKSRhdWMgCgojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KIyAgIFNjZW5hcmlvIDM6IFVzZWwgYWxsIFdvRS10cmFuc2Zvcm1lZCBwcmVkaWN0b3JzCiM9PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKCiMgQmlubmluZyBhbGwgdmFyaWFibGVzOiAKCmJpbnNfYWxsIDwtIHdvZWJpbih0cmFpbl9jb252ZXJ0ZWQsIHkgPSAiQ2xhc3MiLCAgcG9zaXRpdmUgPSAiQ2xhc3N8MSIpCgojIENvbmR1Y3QgV29FIHRyYW5zZm9ybWF0aW9uOiAKCmRmX3RyYWluX3dvZUFsbCA8LSB3b2ViaW5fcGx5KHRyYWluX2NvbnZlcnRlZCwgYmlucyA9IGJpbnNfYWxsKSAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09IDEgfiAiQmFkIiwgVFJVRSB+ICJHb29kIikgJT4lIGFzLmZhY3RvcigpKQoKZGZfdGVzdF93b2VBbGwgPC0gd29lYmluX3BseSh0ZXN0X2NvbnZlcnRlZCwgYmlucyA9IGJpbnNfYWxsKSAlPiUgCiAgbXV0YXRlKENsYXNzID0gY2FzZV93aGVuKENsYXNzID09IDEgfiAiQmFkIiwgVFJVRSB+ICJHb29kIikgJT4lIGFzLmZhY3RvcigpKQoKCiMgVHJhaW4gTG9naXN0aWMgUmVncmVzc2lvbiB3aXRoIFdvRS10cmFuc2Zvcm1lZCBwcmVkaWN0b3JzOiAKCmxvZ2l0V29FX0FsbCA8LSBnbG0oQ2xhc3MgfiAuLCBmYW1pbHkgPSAiYmlub21pYWwiLCBkYXRhID0gZGZfdHJhaW5fd29lQWxsKQoKIyBDYWxjdWxhdGUgUk9DL0FVQzogCgpwcm9iX3ByZWRXb0VfQWxsIDwtIHByZWRpY3QobG9naXRXb0VfQWxsLCBkZl90ZXN0X3dvZUFsbCwgdHlwZSA9ICJyZXNwb25zZSIpCkFVQ19Xb0VUcmFuIDwtIHJvYyhkZl90ZXN0X3dvZUFsbCRDbGFzcywgcHJvYl9wcmVkV29FX0FsbCkkYXVjICU+JSBhcy5udW1lcmljKCkKCiMgQ29tcGFyZSBBVUMgcmVzdWx0czoKZGZfcmVzdWx0cyA8LSB0aWJibGUoTWV0aG9kID0gYygiQWxsIFZhcmlhYmxlcyIsICJBVUMgMC41NSIsICJXb0UgVHJhbiIpLCAKICAgICAgICAgICAgICAgICAgICAgQVVDID0gYyhBVUNfQWxsLCBBVUNfVmFyaWFibGVzNTUsIEFVQ19Xb0VUcmFuKSkKCmtuaXRyOjprYWJsZShkZl9yZXN1bHRzICU+JSBtdXRhdGUoQVVDID0gcm91bmQoQVVDLCAzKSkgJT4lIGFycmFuZ2UoLUFVQykpCgoKCmBgYAoK