#------------------------------------
# Perform some data pre-processing
#------------------------------------
rm(list = ls())

# Load some packages: 
library(tidyverse)
library(magrittr)

# Import data: 
hmeq <- read.csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")

# Function replaces NA by mean: 

replace_by_mean <- function(x) {
  x[is.na(x)] <- mean(x, na.rm = TRUE)
  return(x)
}

# A function imputes NA observations for categorical variables: 

replace_na_categorical <- function(x) {
  x %>% 
    table() %>% 
    as.data.frame() %>% 
    arrange(-Freq) ->> my_df
  
  n_obs <- sum(my_df$Freq)
  pop <- my_df$. %>% as.character()
  set.seed(29)
  x[is.na(x)] <- sample(pop, sum(is.na(x)), replace = TRUE, prob = my_df$Freq)
  return(x)
}

# Use the two functions: 
df <- hmeq %>% 
  mutate_if(is.factor, as.character) %>% 
  mutate(REASON = case_when(REASON == "" ~ NA_character_, TRUE ~ REASON), 
         JOB = case_when(JOB == "" ~ NA_character_, TRUE ~ JOB)) %>%
  mutate_if(is_character, as.factor) %>% 
  mutate_if(is.numeric, replace_by_mean) %>% 
  mutate_if(is.factor, replace_na_categorical)


# Convert BAD to factor and scale 0 -1 data set: 
df_for_ml <- df %>% 
  mutate(BAD = case_when(BAD == 1 ~ "Bad", TRUE ~ "Good") %>% as.factor()) %>% 
  mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))})


#-----------------------------------
#  Simultaneously Train 10 Models
#-----------------------------------

# Split data: 
library(caret)
set.seed(1)
id <- createDataPartition(y = df_for_ml$BAD, p = 0.7, list = FALSE)
df_train_ml <- df_for_ml[id, ]
df_test_ml <- df_for_ml[-id, ]


# Set conditions for training model and cross-validation: 

set.seed(1)
number <- 3
repeats <- 5
control <- trainControl(method = "repeatedcv", 
                        number = number , 
                        repeats = repeats, 
                        classProbs = TRUE, 
                        savePredictions = "final", 
                        index = createResample(df_train_ml$BAD, repeats*number), 
                        summaryFunction = twoClassSummary, 
                        allowParallel = TRUE)


# Train Logistic Regression: 
my_logit <- train(BAD ~., method = "glm", data = df_train_ml,
                  trControl = control, metric = "ROC")

# Predict PD: 
pred.prob <- predict(my_logit, df_test_ml, type = "prob") %>% pull(Bad)

# Actual PD: 
test.y <- case_when(df_test_ml$BAD == "Bad" ~ 1, TRUE ~ 0)

# Accuracy by Threshold: 

library(classifierplots)

accuracy_plot(test.y, pred.prob, granularity = 0.01) + 
  hrbrthemes::theme_ft_rc() + 
  theme(panel.grid.minor = element_blank()) + 
  labs(title = "Figure 1: Accuracy by Threshold", 
       caption = "Data Source: http://www.creditriskanalytics.net")

LS0tDQp0aXRsZTogIlZpc3VhbGl6YXRpb24gb2YgQ2xhc3NpZmllciBQZXJmb3JtYW5jZSINCmF1dGhvcjogIk5ndXllbiBDaGkgRHVuZyINCnN1YnRpdGxlOiAiRGFpbHkgR3JhcGggU2VyaWVzIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIGNvZGVfZG93bmxvYWQ6IHllcw0KICAgIGNvZGVfZm9sZGluZzogaGlkZQ0KICAgIGhpZ2hsaWdodDogemVuYnVybg0KICAgIHRoZW1lOiBmbGF0bHkNCiAgICB0b2M6IHllcw0KICAgIHRvY19mbG9hdDogeWVzDQogIHdvcmRfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCi0tLQ0KDQpgYGB7ciBzZXR1cCxpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgZmlnLnJldGluYT0yKQ0KYGBgDQoNCg0KYGBge3J9DQoNCg0KDQoNCiMtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiMgUGVyZm9ybSBzb21lIGRhdGEgcHJlLXByb2Nlc3NpbmcNCiMtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCnJtKGxpc3QgPSBscygpKQ0KDQojIExvYWQgc29tZSBwYWNrYWdlczogDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkobWFncml0dHIpDQoNCiMgSW1wb3J0IGRhdGE6IA0KaG1lcSA8LSByZWFkLmNzdigiaHR0cDovL3d3dy5jcmVkaXRyaXNrYW5hbHl0aWNzLm5ldC91cGxvYWRzLzEvOS81LzEvMTk1MTE2MDEvaG1lcS5jc3YiKQ0KDQojIEZ1bmN0aW9uIHJlcGxhY2VzIE5BIGJ5IG1lYW46IA0KDQpyZXBsYWNlX2J5X21lYW4gPC0gZnVuY3Rpb24oeCkgew0KICB4W2lzLm5hKHgpXSA8LSBtZWFuKHgsIG5hLnJtID0gVFJVRSkNCiAgcmV0dXJuKHgpDQp9DQoNCiMgQSBmdW5jdGlvbiBpbXB1dGVzIE5BIG9ic2VydmF0aW9ucyBmb3IgY2F0ZWdvcmljYWwgdmFyaWFibGVzOiANCg0KcmVwbGFjZV9uYV9jYXRlZ29yaWNhbCA8LSBmdW5jdGlvbih4KSB7DQogIHggJT4lIA0KICAgIHRhYmxlKCkgJT4lIA0KICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgDQogICAgYXJyYW5nZSgtRnJlcSkgLT4+IG15X2RmDQogIA0KICBuX29icyA8LSBzdW0obXlfZGYkRnJlcSkNCiAgcG9wIDwtIG15X2RmJC4gJT4lIGFzLmNoYXJhY3RlcigpDQogIHNldC5zZWVkKDI5KQ0KICB4W2lzLm5hKHgpXSA8LSBzYW1wbGUocG9wLCBzdW0oaXMubmEoeCkpLCByZXBsYWNlID0gVFJVRSwgcHJvYiA9IG15X2RmJEZyZXEpDQogIHJldHVybih4KQ0KfQ0KDQojIFVzZSB0aGUgdHdvIGZ1bmN0aW9uczogDQpkZiA8LSBobWVxICU+JSANCiAgbXV0YXRlX2lmKGlzLmZhY3RvciwgYXMuY2hhcmFjdGVyKSAlPiUgDQogIG11dGF0ZShSRUFTT04gPSBjYXNlX3doZW4oUkVBU09OID09ICIiIH4gTkFfY2hhcmFjdGVyXywgVFJVRSB+IFJFQVNPTiksIA0KICAgICAgICAgSk9CID0gY2FzZV93aGVuKEpPQiA9PSAiIiB+IE5BX2NoYXJhY3Rlcl8sIFRSVUUgfiBKT0IpKSAlPiUNCiAgbXV0YXRlX2lmKGlzX2NoYXJhY3RlciwgYXMuZmFjdG9yKSAlPiUgDQogIG11dGF0ZV9pZihpcy5udW1lcmljLCByZXBsYWNlX2J5X21lYW4pICU+JSANCiAgbXV0YXRlX2lmKGlzLmZhY3RvciwgcmVwbGFjZV9uYV9jYXRlZ29yaWNhbCkNCg0KDQojIENvbnZlcnQgQkFEIHRvIGZhY3RvciBhbmQgc2NhbGUgMCAtMSBkYXRhIHNldDogDQpkZl9mb3JfbWwgPC0gZGYgJT4lIA0KICBtdXRhdGUoQkFEID0gY2FzZV93aGVuKEJBRCA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpICU+JSBhcy5mYWN0b3IoKSkgJT4lIA0KICBtdXRhdGVfaWYoaXMubnVtZXJpYywgZnVuY3Rpb24oeCkgeyh4IC0gbWluKHgpKSAvIChtYXgoeCkgLSBtaW4oeCkpfSkNCg0KDQojLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiMgIFNpbXVsdGFuZW91c2x5IFRyYWluIDEwIE1vZGVscw0KIy0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMgU3BsaXQgZGF0YTogDQpsaWJyYXJ5KGNhcmV0KQ0Kc2V0LnNlZWQoMSkNCmlkIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oeSA9IGRmX2Zvcl9tbCRCQUQsIHAgPSAwLjcsIGxpc3QgPSBGQUxTRSkNCmRmX3RyYWluX21sIDwtIGRmX2Zvcl9tbFtpZCwgXQ0KZGZfdGVzdF9tbCA8LSBkZl9mb3JfbWxbLWlkLCBdDQoNCg0KIyBTZXQgY29uZGl0aW9ucyBmb3IgdHJhaW5pbmcgbW9kZWwgYW5kIGNyb3NzLXZhbGlkYXRpb246IA0KDQpzZXQuc2VlZCgxKQ0KbnVtYmVyIDwtIDMNCnJlcGVhdHMgPC0gNQ0KY29udHJvbCA8LSB0cmFpbkNvbnRyb2wobWV0aG9kID0gInJlcGVhdGVkY3YiLCANCiAgICAgICAgICAgICAgICAgICAgICAgIG51bWJlciA9IG51bWJlciAsIA0KICAgICAgICAgICAgICAgICAgICAgICAgcmVwZWF0cyA9IHJlcGVhdHMsIA0KICAgICAgICAgICAgICAgICAgICAgICAgY2xhc3NQcm9icyA9IFRSVUUsIA0KICAgICAgICAgICAgICAgICAgICAgICAgc2F2ZVByZWRpY3Rpb25zID0gImZpbmFsIiwgDQogICAgICAgICAgICAgICAgICAgICAgICBpbmRleCA9IGNyZWF0ZVJlc2FtcGxlKGRmX3RyYWluX21sJEJBRCwgcmVwZWF0cypudW1iZXIpLCANCiAgICAgICAgICAgICAgICAgICAgICAgIHN1bW1hcnlGdW5jdGlvbiA9IHR3b0NsYXNzU3VtbWFyeSwgDQogICAgICAgICAgICAgICAgICAgICAgICBhbGxvd1BhcmFsbGVsID0gVFJVRSkNCg0KDQojIFRyYWluIExvZ2lzdGljIFJlZ3Jlc3Npb246IA0KbXlfbG9naXQgPC0gdHJhaW4oQkFEIH4uLCBtZXRob2QgPSAiZ2xtIiwgZGF0YSA9IGRmX3RyYWluX21sLA0KICAgICAgICAgICAgICAgICAgdHJDb250cm9sID0gY29udHJvbCwgbWV0cmljID0gIlJPQyIpDQoNCiMgUHJlZGljdCBQRDogDQpwcmVkLnByb2IgPC0gcHJlZGljdChteV9sb2dpdCwgZGZfdGVzdF9tbCwgdHlwZSA9ICJwcm9iIikgJT4lIHB1bGwoQmFkKQ0KDQojIEFjdHVhbCBQRDogDQp0ZXN0LnkgPC0gY2FzZV93aGVuKGRmX3Rlc3RfbWwkQkFEID09ICJCYWQiIH4gMSwgVFJVRSB+IDApDQoNCiMgQWNjdXJhY3kgYnkgVGhyZXNob2xkOiANCg0KbGlicmFyeShjbGFzc2lmaWVycGxvdHMpDQoNCmFjY3VyYWN5X3Bsb3QodGVzdC55LCBwcmVkLnByb2IsIGdyYW51bGFyaXR5ID0gMC4wMSkgKyANCiAgaHJicnRoZW1lczo6dGhlbWVfZnRfcmMoKSArIA0KICB0aGVtZShwYW5lbC5ncmlkLm1pbm9yID0gZWxlbWVudF9ibGFuaygpKSArIA0KICBsYWJzKHRpdGxlID0gIkZpZ3VyZSAxOiBBY2N1cmFjeSBieSBUaHJlc2hvbGQiLCANCiAgICAgICBjYXB0aW9uID0gIkRhdGEgU291cmNlOiBodHRwOi8vd3d3LmNyZWRpdHJpc2thbmFseXRpY3MubmV0IikNCg0KDQoNCg0KYGBgDQoNCg==