About the Contest
Information about the Kalapa’s Credit Scoring Challenge is here. After registering for the contest you can download data here.
R Codes
R codes for a naive approach with ranking of 19:
#=========================================
# A Naive Approach for Data Processing
#=========================================
# Clear work space:
rm(list = ls())
# Load data:
library(tidyverse)
df_train <- read_csv("/home/khanhan/Desktop/kalapa/train.csv")
df_train %>% mutate_if(is.character, function(x) {str_to_upper(x)}) -> df_train
df_test <- read_csv("/home/khanhan/Desktop/kalapa/test.csv")
df_test %>%
mutate_if(is.character, function(x) {str_to_upper(x)}) %>%
mutate(FIELD_36 = as.logical(FIELD_36)) -> df_test
# Conduct binning variables:
library(scorecard)
# Generates optimal binning for all variables/features:
bins_var <- woebin(df_train %>% select(-id), y = "label", no_cores = 8, positive = "label|1")
# IV for variables/features:
do.call("rbind", bins_var) %>%
as.data.frame() %>%
filter(!duplicated(variable)) %>%
rename(iv_var = total_iv) %>%
arrange(iv_var) %>%
mutate(variable = factor(variable, levels = variable)) -> iv_values
# Features have IV >= 0:
iv_values %>%
filter(iv_var >= 0) %>%
pull(variable) %>%
as.character() -> var_IV_10
# Conduct data transformation based on IV/WoE and filter features with IV > 0.1:
train_woe <- woebin_ply(df_train %>% select(-id), bins_var) %>%
as.data.frame() %>%
select(c("label", paste0(var_IV_10, "_", "woe")))
# Data transformation for actual test data:
test_woe <- woebin_ply(df_test %>% select(-id), bins_var) %>%
as.data.frame() %>%
select(paste0(var_IV_10, "_", "woe"))
# A function imputes NA observations:
replace_na_categorical <- function(x, y) {
y %>%
table() %>%
as.data.frame() %>%
arrange(-Freq) -> my_df
n_obs <- sum(my_df$Freq)
pop <- my_df$. %>% as.character() %>% as.numeric()
set.seed(29)
x[is.na(x)] <- sample(pop, sum(is.na(x)), replace = TRUE, prob = my_df$Freq)
return(x)
}
# Use the function:
test_woe %>%
mutate(FIELD_13_woe = replace_na_categorical(FIELD_13_woe, train_woe$FIELD_13_woe)) %>%
mutate(maCv_woe = replace_na_categorical(maCv_woe, train_woe$maCv_woe)) %>%
mutate(district_woe = replace_na_categorical(district_woe, train_woe$district_woe)) %>%
mutate(FIELD_7_woe = replace_na_categorical(FIELD_7_woe, train_woe$FIELD_7_woe)) %>%
mutate(FIELD_41_woe = replace_na_categorical(FIELD_41_woe, train_woe$FIELD_41_woe)) %>%
mutate(FIELD_10_woe = replace_na_categorical(FIELD_10_woe, train_woe$FIELD_10_woe)) %>%
mutate(FIELD_39_woe = replace_na_categorical(FIELD_39_woe, train_woe$FIELD_39_woe)) %>%
mutate(FIELD_11_woe = replace_na_categorical(FIELD_11_woe, train_woe$FIELD_11_woe)) %>%
mutate(FIELD_9_woe = replace_na_categorical(FIELD_9_woe, train_woe$FIELD_9_woe)) %>%
mutate(FIELD_12_woe = replace_na_categorical(FIELD_12_woe, train_woe$FIELD_12_woe)) -> test_woe_imputed
#======================================================
# Attempt 4: Default Random Forest with Scaled Data
#======================================================
# For convinience, convert binary target variable to factor:
train_woe %>%
mutate(label = case_when(label == 1 ~ "Bad", TRUE ~ "Good")) %>%
mutate(label = as.factor(label)) -> df_forGBM
# Scale our data:
df_forGBM %>%
mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) -> df_forGBM_Scaled
test_woe_imputed %>%
mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) -> df_test_Scaled
# Train Random Forest:
library(ranger)
RF_default <- ranger(label ~ ., data = df_forGBM_Scaled, probability = TRUE)
# Use the RF Classifier for predicting PD (Probability of Default):
pd_sub_RF <- predict(RF_default, df_test_Scaled, type = "response")
# Save results for submission:
pd_sub_RF$predictions %>% as.data.frame() %>% pull(Bad) -> pd_sub_RF
df_sub <- data.frame(id = 30000:49999, label = pd_sub_RF)
write_csv(df_sub, "submission_RandomForest_ScaledData.csv")
LS0tCnRpdGxlOiAiS2FsYXBhJ3MgQ3JlZGl0IFNjb3JpbmcgQ2hhbGxlbmdlIC0gQSBOYWl2ZSBBcHByb2FjaCB3aXRoIFJhbmtpbmcgb2YgMTkiCmF1dGhvcjogIk5ndXllbiBDaGkgRHVuZyIKc3VidGl0bGU6IFIgRm9yIEZ1bgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHllcwogICAgaGlnaGxpZ2h0OiB6ZW5idXJuCiAgICB0aGVtZTogZmxhdGx5CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKICBwZGZfZG9jdW1lbnQ6CiAgICB0b2M6IHllcwotLS0KCmBgYHtyIHNldHVwLGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UsIGZpZy53aWR0aCA9IDEwLCBmaWcuaGVpZ2h0ID0gNikKYGBgCgojIEFib3V0IHRoZSBDb250ZXN0CgpJbmZvcm1hdGlvbiBhYm91dCB0aGUgS2FsYXBhJ3MgQ3JlZGl0IFNjb3JpbmcgQ2hhbGxlbmdlIGlzIFtoZXJlXShodHRwczovL2NoYWxsZW5nZS5rYWxhcGEudm4vcmVndWxhdGlvbnMuaHRtbCkuIEFmdGVyIHJlZ2lzdGVyaW5nIGZvciB0aGUgY29udGVzdCB5b3UgY2FuIGRvd25sb2FkIGRhdGEgW2hlcmVdKGh0dHBzOi8vd3d3LmFpdml2bi5jb20vKS4gCgojIFIgQ29kZXMgCgpSIGNvZGVzIGZvciBhIG5haXZlIGFwcHJvYWNoIHdpdGggcmFua2luZyBvZiAxOTogCgpgYGB7ciwgZXZhbD1GQUxTRX0KIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgIEEgTmFpdmUgQXBwcm9hY2ggZm9yIERhdGEgUHJvY2Vzc2luZwojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KCiMgQ2xlYXIgd29yayBzcGFjZTogCnJtKGxpc3QgPSBscygpKQoKIyBMb2FkIGRhdGE6IApsaWJyYXJ5KHRpZHl2ZXJzZSkKCmRmX3RyYWluIDwtIHJlYWRfY3N2KCIvaG9tZS9raGFuaGFuL0Rlc2t0b3Ava2FsYXBhL3RyYWluLmNzdiIpCgpkZl90cmFpbiAlPiUgbXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgZnVuY3Rpb24oeCkge3N0cl90b191cHBlcih4KX0pIC0+IGRmX3RyYWluCgpkZl90ZXN0IDwtIHJlYWRfY3N2KCIvaG9tZS9raGFuaGFuL0Rlc2t0b3Ava2FsYXBhL3Rlc3QuY3N2IikKCmRmX3Rlc3QgJT4lIAogIG11dGF0ZV9pZihpcy5jaGFyYWN0ZXIsIGZ1bmN0aW9uKHgpIHtzdHJfdG9fdXBwZXIoeCl9KSAlPiUgCiAgbXV0YXRlKEZJRUxEXzM2ID0gYXMubG9naWNhbChGSUVMRF8zNikpIC0+IGRmX3Rlc3QKCiMgQ29uZHVjdCBiaW5uaW5nIHZhcmlhYmxlczogCmxpYnJhcnkoc2NvcmVjYXJkKQoKIyBHZW5lcmF0ZXMgb3B0aW1hbCBiaW5uaW5nIGZvciBhbGwgdmFyaWFibGVzL2ZlYXR1cmVzOiAKYmluc192YXIgPC0gd29lYmluKGRmX3RyYWluICU+JSBzZWxlY3QoLWlkKSwgeSA9ICJsYWJlbCIsIG5vX2NvcmVzID0gOCwgcG9zaXRpdmUgPSAibGFiZWx8MSIpCgojIElWIGZvciB2YXJpYWJsZXMvZmVhdHVyZXM6IAoKZG8uY2FsbCgicmJpbmQiLCBiaW5zX3ZhcikgJT4lIAogIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgZmlsdGVyKCFkdXBsaWNhdGVkKHZhcmlhYmxlKSkgJT4lIAogIHJlbmFtZShpdl92YXIgPSB0b3RhbF9pdikgJT4lIAogIGFycmFuZ2UoaXZfdmFyKSAlPiUgCiAgbXV0YXRlKHZhcmlhYmxlID0gZmFjdG9yKHZhcmlhYmxlLCBsZXZlbHMgPSB2YXJpYWJsZSkpIC0+IGl2X3ZhbHVlcwoKIyBGZWF0dXJlcyBoYXZlIElWID49IDA6IAoKaXZfdmFsdWVzICU+JSAKICBmaWx0ZXIoaXZfdmFyID49IDApICU+JSAKICBwdWxsKHZhcmlhYmxlKSAlPiUgCiAgYXMuY2hhcmFjdGVyKCkgLT4gdmFyX0lWXzEwCgoKIyBDb25kdWN0IGRhdGEgdHJhbnNmb3JtYXRpb24gYmFzZWQgb24gSVYvV29FIGFuZCBmaWx0ZXIgZmVhdHVyZXMgd2l0aCBJViA+IDAuMTogCgp0cmFpbl93b2UgPC0gd29lYmluX3BseShkZl90cmFpbiAlPiUgc2VsZWN0KC1pZCksIGJpbnNfdmFyKSAlPiUgCiAgYXMuZGF0YS5mcmFtZSgpICU+JSAKICBzZWxlY3QoYygibGFiZWwiLCBwYXN0ZTAodmFyX0lWXzEwLCAiXyIsICJ3b2UiKSkpCgoKIyBEYXRhIHRyYW5zZm9ybWF0aW9uIGZvciBhY3R1YWwgdGVzdCBkYXRhOiAKCnRlc3Rfd29lIDwtIHdvZWJpbl9wbHkoZGZfdGVzdCAlPiUgc2VsZWN0KC1pZCksIGJpbnNfdmFyKSAlPiUgCiAgYXMuZGF0YS5mcmFtZSgpICU+JSAKICBzZWxlY3QocGFzdGUwKHZhcl9JVl8xMCwgIl8iLCAid29lIikpCgojIEEgZnVuY3Rpb24gaW1wdXRlcyBOQSBvYnNlcnZhdGlvbnM6IAoKcmVwbGFjZV9uYV9jYXRlZ29yaWNhbCA8LSBmdW5jdGlvbih4LCB5KSB7CiAgCiAgeSAlPiUgCiAgICB0YWJsZSgpICU+JSAKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBhcnJhbmdlKC1GcmVxKSAtPiBteV9kZgogIAogIG5fb2JzIDwtIHN1bShteV9kZiRGcmVxKQogIAogIHBvcCA8LSBteV9kZiQuICU+JSBhcy5jaGFyYWN0ZXIoKSAlPiUgYXMubnVtZXJpYygpCiAgCiAgc2V0LnNlZWQoMjkpCiAgCiAgeFtpcy5uYSh4KV0gPC0gc2FtcGxlKHBvcCwgc3VtKGlzLm5hKHgpKSwgcmVwbGFjZSA9IFRSVUUsIHByb2IgPSBteV9kZiRGcmVxKQogIAogIHJldHVybih4KQp9CgojIFVzZSB0aGUgZnVuY3Rpb246IAoKdGVzdF93b2UgJT4lIAogIG11dGF0ZShGSUVMRF8xM193b2UgPSByZXBsYWNlX25hX2NhdGVnb3JpY2FsKEZJRUxEXzEzX3dvZSwgdHJhaW5fd29lJEZJRUxEXzEzX3dvZSkpICU+JSAKICBtdXRhdGUobWFDdl93b2UgPSByZXBsYWNlX25hX2NhdGVnb3JpY2FsKG1hQ3Zfd29lLCB0cmFpbl93b2UkbWFDdl93b2UpKSAlPiUgCiAgbXV0YXRlKGRpc3RyaWN0X3dvZSA9IHJlcGxhY2VfbmFfY2F0ZWdvcmljYWwoZGlzdHJpY3Rfd29lLCB0cmFpbl93b2UkZGlzdHJpY3Rfd29lKSkgJT4lIAogIG11dGF0ZShGSUVMRF83X3dvZSA9IHJlcGxhY2VfbmFfY2F0ZWdvcmljYWwoRklFTERfN193b2UsIHRyYWluX3dvZSRGSUVMRF83X3dvZSkpICU+JSAKICBtdXRhdGUoRklFTERfNDFfd29lID0gcmVwbGFjZV9uYV9jYXRlZ29yaWNhbChGSUVMRF80MV93b2UsIHRyYWluX3dvZSRGSUVMRF80MV93b2UpKSAlPiUgCiAgbXV0YXRlKEZJRUxEXzEwX3dvZSA9IHJlcGxhY2VfbmFfY2F0ZWdvcmljYWwoRklFTERfMTBfd29lLCB0cmFpbl93b2UkRklFTERfMTBfd29lKSkgJT4lIAogIG11dGF0ZShGSUVMRF8zOV93b2UgPSByZXBsYWNlX25hX2NhdGVnb3JpY2FsKEZJRUxEXzM5X3dvZSwgdHJhaW5fd29lJEZJRUxEXzM5X3dvZSkpICU+JSAKICBtdXRhdGUoRklFTERfMTFfd29lID0gcmVwbGFjZV9uYV9jYXRlZ29yaWNhbChGSUVMRF8xMV93b2UsIHRyYWluX3dvZSRGSUVMRF8xMV93b2UpKSAlPiUgCiAgbXV0YXRlKEZJRUxEXzlfd29lID0gcmVwbGFjZV9uYV9jYXRlZ29yaWNhbChGSUVMRF85X3dvZSwgdHJhaW5fd29lJEZJRUxEXzlfd29lKSkgJT4lIAogIG11dGF0ZShGSUVMRF8xMl93b2UgPSByZXBsYWNlX25hX2NhdGVnb3JpY2FsKEZJRUxEXzEyX3dvZSwgdHJhaW5fd29lJEZJRUxEXzEyX3dvZSkpIC0+IHRlc3Rfd29lX2ltcHV0ZWQKCgojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgQXR0ZW1wdCA0OiBEZWZhdWx0IFJhbmRvbSBGb3Jlc3Qgd2l0aCBTY2FsZWQgRGF0YQojPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CgojIEZvciBjb252aW5pZW5jZSwgY29udmVydCBiaW5hcnkgdGFyZ2V0IHZhcmlhYmxlIHRvIGZhY3RvcjogCgp0cmFpbl93b2UgJT4lIAogIG11dGF0ZShsYWJlbCA9IGNhc2Vfd2hlbihsYWJlbCA9PSAxIH4gIkJhZCIsIFRSVUUgfiAiR29vZCIpKSAlPiUgCiAgbXV0YXRlKGxhYmVsID0gYXMuZmFjdG9yKGxhYmVsKSkgLT4gZGZfZm9yR0JNIAoKIyBTY2FsZSBvdXIgZGF0YTogCgpkZl9mb3JHQk0gJT4lIAogIG11dGF0ZV9pZihpcy5udW1lcmljLCBmdW5jdGlvbih4KSB7KHggLSBtaW4oeCkpIC8gKG1heCh4KSAtIG1pbih4KSl9KSAtPiBkZl9mb3JHQk1fU2NhbGVkCgp0ZXN0X3dvZV9pbXB1dGVkICU+JSAKICBtdXRhdGVfaWYoaXMubnVtZXJpYywgZnVuY3Rpb24oeCkgeyh4IC0gbWluKHgpKSAvIChtYXgoeCkgLSBtaW4oeCkpfSkgLT4gZGZfdGVzdF9TY2FsZWQKCiMgVHJhaW4gUmFuZG9tIEZvcmVzdDogCmxpYnJhcnkocmFuZ2VyKQpSRl9kZWZhdWx0IDwtIHJhbmdlcihsYWJlbCB+IC4sIGRhdGEgPSBkZl9mb3JHQk1fU2NhbGVkLCBwcm9iYWJpbGl0eSA9IFRSVUUpCgojIFVzZSB0aGUgUkYgQ2xhc3NpZmllciBmb3IgcHJlZGljdGluZyBQRCAoUHJvYmFiaWxpdHkgb2YgRGVmYXVsdCk6IApwZF9zdWJfUkYgPC0gcHJlZGljdChSRl9kZWZhdWx0LCBkZl90ZXN0X1NjYWxlZCwgdHlwZSA9ICJyZXNwb25zZSIpCgojIFNhdmUgcmVzdWx0cyBmb3Igc3VibWlzc2lvbjogCnBkX3N1Yl9SRiRwcmVkaWN0aW9ucyAlPiUgYXMuZGF0YS5mcmFtZSgpICU+JSBwdWxsKEJhZCkgLT4gcGRfc3ViX1JGCmRmX3N1YiA8LSBkYXRhLmZyYW1lKGlkID0gMzAwMDA6NDk5OTksIGxhYmVsID0gcGRfc3ViX1JGKQp3cml0ZV9jc3YoZGZfc3ViLCAic3VibWlzc2lvbl9SYW5kb21Gb3Jlc3RfU2NhbGVkRGF0YS5jc3YiKQogICAgCmBgYAoKCgo=