Load all the required R packages.

# install the required packages first
library(readxl)    # load excel files
library(cdata)     # data wrangling
## Loading required package: wrapr
library(dplyr)     # tidy up the data (data wrangling)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:wrapr':
## 
##     coalesce
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rsample)   # stratified sampling
library(caret)     # preprocessing (min-max normalization)
## Loading required package: lattice
## Loading required package: ggplot2
library(sigr)      # for AUC calculation
library(knitr)     # for table presentation
library(mgcv)      # GAM model
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## The following object is masked from 'package:wrapr':
## 
##     %.%
library(glue)      # for formula construction
## 
## Attaching package: 'glue'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(WVPlots)   # double density plots and ROC curves
library(car)       # VIF calculation
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:wrapr':
## 
##     bc
library(glmnet)    # Lasso model
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:wrapr':
## 
##     pack, unpack
## Loaded glmnet 4.1-2
library(glmnetUtils)  # Lasso model
## 
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
## 
##     cv.glmnet, glmnet
library(class)     # kNN 

Data acquisition and preprocessing

The coding is the same as this Rpubs document 7 and will not be shown here.

Data partitioning

The coding is the same as this Rpubs document 7 and will not be shown here.

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(response)` instead of `response` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Data transformation

Continuous features

Min-max normalization to standardize scales across all continuous predictors.

# Select the continuous features
features_cont = colnames(train)[c(1, 5, c(12:23))]
train_cont = train[,features_cont]
# Min-max normalization on train data (continuous features)
preprocess_min_max = preProcess(train_cont, method = c("range"))
scaled_train_cont = predict(preprocess_min_max, train_cont)

train_scaled = cbind(scaled_train_cont, train[, features_cat], default = train$default)

Categorical features

One-hot encoding and ordinal encoding.

m_train = dim(train_scaled)[1]
my_cat_encode = function(predictor, ncol, col_names, m){
  # data is the predictor (data frame); ncol is the number of columns
  # m = dim(predictor)[1]; col_names is the names of column header
  mat = matrix(, nrow = m, ncol = ncol)
  
  for (i in (1:m)){
    data = predictor[i]
    if (data == -2){
      vec = c(1,0,0,0)
    } else if (data == -1){
      vec = c(0,1,0,0)
    } else if (data == 0){
      vec = c(0,0,1,0)
    } else {
      vec = c(0,0,0,data)
    }
    mat[i,] = vec
  }
  df_ps0 = data.frame(mat)
  # colnames(df_ps0) = c("neg2_pay0", "neg1_pay0", "zero_pay0","delay_months")
  colnames(df_ps0) = col_names
  df_ps0
}

df = data.frame(matrix(,nrow = m_train, ncol = 0))
f_pay = c("PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")

for (j in f_pay){
  predictor = train_scaled[,j]
  col_names = sapply(c("neg2_{j}", "duly_{j}", "zero_{j}", "delay_months_{j}"), glue)
  names(col_names) = NULL
  df1 = my_cat_encode(predictor, ncol = 4, col_names = col_names, m = m_train)
  df = cbind(df, df1)
}

# concatenate original dataframe
train_new = cbind(train_scaled, df)
train_new = train_new[, !(colnames(train_new) %in% f_pay)]

The above preprocessing pipeline is applied on test data.

# For test data
scaled_test_cont = predict(preprocess_min_max, test[,features_cont])
test_scaled = cbind(scaled_test_cont, test[, features_cat], default = test$default)

m_test = dim(test_scaled)[1]
# Repeat same pipeline as training data
df = data.frame(matrix(,nrow = m_test, ncol = 0))
f_pay = c("PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")

for (j in f_pay){
  predictor = test_scaled[,j]
  col_names = sapply(c("neg2_{j}", "duly_{j}", "zero_{j}", "delay_months_{j}"), glue)
  names(col_names) = NULL
  df1 = my_cat_encode(predictor, ncol = 4, col_names = col_names, m = m_test)
  df = cbind(df, df1)
}

test_new = cbind(test_scaled, df)
test_new = test_new[, !(colnames(test_new) %in% f_pay)]

cat('Training data no. of samples: ', dim(train_new)[1], 
    ' ; no. of features: ', dim(train_new)[2])
## Training data no. of samples:  23999  ; no. of features:  42
cat('Test data no. of samples: ', dim(test_new)[1], 
    ' ; no. of features: ', dim(test_new)[2])
## Test data no. of samples:  6001  ; no. of features:  42

We end up with 42 predictors and this transformed trained data will be used to train logistic regression (LR), regularized logistic regression (Lasso), generalized additive model (GAM).

Models training and their performance evaluation

First, I write a function to calculate accuracy, precision, recall, f-score and AUC. AUC refers to area under the receiver operating characteristic (ROC) curve.

performance = function(truth, pred, threshold=0.5){
  # Expect pred to be posterior probability (double precision)
  if (typeof(pred) == 'double'){
    tab = table(truth, pred>=threshold)
  } else {
    stop('The model prediction must be either scores or probability estimates')
  }
  acc = (sum(diag(tab)))/sum(tab)
  recall = tab[2,2]/sum(tab[2,])
  precision = tab[2,2]/sum(tab[,2])
  f1_score = (2*recall*precision)/(recall+precision)
  # Calculate AUC, change the response variable to boolean variable
  AUC = calcAUC(pred, as.numeric(truth)-1)
  round(c(accuracy = acc, recall = recall, precision = precision,
          f1_score = f1_score, AUC = AUC),4)
}

Logistic regression (LR)

Model training

Logarithmic transform positively skewed predictors as shown in the code chunks below.

# Construct formula with logarithmic transformation on continuous predictors
variables = glue('log({features_cont}+1)')
response = 'default'
cat_var = setdiff(colnames(train_new), c(features_cont, response))
all_vars = c(variables, cat_var)
f = as.formula(
  paste(response, paste(all_vars, collapse = " + "), sep = " ~ ")
)
print(f)
## default ~ log(LIMIT_BAL + 1) + log(AGE + 1) + log(BILL_AMT1 + 
##     1) + log(BILL_AMT2 + 1) + log(BILL_AMT3 + 1) + log(BILL_AMT4 + 
##     1) + log(BILL_AMT5 + 1) + log(BILL_AMT6 + 1) + log(PAY_AMT1 + 
##     1) + log(PAY_AMT2 + 1) + log(PAY_AMT3 + 1) + log(PAY_AMT4 + 
##     1) + log(PAY_AMT5 + 1) + log(PAY_AMT6 + 1) + SEX + EDUCATION + 
##     MARRIAGE + neg2_PAY_0 + duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 + 
##     neg2_PAY_2 + duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 + 
##     neg2_PAY_3 + duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 + 
##     neg2_PAY_4 + duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 + 
##     neg2_PAY_5 + duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 + 
##     neg2_PAY_6 + duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6
model_logreg = glm(f, data = train_new, family = binomial(link = "logit"))
summary(model_logreg)
## 
## Call:
## glm(formula = f, family = binomial(link = "logit"), data = train_new)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2834  -0.5932  -0.5021  -0.3033   3.1770  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            0.623290   0.537022   1.161 0.245788    
## log(LIMIT_BAL + 1)    -2.370658   0.233152 -10.168  < 2e-16 ***
## log(AGE + 1)           0.299972   0.156123   1.921 0.054684 .  
## log(BILL_AMT1 + 1)    -1.131413   1.744292  -0.649 0.516573    
## log(BILL_AMT2 + 1)     1.729930   2.032756   0.851 0.394754    
## log(BILL_AMT3 + 1)     6.203739   3.046040   2.037 0.041684 *  
## log(BILL_AMT4 + 1)    -0.174320   1.736196  -0.100 0.920024    
## log(BILL_AMT5 + 1)    -1.630793   2.082356  -0.783 0.433540    
## log(BILL_AMT6 + 1)     1.586931   2.415310   0.657 0.511162    
## log(PAY_AMT1 + 1)    -11.022535   2.415819  -4.563 5.05e-06 ***
## log(PAY_AMT2 + 1)    -16.484680   4.092820  -4.028 5.63e-05 ***
## log(PAY_AMT3 + 1)     -1.495719   1.954620  -0.765 0.444139    
## log(PAY_AMT4 + 1)     -1.042464   1.388120  -0.751 0.452659    
## log(PAY_AMT5 + 1)     -1.908810   0.994503  -1.919 0.054939 .  
## log(PAY_AMT6 + 1)     -1.953664   0.905576  -2.157 0.030977 *  
## SEXM                   0.146167   0.036082   4.051 5.10e-05 ***
## EDUCATIONhigh_school  -0.020145   0.055910  -0.360 0.718616    
## EDUCATIONothers       -0.873374   0.403451  -2.165 0.030406 *  
## EDUCATIONuniversity    0.002154   0.041825   0.051 0.958932    
## EDUCATIONunknown      -1.262168   0.255757  -4.935 8.01e-07 ***
## MARRIAGEothers        -0.140138   0.157815  -0.888 0.374546    
## MARRIAGEsingle        -0.158288   0.040954  -3.865 0.000111 ***
## neg2_PAY_0            -0.216275   0.109050  -1.983 0.047337 *  
## duly_PAY_0             0.038243   0.106513   0.359 0.719559    
## zero_PAY_0            -0.938408   0.118341  -7.930 2.20e-15 ***
## delay_months_PAY_0     0.636710   0.051449  12.376  < 2e-16 ***
## neg2_PAY_2            -1.324896   0.212811  -6.226 4.79e-10 ***
## duly_PAY_2            -1.238668   0.204339  -6.062 1.35e-09 ***
## zero_PAY_2            -0.784737   0.209117  -3.753 0.000175 ***
## delay_months_PAY_2    -0.533255   0.089892  -5.932 2.99e-09 ***
## neg2_PAY_3            -0.438103   0.244374  -1.793 0.073012 .  
## duly_PAY_3            -0.545635   0.221681  -2.461 0.013841 *  
## zero_PAY_3            -0.491314   0.212467  -2.312 0.020754 *  
## delay_months_PAY_3    -0.059587   0.100393  -0.594 0.552819    
## neg2_PAY_4            -0.902047   0.315394  -2.860 0.004236 ** 
## duly_PAY_4            -0.861328   0.297962  -2.891 0.003843 ** 
## zero_PAY_4            -0.875994   0.290268  -3.018 0.002546 ** 
## delay_months_PAY_4    -0.282034   0.139951  -2.015 0.043879 *  
## neg2_PAY_5            -0.395381   0.354870  -1.114 0.265212    
## duly_PAY_5            -0.564350   0.340586  -1.657 0.097520 .  
## zero_PAY_5            -0.415625   0.332329  -1.251 0.211064    
## delay_months_PAY_5    -0.093923   0.159758  -0.588 0.556593    
## neg2_PAY_6             0.675856   0.289830   2.332 0.019706 *  
## duly_PAY_6             0.540918   0.282701   1.913 0.055697 .  
## zero_PAY_6             0.345902   0.274818   1.259 0.208155    
## delay_months_PAY_6     0.391511   0.130558   2.999 0.002711 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25362  on 23998  degrees of freedom
## Residual deviance: 20803  on 23953  degrees of freedom
## AIC: 20895
## 
## Number of Fisher Scoring iterations: 5

Variable inflation factor (VIF)

VIF of more than 5 indicates strong multicollinearity (highly collinear with other variables) (see this link). Multicollinearity makes it difficult to establish relationship between predictors and target variable as it is difficult to separate out the individual effect of the collinear variables on the response. Regularization is a good alternative to address multicollinearity issue by reducing the magnitudes of coefficients.

kable(data.frame(vif(model_logreg)), caption = "VIF for predictors")
VIF for predictors
GVIF Df GVIF..1..2.Df..
log(LIMIT_BAL + 1) 1.755492 1 1.324950
log(AGE + 1) 1.419511 1 1.191432
log(BILL_AMT1 + 1) 23.888635 1 4.887600
log(BILL_AMT2 + 1) 39.971787 1 6.322324
log(BILL_AMT3 + 1) 29.159909 1 5.399991
log(BILL_AMT4 + 1) 26.650667 1 5.162428
log(BILL_AMT5 + 1) 32.676032 1 5.716295
log(BILL_AMT6 + 1) 19.694999 1 4.437905
log(PAY_AMT1 + 1) 1.522892 1 1.234055
log(PAY_AMT2 + 1) 1.511449 1 1.229410
log(PAY_AMT3 + 1) 1.549805 1 1.244912
log(PAY_AMT4 + 1) 1.536855 1 1.239699
log(PAY_AMT5 + 1) 1.605836 1 1.267216
log(PAY_AMT6 + 1) 1.144700 1 1.069906
SEX 1.029312 1 1.014550
EDUCATION 1.250969 4 1.028385
MARRIAGE 1.364017 2 1.080699
neg2_PAY_0 2.689367 1 1.639929
duly_PAY_0 5.573788 1 2.360887
zero_PAY_0 10.819927 1 3.289366
delay_months_PAY_0 6.098382 1 2.469490
neg2_PAY_2 16.582334 1 4.072141
duly_PAY_2 20.570947 1 4.535521
zero_PAY_2 35.267327 1 5.938630
delay_months_PAY_2 22.769923 1 4.771784
neg2_PAY_3 23.566585 1 4.854543
duly_PAY_3 23.487948 1 4.846437
zero_PAY_3 36.628169 1 6.052121
delay_months_PAY_3 26.868115 1 5.183446
neg2_PAY_4 41.488296 1 6.441141
duly_PAY_4 41.912096 1 6.473955
zero_PAY_4 68.433990 1 8.272484
delay_months_PAY_4 46.683270 1 6.832516
neg2_PAY_5 55.485950 1 7.448889
duly_PAY_5 52.974433 1 7.278354
zero_PAY_5 89.329811 1 9.451445
delay_months_PAY_5 53.776321 1 7.333234
neg2_PAY_6 39.470716 1 6.282572
duly_PAY_6 38.232810 1 6.183269
zero_PAY_6 61.363627 1 7.833494
delay_months_PAY_6 35.558670 1 5.963109

LR model performance evaluation using training and test data.

pred = predict(model_logreg, newdata = train_new, type = "response")
pred_test = predict(model_logreg, newdata = test_new, type = "response")

test$pred_logreg = pred_test
test$binary_default = (test$default==1)

(perf_train_logreg = performance(train_new$default, pred = pred))
##  accuracy    recall precision  f1_score       AUC 
##    0.8210    0.3661    0.6761    0.4749    0.7743
(perf_test_logreg = performance(test_new$default, pred = pred_test))
##  accuracy    recall precision  f1_score       AUC 
##    0.8140    0.3456    0.6501    0.4513    0.7481

Double density plot

When it comes to probabilistic model, it’s useful to construct double density plot. Refer to this online e-book for more info.

DoubleDensityPlot(test, xvar = "pred_logreg", truthVar = "default", 
                  title = "Distribution of scores of LR model in test data") +
  geom_vline(xintercept = 0.5, color = 'red', linetype = 2)

Enrichment-recall plot

Visualize the model’s precision-recall trade-off.

train$pred = pred
train$binary_default = train$default == 1
PRTPlot(train, 'pred', 'binary_default', TRUE, 
        plotvars = c('enrichment', 'recall'), thresholdrange = c(0,1),
        title = 'Enrichment/Recall vs thresholds for LR model')

Double density plot and enrichment-recall plot are helpful to pick the classifier’s threshold.

Lasso

Model Training and its coeffient estimates.

model_lasso = cv.glmnet(f, data = train_new, alpha = 0, 
                        family = "binomial", standardize = FALSE)
# summary(model_lasso)
coeff = coef(model_lasso)
coef_frame <- data.frame(coef = rownames(coeff)[-1],
                         value = coeff[-1,1])
ggplot(coef_frame, aes(x = coef, y = value)) +
  geom_pointrange(aes(ymin = 0, ymax = value)) +
  ggtitle("Coefficients of lasso model") +
  coord_flip() +
  theme(axis.text.y = element_text(size = 6))

Model evaluation

Lasso performance evaluated using training set (above) and test set (below)

##  accuracy    recall precision  f1_score       AUC 
##    0.8112    0.2611    0.6947    0.3796    0.7588
##  accuracy    recall precision  f1_score       AUC 
##    0.8022    0.2304    0.6497    0.3402    0.7385

The code outputs are training performance (above) and test performance (below).

GAM

Model training

# Construct the formula
variables = glue('s(log({features_cont}+1))')
f_add = train_new %>% select(tail(names(.),24)) %>% colnames()
f_cat = c("SEX", "EDUCATION", "MARRIAGE")
all_vars = c(variables, c(f_add,f_cat))
# Change the response to binary variable (Boolean)
train_new$binary_default = (train_new$default==1)

f = as.formula(
  paste("binary_default", paste(all_vars, collapse = " + "), sep = " ~ ")
)
print(f)
## binary_default ~ s(log(LIMIT_BAL + 1)) + s(log(AGE + 1)) + s(log(BILL_AMT1 + 
##     1)) + s(log(BILL_AMT2 + 1)) + s(log(BILL_AMT3 + 1)) + s(log(BILL_AMT4 + 
##     1)) + s(log(BILL_AMT5 + 1)) + s(log(BILL_AMT6 + 1)) + s(log(PAY_AMT1 + 
##     1)) + s(log(PAY_AMT2 + 1)) + s(log(PAY_AMT3 + 1)) + s(log(PAY_AMT4 + 
##     1)) + s(log(PAY_AMT5 + 1)) + s(log(PAY_AMT6 + 1)) + neg2_PAY_0 + 
##     duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 + neg2_PAY_2 + 
##     duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 + neg2_PAY_3 + 
##     duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 + neg2_PAY_4 + 
##     duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 + neg2_PAY_5 + 
##     duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 + neg2_PAY_6 + 
##     duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6 + SEX + EDUCATION + 
##     MARRIAGE
gam_model = gam(f, data = train_new, family = binomial(link = "logit"), 
                standardize = FALSE)
gam_model$converged
## [1] TRUE
# To Visualize some "interesting" variables
(sum_gam = summary(gam_model))
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## binary_default ~ s(log(LIMIT_BAL + 1)) + s(log(AGE + 1)) + s(log(BILL_AMT1 + 
##     1)) + s(log(BILL_AMT2 + 1)) + s(log(BILL_AMT3 + 1)) + s(log(BILL_AMT4 + 
##     1)) + s(log(BILL_AMT5 + 1)) + s(log(BILL_AMT6 + 1)) + s(log(PAY_AMT1 + 
##     1)) + s(log(PAY_AMT2 + 1)) + s(log(PAY_AMT3 + 1)) + s(log(PAY_AMT4 + 
##     1)) + s(log(PAY_AMT5 + 1)) + s(log(PAY_AMT6 + 1)) + neg2_PAY_0 + 
##     duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 + neg2_PAY_2 + 
##     duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 + neg2_PAY_3 + 
##     duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 + neg2_PAY_4 + 
##     duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 + neg2_PAY_5 + 
##     duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 + neg2_PAY_6 + 
##     duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6 + SEX + EDUCATION + 
##     MARRIAGE
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.27324    0.22820   5.579 2.41e-08 ***
## neg2_PAY_0            0.01365    0.11039   0.124 0.901609    
## duly_PAY_0            0.09678    0.10693   0.905 0.365417    
## zero_PAY_0           -0.87332    0.11970  -7.296 2.97e-13 ***
## delay_months_PAY_0    0.65598    0.05217  12.573  < 2e-16 ***
## neg2_PAY_2           -1.60326    0.21546  -7.441 9.98e-14 ***
## duly_PAY_2           -1.44753    0.20614  -7.022 2.19e-12 ***
## zero_PAY_2           -0.87396    0.21034  -4.155 3.25e-05 ***
## delay_months_PAY_2   -0.61168    0.09093  -6.727 1.74e-11 ***
## neg2_PAY_3           -0.48324    0.24771  -1.951 0.051081 .  
## duly_PAY_3           -0.49514    0.22591  -2.192 0.028398 *  
## zero_PAY_3           -0.45769    0.21649  -2.114 0.034503 *  
## delay_months_PAY_3   -0.08273    0.10290  -0.804 0.421412    
## neg2_PAY_4           -1.06168    0.31582  -3.362 0.000775 ***
## duly_PAY_4           -0.95467    0.29874  -3.196 0.001395 ** 
## zero_PAY_4           -0.93751    0.29058  -3.226 0.001254 ** 
## delay_months_PAY_4   -0.29172    0.14083  -2.071 0.038323 *  
## neg2_PAY_5           -0.46026    0.35239  -1.306 0.191512    
## duly_PAY_5           -0.62961    0.33807  -1.862 0.062551 .  
## zero_PAY_5           -0.44468    0.32959  -1.349 0.177270    
## delay_months_PAY_5   -0.09565    0.15863  -0.603 0.546534    
## neg2_PAY_6            0.51548    0.28960   1.780 0.075084 .  
## duly_PAY_6            0.42994    0.28229   1.523 0.127746    
## zero_PAY_6            0.24927    0.27376   0.911 0.362544    
## delay_months_PAY_6    0.34356    0.12992   2.645 0.008181 ** 
## SEXM                  0.12246    0.03652   3.353 0.000798 ***
## EDUCATIONhigh_school -0.05804    0.05640  -1.029 0.303471    
## EDUCATIONothers      -0.85740    0.40293  -2.128 0.033346 *  
## EDUCATIONuniversity  -0.02617    0.04218  -0.621 0.534892    
## EDUCATIONunknown     -1.29344    0.25682  -5.036 4.75e-07 ***
## MARRIAGEothers       -0.17247    0.15825  -1.090 0.275789    
## MARRIAGEsingle       -0.16830    0.04177  -4.030 5.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df Chi.sq  p-value    
## s(log(LIMIT_BAL + 1)) 2.573  3.231 88.366  < 2e-16 ***
## s(log(AGE + 1))       2.819  3.557  6.360   0.1332    
## s(log(BILL_AMT1 + 1)) 5.989  7.177 37.878 3.92e-06 ***
## s(log(BILL_AMT2 + 1)) 1.002  1.004  1.168   0.2806    
## s(log(BILL_AMT3 + 1)) 1.001  1.003  5.258   0.0219 *  
## s(log(BILL_AMT4 + 1)) 2.039  2.599  1.737   0.4889    
## s(log(BILL_AMT5 + 1)) 1.942  2.485  1.976   0.5231    
## s(log(BILL_AMT6 + 1)) 1.001  1.002  1.421   0.2336    
## s(log(PAY_AMT1 + 1))  5.264  6.342 36.080 4.58e-06 ***
## s(log(PAY_AMT2 + 1))  7.613  7.892 64.448 5.76e-11 ***
## s(log(PAY_AMT3 + 1))  5.030  6.180 14.605   0.0262 *  
## s(log(PAY_AMT4 + 1))  1.344  1.613  0.708   0.6977    
## s(log(PAY_AMT5 + 1))  1.625  2.026  3.273   0.1968    
## s(log(PAY_AMT6 + 1))  5.442  6.570 11.431   0.0927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.219   Deviance explained =   19%
## UBRE = -0.13767  Scale est. = 1         n = 23999

Visualization of non-linear relationship between predictors and response variable

# Visualize s() outputs
terms = predict(gam_model, type="terms")

frame1 = as.data.frame(terms)
# Remember that 1 is negative, while 2 is positive
# vars=setdiff(colnames(train_new),c("default","binary_default"))

df10= sum_gam$chi.sq
names(df10)
##  [1] "s(log(LIMIT_BAL + 1))" "s(log(AGE + 1))"       "s(log(BILL_AMT1 + 1))"
##  [4] "s(log(BILL_AMT2 + 1))" "s(log(BILL_AMT3 + 1))" "s(log(BILL_AMT4 + 1))"
##  [7] "s(log(BILL_AMT5 + 1))" "s(log(BILL_AMT6 + 1))" "s(log(PAY_AMT1 + 1))" 
## [10] "s(log(PAY_AMT2 + 1))"  "s(log(PAY_AMT3 + 1))"  "s(log(PAY_AMT4 + 1))" 
## [13] "s(log(PAY_AMT5 + 1))"  "s(log(PAY_AMT6 + 1))"
vars = names(sum_gam$chi.sq)[which(sum_gam$s.pv<0.05)]
vars
## [1] "s(log(LIMIT_BAL + 1))" "s(log(BILL_AMT1 + 1))" "s(log(BILL_AMT3 + 1))"
## [4] "s(log(PAY_AMT1 + 1))"  "s(log(PAY_AMT2 + 1))"  "s(log(PAY_AMT3 + 1))"
clean_vars = gsub('[()]','',vars)
clean_vars = sub("slog",'',clean_vars)
clean_vars = gsub('\\+','',clean_vars)
clean_vars = gsub('\\s','', clean_vars)
clean_vars = substr(clean_vars,1,nchar(clean_vars)-1)

frame1_visual = frame1[,vars]
train_gam_visual = train[,clean_vars]

# Long pivot
frame1_visual_long = unpivot_to_blocks(frame1_visual, nameForNewKeyColumn ="basis_function",
                                       nameForNewValueColumn = "basis_values",
                                       columnsToTakeFrom = vars)
train_gam_visual_long = unpivot_to_blocks(train_gam_visual, 
                                          nameForNewKeyColumn = "predictors",
                                          nameForNewValueColumn = "values",
                                          columnsToTakeFrom = clean_vars)

dat_visual = cbind(frame1_visual_long, train_gam_visual_long)

dat_visual %>% ggplot(aes(x = values, y = basis_values)) +
  geom_smooth() +
  facet_wrap(~predictors, ncol = 3, scales = "free")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Model evaluation

GAM performances evaluated using training set (above) and test set (below).

##  accuracy    recall precision  f1_score       AUC 
##    0.8212    0.3674    0.6766    0.4762    0.7843
##  accuracy    recall precision  f1_score       AUC 
##    0.8142    0.3449    0.6515    0.4510    0.7560

kNN

Data transformation

Perform one-hot encoding on categorical features: SEX, MARRIAGE and EDUCATION.

# Use train_new
dummy=dummyVars("~.",data=train_new[,f_cat])
cat_new=data.frame(predict(dummy,newdata=train_new[,f_cat]))
var_removed = c("binary_default", f_cat)

train_onehot = train_new[, !(colnames(train_new) %in% var_removed)]
train_onehot = cbind(cat_new, train_onehot)

# Test data
cat_new = data.frame(predict(dummy,newdata=test_new[,f_cat]))
test_onehot = test_new[, !(colnames(test_new) %in% var_removed)]
test_onehot = cbind(cat_new, test_onehot)

cat('Training data no. of samples (kNN): ', dim(train_onehot)[1], 
    ' ; no. of features: ', dim(train_onehot)[2])
## Training data no. of samples (kNN):  23999  ; no. of features:  49
cat('Test data no. of samples (kNN): ', dim(test_onehot)[1], 
    ' ; no. of features: ', dim(test_onehot)[2])
## Test data no. of samples (kNN):  6001  ; no. of features:  49

Choose the best k (hyperparameter)

Hyperparameter k (number of nearest neighbors) needs to be fine-tuned. Too small of a k leads to overfitting (wiggly decision boundary and thus poor generalization); if k is too high, it would incur high computational time and results in low-variance high bias model. Refer to https://www.statlearning.com/ pg.41 for more details. The rule of thumb of the optimal k is \[\sqrt{m}\] where m is the number of training samples.

# Use train-test validation approach.
# First, prepare both training and test data.
set.seed(3)
split = initial_split(train_onehot, prop = 0.8, strata = "default")
train_knn = training(split)
test_knn = testing(split)

target_train = train_knn$default==1
target_test = as.numeric(test_knn$default)-1
train_knn = train_knn[, !(colnames(train_knn) %in% c("default"))]
test_knn = test_knn[, !(colnames(test_knn) %in% c("default"))]

# Set the range of k
k_range = seq(from = 115, to = 175, by = 2)

AUC_vec = rep(NA, length(k_range))
iter = 0
for (K in (k_range)) {
  iter = iter + 1
  knn_pred = knn(train_knn, test_knn, target_train, k=K)
  AUC_vec[iter] = calcAUC(ifelse(knn_pred==TRUE, 1, 0), target_test)
}

df_plot_k = data.frame(k = k_range, AUC = AUC_vec)
k_opt = df_plot_k[which.max(AUC_vec),1]

df_plot_k %>% 
  ggplot(aes(x = k, y = AUC)) +
  geom_line() +
  geom_vline(xintercept = k_opt, linetype = 2, color = 'red') +
  xlab('Number of nearest neighbors, k') +
  ylab('AUC')

Model evaluation

knn_pred = knn(train_onehot, test_onehot, (train_onehot$default==1), 
               k=k_opt, prob = TRUE)

prop_vote = attr(knn_pred,'prob')
prob_pos = ifelse(knn_pred==FALSE, 1-prop_vote, prop_vote)
test$pred_knn = prob_pos

(perf_test_knn = performance(truth = test_new$default, pred = prob_pos))
##  accuracy    recall precision  f1_score       AUC 
##    0.8994    0.6295    0.8819    0.7346    0.9703

Findings and summary

Tabulation of performances of classifiers

kable(rbind(perf_test_logreg, perf_test_lasso, perf_test_gam, perf_test_knn))
accuracy recall precision f1_score AUC
perf_test_logreg 0.8140 0.3456 0.6501 0.4513 0.7481
perf_test_lasso 0.8022 0.2304 0.6497 0.3402 0.7385
perf_test_gam 0.8142 0.3449 0.6515 0.4510 0.7560
perf_test_knn 0.8994 0.6295 0.8819 0.7346 0.9703

ROC plot

test$binary_default = (test$default==1)
ROCPlotList(
  frame = test,
  xvar_names = c("pred_logreg", "pred_lasso", "pred_gam", "pred_knn"),
  truthVar = "binary_default", truthTarget = TRUE,
  title = "ROC plots for all classifiers"
)

Classifiers’ score threshold manipulation

Set the classifier’s score threshold to be 0.2.

perf_test_logreg = performance(truth = test$default, pred = test$pred_logreg, 
                               threshold = 0.2)
perf_test_lasso = performance(test$default, test$pred_lasso, threshold = 0.2)
perf_test_gam = performance(test$default, test$pred_gam, threshold = 0.2)
perf_test_knn = performance(test$default, test$pred_knn, threshold = 0.2)
kable(rbind(perf_test_logreg, perf_test_lasso, perf_test_gam, perf_test_knn), 
      caption = "test performances for different models (thresholds=0.2)")
test performances for different models (thresholds=0.2)
accuracy recall precision f1_score AUC
perf_test_logreg 0.7445 0.5836 0.4416 0.5028 0.7481
perf_test_lasso 0.7580 0.5655 0.4619 0.5085 0.7385
perf_test_gam 0.7327 0.6152 0.4277 0.5046 0.7560
perf_test_knn 0.9100 0.9646 0.7221 0.8259 0.9703

Computing environment

##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          4                           
## minor          0.2                         
## year           2020                        
## month          06                          
## day            22                          
## svn rev        78730                       
## language       R                           
## version.string R version 4.0.2 (2020-06-22)
## nickname       Taking Off Again

Reference

  1. Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.