Load all the required R packages.
# install the required packages first
library(readxl) # load excel files
library(cdata) # data wrangling
## Loading required package: wrapr
library(dplyr) # tidy up the data (data wrangling)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:wrapr':
##
## coalesce
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rsample) # stratified sampling
library(caret) # preprocessing (min-max normalization)
## Loading required package: lattice
## Loading required package: ggplot2
library(sigr) # for AUC calculation
library(knitr) # for table presentation
library(mgcv) # GAM model
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following object is masked from 'package:wrapr':
##
## %.%
library(glue) # for formula construction
##
## Attaching package: 'glue'
## The following object is masked from 'package:nlme':
##
## collapse
## The following object is masked from 'package:dplyr':
##
## collapse
library(WVPlots) # double density plots and ROC curves
library(car) # VIF calculation
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:wrapr':
##
## bc
library(glmnet) # Lasso model
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:wrapr':
##
## pack, unpack
## Loaded glmnet 4.1-2
library(glmnetUtils) # Lasso model
##
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
##
## cv.glmnet, glmnet
library(class) # kNN
The coding is the same as this Rpubs document 7 and will not be shown here.
The coding is the same as this Rpubs document 7 and will not be shown here.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(response)` instead of `response` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Min-max normalization to standardize scales across all continuous predictors.
# Select the continuous features
features_cont = colnames(train)[c(1, 5, c(12:23))]
train_cont = train[,features_cont]
# Min-max normalization on train data (continuous features)
preprocess_min_max = preProcess(train_cont, method = c("range"))
scaled_train_cont = predict(preprocess_min_max, train_cont)
train_scaled = cbind(scaled_train_cont, train[, features_cat], default = train$default)
One-hot encoding and ordinal encoding.
m_train = dim(train_scaled)[1]
my_cat_encode = function(predictor, ncol, col_names, m){
# data is the predictor (data frame); ncol is the number of columns
# m = dim(predictor)[1]; col_names is the names of column header
mat = matrix(, nrow = m, ncol = ncol)
for (i in (1:m)){
data = predictor[i]
if (data == -2){
vec = c(1,0,0,0)
} else if (data == -1){
vec = c(0,1,0,0)
} else if (data == 0){
vec = c(0,0,1,0)
} else {
vec = c(0,0,0,data)
}
mat[i,] = vec
}
df_ps0 = data.frame(mat)
# colnames(df_ps0) = c("neg2_pay0", "neg1_pay0", "zero_pay0","delay_months")
colnames(df_ps0) = col_names
df_ps0
}
df = data.frame(matrix(,nrow = m_train, ncol = 0))
f_pay = c("PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")
for (j in f_pay){
predictor = train_scaled[,j]
col_names = sapply(c("neg2_{j}", "duly_{j}", "zero_{j}", "delay_months_{j}"), glue)
names(col_names) = NULL
df1 = my_cat_encode(predictor, ncol = 4, col_names = col_names, m = m_train)
df = cbind(df, df1)
}
# concatenate original dataframe
train_new = cbind(train_scaled, df)
train_new = train_new[, !(colnames(train_new) %in% f_pay)]
The above preprocessing pipeline is applied on test data.
# For test data
scaled_test_cont = predict(preprocess_min_max, test[,features_cont])
test_scaled = cbind(scaled_test_cont, test[, features_cat], default = test$default)
m_test = dim(test_scaled)[1]
# Repeat same pipeline as training data
df = data.frame(matrix(,nrow = m_test, ncol = 0))
f_pay = c("PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")
for (j in f_pay){
predictor = test_scaled[,j]
col_names = sapply(c("neg2_{j}", "duly_{j}", "zero_{j}", "delay_months_{j}"), glue)
names(col_names) = NULL
df1 = my_cat_encode(predictor, ncol = 4, col_names = col_names, m = m_test)
df = cbind(df, df1)
}
test_new = cbind(test_scaled, df)
test_new = test_new[, !(colnames(test_new) %in% f_pay)]
cat('Training data no. of samples: ', dim(train_new)[1],
' ; no. of features: ', dim(train_new)[2])
## Training data no. of samples: 23999 ; no. of features: 42
cat('Test data no. of samples: ', dim(test_new)[1],
' ; no. of features: ', dim(test_new)[2])
## Test data no. of samples: 6001 ; no. of features: 42
We end up with 42 predictors and this transformed trained data will be used to train logistic regression (LR), regularized logistic regression (Lasso), generalized additive model (GAM).
First, I write a function to calculate accuracy, precision, recall, f-score and AUC. AUC refers to area under the receiver operating characteristic (ROC) curve.
performance = function(truth, pred, threshold=0.5){
# Expect pred to be posterior probability (double precision)
if (typeof(pred) == 'double'){
tab = table(truth, pred>=threshold)
} else {
stop('The model prediction must be either scores or probability estimates')
}
acc = (sum(diag(tab)))/sum(tab)
recall = tab[2,2]/sum(tab[2,])
precision = tab[2,2]/sum(tab[,2])
f1_score = (2*recall*precision)/(recall+precision)
# Calculate AUC, change the response variable to boolean variable
AUC = calcAUC(pred, as.numeric(truth)-1)
round(c(accuracy = acc, recall = recall, precision = precision,
f1_score = f1_score, AUC = AUC),4)
}
Logarithmic transform positively skewed predictors as shown in the code chunks below.
# Construct formula with logarithmic transformation on continuous predictors
variables = glue('log({features_cont}+1)')
response = 'default'
cat_var = setdiff(colnames(train_new), c(features_cont, response))
all_vars = c(variables, cat_var)
f = as.formula(
paste(response, paste(all_vars, collapse = " + "), sep = " ~ ")
)
print(f)
## default ~ log(LIMIT_BAL + 1) + log(AGE + 1) + log(BILL_AMT1 +
## 1) + log(BILL_AMT2 + 1) + log(BILL_AMT3 + 1) + log(BILL_AMT4 +
## 1) + log(BILL_AMT5 + 1) + log(BILL_AMT6 + 1) + log(PAY_AMT1 +
## 1) + log(PAY_AMT2 + 1) + log(PAY_AMT3 + 1) + log(PAY_AMT4 +
## 1) + log(PAY_AMT5 + 1) + log(PAY_AMT6 + 1) + SEX + EDUCATION +
## MARRIAGE + neg2_PAY_0 + duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 +
## neg2_PAY_2 + duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 +
## neg2_PAY_3 + duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 +
## neg2_PAY_4 + duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 +
## neg2_PAY_5 + duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 +
## neg2_PAY_6 + duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6
model_logreg = glm(f, data = train_new, family = binomial(link = "logit"))
summary(model_logreg)
##
## Call:
## glm(formula = f, family = binomial(link = "logit"), data = train_new)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2834 -0.5932 -0.5021 -0.3033 3.1770
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.623290 0.537022 1.161 0.245788
## log(LIMIT_BAL + 1) -2.370658 0.233152 -10.168 < 2e-16 ***
## log(AGE + 1) 0.299972 0.156123 1.921 0.054684 .
## log(BILL_AMT1 + 1) -1.131413 1.744292 -0.649 0.516573
## log(BILL_AMT2 + 1) 1.729930 2.032756 0.851 0.394754
## log(BILL_AMT3 + 1) 6.203739 3.046040 2.037 0.041684 *
## log(BILL_AMT4 + 1) -0.174320 1.736196 -0.100 0.920024
## log(BILL_AMT5 + 1) -1.630793 2.082356 -0.783 0.433540
## log(BILL_AMT6 + 1) 1.586931 2.415310 0.657 0.511162
## log(PAY_AMT1 + 1) -11.022535 2.415819 -4.563 5.05e-06 ***
## log(PAY_AMT2 + 1) -16.484680 4.092820 -4.028 5.63e-05 ***
## log(PAY_AMT3 + 1) -1.495719 1.954620 -0.765 0.444139
## log(PAY_AMT4 + 1) -1.042464 1.388120 -0.751 0.452659
## log(PAY_AMT5 + 1) -1.908810 0.994503 -1.919 0.054939 .
## log(PAY_AMT6 + 1) -1.953664 0.905576 -2.157 0.030977 *
## SEXM 0.146167 0.036082 4.051 5.10e-05 ***
## EDUCATIONhigh_school -0.020145 0.055910 -0.360 0.718616
## EDUCATIONothers -0.873374 0.403451 -2.165 0.030406 *
## EDUCATIONuniversity 0.002154 0.041825 0.051 0.958932
## EDUCATIONunknown -1.262168 0.255757 -4.935 8.01e-07 ***
## MARRIAGEothers -0.140138 0.157815 -0.888 0.374546
## MARRIAGEsingle -0.158288 0.040954 -3.865 0.000111 ***
## neg2_PAY_0 -0.216275 0.109050 -1.983 0.047337 *
## duly_PAY_0 0.038243 0.106513 0.359 0.719559
## zero_PAY_0 -0.938408 0.118341 -7.930 2.20e-15 ***
## delay_months_PAY_0 0.636710 0.051449 12.376 < 2e-16 ***
## neg2_PAY_2 -1.324896 0.212811 -6.226 4.79e-10 ***
## duly_PAY_2 -1.238668 0.204339 -6.062 1.35e-09 ***
## zero_PAY_2 -0.784737 0.209117 -3.753 0.000175 ***
## delay_months_PAY_2 -0.533255 0.089892 -5.932 2.99e-09 ***
## neg2_PAY_3 -0.438103 0.244374 -1.793 0.073012 .
## duly_PAY_3 -0.545635 0.221681 -2.461 0.013841 *
## zero_PAY_3 -0.491314 0.212467 -2.312 0.020754 *
## delay_months_PAY_3 -0.059587 0.100393 -0.594 0.552819
## neg2_PAY_4 -0.902047 0.315394 -2.860 0.004236 **
## duly_PAY_4 -0.861328 0.297962 -2.891 0.003843 **
## zero_PAY_4 -0.875994 0.290268 -3.018 0.002546 **
## delay_months_PAY_4 -0.282034 0.139951 -2.015 0.043879 *
## neg2_PAY_5 -0.395381 0.354870 -1.114 0.265212
## duly_PAY_5 -0.564350 0.340586 -1.657 0.097520 .
## zero_PAY_5 -0.415625 0.332329 -1.251 0.211064
## delay_months_PAY_5 -0.093923 0.159758 -0.588 0.556593
## neg2_PAY_6 0.675856 0.289830 2.332 0.019706 *
## duly_PAY_6 0.540918 0.282701 1.913 0.055697 .
## zero_PAY_6 0.345902 0.274818 1.259 0.208155
## delay_months_PAY_6 0.391511 0.130558 2.999 0.002711 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25362 on 23998 degrees of freedom
## Residual deviance: 20803 on 23953 degrees of freedom
## AIC: 20895
##
## Number of Fisher Scoring iterations: 5
VIF of more than 5 indicates strong multicollinearity (highly collinear with other variables) (see this link). Multicollinearity makes it difficult to establish relationship between predictors and target variable as it is difficult to separate out the individual effect of the collinear variables on the response. Regularization is a good alternative to address multicollinearity issue by reducing the magnitudes of coefficients.
kable(data.frame(vif(model_logreg)), caption = "VIF for predictors")
| GVIF | Df | GVIF..1..2.Df.. | |
|---|---|---|---|
| log(LIMIT_BAL + 1) | 1.755492 | 1 | 1.324950 |
| log(AGE + 1) | 1.419511 | 1 | 1.191432 |
| log(BILL_AMT1 + 1) | 23.888635 | 1 | 4.887600 |
| log(BILL_AMT2 + 1) | 39.971787 | 1 | 6.322324 |
| log(BILL_AMT3 + 1) | 29.159909 | 1 | 5.399991 |
| log(BILL_AMT4 + 1) | 26.650667 | 1 | 5.162428 |
| log(BILL_AMT5 + 1) | 32.676032 | 1 | 5.716295 |
| log(BILL_AMT6 + 1) | 19.694999 | 1 | 4.437905 |
| log(PAY_AMT1 + 1) | 1.522892 | 1 | 1.234055 |
| log(PAY_AMT2 + 1) | 1.511449 | 1 | 1.229410 |
| log(PAY_AMT3 + 1) | 1.549805 | 1 | 1.244912 |
| log(PAY_AMT4 + 1) | 1.536855 | 1 | 1.239699 |
| log(PAY_AMT5 + 1) | 1.605836 | 1 | 1.267216 |
| log(PAY_AMT6 + 1) | 1.144700 | 1 | 1.069906 |
| SEX | 1.029312 | 1 | 1.014550 |
| EDUCATION | 1.250969 | 4 | 1.028385 |
| MARRIAGE | 1.364017 | 2 | 1.080699 |
| neg2_PAY_0 | 2.689367 | 1 | 1.639929 |
| duly_PAY_0 | 5.573788 | 1 | 2.360887 |
| zero_PAY_0 | 10.819927 | 1 | 3.289366 |
| delay_months_PAY_0 | 6.098382 | 1 | 2.469490 |
| neg2_PAY_2 | 16.582334 | 1 | 4.072141 |
| duly_PAY_2 | 20.570947 | 1 | 4.535521 |
| zero_PAY_2 | 35.267327 | 1 | 5.938630 |
| delay_months_PAY_2 | 22.769923 | 1 | 4.771784 |
| neg2_PAY_3 | 23.566585 | 1 | 4.854543 |
| duly_PAY_3 | 23.487948 | 1 | 4.846437 |
| zero_PAY_3 | 36.628169 | 1 | 6.052121 |
| delay_months_PAY_3 | 26.868115 | 1 | 5.183446 |
| neg2_PAY_4 | 41.488296 | 1 | 6.441141 |
| duly_PAY_4 | 41.912096 | 1 | 6.473955 |
| zero_PAY_4 | 68.433990 | 1 | 8.272484 |
| delay_months_PAY_4 | 46.683270 | 1 | 6.832516 |
| neg2_PAY_5 | 55.485950 | 1 | 7.448889 |
| duly_PAY_5 | 52.974433 | 1 | 7.278354 |
| zero_PAY_5 | 89.329811 | 1 | 9.451445 |
| delay_months_PAY_5 | 53.776321 | 1 | 7.333234 |
| neg2_PAY_6 | 39.470716 | 1 | 6.282572 |
| duly_PAY_6 | 38.232810 | 1 | 6.183269 |
| zero_PAY_6 | 61.363627 | 1 | 7.833494 |
| delay_months_PAY_6 | 35.558670 | 1 | 5.963109 |
LR model performance evaluation using training and test data.
pred = predict(model_logreg, newdata = train_new, type = "response")
pred_test = predict(model_logreg, newdata = test_new, type = "response")
test$pred_logreg = pred_test
test$binary_default = (test$default==1)
(perf_train_logreg = performance(train_new$default, pred = pred))
## accuracy recall precision f1_score AUC
## 0.8210 0.3661 0.6761 0.4749 0.7743
(perf_test_logreg = performance(test_new$default, pred = pred_test))
## accuracy recall precision f1_score AUC
## 0.8140 0.3456 0.6501 0.4513 0.7481
When it comes to probabilistic model, it’s useful to construct double density plot. Refer to this online e-book for more info.
DoubleDensityPlot(test, xvar = "pred_logreg", truthVar = "default",
title = "Distribution of scores of LR model in test data") +
geom_vline(xintercept = 0.5, color = 'red', linetype = 2)
Visualize the model’s precision-recall trade-off.
train$pred = pred
train$binary_default = train$default == 1
PRTPlot(train, 'pred', 'binary_default', TRUE,
plotvars = c('enrichment', 'recall'), thresholdrange = c(0,1),
title = 'Enrichment/Recall vs thresholds for LR model')
Double density plot and enrichment-recall plot are helpful to pick the classifier’s threshold.
model_lasso = cv.glmnet(f, data = train_new, alpha = 0,
family = "binomial", standardize = FALSE)
# summary(model_lasso)
coeff = coef(model_lasso)
coef_frame <- data.frame(coef = rownames(coeff)[-1],
value = coeff[-1,1])
ggplot(coef_frame, aes(x = coef, y = value)) +
geom_pointrange(aes(ymin = 0, ymax = value)) +
ggtitle("Coefficients of lasso model") +
coord_flip() +
theme(axis.text.y = element_text(size = 6))
Lasso performance evaluated using training set (above) and test set (below)
## accuracy recall precision f1_score AUC
## 0.8112 0.2611 0.6947 0.3796 0.7588
## accuracy recall precision f1_score AUC
## 0.8022 0.2304 0.6497 0.3402 0.7385
The code outputs are training performance (above) and test performance (below).
# Construct the formula
variables = glue('s(log({features_cont}+1))')
f_add = train_new %>% select(tail(names(.),24)) %>% colnames()
f_cat = c("SEX", "EDUCATION", "MARRIAGE")
all_vars = c(variables, c(f_add,f_cat))
# Change the response to binary variable (Boolean)
train_new$binary_default = (train_new$default==1)
f = as.formula(
paste("binary_default", paste(all_vars, collapse = " + "), sep = " ~ ")
)
print(f)
## binary_default ~ s(log(LIMIT_BAL + 1)) + s(log(AGE + 1)) + s(log(BILL_AMT1 +
## 1)) + s(log(BILL_AMT2 + 1)) + s(log(BILL_AMT3 + 1)) + s(log(BILL_AMT4 +
## 1)) + s(log(BILL_AMT5 + 1)) + s(log(BILL_AMT6 + 1)) + s(log(PAY_AMT1 +
## 1)) + s(log(PAY_AMT2 + 1)) + s(log(PAY_AMT3 + 1)) + s(log(PAY_AMT4 +
## 1)) + s(log(PAY_AMT5 + 1)) + s(log(PAY_AMT6 + 1)) + neg2_PAY_0 +
## duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 + neg2_PAY_2 +
## duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 + neg2_PAY_3 +
## duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 + neg2_PAY_4 +
## duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 + neg2_PAY_5 +
## duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 + neg2_PAY_6 +
## duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6 + SEX + EDUCATION +
## MARRIAGE
gam_model = gam(f, data = train_new, family = binomial(link = "logit"),
standardize = FALSE)
gam_model$converged
## [1] TRUE
# To Visualize some "interesting" variables
(sum_gam = summary(gam_model))
##
## Family: binomial
## Link function: logit
##
## Formula:
## binary_default ~ s(log(LIMIT_BAL + 1)) + s(log(AGE + 1)) + s(log(BILL_AMT1 +
## 1)) + s(log(BILL_AMT2 + 1)) + s(log(BILL_AMT3 + 1)) + s(log(BILL_AMT4 +
## 1)) + s(log(BILL_AMT5 + 1)) + s(log(BILL_AMT6 + 1)) + s(log(PAY_AMT1 +
## 1)) + s(log(PAY_AMT2 + 1)) + s(log(PAY_AMT3 + 1)) + s(log(PAY_AMT4 +
## 1)) + s(log(PAY_AMT5 + 1)) + s(log(PAY_AMT6 + 1)) + neg2_PAY_0 +
## duly_PAY_0 + zero_PAY_0 + delay_months_PAY_0 + neg2_PAY_2 +
## duly_PAY_2 + zero_PAY_2 + delay_months_PAY_2 + neg2_PAY_3 +
## duly_PAY_3 + zero_PAY_3 + delay_months_PAY_3 + neg2_PAY_4 +
## duly_PAY_4 + zero_PAY_4 + delay_months_PAY_4 + neg2_PAY_5 +
## duly_PAY_5 + zero_PAY_5 + delay_months_PAY_5 + neg2_PAY_6 +
## duly_PAY_6 + zero_PAY_6 + delay_months_PAY_6 + SEX + EDUCATION +
## MARRIAGE
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.27324 0.22820 5.579 2.41e-08 ***
## neg2_PAY_0 0.01365 0.11039 0.124 0.901609
## duly_PAY_0 0.09678 0.10693 0.905 0.365417
## zero_PAY_0 -0.87332 0.11970 -7.296 2.97e-13 ***
## delay_months_PAY_0 0.65598 0.05217 12.573 < 2e-16 ***
## neg2_PAY_2 -1.60326 0.21546 -7.441 9.98e-14 ***
## duly_PAY_2 -1.44753 0.20614 -7.022 2.19e-12 ***
## zero_PAY_2 -0.87396 0.21034 -4.155 3.25e-05 ***
## delay_months_PAY_2 -0.61168 0.09093 -6.727 1.74e-11 ***
## neg2_PAY_3 -0.48324 0.24771 -1.951 0.051081 .
## duly_PAY_3 -0.49514 0.22591 -2.192 0.028398 *
## zero_PAY_3 -0.45769 0.21649 -2.114 0.034503 *
## delay_months_PAY_3 -0.08273 0.10290 -0.804 0.421412
## neg2_PAY_4 -1.06168 0.31582 -3.362 0.000775 ***
## duly_PAY_4 -0.95467 0.29874 -3.196 0.001395 **
## zero_PAY_4 -0.93751 0.29058 -3.226 0.001254 **
## delay_months_PAY_4 -0.29172 0.14083 -2.071 0.038323 *
## neg2_PAY_5 -0.46026 0.35239 -1.306 0.191512
## duly_PAY_5 -0.62961 0.33807 -1.862 0.062551 .
## zero_PAY_5 -0.44468 0.32959 -1.349 0.177270
## delay_months_PAY_5 -0.09565 0.15863 -0.603 0.546534
## neg2_PAY_6 0.51548 0.28960 1.780 0.075084 .
## duly_PAY_6 0.42994 0.28229 1.523 0.127746
## zero_PAY_6 0.24927 0.27376 0.911 0.362544
## delay_months_PAY_6 0.34356 0.12992 2.645 0.008181 **
## SEXM 0.12246 0.03652 3.353 0.000798 ***
## EDUCATIONhigh_school -0.05804 0.05640 -1.029 0.303471
## EDUCATIONothers -0.85740 0.40293 -2.128 0.033346 *
## EDUCATIONuniversity -0.02617 0.04218 -0.621 0.534892
## EDUCATIONunknown -1.29344 0.25682 -5.036 4.75e-07 ***
## MARRIAGEothers -0.17247 0.15825 -1.090 0.275789
## MARRIAGEsingle -0.16830 0.04177 -4.030 5.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(log(LIMIT_BAL + 1)) 2.573 3.231 88.366 < 2e-16 ***
## s(log(AGE + 1)) 2.819 3.557 6.360 0.1332
## s(log(BILL_AMT1 + 1)) 5.989 7.177 37.878 3.92e-06 ***
## s(log(BILL_AMT2 + 1)) 1.002 1.004 1.168 0.2806
## s(log(BILL_AMT3 + 1)) 1.001 1.003 5.258 0.0219 *
## s(log(BILL_AMT4 + 1)) 2.039 2.599 1.737 0.4889
## s(log(BILL_AMT5 + 1)) 1.942 2.485 1.976 0.5231
## s(log(BILL_AMT6 + 1)) 1.001 1.002 1.421 0.2336
## s(log(PAY_AMT1 + 1)) 5.264 6.342 36.080 4.58e-06 ***
## s(log(PAY_AMT2 + 1)) 7.613 7.892 64.448 5.76e-11 ***
## s(log(PAY_AMT3 + 1)) 5.030 6.180 14.605 0.0262 *
## s(log(PAY_AMT4 + 1)) 1.344 1.613 0.708 0.6977
## s(log(PAY_AMT5 + 1)) 1.625 2.026 3.273 0.1968
## s(log(PAY_AMT6 + 1)) 5.442 6.570 11.431 0.0927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.219 Deviance explained = 19%
## UBRE = -0.13767 Scale est. = 1 n = 23999
# Visualize s() outputs
terms = predict(gam_model, type="terms")
frame1 = as.data.frame(terms)
# Remember that 1 is negative, while 2 is positive
# vars=setdiff(colnames(train_new),c("default","binary_default"))
df10= sum_gam$chi.sq
names(df10)
## [1] "s(log(LIMIT_BAL + 1))" "s(log(AGE + 1))" "s(log(BILL_AMT1 + 1))"
## [4] "s(log(BILL_AMT2 + 1))" "s(log(BILL_AMT3 + 1))" "s(log(BILL_AMT4 + 1))"
## [7] "s(log(BILL_AMT5 + 1))" "s(log(BILL_AMT6 + 1))" "s(log(PAY_AMT1 + 1))"
## [10] "s(log(PAY_AMT2 + 1))" "s(log(PAY_AMT3 + 1))" "s(log(PAY_AMT4 + 1))"
## [13] "s(log(PAY_AMT5 + 1))" "s(log(PAY_AMT6 + 1))"
vars = names(sum_gam$chi.sq)[which(sum_gam$s.pv<0.05)]
vars
## [1] "s(log(LIMIT_BAL + 1))" "s(log(BILL_AMT1 + 1))" "s(log(BILL_AMT3 + 1))"
## [4] "s(log(PAY_AMT1 + 1))" "s(log(PAY_AMT2 + 1))" "s(log(PAY_AMT3 + 1))"
clean_vars = gsub('[()]','',vars)
clean_vars = sub("slog",'',clean_vars)
clean_vars = gsub('\\+','',clean_vars)
clean_vars = gsub('\\s','', clean_vars)
clean_vars = substr(clean_vars,1,nchar(clean_vars)-1)
frame1_visual = frame1[,vars]
train_gam_visual = train[,clean_vars]
# Long pivot
frame1_visual_long = unpivot_to_blocks(frame1_visual, nameForNewKeyColumn ="basis_function",
nameForNewValueColumn = "basis_values",
columnsToTakeFrom = vars)
train_gam_visual_long = unpivot_to_blocks(train_gam_visual,
nameForNewKeyColumn = "predictors",
nameForNewValueColumn = "values",
columnsToTakeFrom = clean_vars)
dat_visual = cbind(frame1_visual_long, train_gam_visual_long)
dat_visual %>% ggplot(aes(x = values, y = basis_values)) +
geom_smooth() +
facet_wrap(~predictors, ncol = 3, scales = "free")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
GAM performances evaluated using training set (above) and test set (below).
## accuracy recall precision f1_score AUC
## 0.8212 0.3674 0.6766 0.4762 0.7843
## accuracy recall precision f1_score AUC
## 0.8142 0.3449 0.6515 0.4510 0.7560
Perform one-hot encoding on categorical features: SEX, MARRIAGE and EDUCATION.
# Use train_new
dummy=dummyVars("~.",data=train_new[,f_cat])
cat_new=data.frame(predict(dummy,newdata=train_new[,f_cat]))
var_removed = c("binary_default", f_cat)
train_onehot = train_new[, !(colnames(train_new) %in% var_removed)]
train_onehot = cbind(cat_new, train_onehot)
# Test data
cat_new = data.frame(predict(dummy,newdata=test_new[,f_cat]))
test_onehot = test_new[, !(colnames(test_new) %in% var_removed)]
test_onehot = cbind(cat_new, test_onehot)
cat('Training data no. of samples (kNN): ', dim(train_onehot)[1],
' ; no. of features: ', dim(train_onehot)[2])
## Training data no. of samples (kNN): 23999 ; no. of features: 49
cat('Test data no. of samples (kNN): ', dim(test_onehot)[1],
' ; no. of features: ', dim(test_onehot)[2])
## Test data no. of samples (kNN): 6001 ; no. of features: 49
Hyperparameter k (number of nearest neighbors) needs to be fine-tuned. Too small of a k leads to overfitting (wiggly decision boundary and thus poor generalization); if k is too high, it would incur high computational time and results in low-variance high bias model. Refer to https://www.statlearning.com/ pg.41 for more details. The rule of thumb of the optimal k is \[\sqrt{m}\] where m is the number of training samples.
# Use train-test validation approach.
# First, prepare both training and test data.
set.seed(3)
split = initial_split(train_onehot, prop = 0.8, strata = "default")
train_knn = training(split)
test_knn = testing(split)
target_train = train_knn$default==1
target_test = as.numeric(test_knn$default)-1
train_knn = train_knn[, !(colnames(train_knn) %in% c("default"))]
test_knn = test_knn[, !(colnames(test_knn) %in% c("default"))]
# Set the range of k
k_range = seq(from = 115, to = 175, by = 2)
AUC_vec = rep(NA, length(k_range))
iter = 0
for (K in (k_range)) {
iter = iter + 1
knn_pred = knn(train_knn, test_knn, target_train, k=K)
AUC_vec[iter] = calcAUC(ifelse(knn_pred==TRUE, 1, 0), target_test)
}
df_plot_k = data.frame(k = k_range, AUC = AUC_vec)
k_opt = df_plot_k[which.max(AUC_vec),1]
df_plot_k %>%
ggplot(aes(x = k, y = AUC)) +
geom_line() +
geom_vline(xintercept = k_opt, linetype = 2, color = 'red') +
xlab('Number of nearest neighbors, k') +
ylab('AUC')
knn_pred = knn(train_onehot, test_onehot, (train_onehot$default==1),
k=k_opt, prob = TRUE)
prop_vote = attr(knn_pred,'prob')
prob_pos = ifelse(knn_pred==FALSE, 1-prop_vote, prop_vote)
test$pred_knn = prob_pos
(perf_test_knn = performance(truth = test_new$default, pred = prob_pos))
## accuracy recall precision f1_score AUC
## 0.8994 0.6295 0.8819 0.7346 0.9703
kable(rbind(perf_test_logreg, perf_test_lasso, perf_test_gam, perf_test_knn))
| accuracy | recall | precision | f1_score | AUC | |
|---|---|---|---|---|---|
| perf_test_logreg | 0.8140 | 0.3456 | 0.6501 | 0.4513 | 0.7481 |
| perf_test_lasso | 0.8022 | 0.2304 | 0.6497 | 0.3402 | 0.7385 |
| perf_test_gam | 0.8142 | 0.3449 | 0.6515 | 0.4510 | 0.7560 |
| perf_test_knn | 0.8994 | 0.6295 | 0.8819 | 0.7346 | 0.9703 |
test$binary_default = (test$default==1)
ROCPlotList(
frame = test,
xvar_names = c("pred_logreg", "pred_lasso", "pred_gam", "pred_knn"),
truthVar = "binary_default", truthTarget = TRUE,
title = "ROC plots for all classifiers"
)
Set the classifier’s score threshold to be 0.2.
perf_test_logreg = performance(truth = test$default, pred = test$pred_logreg,
threshold = 0.2)
perf_test_lasso = performance(test$default, test$pred_lasso, threshold = 0.2)
perf_test_gam = performance(test$default, test$pred_gam, threshold = 0.2)
perf_test_knn = performance(test$default, test$pred_knn, threshold = 0.2)
kable(rbind(perf_test_logreg, perf_test_lasso, perf_test_gam, perf_test_knn),
caption = "test performances for different models (thresholds=0.2)")
| accuracy | recall | precision | f1_score | AUC | |
|---|---|---|---|---|---|
| perf_test_logreg | 0.7445 | 0.5836 | 0.4416 | 0.5028 | 0.7481 |
| perf_test_lasso | 0.7580 | 0.5655 | 0.4619 | 0.5085 | 0.7385 |
| perf_test_gam | 0.7327 | 0.6152 | 0.4277 | 0.5046 | 0.7560 |
| perf_test_knn | 0.9100 | 0.9646 | 0.7221 | 0.8259 | 0.9703 |
Computing environment
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 4
## minor 0.2
## year 2020
## month 06
## day 22
## svn rev 78730
## language R
## version.string R version 4.0.2 (2020-06-22)
## nickname Taking Off Again