library(tidyverse)
library(GGally)
library(plyr)
library(knitr)
library(pROC)

library(DMwR)
library(caret)
library(rpart)
library(xgboost)
#library(LightGBM)

library(klaR)
library(gbm)
library(nnet)
library(e1071)
library(readxl)

library(PerformanceAnalytics)
library(kerndwd)
library(tree)
library(SMCRM)
library(randomForestSRC)

library(rpart)
library(car)
library(dplyr)
library(earth)
library(mda)

library(bnclassify)
library(skimr)
library(rpart.plot)
library(corrplot)
library(ggthemes)
#str(df2)
colnames(df2)[24] <- "DEFAULT"
# GLM model for variable collinearity  
set.seed(123)
glm.fit1 <- glm(DEFAULT~., data=df2)
preds.glm1 = predict(glm.fit1, newdata = df2)
glm1.mse = mean((preds.glm1 - df2$DEFAULT)^2)
glm1.mse
car::vif(glm.fit1)
chart.Correlation(df2, histogram=TRUE, pch=19)
chart.Correlation(df2[,c(24,0:10)], histogram=TRUE, pch=19)
# Check correlation
df2$DEFAULT = as.numeric(df2$DEFAULT)
r = cor(df2[-c(3, 4, 5)])
corrplot(r, method = "circle")
set.seed(123)
options(warn=-1)

subsets <- c(1:5,12, 15, 18)

ctrl <- rfeControl(functions = rfFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)

lmProfile <- rfe(x=trainSet[, 1:24], y=trainSet$DEFAULT,
                 sizes = subsets,
                 rfeControl = ctrl)

lmProfile

Data Pre-Processing

df$ID <- NULL
colnames(df)[24] <- "DEFAULT"
df$DEFAULT <- as.factor(ifelse(df$DEFAULT == 1, "Yes", "No"))
df$SEX <- as.factor(ifelse(df$SEX == 1, "Male", "Female"))
df$MARRIAGE <- as.factor(ifelse(df$MARRIAGE == 1, "Married",
                                ifelse(df$MARRIAGE == 2, "Single", "Other")))
df$EDUCATION <- as.factor(ifelse(df$EDUCATION == 1, "Graduate.School",
                                 ifelse(df$EDUCATION == 2, "University", 
                                        ifelse(df$EDUCATION == 3, "High.School",
                                               ifelse(df$EDUCATION == 4, "Other", "Unkown")))))
df$PAY_0 <- as.factor(df$PAY_0)
df$PAY_2 <- as.factor(df$PAY_2)
df$PAY_3 <- as.factor(df$PAY_3)
df$PAY_4 <- as.factor(df$PAY_4)
df$PAY_5 <- as.factor(df$PAY_5)
df$PAY_6 <- as.factor(df$PAY_6)
str(df)
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(LIMIT_BAL~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "LIMIT_BAL")
boxplot(AGE~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "AGE")
#boxplot(SEX~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "SEX")
#boxplot(EDUCATION~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "EDUCATION")
#boxplot(MARRAIGE~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "MARRAIGE")
#boxplot(PAY_0~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "PAY_0")

Modeling

set.seed(101)

index <- createDataPartition(df$DEFAULT,
                             p = 0.7,
                             list = F)
trainSet <- df[index,]
trainSet_remove_y <- trainSet
trainSet_remove_y$DEFAULT <- NULL

#xTrain <- trainSet %>% select(-DEFAULT)
xTrain <- trainSet_remove_y
yTrain <- trainSet$DEFAULT

testSet <- df[-index,]


fiveMetric <- function(...) c(twoClassSummary(...),
                              defaultSummary(...))

ctrl <- trainControl(method = "cv",
                     number = 5,
                     summaryFunction = fiveMetric,
                     classProbs = T,
                     verboseIter = T)

ctrlSMOTE <- trainControl(method = "cv",
                          number = 5,
                          summaryFunction = fiveMetric,
                          classProbs = T,
                          sampling = "smote",
                          verboseIter = T)
skimr::skim_to_wide(trainSet)
# DEFAULT AND EDUCATION
ggplot(df2, aes(x = EDUCATION, fill = DEFAULT)) +
  geom_bar() +
  labs(x = 'Education') +
  theme_classic()

# DEFAULT AND SEX
ggplot(df2, aes(x = SEX, fill = DEFAULT)) +
  geom_bar() +
  labs(x = 'Sex') +
  theme_classic()

# DEFAULT AND MARIAGE
ggplot(df2, aes(x = MARRIAGE, fill = DEFAULT)) +
  geom_bar() +
  labs(x = 'Marriage') +
  theme_classic()

# LIMIT_BAL AND EDUCATION
ggplot(df2, aes(x = EDUCATION, fill = LIMIT_BAL)) +
  geom_bar() +
  labs(x = 'Education') +
  labs(y = 'Limit credit') +
  theme_classic()

# age and default
ggplot(df2, aes(AGE, fill = DEFAULT)) + 
  geom_histogram(binwidth = 6) + 
  facet_grid(.~EDUCATION) + 
  theme_classic()
#library(vcd)
#mosaic(~ PAY_0 + DEFAULT | AGE, 
#       data = df2, shade=TRUE, legend=TRUE)
featurePlot(x = df[,c("SEX", "EDUCATION", "AGE", "MARRIAGE", "LIMIT_BAL")],
            y = df$DEFAULT, 
            plot = "box",
            scales = list(y = list(relation="free"),
                          x = list(rot = 90)),  
            layout = c(4,1 ), 
            auto.key = list(columns = 2))
featurePlot(x = df[, 18:23], 
            y = df$DEFAULT, 
            plot = "box",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")))
featurePlot(x = df[, 18:23], 
            y = df$DEFAULT, 
            plot = "density",
            strip=strip.custom(par.strip.text=list(cex=.7)),
            scales = list(x = list(relation="free"), 
                          y = list(relation="free")))

Logistic - Required packages: MASS

Type: Regression, Classification

No tuning parameters for this model

set.seed(101)
glm_model <- train(DEFAULT ~ ., data = trainSet,
                   method = "glmStepAIC",
                   preProc = c("center", "scale"),
                   trControl = ctrl,
                   metric = "Accuracy"
                   )
glm_model
summary(glm_model)
#Final GLM Model Formula
# ROC 0.953103, Accuracy 0.8194369  : LIMIT_BAL + SEXMale + EDUCATIONOther + EDUCATIONUnkown + MARRIAGESingle + `PAY_0-1` + PAY_01 + PAY_02 + PAY_03 + PAY_04 + PAY_05 + PAY_06 + PAY_07 + `PAY_2-1` + PAY_24 + PAY_31 + PAY_32 + PAY_33 + PAY_35 + PAY_36 + `PAY_4-1` + PAY_40 + PAY_45 + PAY_46 + PAY_50 + PAY_52 + PAY_53 + PAY_57 + `PAY_6-1` + PAY_60 + PAY_63 + PAY_67 + BILL_AMT3 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 + PAY_AMT4 + PAY_AMT6

glm_model[["finalModel"]][["formula"]][[3]]
glm_model$bestTune
glm_res <- as_tibble(glm_model$results)
glm_res

Flexible Discriminant Analysis

Type: Classification - Required packages: earth, mda - Tuning parameters: degree (Product Degree) nprune (#Terms)

set.seed(101)
DA_model <- train (DEFAULT ~., data = trainSet,
                      method = "fda",
                      preProc = c("center", "scale"),
                      trControl = ctrl,
                      metric = "Accuracy")
                  
DA_model
summary(DA_model)
varImp(DA_model)
DA_model$bestTune
DA_res <- as_tibble(DA_model$results)
DA_res

RandomForest - Type: Classification, Regression

Tuning parameters:

mtry (#Randomly Selected Predictors) Required packages: randomForest

A model-specific variable importance metric is available.

set.seed(101)
rf_model <- train(DEFAULT ~ ., data = trainSet,
                  method = "rf",
                  preProc = c("center", "scale"),
                  trControl = ctrl,
                  metric = "Accuracy",
                  tuneGrid = expand.grid(.mtry = c(4,8,12,22)
                                            ))
rf_model
summary(rf_model)
varImp(rf_model)
rf_model$bestTune
RF_res <- as_tibble(rf_model$results)
RF_res

KNN - Required packages: kernlab

Type: Classification, Regression

Tuning parameters:

k (#Neighbors)

set.seed(101)
KNN_model <- train(DEFAULT ~ ., data = trainSet,
                  method = "knn",
                  preProc = c("center", "scale"),
                  trControl = ctrl,
                  metric = "Accuracy")
KNN_model
summary(KNN_model)
#varImp(SVM_model)
KNN_model$bestTune
KNN_res <- as_tibble(KNN_model$results)
KNN_res

SVM Radial - Required packages: kernlab

Type: Classification

Tuning parameters:

sigma (Sigma) tau (Regularization Parameter)

set.seed(101)
SVM_model <- train(DEFAULT ~ ., data = trainSet,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  trControl = ctrl,
                  metric = "Accuracy")
SVM_model
summary(SVM_model)
#varImp(SVM_model)
SVM_model$bestTune
SVM_res <- as_tibble(SVM_model$results)
SVM_res

SVM - Required packages: kernlab

Type: Classification

Tuning parameters:

sigma (Sigma) tau (Regularization Parameter)

set.seed(101)
SVM_model2 <- train(DEFAULT ~ ., data = trainSet,
                  method = "lssvmRadial",
                  preProc = c("center", "scale"),
                  trControl = ctrl,
                  metric = "Accuracy")
SVM_model2
summary(SVM_model2)
#varImp(SVM_model)
SVM_model2$bestTune
SVM2_res <- as_tibble(SVM_model2$results)
SVM2_res

Decision Tree - Required packages: rpart

Type: Regression, Classification

Tuning parameters: cp (Complexity Parameter)

set.seed(101)
C <- train(DEFAULT ~ ., data = trainSet,
                      method = "rpart",
                      preProc = c("center", "scale"),
                      trControl = ctrl,
                      metric = "Accuracy",
                      tuneGrid = expand.grid(.cp = seq(0,0.01,0.1)))
rpart_model
varImp(rpart_model)
rpart_model$bestTune
rpart_res <- as_tibble(rpart_model$results)
rpart_res

Neural Network - Required Packages: nnet

Type: Classification, Regression

Tuning parameters: size (#Hidden Units) decay (Weight Decay)

nn_grid <- expand.grid(.size = c(2,4,8),
                       .decay = c(0.001,0.01))
set.seed(101)
nn_model <- train(x = xTrain, y = yTrain,
                  method = "nnet",
                  tuneGrid = nn_grid,
                  preProcess = c("center", "scale"),
                  trControl = ctrl)

nn_model
set.seed(101)
nn_modelSMOTE <- train(x = xTrain, y = yTrain,
                  method = "nnet",
                  tuneGrid = nn_grid,
                  preProcess = c("center", "scale"),
                  trControl = ctrlSMOTE)
nn_modelSMOTE
# Seems as if our SMOTE (Synthetic Minority Over-Sampling Technique) was ineffective at improving our #models accuracy
nn_model$bestTune
nn_res <- as_tibble(nn_model$results)
nn_res
nn_modelSMOTE$bestTune
nnSMOTE_res <- as_tibble(nn_modelSMOTE$results)
nnSMOTE_res

eXtreme Gradient Boosting - Required packages: xgboost, plyr

Type: Regression and Classification

Tuning parameters: nrounds (# Boosting Iterations) max_depth (Max Tree Depth) eta (Shrinkage) gamma (Minimum Loss Reduction) colsample_bytree (Subsample Ratio of Columns) min_child_weight (Minimum Sum of Instance Weight) subsample (Subsample Percentage)

xgb_grid <- expand.grid(.nrounds = c(50,100,150),
                        .max_depth = 6,
                        .min_child_weight = 1,
                        .eta = .25,
                        .colsample_bytree = 1,
                        .subsample = 1,
                        .gamma = 1
                        )
set.seed(101)
xgb_model <- train(DEFAULT ~ ., data = trainSet,
                   method = "xgbTree",
                   preProcess = c("center", "scale"),
                   trControl = ctrl,
                   metric = "Accuracy",
                   tuneGrid = xgb_grid)
xgb_model
set.seed(101)
xgb_modelSMOTE <- train(DEFAULT ~ ., data = trainSet,
                         method = "xgbTree",
                         preProcess = c("center", "scale"),
                         trControl = ctrlSMOTE,
                         metric = "Accuracy",
                         tuneGrid = xgb_grid)
xgb_modelSMOTE
xgb_model$bestTune
xgb_res <- as_tibble(xgb_model$results)
xgb_res
xgb_modelSMOTE$bestTune
xgbSMOTE_res <- as_tibble(nn_modelSMOTE$results)
xgbSMOTE_res

Naive Bayes Classifier - Required packages: bnclassify

Type: Classification

Tuning parameters: smooth (Smoothing Parameter) prior (Prior Probability)

set.seed(101)
Naive_Bayes_model <- train(x = xTrain, y = yTrain,
                   method = "nb",
                   preProcess = c("center"),
                   trControl = ctrl,
                   metric = "Accuracy"
                   )
Naive_Bayes_model
summary(Naive_Bayes_model)
Naive_Bayes_model$bestTune
Naive_Bayes_res <- as_tibble(Naive_Bayes_model$results)
Naive_Bayes_res
library(ROCR)
pred_test_naive<-predict(naive_model, newdata = T, type="raw")
p_test_naive<-prediction(pred_test_naive[,2], testSet$DEFAULT)
perf_naive<-performance(p_test_naive, "tpr", "fpr")
plot(perf_naive, colorize=F)
performance(p_test_naive, "auc")@y.values

Comparing Models

all_models <- resamples((list(GLM = glm_model,
                              DA = DA_model,
                              RandomForest = rf_model,
                              DecisionTree = rpart_model,
                              KNN = KNN_model,
                              SVMR = SVM_model,
                              SVM2 = SVM_model2,
                              NN = nn_model,
                              NN_SMOTE = nn_modelSMOTE,
                              NB = Naive_Bayes_model,
                              XGB = xgb_model,
                              XGB_SMOTE = xgb_modelSMOTE)),decreasing = T)
summary(all_models)
bwplot(all_models)
dotplot(all_models)

Prediction:

We will use all of our models for predicting onto new unseen data sets.

GLM prediction:

predClass_glm <- predict(glm_model, newdata = testSet, type = "raw")
predProb_glm <- predict(glm_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_glm, testSet$DEFAULT)
roc_glm <- roc(testSet$DEFAULT, predProb_glm[,"No"])
plot(roc_glm, print.auc = T,
     legacy.axis = T)

DA prediction:

predClass_DA <- predict(DA_model, newdata = testSet, type = "raw")
predProb_DA <- predict(DA_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_DA, testSet$DEFAULT)
roc_DA <- roc(testSet$DEFAULT, predProb_DA[,"No"])
plot(roc_DA, print.auc = T,
     legacy.axis = T)

RF prediction:

predClass_rf <- predict(rf_model, newdata = testSet, type = "raw")
predProb_rf <- predict(rf_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_rf, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_rf[,"No"])
plot(roc_nn, print.auc = T,
     legacy.axis = T)

KNN prediction:

predClass_KNN <- predict(KNN_model, newdata = testSet, type = "raw")
predProb_KNN <- predict(KNN_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_KNN, testSet$DEFAULT)
roc_KNN <- roc(testSet$DEFAULT, predProb_KNN[,"No"])
plot(roc_KNN, print.auc = T,
     legacy.axis = T)

SVM prediction:

predClass_SVM <- predict(SVM_model, newdata = testSet, type = "raw")
predProb_SVM <- predict(SVM_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_SVM, testSet$DEFAULT)
roc_SVM <- roc(testSet$DEFAULT, predProb_SVM[,"No"])
plot(roc_SVM, print.auc = T,
     legacy.axis = T)

Decision Tree prediction:

predClass_rpart <- predict(rpart_model, newdata = testSet, type = "raw")
predProb_rpart <- predict(rpart_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_rpart, testSet$DEFAULT)
roc_rpart <- roc(testSet$DEFAULT, predProb_rpart[,"No"])
plot(roc_rpart, print.auc = T,
     legacy.axis = T)

NN prediction:

predClass_nn <- predict(nn_model, newdata = testSet, type = "raw")
predProb_nn <- predict(nn_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_nn, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_nn[,"No"])
plot(roc_nn, print.auc = T,
     legacy.axis = T)

NN_SMOTE prediction:

predClass_nnSMOTE <- predict(nn_modelSMOTE, newdata = testSet, type = "raw")
predProb_nnSMOTE <- predict(nn_modelSMOTE, newdata = testSet, type = "prob")
confusionMatrix(predClass_nnSMOTE, testSet$DEFAULT)
roc_nnSMOTE <- roc(testSet$DEFAULT, predProb_nnSMOTE[,"No"])
plot(roc_nn, print.auc = T,
     legacy.axis = T)

Naive_Bayes Prediction:

XGB Prediction:

predClass_xgb <- predict(xgb_model, newdata = testSet, type = "raw")
predProb_xgb <- predict(xgb_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_xgb, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_xgb[,"No"])
plot(roc_xgb, print.auc = T,
     legacy.axis = T)