library(tidyverse)
library(GGally)
library(plyr)
library(knitr)
library(pROC)
library(DMwR)
library(caret)
library(rpart)
library(xgboost)
#library(LightGBM)
library(klaR)
library(gbm)
library(nnet)
library(e1071)
library(readxl)
library(PerformanceAnalytics)
library(kerndwd)
library(tree)
library(SMCRM)
library(randomForestSRC)
library(rpart)
library(car)
library(dplyr)
library(earth)
library(mda)
library(bnclassify)
library(skimr)
library(rpart.plot)
library(corrplot)
library(ggthemes)
#str(df2)
colnames(df2)[24] <- "DEFAULT"
# GLM model for variable collinearity
set.seed(123)
glm.fit1 <- glm(DEFAULT~., data=df2)
preds.glm1 = predict(glm.fit1, newdata = df2)
glm1.mse = mean((preds.glm1 - df2$DEFAULT)^2)
glm1.mse
car::vif(glm.fit1)
chart.Correlation(df2, histogram=TRUE, pch=19)
chart.Correlation(df2[,c(24,0:10)], histogram=TRUE, pch=19)
# Check correlation
df2$DEFAULT = as.numeric(df2$DEFAULT)
r = cor(df2[-c(3, 4, 5)])
corrplot(r, method = "circle")
set.seed(123)
options(warn=-1)
subsets <- c(1:5,12, 15, 18)
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x=trainSet[, 1:24], y=trainSet$DEFAULT,
sizes = subsets,
rfeControl = ctrl)
lmProfile
df$ID <- NULL
colnames(df)[24] <- "DEFAULT"
df$DEFAULT <- as.factor(ifelse(df$DEFAULT == 1, "Yes", "No"))
df$SEX <- as.factor(ifelse(df$SEX == 1, "Male", "Female"))
df$MARRIAGE <- as.factor(ifelse(df$MARRIAGE == 1, "Married",
ifelse(df$MARRIAGE == 2, "Single", "Other")))
df$EDUCATION <- as.factor(ifelse(df$EDUCATION == 1, "Graduate.School",
ifelse(df$EDUCATION == 2, "University",
ifelse(df$EDUCATION == 3, "High.School",
ifelse(df$EDUCATION == 4, "Other", "Unkown")))))
df$PAY_0 <- as.factor(df$PAY_0)
df$PAY_2 <- as.factor(df$PAY_2)
df$PAY_3 <- as.factor(df$PAY_3)
df$PAY_4 <- as.factor(df$PAY_4)
df$PAY_5 <- as.factor(df$PAY_5)
df$PAY_6 <- as.factor(df$PAY_6)
str(df)
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(LIMIT_BAL~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "LIMIT_BAL")
boxplot(AGE~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "AGE")
#boxplot(SEX~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "SEX")
#boxplot(EDUCATION~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "EDUCATION")
#boxplot(MARRAIGE~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "MARRAIGE")
#boxplot(PAY_0~ DEFAULT, data = df, xlab = "DEFAULT", ylab = "PAY_0")
set.seed(101)
index <- createDataPartition(df$DEFAULT,
p = 0.7,
list = F)
trainSet <- df[index,]
trainSet_remove_y <- trainSet
trainSet_remove_y$DEFAULT <- NULL
#xTrain <- trainSet %>% select(-DEFAULT)
xTrain <- trainSet_remove_y
yTrain <- trainSet$DEFAULT
testSet <- df[-index,]
fiveMetric <- function(...) c(twoClassSummary(...),
defaultSummary(...))
ctrl <- trainControl(method = "cv",
number = 5,
summaryFunction = fiveMetric,
classProbs = T,
verboseIter = T)
ctrlSMOTE <- trainControl(method = "cv",
number = 5,
summaryFunction = fiveMetric,
classProbs = T,
sampling = "smote",
verboseIter = T)
skimr::skim_to_wide(trainSet)
# DEFAULT AND EDUCATION
ggplot(df2, aes(x = EDUCATION, fill = DEFAULT)) +
geom_bar() +
labs(x = 'Education') +
theme_classic()
# DEFAULT AND SEX
ggplot(df2, aes(x = SEX, fill = DEFAULT)) +
geom_bar() +
labs(x = 'Sex') +
theme_classic()
# DEFAULT AND MARIAGE
ggplot(df2, aes(x = MARRIAGE, fill = DEFAULT)) +
geom_bar() +
labs(x = 'Marriage') +
theme_classic()
# LIMIT_BAL AND EDUCATION
ggplot(df2, aes(x = EDUCATION, fill = LIMIT_BAL)) +
geom_bar() +
labs(x = 'Education') +
labs(y = 'Limit credit') +
theme_classic()
# age and default
ggplot(df2, aes(AGE, fill = DEFAULT)) +
geom_histogram(binwidth = 6) +
facet_grid(.~EDUCATION) +
theme_classic()
#library(vcd)
#mosaic(~ PAY_0 + DEFAULT | AGE,
# data = df2, shade=TRUE, legend=TRUE)
featurePlot(x = df[,c("SEX", "EDUCATION", "AGE", "MARRIAGE", "LIMIT_BAL")],
y = df$DEFAULT,
plot = "box",
scales = list(y = list(relation="free"),
x = list(rot = 90)),
layout = c(4,1 ),
auto.key = list(columns = 2))
featurePlot(x = df[, 18:23],
y = df$DEFAULT,
plot = "box",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
y = list(relation="free")))
featurePlot(x = df[, 18:23],
y = df$DEFAULT,
plot = "density",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
y = list(relation="free")))
Type: Regression, Classification
No tuning parameters for this model
set.seed(101)
glm_model <- train(DEFAULT ~ ., data = trainSet,
method = "glmStepAIC",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy"
)
glm_model
summary(glm_model)
#Final GLM Model Formula
# ROC 0.953103, Accuracy 0.8194369 : LIMIT_BAL + SEXMale + EDUCATIONOther + EDUCATIONUnkown + MARRIAGESingle + `PAY_0-1` + PAY_01 + PAY_02 + PAY_03 + PAY_04 + PAY_05 + PAY_06 + PAY_07 + `PAY_2-1` + PAY_24 + PAY_31 + PAY_32 + PAY_33 + PAY_35 + PAY_36 + `PAY_4-1` + PAY_40 + PAY_45 + PAY_46 + PAY_50 + PAY_52 + PAY_53 + PAY_57 + `PAY_6-1` + PAY_60 + PAY_63 + PAY_67 + BILL_AMT3 + BILL_AMT6 + PAY_AMT1 + PAY_AMT2 + PAY_AMT4 + PAY_AMT6
glm_model[["finalModel"]][["formula"]][[3]]
glm_model$bestTune
glm_res <- as_tibble(glm_model$results)
glm_res
Type: Classification - Required packages: earth, mda - Tuning parameters: degree (Product Degree) nprune (#Terms)
set.seed(101)
DA_model <- train (DEFAULT ~., data = trainSet,
method = "fda",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy")
DA_model
summary(DA_model)
varImp(DA_model)
DA_model$bestTune
DA_res <- as_tibble(DA_model$results)
DA_res
Tuning parameters:
mtry (#Randomly Selected Predictors) Required packages: randomForest
A model-specific variable importance metric is available.
set.seed(101)
rf_model <- train(DEFAULT ~ ., data = trainSet,
method = "rf",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy",
tuneGrid = expand.grid(.mtry = c(4,8,12,22)
))
rf_model
summary(rf_model)
varImp(rf_model)
rf_model$bestTune
RF_res <- as_tibble(rf_model$results)
RF_res
Type: Classification, Regression
Tuning parameters:
k (#Neighbors)
set.seed(101)
KNN_model <- train(DEFAULT ~ ., data = trainSet,
method = "knn",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy")
KNN_model
summary(KNN_model)
#varImp(SVM_model)
KNN_model$bestTune
KNN_res <- as_tibble(KNN_model$results)
KNN_res
Type: Classification
Tuning parameters:
sigma (Sigma) tau (Regularization Parameter)
set.seed(101)
SVM_model <- train(DEFAULT ~ ., data = trainSet,
method = "svmRadial",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy")
SVM_model
summary(SVM_model)
#varImp(SVM_model)
SVM_model$bestTune
SVM_res <- as_tibble(SVM_model$results)
SVM_res
Type: Classification
Tuning parameters:
sigma (Sigma) tau (Regularization Parameter)
set.seed(101)
SVM_model2 <- train(DEFAULT ~ ., data = trainSet,
method = "lssvmRadial",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy")
SVM_model2
summary(SVM_model2)
#varImp(SVM_model)
SVM_model2$bestTune
SVM2_res <- as_tibble(SVM_model2$results)
SVM2_res
Type: Regression, Classification
Tuning parameters: cp (Complexity Parameter)
set.seed(101)
C <- train(DEFAULT ~ ., data = trainSet,
method = "rpart",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy",
tuneGrid = expand.grid(.cp = seq(0,0.01,0.1)))
rpart_model
varImp(rpart_model)
rpart_model$bestTune
rpart_res <- as_tibble(rpart_model$results)
rpart_res
Type: Classification, Regression
Tuning parameters: size (#Hidden Units) decay (Weight Decay)
nn_grid <- expand.grid(.size = c(2,4,8),
.decay = c(0.001,0.01))
set.seed(101)
nn_model <- train(x = xTrain, y = yTrain,
method = "nnet",
tuneGrid = nn_grid,
preProcess = c("center", "scale"),
trControl = ctrl)
nn_model
set.seed(101)
nn_modelSMOTE <- train(x = xTrain, y = yTrain,
method = "nnet",
tuneGrid = nn_grid,
preProcess = c("center", "scale"),
trControl = ctrlSMOTE)
nn_modelSMOTE
# Seems as if our SMOTE (Synthetic Minority Over-Sampling Technique) was ineffective at improving our #models accuracy
nn_model$bestTune
nn_res <- as_tibble(nn_model$results)
nn_res
nn_modelSMOTE$bestTune
nnSMOTE_res <- as_tibble(nn_modelSMOTE$results)
nnSMOTE_res
Type: Regression and Classification
Tuning parameters: nrounds (# Boosting Iterations) max_depth (Max Tree Depth) eta (Shrinkage) gamma (Minimum Loss Reduction) colsample_bytree (Subsample Ratio of Columns) min_child_weight (Minimum Sum of Instance Weight) subsample (Subsample Percentage)
xgb_grid <- expand.grid(.nrounds = c(50,100,150),
.max_depth = 6,
.min_child_weight = 1,
.eta = .25,
.colsample_bytree = 1,
.subsample = 1,
.gamma = 1
)
set.seed(101)
xgb_model <- train(DEFAULT ~ ., data = trainSet,
method = "xgbTree",
preProcess = c("center", "scale"),
trControl = ctrl,
metric = "Accuracy",
tuneGrid = xgb_grid)
xgb_model
set.seed(101)
xgb_modelSMOTE <- train(DEFAULT ~ ., data = trainSet,
method = "xgbTree",
preProcess = c("center", "scale"),
trControl = ctrlSMOTE,
metric = "Accuracy",
tuneGrid = xgb_grid)
xgb_modelSMOTE
xgb_model$bestTune
xgb_res <- as_tibble(xgb_model$results)
xgb_res
xgb_modelSMOTE$bestTune
xgbSMOTE_res <- as_tibble(nn_modelSMOTE$results)
xgbSMOTE_res
Type: Classification
Tuning parameters: smooth (Smoothing Parameter) prior (Prior Probability)
set.seed(101)
Naive_Bayes_model <- train(x = xTrain, y = yTrain,
method = "nb",
preProcess = c("center"),
trControl = ctrl,
metric = "Accuracy"
)
Naive_Bayes_model
summary(Naive_Bayes_model)
Naive_Bayes_model$bestTune
Naive_Bayes_res <- as_tibble(Naive_Bayes_model$results)
Naive_Bayes_res
library(ROCR)
pred_test_naive<-predict(naive_model, newdata = T, type="raw")
p_test_naive<-prediction(pred_test_naive[,2], testSet$DEFAULT)
perf_naive<-performance(p_test_naive, "tpr", "fpr")
plot(perf_naive, colorize=F)
performance(p_test_naive, "auc")@y.values
all_models <- resamples((list(GLM = glm_model,
DA = DA_model,
RandomForest = rf_model,
DecisionTree = rpart_model,
KNN = KNN_model,
SVMR = SVM_model,
SVM2 = SVM_model2,
NN = nn_model,
NN_SMOTE = nn_modelSMOTE,
NB = Naive_Bayes_model,
XGB = xgb_model,
XGB_SMOTE = xgb_modelSMOTE)),decreasing = T)
summary(all_models)
bwplot(all_models)
dotplot(all_models)
We will use all of our models for predicting onto new unseen data sets.
predClass_glm <- predict(glm_model, newdata = testSet, type = "raw")
predProb_glm <- predict(glm_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_glm, testSet$DEFAULT)
roc_glm <- roc(testSet$DEFAULT, predProb_glm[,"No"])
plot(roc_glm, print.auc = T,
legacy.axis = T)
predClass_DA <- predict(DA_model, newdata = testSet, type = "raw")
predProb_DA <- predict(DA_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_DA, testSet$DEFAULT)
roc_DA <- roc(testSet$DEFAULT, predProb_DA[,"No"])
plot(roc_DA, print.auc = T,
legacy.axis = T)
predClass_rf <- predict(rf_model, newdata = testSet, type = "raw")
predProb_rf <- predict(rf_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_rf, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_rf[,"No"])
plot(roc_nn, print.auc = T,
legacy.axis = T)
predClass_KNN <- predict(KNN_model, newdata = testSet, type = "raw")
predProb_KNN <- predict(KNN_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_KNN, testSet$DEFAULT)
roc_KNN <- roc(testSet$DEFAULT, predProb_KNN[,"No"])
plot(roc_KNN, print.auc = T,
legacy.axis = T)
predClass_SVM <- predict(SVM_model, newdata = testSet, type = "raw")
predProb_SVM <- predict(SVM_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_SVM, testSet$DEFAULT)
roc_SVM <- roc(testSet$DEFAULT, predProb_SVM[,"No"])
plot(roc_SVM, print.auc = T,
legacy.axis = T)
predClass_rpart <- predict(rpart_model, newdata = testSet, type = "raw")
predProb_rpart <- predict(rpart_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_rpart, testSet$DEFAULT)
roc_rpart <- roc(testSet$DEFAULT, predProb_rpart[,"No"])
plot(roc_rpart, print.auc = T,
legacy.axis = T)
predClass_nn <- predict(nn_model, newdata = testSet, type = "raw")
predProb_nn <- predict(nn_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_nn, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_nn[,"No"])
plot(roc_nn, print.auc = T,
legacy.axis = T)
predClass_nnSMOTE <- predict(nn_modelSMOTE, newdata = testSet, type = "raw")
predProb_nnSMOTE <- predict(nn_modelSMOTE, newdata = testSet, type = "prob")
confusionMatrix(predClass_nnSMOTE, testSet$DEFAULT)
roc_nnSMOTE <- roc(testSet$DEFAULT, predProb_nnSMOTE[,"No"])
plot(roc_nn, print.auc = T,
legacy.axis = T)
predClass_xgb <- predict(xgb_model, newdata = testSet, type = "raw")
predProb_xgb <- predict(xgb_model, newdata = testSet, type = "prob")
confusionMatrix(predClass_xgb, testSet$DEFAULT)
roc_nn <- roc(testSet$DEFAULT, predProb_xgb[,"No"])
plot(roc_xgb, print.auc = T,
legacy.axis = T)