# convert 'Male' as a factor
df[, Male := as.factor(Male)]
# convert 'Education' as a factor
df[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
df[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
df[, Default := as.factor(Default)]
## setting levels as "Yes" and "NO"
df$Default <- ifelse(df$Default == "1","Yes","No")library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$Default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)## [1] 23681 9
## [1] 5920 9
set.seed(3333)
# fitting decision tree classification model
DTModel <- train(Default ~ CreditLimit
+ Male
+ Education
+ MaritalStatus
+ Age
+ BillOutstanding
+ LastPayment,
data = train.df,
method = "rpart",
metric = "ROC",
parms = list(split = "information"),
trControl = trctrl)
# model summary
DTModel## CART
##
## 23681 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001703255 0.7776698 0.05668749
## 0.002081756 0.7777964 0.05365759
## 0.003406510 0.7765299 0.01096180
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.002081756.
rpart.plot# viasulaziation
library(rpart.plot)
prp(DTModel$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)# plot of probabilities
plot(PredDTModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
Predicted <- ifelse(PredDTModel$Yes > 0.50, "Yes", "No")
# ordering the vectors
Actual <- test.df$Default
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 4533 1234
## Yes 66 87
##
## Accuracy : 0.7804
## 95% CI : (0.7696, 0.7909)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.2617
##
## Kappa : 0.0752
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98565
## Specificity : 0.06586
## Pos Pred Value : 0.78602
## Neg Pred Value : 0.56863
## Prevalence : 0.77686
## Detection Rate : 0.76571
## Detection Prevalence : 0.97416
## Balanced Accuracy : 0.52575
##
## 'Positive' Class : No
##
# loading the package
library(ROCR)
DTPrediction <- predict(DTModel, test.df,type = "prob")
Prediction <- prediction(DTPrediction[2],test.df$Default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
DTPrediction <- prediction(PredDTModel[2],test.df$Default)
aucDT <- performance(DTPrediction, measure = "auc")
aucDT <- aucDT@y.values[[1]]
aucDT## [1] 0.601756