library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$Default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)## [1] 23681 9
## [1] 5920 9
## setting levels as "Yes" and "NO"
train.df$Default <- ifelse(train.df$Default == "1","Yes","No")
set.seed(123)
# fitting decision tree classification model
ModelRF <- train(Default ~ CreditLimit
+ Male
+ Education
+ MaritalStatus
+ Age
+ BillOutstanding
+ LastPayment,
data = train.df,
method = "rf",
nbagg = 50,
parms = list(split = "gini"),
trControl = trctrl,
importance = TRUE)
# model summary
ModelRF## Random Forest
##
## 23681 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
# plot of probabilities
plot(PredRFModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
Predicted <- ifelse(PredRFModel$Yes > 0.50, "Yes", "No")
# ordering the vectors
Actual <- ifelse(test.df$Default=="1","Yes","No")
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual), positive = "Yes")
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 4519 1226
## Yes 80 95
##
## Accuracy : 0.7794
## 95% CI : (0.7686, 0.7899)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.3262
##
## Kappa : 0.0789
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07192
## Specificity : 0.98260
## Pos Pred Value : 0.54286
## Neg Pred Value : 0.78660
## Prevalence : 0.22314
## Detection Rate : 0.01605
## Detection Prevalence : 0.02956
## Balanced Accuracy : 0.52726
##
## 'Positive' Class : Yes
##
# loading the package
library(ROCR)
RFPrediction <- predict(ModelRF, test.df,type = "prob")
Prediction <- prediction(RFPrediction[2],test.df$Default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
RFPrediction <- prediction(PredRFModel[2],test.df$Default)
aucRF <- performance(RFPrediction, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF## [1] 0.6459245