setwd("C:/Users/adminsa/Desktop/Pos Graduacao/Machine Learning/Mybank")
ploan <- read.table("My Bank Case Study-dataset.csv", sep = ",", header = TRUE)
dim(ploan)
[1] 20000 40
names(ploan)
[1] "CUST_ID" "TARGET" "AGE" "GENDER" "BALANCE"
[6] "OCCUPATION" "AGE_BKT" "SCR" "HOLDING_PERIOD" "ACC_TYPE"
[11] "ACC_OP_DATE" "LEN_OF_RLTN_IN_MNTH" "NO_OF_L_CR_TXNS" "NO_OF_L_DR_TXNS" "TOT_NO_OF_L_TXNS"
[16] "NO_OF_BR_CSH_WDL_DR_TXNS" "NO_OF_ATM_DR_TXNS" "NO_OF_NET_DR_TXNS" "NO_OF_MOB_DR_TXNS" "NO_OF_CHQ_DR_TXNS"
[21] "FLG_HAS_CC" "AMT_ATM_DR" "AMT_BR_CSH_WDL_DR" "AMT_CHQ_DR" "AMT_NET_DR"
[26] "AMT_MOB_DR" "AMT_L_DR" "FLG_HAS_ANY_CHGS" "AMT_OTH_BK_ATM_USG_CHGS" "AMT_MIN_BAL_NMC_CHGS"
[31] "NO_OF_IW_CHQ_BNC_TXNS" "NO_OF_OW_CHQ_BNC_TXNS" "AVG_AMT_PER_ATM_TXN" "AVG_AMT_PER_CSH_WDL_TXN" "AVG_AMT_PER_CHQ_TXN"
[36] "AVG_AMT_PER_NET_TXN" "AVG_AMT_PER_MOB_TXN" "FLG_HAS_NOMINEE" "FLG_HAS_OLD_LOAN" "random"
str(ploan)
'data.frame': 20000 obs. of 40 variables:
$ CUST_ID : Factor w/ 20000 levels "C1","C10","C100",..: 17699 16532 11027 17984 2363 11747 18115 15556 15216 12494 ...
$ TARGET : int 0 0 0 0 0 0 0 0 0 0 ...
$ AGE : int 27 47 40 53 36 42 30 53 42 30 ...
$ GENDER : Factor w/ 3 levels "F","M","O": 2 2 2 2 2 1 2 1 1 2 ...
$ BALANCE : num 3384 287489 18217 71720 1671623 ...
$ OCCUPATION : Factor w/ 4 levels "PROF","SAL","SELF-EMP",..: 3 2 3 2 1 1 1 2 3 1 ...
$ AGE_BKT : Factor w/ 7 levels "<25",">50","26-30",..: 3 7 5 2 5 6 3 2 6 3 ...
$ SCR : int 776 324 603 196 167 493 479 562 105 170 ...
$ HOLDING_PERIOD : int 30 28 2 13 24 26 14 25 15 13 ...
$ ACC_TYPE : Factor w/ 2 levels "CA","SA": 2 2 2 1 2 2 2 1 2 2 ...
$ ACC_OP_DATE : Factor w/ 4869 levels "01-01-00","01-01-01",..: 3270 1806 3575 993 2861 862 4533 3160 257 334 ...
$ LEN_OF_RLTN_IN_MNTH : int 146 104 61 107 185 192 177 99 88 111 ...
$ NO_OF_L_CR_TXNS : int 7 8 10 36 20 5 6 14 18 14 ...
$ NO_OF_L_DR_TXNS : int 3 2 5 14 1 2 6 3 14 8 ...
$ TOT_NO_OF_L_TXNS : int 10 10 15 50 21 7 12 17 32 22 ...
$ NO_OF_BR_CSH_WDL_DR_TXNS: int 0 0 1 4 1 1 0 3 6 3 ...
$ NO_OF_ATM_DR_TXNS : int 1 1 1 2 0 1 1 0 2 1 ...
$ NO_OF_NET_DR_TXNS : int 2 1 1 3 0 0 1 0 4 0 ...
$ NO_OF_MOB_DR_TXNS : int 0 0 0 1 0 0 0 0 1 0 ...
$ NO_OF_CHQ_DR_TXNS : int 0 0 2 4 0 0 4 0 1 4 ...
$ FLG_HAS_CC : int 0 0 0 0 0 1 0 0 1 0 ...
$ AMT_ATM_DR : int 13100 6600 11200 26100 0 18500 6200 0 35400 18000 ...
$ AMT_BR_CSH_WDL_DR : int 0 0 561120 673590 808480 379310 0 945160 198430 869880 ...
$ AMT_CHQ_DR : int 0 0 49320 60780 0 0 10580 0 51490 32610 ...
$ AMT_NET_DR : num 973557 799813 997570 741506 0 ...
$ AMT_MOB_DR : int 0 0 0 71388 0 0 0 0 170332 0 ...
$ AMT_L_DR : num 986657 806413 1619210 1573364 808480 ...
$ FLG_HAS_ANY_CHGS : int 0 1 1 0 0 0 1 0 0 0 ...
$ AMT_OTH_BK_ATM_USG_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
$ AMT_MIN_BAL_NMC_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_OF_IW_CHQ_BNC_TXNS : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_OF_OW_CHQ_BNC_TXNS : int 0 0 1 0 0 0 0 0 0 0 ...
$ AVG_AMT_PER_ATM_TXN : num 13100 6600 11200 13050 0 ...
$ AVG_AMT_PER_CSH_WDL_TXN : num 0 0 561120 168398 808480 ...
$ AVG_AMT_PER_CHQ_TXN : num 0 0 24660 15195 0 ...
$ AVG_AMT_PER_NET_TXN : num 486779 799813 997570 247169 0 ...
$ AVG_AMT_PER_MOB_TXN : num 0 0 0 71388 0 ...
$ FLG_HAS_NOMINEE : int 1 1 1 1 1 1 0 1 1 0 ...
$ FLG_HAS_OLD_LOAN : int 1 0 1 0 0 1 1 1 1 0 ...
$ random : num 1.14e-05 1.11e-04 1.20e-04 1.37e-04 1.74e-04 ...
colSums(is.na(ploan))
CUST_ID TARGET AGE GENDER BALANCE
0 0 0 0 0
OCCUPATION AGE_BKT SCR HOLDING_PERIOD ACC_TYPE
0 0 0 0 0
ACC_OP_DATE LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS
0 0 0 0 0
NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS
0 0 0 0 0
FLG_HAS_CC AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR AMT_NET_DR
0 0 0 0 0
AMT_MOB_DR AMT_L_DR FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS
0 0 0 0 0
NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN
0 0 0 0 0
AVG_AMT_PER_NET_TXN AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN random
0 0 0 0 0
ploan$FLG_HAS_CC <- as.factor(ploan$FLG_HAS_CC)
ploan$FLG_HAS_ANY_CHGS <- as.factor(ploan$FLG_HAS_ANY_CHGS)
ploan$FLG_HAS_NOMINEE <- as.factor(ploan$FLG_HAS_NOMINEE)
ploan$FLG_HAS_OLD_LOAN <- as.factor(ploan$FLG_HAS_OLD_LOAN)
library(caret)
set.seed(111)
trainIndex <- createDataPartition(ploan$TARGET,p=0.7,list = FALSE,times = 1)
train.data <- ploan[trainIndex, ]
length(which(train.data$TARGET == 1))*100/nrow(train.data)
[1] 12.77143
dim(train.data)
[1] 14000 40
test.data <- ploan[-trainIndex,]
length(which(test.data$TARGET == 1))*100/nrow(test.data)
[1] 12.06667
dim(test.data)
[1] 6000 40
Model Building - CART (Unbalanced Dataset)-Setting the control parameter inputs for rpart
library(rpart)
package 㤼㸱rpart㤼㸲 was built under R version 3.6.3
r.ctrl <- rpart.control(minsplit = 100,
minbucket = 10,
cp = 0,
xval = 10
)
cart.dev <- train.data
names(cart.dev)
[1] "CUST_ID" "TARGET" "AGE" "GENDER" "BALANCE"
[6] "OCCUPATION" "AGE_BKT" "SCR" "HOLDING_PERIOD" "ACC_TYPE"
[11] "ACC_OP_DATE" "LEN_OF_RLTN_IN_MNTH" "NO_OF_L_CR_TXNS" "NO_OF_L_DR_TXNS" "TOT_NO_OF_L_TXNS"
[16] "NO_OF_BR_CSH_WDL_DR_TXNS" "NO_OF_ATM_DR_TXNS" "NO_OF_NET_DR_TXNS" "NO_OF_MOB_DR_TXNS" "NO_OF_CHQ_DR_TXNS"
[21] "FLG_HAS_CC" "AMT_ATM_DR" "AMT_BR_CSH_WDL_DR" "AMT_CHQ_DR" "AMT_NET_DR"
[26] "AMT_MOB_DR" "AMT_L_DR" "FLG_HAS_ANY_CHGS" "AMT_OTH_BK_ATM_USG_CHGS" "AMT_MIN_BAL_NMC_CHGS"
[31] "NO_OF_IW_CHQ_BNC_TXNS" "NO_OF_OW_CHQ_BNC_TXNS" "AVG_AMT_PER_ATM_TXN" "AVG_AMT_PER_CSH_WDL_TXN" "AVG_AMT_PER_CHQ_TXN"
[36] "AVG_AMT_PER_NET_TXN" "AVG_AMT_PER_MOB_TXN" "FLG_HAS_NOMINEE" "FLG_HAS_OLD_LOAN" "random"
m1 <- rpart(formula = TARGET~.,
data = cart.dev[,-c(1,11)],
method = "class",
control = r.ctrl)
printcp(m1)
Classification tree:
rpart(formula = TARGET ~ ., data = cart.dev[, -c(1, 11)], method = "class",
control = r.ctrl)
Variables actually used in tree construction:
[1] AGE AGE_BKT AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR
[6] AMT_L_DR AMT_MOB_DR AMT_NET_DR AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CHQ_TXN
[11] AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_NET_TXN BALANCE FLG_HAS_CC GENDER
[16] HOLDING_PERIOD LEN_OF_RLTN_IN_MNTH NO_OF_ATM_DR_TXNS NO_OF_IW_CHQ_BNC_TXNS NO_OF_L_CR_TXNS
[21] NO_OF_L_DR_TXNS OCCUPATION SCR TOT_NO_OF_L_TXNS
Root node error: 1788/14000 = 0.12771
n= 14000
CP nsplit rel error xerror xstd
1 0.00643177 0 1.00000 1.00000 0.022087
2 0.00531320 2 0.98714 0.99273 0.022019
3 0.00335570 4 0.97651 0.99217 0.022013
4 0.00279642 5 0.97315 0.98937 0.021987
5 0.00268456 6 0.97036 0.99161 0.022008
6 0.00260999 15 0.94295 0.99161 0.022008
7 0.00223714 19 0.93233 0.98937 0.021987
8 0.00195749 20 0.93009 0.99664 0.022056
9 0.00167785 22 0.92617 1.00727 0.022156
10 0.00111857 43 0.88255 1.01119 0.022192
11 0.00095877 48 0.87696 1.01063 0.022187
12 0.00065250 55 0.87025 1.01566 0.022234
13 0.00055928 61 0.86633 1.02125 0.022286
14 0.00047939 75 0.85682 1.02125 0.022286
15 0.00037286 82 0.85347 1.01957 0.022271
16 0.00000000 85 0.85235 1.02740 0.022343
library(rattle)
package 㤼㸱rattle㤼㸲 was built under R version 3.6.3Rattle: A free graphical interface for data science with R.
Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
Type 'rattle()' to shake, rattle, and roll your data.
Attaching package: 㤼㸱rattle㤼㸲
The following object is masked from 㤼㸱package:randomForest㤼㸲:
importance
The following object is masked from 㤼㸱package:VIM㤼㸲:
wine
library(RColorBrewer)
fancyRpartPlot(m1)
plotcp(m1)
ptree<- prune(m1, cp= 0.0022 ,"CP")
printcp(ptree)
Classification tree:
rpart(formula = TARGET ~ ., data = cart.dev[, -c(1, 11)], method = "class",
control = r.ctrl)
Variables actually used in tree construction:
[1] AGE_BKT AMT_L_DR AMT_MOB_DR AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_NET_TXN
[7] BALANCE GENDER HOLDING_PERIOD LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS
[13] OCCUPATION SCR
Root node error: 1788/14000 = 0.12771
n= 14000
CP nsplit rel error xerror xstd
1 0.0064318 0 1.00000 1.00000 0.022087
2 0.0053132 2 0.98714 0.99273 0.022019
3 0.0033557 4 0.97651 0.99217 0.022013
4 0.0027964 5 0.97315 0.98937 0.021987
5 0.0026846 6 0.97036 0.99161 0.022008
6 0.0026100 15 0.94295 0.99161 0.022008
7 0.0022371 19 0.93233 0.98937 0.021987
8 0.0022000 20 0.93009 0.99664 0.022056
fancyRpartPlot(ptree,
uniform = TRUE,
main = "Final Tree",
palettes = c("Blues", "Reds")
)
Measurements KPIs for CART Rank Ordering, KS, Area Under Curve (AUC), Gini Coefficient, Classification Error
Lets Predict The Data:
cart.dev$predict.class = predict(ptree, cart.dev, type = "class")
cart.dev$predict.score = predict(ptree, cart.dev, type = "prob")
Deciling
library(StatMeasures)
decile <- function(x){
deciles <- vector(length=10)
for (i in seq(0.1,1,.1)){
deciles[i*10] <- quantile(x, i, na.rm=T)
}
return (
ifelse(x<deciles[1], 1,
ifelse(x<deciles[2], 2,
ifelse(x<deciles[3], 3,
ifelse(x<deciles[4], 4,
ifelse(x<deciles[5], 5,
ifelse(x<deciles[6], 6,
ifelse(x<deciles[7], 7,
ifelse(x<deciles[8], 8,
ifelse(x<deciles[9], 9, 10
))))))))))
}
cart.dev$deciles <- decile(cart.dev$predict.score[,2])
Ranking the Code
KS and Area under Curve
cart.dev - Confusion Matrix, using CARET and am excellent library that helps us not just bringing the accuracy, but others fine measurements such as: sensivity, Specificity
library(caret)
library(e1071)
class(cart.dev$TARGET)
[1] "integer"
class(cart.dev$predict.class)
[1] "factor"
cart.dev$TARGET = as.factor(cart.dev$TARGET)
cm.dev = confusionMatrix(cart.dev$predict.class, cart.dev$TARGET, positive = "1")
print(cm.dev)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 12085 1536
1 127 252
Accuracy : 0.8812
95% CI : (0.8757, 0.8865)
No Information Rate : 0.8723
P-Value [Acc > NIR] : 0.0007325
Kappa : 0.1967
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.14094
Specificity : 0.98960
Pos Pred Value : 0.66491
Neg Pred Value : 0.88723
Prevalence : 0.12771
Detection Rate : 0.01800
Detection Prevalence : 0.02707
Balanced Accuracy : 0.56527
'Positive' Class : 1
library(rpart)
cart.holdout <- test.data
names(test.data)
[1] "CUST_ID" "TARGET" "AGE" "GENDER" "BALANCE"
[6] "OCCUPATION" "AGE_BKT" "SCR" "HOLDING_PERIOD" "ACC_TYPE"
[11] "ACC_OP_DATE" "LEN_OF_RLTN_IN_MNTH" "NO_OF_L_CR_TXNS" "NO_OF_L_DR_TXNS" "TOT_NO_OF_L_TXNS"
[16] "NO_OF_BR_CSH_WDL_DR_TXNS" "NO_OF_ATM_DR_TXNS" "NO_OF_NET_DR_TXNS" "NO_OF_MOB_DR_TXNS" "NO_OF_CHQ_DR_TXNS"
[21] "FLG_HAS_CC" "AMT_ATM_DR" "AMT_BR_CSH_WDL_DR" "AMT_CHQ_DR" "AMT_NET_DR"
[26] "AMT_MOB_DR" "AMT_L_DR" "FLG_HAS_ANY_CHGS" "AMT_OTH_BK_ATM_USG_CHGS" "AMT_MIN_BAL_NMC_CHGS"
[31] "NO_OF_IW_CHQ_BNC_TXNS" "NO_OF_OW_CHQ_BNC_TXNS" "AVG_AMT_PER_ATM_TXN" "AVG_AMT_PER_CSH_WDL_TXN" "AVG_AMT_PER_CHQ_TXN"
[36] "AVG_AMT_PER_NET_TXN" "AVG_AMT_PER_MOB_TXN" "FLG_HAS_NOMINEE" "FLG_HAS_OLD_LOAN" "random"
names(cart.holdout)
[1] "CUST_ID" "TARGET" "AGE" "GENDER" "BALANCE"
[6] "OCCUPATION" "AGE_BKT" "SCR" "HOLDING_PERIOD" "ACC_TYPE"
[11] "ACC_OP_DATE" "LEN_OF_RLTN_IN_MNTH" "NO_OF_L_CR_TXNS" "NO_OF_L_DR_TXNS" "TOT_NO_OF_L_TXNS"
[16] "NO_OF_BR_CSH_WDL_DR_TXNS" "NO_OF_ATM_DR_TXNS" "NO_OF_NET_DR_TXNS" "NO_OF_MOB_DR_TXNS" "NO_OF_CHQ_DR_TXNS"
[21] "FLG_HAS_CC" "AMT_ATM_DR" "AMT_BR_CSH_WDL_DR" "AMT_CHQ_DR" "AMT_NET_DR"
[26] "AMT_MOB_DR" "AMT_L_DR" "FLG_HAS_ANY_CHGS" "AMT_OTH_BK_ATM_USG_CHGS" "AMT_MIN_BAL_NMC_CHGS"
[31] "NO_OF_IW_CHQ_BNC_TXNS" "NO_OF_OW_CHQ_BNC_TXNS" "AVG_AMT_PER_ATM_TXN" "AVG_AMT_PER_CSH_WDL_TXN" "AVG_AMT_PER_CHQ_TXN"
[36] "AVG_AMT_PER_NET_TXN" "AVG_AMT_PER_MOB_TXN" "FLG_HAS_NOMINEE" "FLG_HAS_OLD_LOAN" "random"
m2 <- rpart(formula = TARGET~.,
data = cart.holdout[,-c(1,11)],
method = "class",
control = r.ctrl)
printcp(m2)
Classification tree:
rpart(formula = TARGET ~ ., data = cart.holdout[, -c(1, 11)],
method = "class", control = r.ctrl)
Variables actually used in tree construction:
[1] AMT_ATM_DR AMT_BR_CSH_WDL_DR BALANCE FLG_HAS_CC HOLDING_PERIOD NO_OF_L_DR_TXNS OCCUPATION
[8] SCR TOT_NO_OF_L_TXNS
Root node error: 724/6000 = 0.12067
n= 6000
CP nsplit rel error xerror xstd
1 0.00966851 0 1.00000 1.0000 0.034850
2 0.00055249 4 0.95994 1.0166 0.035098
3 0.00046041 9 0.95718 1.0497 0.035584
4 0.00000000 12 0.95580 1.0497 0.035584
library(rattle)
library(RColorBrewer)
fancyRpartPlot(m2)
cart.holdout$predict.class = predict(ptree, cart.holdout, type = "class")
cart.holdout$predict.score = predict(ptree, cart.holdout, type = "prob")
Deciling, already done that wih Development subset, just add a new collumn in holdout subset
cart.holdout$deciles <- decile(cart.holdout$predict.score[,2])
View(cart.holdout)
Ranking the Code
library(data.table)
library(scales)
tmp_DT.holdout = data.table(cart.holdout)
rank.holdout <- tmp_DT.holdout[, list(cnt=length(TARGET),
cnt_resp=sum(TARGET==1),
cnt_non_resp=sum(TARGET==0)
), by=deciles][order(-deciles)]
rank.holdout$rrate <- round(rank.holdout$cnt_resp / rank.holdout$cnt,4);
rank.holdout$cum_resp <- cumsum(rank.holdout$cnt_resp)
rank.holdout$cum_non_resp <- cumsum(rank.holdout$cnt_non_resp)
rank.holdout$cum_rel_resp <- round(rank.holdout$cum_resp / sum(rank.holdout$cnt_resp),4);
rank.holdout$cum_rel_non_resp <- round(rank.holdout$cum_non_resp / sum(rank.holdout$cnt_non_resp),4);
rank.holdout$ks <- abs(rank.holdout$cum_rel_resp - rank.holdout$cum_rel_non_resp) * 100;
rank.holdout$rrate <- percent(rank.holdout$rrate)
rank.holdout$cum_rel_resp <- percent(rank.holdout$cum_rel_resp)
rank.holdout$cum_rel_non_resp <- percent(rank.holdout$cum_rel_non_resp)
rank.holdout
NA
KS and Area under Curve
library(ROCR)
library(ineq)
pred.holdout <- prediction(cart.holdout$predict.score[,2], cart.holdout$TARGET)
perf.holdout <- performance(pred.holdout, "tpr", "fpr")
plot(perf.holdout)
KS.holdout <- max(attr(perf.holdout, 'y.values')[[1]]-attr(perf.holdout, 'x.values')[[1]])
auc.holdout <- performance(pred.holdout,"auc");
auc.holdout <- as.numeric(auc.holdout@y.values)
gini.holdout = ineq(cart.holdout$predict.score[,2], type="Gini")
with(cart.holdout, table(TARGET, predict.class))
predict.class
TARGET 0 1
0 5197 79
1 625 99
plot(perf.holdout)
cart.holdout - Confusion Matrix, using CARET and an excellent library that helps us not just bringing the accuracy, but others fine measurements such as: sensivity, Specificity
library(caret)
library(e1071)
cart.holdout$TARGET = as.factor(cart.holdout$TARGET)
cm.holdout = confusionMatrix(cart.holdout$predict.class, cart.holdout$TARGET, positive = "1")
print(cm.holdout)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 5197 625
1 79 99
Accuracy : 0.8827
95% CI : (0.8743, 0.8907)
No Information Rate : 0.8793
P-Value [Acc > NIR] : 0.2204
Kappa : 0.1805
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.13674
Specificity : 0.98503
Pos Pred Value : 0.55618
Neg Pred Value : 0.89265
Prevalence : 0.12067
Detection Rate : 0.01650
Detection Prevalence : 0.02967
Balanced Accuracy : 0.56088
'Positive' Class : 1
class(cart.holdout$TARGET)
[1] "factor"
class(cart.holdout$predict.class)
[1] "factor"
#————————————————————————————————————————————————–*