Sameer Mathur
Classification Based on Decision Tree
---
# reading data
library(data.table)
autoF.dt <- fread("AutoFinanaceData.csv")
# attach data columns
attach(autoF.dt)
# dimension of the dataset
dim(autoF.dt)
[1] 28906 21
# structure of the data table
str(autoF.dt)
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: chr "Closed" "Closed" "Closed" "Closed" ...
$ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : int 0 0 0 0 0 0 0 0 0 0 ...
$ QUALHSC : int 0 0 0 0 0 0 1 0 0 0 ...
$ QUAL_PG : int 0 0 0 0 0 0 0 0 0 0 ...
$ SEXCODE : int 1 1 1 1 1 1 1 1 1 1 ...
$ FULLPDC : int 1 1 1 1 1 0 0 1 1 1 ...
$ FRICODE : int 0 1 1 1 1 0 0 0 0 0 ...
$ WASHCODE : int 0 0 1 1 0 0 0 0 0 0 ...
$ Region : chr "AP2" "AP2" "AP2" "AP2" ...
$ Branch : chr "Vizag" "Vizag" "Vizag" "Vizag" ...
$ DefaulterFlag : int 0 0 0 0 0 0 0 0 0 0 ...
$ DefaulterType : int 0 0 0 0 0 0 0 0 0 0 ...
$ DATASET : chr "" "BUILD" "BUILD" "BUILD" ...
- attr(*, ".internal.selfref")=<externalptr>
# convert 'Contract Status' to a factor
autoF.dt[, ContractStatus := factor(ContractStatus)]
# convert 'PROFBUS' to a factor
autoF.dt[, PROFBUS := factor(PROFBUS)]
# convert 'QUALHSC' to a factor
autoF.dt[, QUALHSC := factor(QUALHSC)]
# convert 'QUAL_PG' to a factor
autoF.dt[, QUAL_PG := factor(QUAL_PG)]
# convert 'SEXCODE' to a factor
autoF.dt[, SEXCODE := factor(SEXCODE)]
# convert 'FULLPDC' to a factor
autoF.dt[, FULLPDC := factor(FULLPDC)]
# convert 'FRICODE' to a factor
autoF.dt[, FRICODE := factor(FRICODE)]
# convert 'WASHCODE' to a factor
autoF.dt[, WASHCODE := factor(WASHCODE)]
# convert 'DefaulterFlag' to a factor
autoF.dt[, DefaulterFlag := factor(DefaulterFlag)]
# convert 'DefaulterType' to a factor
autoF.dt[, DefaulterType := factor(DefaulterType)]
# convert 'Region' to a factor
autoF.dt[, Region := factor(Region)]
# convert 'Branch' to a factor
autoF.dt[, Branch := factor(Branch)]
# convert 'DATASET' to a factor
autoF.dt[, DATASET := factor(DATASET)]
# verify conversion
str(autoF.dt)
Classes 'data.table' and 'data.frame': 28906 obs. of 21 variables:
$ Agmt No : chr "AP18100057" "AP18100140" "AP18100198" "AP18100217" ...
$ ContractStatus: Factor w/ 4 levels "Closed","Foreclosed",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartDate : chr "19-01-01" "10-05-01" "05-08-01" "03-09-01" ...
$ AGE : int 26 28 32 31 36 33 41 47 43 27 ...
$ NOOFDEPE : int 2 2 2 0 2 2 2 0 0 0 ...
$ MTHINCTH : num 4.5 5.59 8.8 5 12 ...
$ SALDATFR : num 1 1 1 1 1 1 1 1 0.97 1 ...
$ TENORYR : num 1.5 2 1 1 1 2 1 2 1.5 2 ...
$ DWNPMFR : num 0.27 0.25 0.51 0.66 0.17 0.18 0.37 0.42 0.27 0.47 ...
$ PROFBUS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ QUALHSC : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
$ QUAL_PG : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ SEXCODE : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ FULLPDC : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 2 2 2 ...
$ FRICODE : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 1 1 ...
$ WASHCODE : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
$ Region : Factor w/ 8 levels "AP1","AP2","Chennai",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Branch : Factor w/ 14 levels "Bangalore","Chennai",..: 14 14 14 14 14 14 14 14 14 14 ...
$ DefaulterFlag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ DefaulterType : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
$ DATASET : Factor w/ 3 levels "","BUILD","VALIDATE": 1 2 2 2 2 2 2 2 2 2 ...
- attr(*, ".internal.selfref")=<externalptr>
# descriptive statistics of the dataframe
library(psych)
describe(autoF.dt)[, c(1:5)]
vars n mean sd median
Agmt No* 1 28906 NaN NA NA
ContractStatus* 2 28906 1.33 0.77 1.00
StartDate* 3 28906 NaN NA NA
AGE 4 28906 36.44 9.82 35.00
NOOFDEPE 5 28906 2.85 1.61 3.00
MTHINCTH 6 28906 8.94 4.81 8.00
SALDATFR 7 28906 0.44 0.46 0.17
TENORYR 8 28906 1.28 0.52 1.00
DWNPMFR 9 28906 0.38 0.16 0.38
PROFBUS* 10 28906 1.15 0.36 1.00
QUALHSC* 11 28906 1.23 0.42 1.00
QUAL_PG* 12 28906 1.04 0.20 1.00
SEXCODE* 13 28906 1.92 0.27 2.00
FULLPDC* 14 28906 1.39 0.49 1.00
FRICODE* 15 28906 1.42 0.49 1.00
WASHCODE* 16 28906 1.19 0.39 1.00
Region* 17 28906 5.33 1.51 6.00
Branch* 18 28906 5.93 3.47 6.00
DefaulterFlag* 19 28906 1.71 0.45 2.00
DefaulterType* 20 28906 1.85 0.63 2.00
DATASET* 21 28906 2.52 0.50 3.00
Reserve 80% for training and 20% of test
# loading the package
library(caTools)
# fixing the observations
set.seed(123)
# splitting the data
split = sample.split(autoF.dt$DefaulterFlag, SplitRatio = 0.75)
# creating the training set
trainingSet = subset(autoF.dt, split == TRUE)
# creating the test set
testSet = subset(autoF.dt, split == FALSE)
# dimensions of the full data
dim(autoF.dt)
[1] 28906 21
# dimensions of the training, testing data
dim(trainingSet)
[1] 21679 21
dim(testSet)
[1] 7227 21
library(caret)
# Control Parameters to be used in the Decision Tree
ctrl <- trainControl(method = "cv",
number = 8,
classProbs = TRUE,
summaryFunction = twoClassSummary)
# Technical Details
# method : resampling method used, e.g. {"boot", "optimism_boot", "boot_all", "cv", "repeatedcv", "LOOCV", "LGOCV"}
# number : number of folds
# summaryFunction: compute performance metrics across resamples.
set.seed(2345)
library(caret)
levels(trainingSet$DefaulterFlag) <- c("No", "Yes")
dTreeInfoGain <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rpart",
parms = list(split = "information"),
trControl = ctrl,
metric = "ROC")
CART
21679 samples
14 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (8 fold)
Summary of sample sizes: 18970, 18969, 18969, 18969, 18969, 18969, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
0.01024164 0.6782253 0.2632497 0.9163298
0.01152184 0.6779262 0.2550870 0.9187274
0.01408225 0.6337774 0.1922215 0.9371999
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01024164.
dTreeInfoGain$bestTune
cp
1 0.01024164
cp is the “minimum benefit” that a split must add to the tree. It is used to control the size of the decision tree and to select the optimal tree size.
If the cost of adding another variable to the decision tree from the current node is above the value of cp, then tree building stops.
# plot complexity parameter
plot(dTreeInfoGain)
Recall that the data has multiple explanatory variables.
colnames(autoF.dt)
[1] "Agmt No" "ContractStatus" "StartDate" "AGE"
[5] "NOOFDEPE" "MTHINCTH" "SALDATFR" "TENORYR"
[9] "DWNPMFR" "PROFBUS" "QUALHSC" "QUAL_PG"
[13] "SEXCODE" "FULLPDC" "FRICODE" "WASHCODE"
[17] "Region" "Branch" "DefaulterFlag" "DefaulterType"
[21] "DATASET"
Which ones are the most important?
varImp(dTreeInfoGain)
rpart variable importance
Overall
FULLPDC1 100.000
TENORYR 48.740
DWNPMFR 36.580
RegionChennai 21.117
FRICODE1 16.890
SALDATFR 5.642
AGE 5.222
PROFBUS1 4.909
RegionKA1 0.000
RegionKE2 0.000
RegionTN1 0.000
WASHCODE1 0.000
MTHINCTH 0.000
SEXCODE1 0.000
RegionVellore 0.000
RegionTN2 0.000
QUALHSC1 0.000
QUAL_PG1 0.000
NOOFDEPE 0.000
RegionAP2 0.000
# important variables
plot(varImp(dTreeInfoGain,
main = "Important Variables",
scale = TRUE))
rpart.plot# viasulaziation
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)
Segment 1:
Consumers who do not submit PDC (Post-dated Checks)
Segment 2:
Consumers who (i) submit PDC, (ii) Loan Tenure >= 1.4 years
Segment 3:
Consumers who (i) submit PDC; (ii) Loan Tenure < 1.4 years; (iii) Don't live in Chennai; (iv) Down-Payment < 37% of Loan
Segment 4:
Consumers who (i) submit PDC; (ii) Loan Tenure < 1.4 years; (iii) Don't live in Chennai; (iv) Down-Payment >= 37% of Loan; (v) Age < 39 years
# gini index
library(caret)
levels(trainingSet$DefaulterFlag) <- c("No", "Yes")
dTreeGiniIndex <- train(DefaulterFlag ~ AGE
+ NOOFDEPE
+ MTHINCTH
+ SALDATFR
+ TENORYR
+ DWNPMFR
+ PROFBUS
+ QUALHSC
+ QUAL_PG
+ SEXCODE
+ FULLPDC
+ FRICODE
+ WASHCODE
+ Region,
data = trainingSet,
method = "rpart",
parms = list(split = "gini"),
trControl = ctrl,
metric = "ROC")
CART
21679 samples
14 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (8 fold)
Summary of sample sizes: 18970, 18969, 18969, 18970, 18968, 18969, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
0.01024164 0.6784573 0.2643651 0.9162687
0.01152184 0.6781444 0.2800501 0.9077798
0.01408225 0.6557567 0.2642051 0.9101078
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01024164.
# plot complexity parameter
plot(dTreeGiniIndex)
rpart.plot (Gini Index)# viasulaziation (gini index)
library(rpart.plot)
prp(dTreeGiniIndex$finalModel, box.palette = "Reds", tweak = 1.2)
Information Gain
We get the same decision tree by both of the methods.
Gini Index
# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain,
testSet,
type = 'raw')
levels(testSet$DefaulterFlag) <- c("No", "Yes")
# confusion matrix
confusionMatrix(predClassTestInfoGain, testSet$DefaulterFlag,
positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 434 297
Yes 1649 4847
Accuracy : 0.7307
95% CI : (0.7203, 0.7409)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 0.0001805
Kappa : 0.1867
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9423
Specificity : 0.2084
Pos Pred Value : 0.7462
Neg Pred Value : 0.5937
Prevalence : 0.7118
Detection Rate : 0.6707
Detection Prevalence : 0.8989
Balanced Accuracy : 0.5753
'Positive' Class : Yes
predProbTestInfoGain <- predict(dTreeInfoGain, testSet, type = "prob")
library(ROCR)
lgPredObjtree <- prediction(predProbTestInfoGain[2],testSet$DefaulterFlag)
lgPerfObjtree <- performance(lgPredObjtree, "tpr","fpr")
plot(lgPerfObjtree,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# auc for decision tree
aucTree <- performance(lgPredObjtree, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree
[1] 0.6829464
CART
21679 samples
14 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (8 fold)
Summary of sample sizes: 18970, 18969, 18969, 18969, 18969, 18969, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
0.01024164 0.6782253 0.2632497 0.9163298
0.01152184 0.6779262 0.2550870 0.9187274
0.01408225 0.6337774 0.1922215 0.9371999
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01024164.
Segment 1:
Consumers who do not submit PDC (Post-dated Checks)
Segment 2:
Consumers who (i) submit PDC, (ii) Loan Tenure >= 1.4 years
Segment 3:
Consumers who (i) submit PDC; (ii) Loan Tenure < 1.4 years; (iii) Don't live in Chennai; (iv) Down-Payment < 37% of Loan
Segment 4:
Consumers who (i) submit PDC; (ii) Loan Tenure < 1.4 years; (iii) Don't live in Chennai; (iv) Down-Payment >= 37% of Loan; (v) Age < 39 years
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 434 297
Yes 1649 4847
Accuracy : 0.7307
95% CI : (0.7203, 0.7409)
No Information Rate : 0.7118
P-Value [Acc > NIR] : 0.0001805
Kappa : 0.1867
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9423
Specificity : 0.2084
Pos Pred Value : 0.7462
Neg Pred Value : 0.5937
Prevalence : 0.7118
Detection Rate : 0.6707
Detection Prevalence : 0.8989
Balanced Accuracy : 0.5753
'Positive' Class : Yes
# auc for decision tree
aucTree <- performance(lgPredObjtree, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree
[1] 0.6829464