library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(classInt)
library(CHAID)
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm

Import Data

bankloan<-read.csv('C:/Users/User/PMS/bankloan.csv')
str(bankloan)
## 'data.frame':    700 obs. of  9 variables:
##  $ age     : int  41 27 40 41 24 41 39 43 24 36 ...
##  $ ed      : int  3 1 1 1 2 2 1 1 1 1 ...
##  $ employ  : int  17 10 15 15 2 5 20 12 3 0 ...
##  $ address : int  12 6 14 14 0 5 9 11 4 13 ...
##  $ income  : int  176 31 55 120 28 25 67 38 19 25 ...
##  $ debtinc : num  9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
##  $ creddebt: num  11.359 1.362 0.856 2.659 1.787 ...
##  $ othdebt : num  5.009 4.001 2.169 0.821 3.057 ...
##  $ default : int  1 0 0 0 1 0 0 0 1 0 ...

Peubah data:

  1. Age: age in years

  2. ed: education level

  3. employ: years with current employer

  4. address: years at current address

  5. income: household income in thousands

  6. debtinc: debt to income ratio (x100)

  7. creddebt: credit card debt in thousands

  8. othdebt: other debt in thousands

  9. default: default status (1/0)

#mengubah type peubah menjadi factor
bankloan$default<-factor(bankloan$default,
                         labels = c('non','def'))
bankloan$ed<-as.factor(bankloan$ed)

#memisahkan data training-testing
set.seed(100)
idx<-createDataPartition(bankloan$default,
                         p=0.3, list=F)
banktrain<-bankloan[-idx,]
banktest<-bankloan[idx,]

Eksplorasi Data

p<-ggplot(bankloan,aes(x=debtinc,fill=default))+ theme(legend.position = 'bottom')
p1<-p+geom_density(alpha=.3)
p2<-p+geom_boxplot()
grid.arrange(p1,p2,ncol=2)

p <- ggplot(bankloan,aes(x=address,fill=default)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)

p <- ggplot(bankloan,aes(x=employ,fill=default)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)

CART

#splitting criteria: Information Gain >> C5.0
model.cart1<-rpart(default~., data=banktrain,
                   parms = list(split='information'))
rpart.plot(model.cart1)

#Splitting Criteria: Gini Ratio >> CART
model.cart2<-rpart(default~., data=banktrain,
                   parms = list(split='gini'))
rpart.plot(model.cart2)

CHAID

CHAID menggunakan nilai chi-square sebagai kriteria pemberhentian algoritma. CHAID mendasarkan pada tabel frekuensi, sehingga semua peubah harus berupa kategorik. beberapa cara mengubah peubah menjadi kategorik diantaranya:

#menngunakan cut dgn break sendiri
bankloan1<-bankloan
bankloan1$age<-cut(bankloan1$age,
                   breaks=c(20,30,40,50,56),
                   label=1:4,
                   include.lowest=T,ordered_result=T)
bankloan1$employ<-cut(bankloan1$employ,
                      breaks=c(0,10,20,31),
                      label=1:3,
                      include.lowest=T,ordered_result=T)
bankloan1$address<-cut(bankloan1$address,
                       breaks=c(0,10,20,34),
                       label=1:3,
                       include.lowest=T,ordered_result=T)
bankloan1$income<-cut(bankloan1$income,
                      breaks=c(14,30,45,60,446),
                      label=1:4,
                      include.lowest=T,ordered_result=T)
bankloan1$debtinc<-cut(bankloan1$debtinc,
                       breaks=c(0.4,5,9,15,41.3),
                       label=1:4,
                       include.lowest=T,ordered_result=T)

#menggunakan classIntervals >>equal width
eqwid<-classIntervals(bankloan1$creddebt,4,
                      style = 'equal')
eqwid$brks
## [1]  0.011696  5.149099 10.286503 15.423906 20.561310
bankloan1$creddebt<-cut(bankloan1$creddebt,
                        breaks=eqwid$brks,
                        label=1:4,
                        include.lowest=T,ordered_result=T)

#equal frequency
eqfreq<-classIntervals(bankloan1$othdebt,4,
                       style = 'quantile')
eqfreq$brks
## [1]  0.045584  1.044178  1.987567  3.923065 27.033600
bankloan1$othdebt<-cut(bankloan1$othdebt,
                       breaks = eqfreq$brks,
                       label=1:4,
                       include.lowest=T,ordered_result=T)
str(bankloan1)
## 'data.frame':    700 obs. of  9 variables:
##  $ age     : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 1 2 3 1 3 2 3 1 2 ...
##  $ ed      : Factor w/ 5 levels "1","2","3","4",..: 3 1 1 1 2 2 1 1 1 1 ...
##  $ employ  : Ord.factor w/ 3 levels "1"<"2"<"3": 2 1 2 2 1 1 2 2 1 1 ...
##  $ address : Ord.factor w/ 3 levels "1"<"2"<"3": 2 1 2 2 1 1 1 2 1 2 ...
##  $ income  : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 4 2 3 4 1 1 4 2 1 1 ...
##  $ debtinc : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 4 2 1 4 3 4 1 4 4 ...
##  $ creddebt: Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 1 1 1 1 1 1 1 1 1 ...
##  $ othdebt : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 4 4 3 1 3 3 4 2 3 3 ...
##  $ default : Factor w/ 2 levels "non","def": 2 1 1 1 2 1 1 1 2 1 ...
banktrain1 <- bankloan1[-idx,]
banktest1 <- bankloan1[idx,]
model.chaid1 <- chaid(default~.,data=banktrain1)
plot(model.chaid1)

pred.cart1 <- predict(model.cart1,banktest,type="class")
pred.cart2 <- predict(model.cart2,banktest,type="class")
pred.chaid1 <- predict(model.chaid1,banktest1)

confusionMatrix(pred.cart1, banktest$default, positive = "def")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction non def
##        non 146  31
##        def  10  24
##                                           
##                Accuracy : 0.8057          
##                  95% CI : (0.7458, 0.8568)
##     No Information Rate : 0.7393          
##     P-Value [Acc > NIR] : 0.015084        
##                                           
##                   Kappa : 0.4248          
##                                           
##  Mcnemar's Test P-Value : 0.001787        
##                                           
##             Sensitivity : 0.4364          
##             Specificity : 0.9359          
##          Pos Pred Value : 0.7059          
##          Neg Pred Value : 0.8249          
##              Prevalence : 0.2607          
##          Detection Rate : 0.1137          
##    Detection Prevalence : 0.1611          
##       Balanced Accuracy : 0.6861          
##                                           
##        'Positive' Class : def             
## 
confusionMatrix(pred.cart2, banktest$default, positive = "def")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction non def
##        non 132  29
##        def  24  26
##                                           
##                Accuracy : 0.7488          
##                  95% CI : (0.6847, 0.8058)
##     No Information Rate : 0.7393          
##     P-Value [Acc > NIR] : 0.4117          
##                                           
##                   Kappa : 0.3285          
##                                           
##  Mcnemar's Test P-Value : 0.5827          
##                                           
##             Sensitivity : 0.4727          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.5200          
##          Neg Pred Value : 0.8199          
##              Prevalence : 0.2607          
##          Detection Rate : 0.1232          
##    Detection Prevalence : 0.2370          
##       Balanced Accuracy : 0.6594          
##                                           
##        'Positive' Class : def             
## 
confusionMatrix(pred.chaid1, banktest1$default, positive = "def")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction non def
##        non 144  32
##        def  12  23
##                                           
##                Accuracy : 0.7915          
##                  95% CI : (0.7304, 0.8442)
##     No Information Rate : 0.7393          
##     P-Value [Acc > NIR] : 0.047372        
##                                           
##                   Kappa : 0.3868          
##                                           
##  Mcnemar's Test P-Value : 0.004179        
##                                           
##             Sensitivity : 0.4182          
##             Specificity : 0.9231          
##          Pos Pred Value : 0.6571          
##          Neg Pred Value : 0.8182          
##              Prevalence : 0.2607          
##          Detection Rate : 0.1090          
##    Detection Prevalence : 0.1659          
##       Balanced Accuracy : 0.6706          
##                                           
##        'Positive' Class : def             
## 

Tuning Hyperparameter

Pada metode CART hyperparameter yang dituning adalah maxdepth (kedalaman pohon) dan minsplit

#Fungsi untuk melakukan tuning hyperparameter CART
tuning<-function(splitting){
  akurasi=c()
  sensitivitas=c()
  spesifikasi=c()
  depth=c()
  minsplit=c()
  i=1
  for (k in (3:20)){
    for(j in 3:10){
      modelcart=rpart(default~.,data=banktrain,
                    parms = list(split=splitting),
                    control=rpart.control(maxdepth=j,
                                          minsplit=k))
      prediksi=predict(modelcart,banktest,type="class")
      tabel <- confusionMatrix(prediksi, banktest$default,
                               positive = "def")
      akurasi[i]=tabel$overall[1]
      sensitivitas[i]=tabel$byClass[1]
      spesifikasi[i]=tabel$byClass[2]
      depth[i]=j
      minsplit[i]=k
      i=i+1
    }
  }
  akhir=data.frame(minsplit,depth,akurasi,sensitivitas,spesifikasi)
  return(akhir)
}

tuning_entropi=tuning('information')
tuning_gini=tuning('gini')
tuning_entropi[which.max(tuning_entropi$akurasi),]
##   minsplit depth   akurasi sensitivitas spesifikasi
## 3        3     5 0.8151659    0.4727273   0.9358974
tuning_gini[which.max(tuning_gini$akurasi),]
##   minsplit depth   akurasi sensitivitas spesifikasi
## 1        3     3 0.7819905          0.4   0.9166667

Berdasarkan hasil tuning, jika metode splitting menggunakan information gain maka hyperparameter optimum adalah minsplit=3 dan depth=5, jika metode splitting menggunakan gini ratio maka hyperparameter optimum adalah minsplit=3 dan maxdepth=3.

model_cart_opt1=rpart(default~.,data=banktrain,
                    parms = list(split='information'),
                    control=rpart.control(maxdepth=5,
                                          minsplit=3))
rpart.plot(model_cart_opt1)

model_cart_opt2=rpart(default~.,data=banktrain,
                    parms = list(split='gini'),
                    control=rpart.control(maxdepth=3,
                                          minsplit=3))
rpart.plot(model_cart_opt2)

Pada metode CHAID hyperparameter yang dituning adalah alpha4 yaitu nilai taraf signifikansi.

# Fungsi untuk tuning hyperparameter CHAID
tuning_chaid<-function(){
  akurasi=c()
  sensitivitas=c()
  spesifikasi=c()
  alpha=c()
  i=1
  for(k in seq(0.01,0.05,0.005)){
    modelchaid=chaid(default~.,data=banktrain1,
                      control=chaid_control(alpha4=k))
    prediksi=predict(modelchaid,banktest1)
    tabel <- confusionMatrix(prediksi, banktest1$default, positive = "def")
    akurasi[i]=tabel$overall[1]
    sensitivitas[i]=tabel$byClass[1]
    spesifikasi[i]=tabel$byClass[2]
    alpha[i]=k
    i=i+1
  }
  akhir=data.frame(alpha,akurasi,sensitivitas,spesifikasi)
  return(akhir)
}

modeltuning_chaid<-tuning_chaid()
modeltuning_chaid
##   alpha   akurasi sensitivitas spesifikasi
## 1 0.010 0.7630332    0.4545455   0.8717949
## 2 0.015 0.7630332    0.4545455   0.8717949
## 3 0.020 0.7630332    0.4545455   0.8717949
## 4 0.025 0.7630332    0.4545455   0.8717949
## 5 0.030 0.7630332    0.4545455   0.8717949
## 6 0.035 0.7630332    0.4545455   0.8717949
## 7 0.040 0.7630332    0.4545455   0.8717949
## 8 0.045 0.7630332    0.4545455   0.8717949
## 9 0.050 0.7914692    0.4181818   0.9230769
modeltuning_chaid[which.max(modeltuning_chaid$akurasi),]
##   alpha   akurasi sensitivitas spesifikasi
## 9  0.05 0.7914692    0.4181818   0.9230769

Berdasarkan hasil tuning diperoleh nilai alpha4 optimum adalah 0.05

modelchaid=chaid(default~.,data=banktrain1,
                      control=chaid_control(alpha4=0.05))
plot(modelchaid)