library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(classInt)
library(CHAID)
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
bankloan<-read.csv('C:/Users/User/PMS/bankloan.csv')
str(bankloan)
## 'data.frame': 700 obs. of 9 variables:
## $ age : int 41 27 40 41 24 41 39 43 24 36 ...
## $ ed : int 3 1 1 1 2 2 1 1 1 1 ...
## $ employ : int 17 10 15 15 2 5 20 12 3 0 ...
## $ address : int 12 6 14 14 0 5 9 11 4 13 ...
## $ income : int 176 31 55 120 28 25 67 38 19 25 ...
## $ debtinc : num 9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
## $ creddebt: num 11.359 1.362 0.856 2.659 1.787 ...
## $ othdebt : num 5.009 4.001 2.169 0.821 3.057 ...
## $ default : int 1 0 0 0 1 0 0 0 1 0 ...
Peubah data:
Age: age in years
ed: education level
employ: years with current employer
address: years at current address
income: household income in thousands
debtinc: debt to income ratio (x100)
creddebt: credit card debt in thousands
othdebt: other debt in thousands
default: default status (1/0)
#mengubah type peubah menjadi factor
bankloan$default<-factor(bankloan$default,
labels = c('non','def'))
bankloan$ed<-as.factor(bankloan$ed)
#memisahkan data training-testing
set.seed(100)
idx<-createDataPartition(bankloan$default,
p=0.3, list=F)
banktrain<-bankloan[-idx,]
banktest<-bankloan[idx,]
p<-ggplot(bankloan,aes(x=debtinc,fill=default))+ theme(legend.position = 'bottom')
p1<-p+geom_density(alpha=.3)
p2<-p+geom_boxplot()
grid.arrange(p1,p2,ncol=2)
p <- ggplot(bankloan,aes(x=address,fill=default)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)
p <- ggplot(bankloan,aes(x=employ,fill=default)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)
#splitting criteria: Information Gain >> C5.0
model.cart1<-rpart(default~., data=banktrain,
parms = list(split='information'))
rpart.plot(model.cart1)
#Splitting Criteria: Gini Ratio >> CART
model.cart2<-rpart(default~., data=banktrain,
parms = list(split='gini'))
rpart.plot(model.cart2)
CHAID menggunakan nilai chi-square sebagai kriteria pemberhentian algoritma. CHAID mendasarkan pada tabel frekuensi, sehingga semua peubah harus berupa kategorik. beberapa cara mengubah peubah menjadi kategorik diantaranya:
langsung menggunakan fungsi as.factor()
menggunakan cut dengan breaks yang ditentukan sendiri
menggunakan fungsi cut dengan breaks yang sama lebar atau sama banyak (dengan fungsi classIntervals dari package classInt)
menggunakan fungsi mdlp dari package discretization
menggunakan fungsi chiM dari package discretization
#menngunakan cut dgn break sendiri
bankloan1<-bankloan
bankloan1$age<-cut(bankloan1$age,
breaks=c(20,30,40,50,56),
label=1:4,
include.lowest=T,ordered_result=T)
bankloan1$employ<-cut(bankloan1$employ,
breaks=c(0,10,20,31),
label=1:3,
include.lowest=T,ordered_result=T)
bankloan1$address<-cut(bankloan1$address,
breaks=c(0,10,20,34),
label=1:3,
include.lowest=T,ordered_result=T)
bankloan1$income<-cut(bankloan1$income,
breaks=c(14,30,45,60,446),
label=1:4,
include.lowest=T,ordered_result=T)
bankloan1$debtinc<-cut(bankloan1$debtinc,
breaks=c(0.4,5,9,15,41.3),
label=1:4,
include.lowest=T,ordered_result=T)
#menggunakan classIntervals >>equal width
eqwid<-classIntervals(bankloan1$creddebt,4,
style = 'equal')
eqwid$brks
## [1] 0.011696 5.149099 10.286503 15.423906 20.561310
bankloan1$creddebt<-cut(bankloan1$creddebt,
breaks=eqwid$brks,
label=1:4,
include.lowest=T,ordered_result=T)
#equal frequency
eqfreq<-classIntervals(bankloan1$othdebt,4,
style = 'quantile')
eqfreq$brks
## [1] 0.045584 1.044178 1.987567 3.923065 27.033600
bankloan1$othdebt<-cut(bankloan1$othdebt,
breaks = eqfreq$brks,
label=1:4,
include.lowest=T,ordered_result=T)
str(bankloan1)
## 'data.frame': 700 obs. of 9 variables:
## $ age : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 1 2 3 1 3 2 3 1 2 ...
## $ ed : Factor w/ 5 levels "1","2","3","4",..: 3 1 1 1 2 2 1 1 1 1 ...
## $ employ : Ord.factor w/ 3 levels "1"<"2"<"3": 2 1 2 2 1 1 2 2 1 1 ...
## $ address : Ord.factor w/ 3 levels "1"<"2"<"3": 2 1 2 2 1 1 1 2 1 2 ...
## $ income : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 4 2 3 4 1 1 4 2 1 1 ...
## $ debtinc : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 4 2 1 4 3 4 1 4 4 ...
## $ creddebt: Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 1 1 1 1 1 1 1 1 1 ...
## $ othdebt : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 4 4 3 1 3 3 4 2 3 3 ...
## $ default : Factor w/ 2 levels "non","def": 2 1 1 1 2 1 1 1 2 1 ...
banktrain1 <- bankloan1[-idx,]
banktest1 <- bankloan1[idx,]
model.chaid1 <- chaid(default~.,data=banktrain1)
plot(model.chaid1)
pred.cart1 <- predict(model.cart1,banktest,type="class")
pred.cart2 <- predict(model.cart2,banktest,type="class")
pred.chaid1 <- predict(model.chaid1,banktest1)
confusionMatrix(pred.cart1, banktest$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 146 31
## def 10 24
##
## Accuracy : 0.8057
## 95% CI : (0.7458, 0.8568)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.015084
##
## Kappa : 0.4248
##
## Mcnemar's Test P-Value : 0.001787
##
## Sensitivity : 0.4364
## Specificity : 0.9359
## Pos Pred Value : 0.7059
## Neg Pred Value : 0.8249
## Prevalence : 0.2607
## Detection Rate : 0.1137
## Detection Prevalence : 0.1611
## Balanced Accuracy : 0.6861
##
## 'Positive' Class : def
##
confusionMatrix(pred.cart2, banktest$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 132 29
## def 24 26
##
## Accuracy : 0.7488
## 95% CI : (0.6847, 0.8058)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.4117
##
## Kappa : 0.3285
##
## Mcnemar's Test P-Value : 0.5827
##
## Sensitivity : 0.4727
## Specificity : 0.8462
## Pos Pred Value : 0.5200
## Neg Pred Value : 0.8199
## Prevalence : 0.2607
## Detection Rate : 0.1232
## Detection Prevalence : 0.2370
## Balanced Accuracy : 0.6594
##
## 'Positive' Class : def
##
confusionMatrix(pred.chaid1, banktest1$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 144 32
## def 12 23
##
## Accuracy : 0.7915
## 95% CI : (0.7304, 0.8442)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.047372
##
## Kappa : 0.3868
##
## Mcnemar's Test P-Value : 0.004179
##
## Sensitivity : 0.4182
## Specificity : 0.9231
## Pos Pred Value : 0.6571
## Neg Pred Value : 0.8182
## Prevalence : 0.2607
## Detection Rate : 0.1090
## Detection Prevalence : 0.1659
## Balanced Accuracy : 0.6706
##
## 'Positive' Class : def
##
Pada metode CART hyperparameter yang dituning adalah maxdepth (kedalaman pohon) dan minsplit
#Fungsi untuk melakukan tuning hyperparameter CART
tuning<-function(splitting){
akurasi=c()
sensitivitas=c()
spesifikasi=c()
depth=c()
minsplit=c()
i=1
for (k in (3:20)){
for(j in 3:10){
modelcart=rpart(default~.,data=banktrain,
parms = list(split=splitting),
control=rpart.control(maxdepth=j,
minsplit=k))
prediksi=predict(modelcart,banktest,type="class")
tabel <- confusionMatrix(prediksi, banktest$default,
positive = "def")
akurasi[i]=tabel$overall[1]
sensitivitas[i]=tabel$byClass[1]
spesifikasi[i]=tabel$byClass[2]
depth[i]=j
minsplit[i]=k
i=i+1
}
}
akhir=data.frame(minsplit,depth,akurasi,sensitivitas,spesifikasi)
return(akhir)
}
tuning_entropi=tuning('information')
tuning_gini=tuning('gini')
tuning_entropi[which.max(tuning_entropi$akurasi),]
## minsplit depth akurasi sensitivitas spesifikasi
## 3 3 5 0.8151659 0.4727273 0.9358974
tuning_gini[which.max(tuning_gini$akurasi),]
## minsplit depth akurasi sensitivitas spesifikasi
## 1 3 3 0.7819905 0.4 0.9166667
Berdasarkan hasil tuning, jika metode splitting menggunakan information gain maka hyperparameter optimum adalah minsplit=3 dan depth=5, jika metode splitting menggunakan gini ratio maka hyperparameter optimum adalah minsplit=3 dan maxdepth=3.
model_cart_opt1=rpart(default~.,data=banktrain,
parms = list(split='information'),
control=rpart.control(maxdepth=5,
minsplit=3))
rpart.plot(model_cart_opt1)
model_cart_opt2=rpart(default~.,data=banktrain,
parms = list(split='gini'),
control=rpart.control(maxdepth=3,
minsplit=3))
rpart.plot(model_cart_opt2)
Pada metode CHAID hyperparameter yang dituning adalah alpha4 yaitu nilai taraf signifikansi.
# Fungsi untuk tuning hyperparameter CHAID
tuning_chaid<-function(){
akurasi=c()
sensitivitas=c()
spesifikasi=c()
alpha=c()
i=1
for(k in seq(0.01,0.05,0.005)){
modelchaid=chaid(default~.,data=banktrain1,
control=chaid_control(alpha4=k))
prediksi=predict(modelchaid,banktest1)
tabel <- confusionMatrix(prediksi, banktest1$default, positive = "def")
akurasi[i]=tabel$overall[1]
sensitivitas[i]=tabel$byClass[1]
spesifikasi[i]=tabel$byClass[2]
alpha[i]=k
i=i+1
}
akhir=data.frame(alpha,akurasi,sensitivitas,spesifikasi)
return(akhir)
}
modeltuning_chaid<-tuning_chaid()
modeltuning_chaid
## alpha akurasi sensitivitas spesifikasi
## 1 0.010 0.7630332 0.4545455 0.8717949
## 2 0.015 0.7630332 0.4545455 0.8717949
## 3 0.020 0.7630332 0.4545455 0.8717949
## 4 0.025 0.7630332 0.4545455 0.8717949
## 5 0.030 0.7630332 0.4545455 0.8717949
## 6 0.035 0.7630332 0.4545455 0.8717949
## 7 0.040 0.7630332 0.4545455 0.8717949
## 8 0.045 0.7630332 0.4545455 0.8717949
## 9 0.050 0.7914692 0.4181818 0.9230769
modeltuning_chaid[which.max(modeltuning_chaid$akurasi),]
## alpha akurasi sensitivitas spesifikasi
## 9 0.05 0.7914692 0.4181818 0.9230769
Berdasarkan hasil tuning diperoleh nilai alpha4 optimum adalah 0.05
modelchaid=chaid(default~.,data=banktrain1,
control=chaid_control(alpha4=0.05))
plot(modelchaid)