Supervised Classification - k-NN, LDA, Regresi Logistik, CHAID, CART
Pertemuan VIII
Library
library(ggplot2)
library(caret)
## Loading required package: lattice
library(class)
library(mvtnorm)
library(MASS)
Data
<-read.csv("dataset_37_diabetes.csv", stringsAsFactors = TRUE)
diabetesstr(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ preg : int 6 1 8 1 0 5 3 10 2 8 ...
## $ plas : int 148 85 183 89 137 116 78 115 197 125 ...
## $ pres : int 72 66 64 66 40 74 50 0 70 96 ...
## $ skin : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insu : int 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedi : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ class: Factor w/ 2 levels "tested_negative",..: 2 1 2 1 2 1 2 1 2 2 ...
Pima Indians Diabetes Database
Pima Indians Diabetes Database
Sumber: https://www.jair.org/index.php/jair/article/view/10129
Deskripsi data:
The diagnostic, binary-valued variable investigated is whether the patient shows signs of diabetes according to World Health Organization criteria (i.e., if the 2 hour post-load plasma glucose was at least 200 mg/dl at any survey examination or if found during routine medical care). The population lives near Phoenix, Arizona, USA.
Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage. ADAP is an adaptive learning routine that generates and executes digital analogs of perceptron-like devices. It is a unique algorithm; see the paper for details.
Daftar Peubah:
preg
: Number of times pregnant
plas
: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
pres
: Diastolic blood pressure (mm Hg)
skin
: Triceps skin fold thickness (mm)
insu
: 2-Hour serum insulin (mu U/ml)
mass
: Body mass index (weight in kg/(height in m)^2)
pedi
: Diabetes pedigree function
age
: Age (years)
class
: Class variable (tested_negative
ortested_positive
)
Eksplorasi Data
Hanya digunakan peubah pred
dan pedi
table(diabetes$class)
##
## tested_negative tested_positive
## 500 268
ggplot(diabetes,aes(x=pedi,fill=class)) + geom_density(alpha=.3) +
theme(legend.position="bottom")
ggplot(diabetes,aes(x=preg,fill=class)) + geom_density(alpha=.3) +
theme(legend.position="bottom")
K-Nearest Neighbors
plot(diabetes$preg,diabetes$pedi)
k-NN melakukan klasifikasi berdasarkan k tetangga terdekat, sehingga sangat tergantung pada jarak. Apapun metode perhitungan jarak yang digunakan, jarak bersifat sensitif terhadap skala dari peubah. Maka dari itu, jika skala dan rentang peubah yang digunakan berbeda-beda, perlu melakukan pembakuan (standardize) terhadap peubah tersebut. Namun tidak ada ketentuan khusus metode pembakuan mana yang harus digunakan. Pada panduan ini yang digunakan adalah pembakuan [0,1].
#standardize
<- function(X) (X-min(X))/(max(X)-min(X))
stdmaxmin <- stdmaxmin(diabetes$preg)
preg1 <- stdmaxmin(diabetes$pedi) pedi1
Untuk melakukan klasifikasi k-NN, bisa menggunakan fungsi knn dari package class. Pada ilustrasi berikut, data testing yang digunakan adalah grid pada [0,1] x [0,1] dari scatter plot peubah preg
vs pedi
.
<-NULL;
m <-b <-seq(0, 1, length.out = 70)
a
for (i in a) {
for (j in b) {
<-rbind(m, c(i, j))
m
} }
<-knn(cbind(preg1,pedi1), m, diabetes$class, k = 3)
prediksi
plot(m[,1], m[,2], col=ifelse(prediksi=="tested_positive", "cyan","yellow"), pch=ifelse(prediksi=="tested_positive",17,12), main="k=3")
points(preg1, pedi1, col=diabetes$class, pch=ifelse(diabetes$class=="tested_positive",17,12), cex=.7)
<-knn(cbind(preg1,pedi1), m, diabetes$class, k = 7)
prediksiplot(m[,1], m[,2], col=ifelse(prediksi=="tested_positive", "cyan","yellow"), pch=ifelse(prediksi=="tested_positive",17,12), main="k=7")
points(preg1, pedi1, col=diabetes$class, pch=ifelse(diabetes$class=="tested_positive",17,12), cex=.7)
Hasil pemodelan k-NN tidak menghasilkan sebuah “model” baik dalam bentuk persamaan matematis ataupun kumpulan aturan tree-logic. Yang dilakukan pada k-NN adalah menyimpan seluruh data training untuk kemudian ketika dimasukkan data testing akan dihitung jarak masing-masing amatan data testing terhadap seluruh amatan data training.
Mencari Nilai k terbaik.
Pembagian Data
<- diabetes
data.knn # data.knn[, 1:9] <- sapply(data.knn[, 1:9], as.numeric)
library(caret)
set.seed(1016)
<- createDataPartition(data.knn$class, p=0.7, list=FALSE)
acak <- data.knn[acak,]
data.training <- data.knn[-acak,] data.testing
Penyesuaian
<- data.training[,9]
y.train <- data.training[,-9]
X.train <- data.testing[,9]
y.test <- data.testing[,-9] X.test
Prediksi
<- knn(X.train, X.test, y.train, k=3)
prediksi <- mean(prediksi == y.test)
accuracy cat("Training Accuracy: ", accuracy, sep='')
## Training Accuracy: 0.7173913
Optimasi
<- c()
train_acc <- c()
valid_acc
for (i in 1:100){
set.seed(1)
<- knn(X.train, X.train, y.train, k=i)
train_pred <- c(train_acc, mean(train_pred == y.train))
train_acc
set.seed(1)
<- knn(X.train, X.test, y.train, k=i)
valid_pred <- c(valid_acc, mean(valid_pred == y.test))
valid_acc
}
plot(1:100, train_acc, pch='.', ylim=c(0.65, 1), col='salmon')
lines(1:100, train_acc, lwd=2, col='salmon')
lines(1:100, valid_acc, lwd=2, col='cornflowerblue')
legend(38, 1, legend=c("Training Acc", "Validation Acc"),
col=c("salmon", "cornflowerblue"), lty=1, lwd=2, cex=0.8)
LDA : Linear Discriminant Analysis
Ilustrasi
Pada sebaran normal tunggal, plot sebarannya seperti ini:
# Normal(7,1)
<-seq(3,11,by=.1)
x<-dnorm(x,7,1)
fxplot(x,fx,"l")
Kalau pada sebaran normal ganda satu populasi, plot sebarannya seperti ini:
# Membuat grid
<-seq(7,23,by=.2)
x1<-seq(7,23,by=.2)
x2<- NULL
grid for (i in x1) {
for (j in x2) {
<- rbind(grid, c(i, j))
grid
}
}
# Multivariate Normal
<- c(15, 15)
mean1 <- matrix(c(5, 3, 3, 5), 2, 2)
sigma # Matriks ragam-peragam sigma
## [,1] [,2]
## [1,] 5 3
## [2,] 3 5
<-dmvnorm(grid, mean1, sigma)
y<- matrix(y, length(x1), length(x2), byrow=TRUE)
z
# Perspective Plot
persp(x1, x2, z, phi=5, theta=25, col=3)
# Contour Plot
contour(x1, x2, z)
Sedangkan sebaran normal ganda dua populasi (matriks ragam-peragam sama), plot sebarannya seperti ini:
# Membuat grid
<- seq(10, 30,length.out=50)
a <- seq(10, 30,length.out=50)
b <- NULL
grid for (i in a) for (j in b) grid <- rbind(grid, c(i, j))
# Multivariate Normal
<- c(15, 15)
mean1 <- c(22, 22)
mean2 <- matrix(c(5, 3, 3, 5), 2, 2)
sigma = 0.5*dmvnorm(grid, mean1, sigma) + 0.5*dmvnorm(grid, mean2, sigma)
y <- matrix(y, length(a), length(b), byrow=TRUE)
z
# Perspective Plot
persp(a, b, z, phi=5, theta=30, col=3)
# Contour Plot
contour(a, b, z)
Ketika matriks ragam-peragamnya berbeda, plot sebarannya seperti ini:
<- matrix(c(5, 3, 3, 5), 2, 2)
sigma1 <- matrix(c(8, 3, 3, 8), 2, 2)
sigma2 = 0.5*dmvnorm(grid, mean1, sigma1) + 0.5*dmvnorm(grid, mean2, sigma2)
y <- matrix(y, length(a), length(b), byrow=TRUE)
z persp(a, b, z, phi=5, theta=30, col=3)
contour(a, b, z)
lines(c(15.5,22),c(25,14),lty=2,lwd=2,col=4)
Gambar di atas adalah ilustrasi bagaimana dua kategori pada peubah respon memiliki sebaran masing-masing dengan nilai tengah dan matriks ragam-peragam yang berbeda. LDA mencari pemisah (classifier) yang memisahkan sebaran kedua kategori respon yang digambarkan pada garis biru. Pada Quadratic Discriminant Analysis (QDA), (garis) classifier yang dibentuk bersifat kuadratik bukan linier.
LDA pada Data Diabetes
<- lda(class~preg+pedi,data=diabetes)
model.lda model.lda
## Call:
## lda(class ~ preg + pedi, data = diabetes)
##
## Prior probabilities of groups:
## tested_negative tested_positive
## 0.6510417 0.3489583
##
## Group means:
## preg pedi
## tested_negative 3.298000 0.429734
## tested_positive 4.865672 0.550500
##
## Coefficients of linear discriminants:
## LD1
## preg 0.2462512
## pedi 1.9936038
Group means pada model LDA yang didapatkan adalah vektor nilai tengah dari sebaran masing-masing kategori. Sedangkan Coefficients of linear discriminants adalah persamaan garis yang memisahkan kedua sebaran: 0.246preg + 1.993pedi = 0.
<- predict(model.lda,diabetes)
pred.lda confusionMatrix(pred.lda$class,diabetes$class,positive="tested_positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction tested_negative tested_positive
## tested_negative 453 195
## tested_positive 47 73
##
## Accuracy : 0.6849
## 95% CI : (0.6507, 0.7176)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 0.02609
##
## Kappa : 0.2046
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.27239
## Specificity : 0.90600
## Pos Pred Value : 0.60833
## Neg Pred Value : 0.69907
## Prevalence : 0.34896
## Detection Rate : 0.09505
## Detection Prevalence : 0.15625
## Balanced Accuracy : 0.58919
##
## 'Positive' Class : tested_positive
##
Regresi Logistik
<-glm(class~preg+pedi, data=diabetes, family="binomial")
model.logistik model.logistik
##
## Call: glm(formula = class ~ preg + pedi, family = "binomial", data = diabetes)
##
## Coefficients:
## (Intercept) preg pedi
## -1.7806 0.1454 1.1789
##
## Degrees of Freedom: 767 Total (i.e. Null); 765 Residual
## Null Deviance: 993.5
## Residual Deviance: 930.5 AIC: 936.5
summary(model.logistik)
##
## Call:
## glm(formula = class ~ preg + pedi, family = "binomial", data = diabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8653 -0.8871 -0.7228 1.2028 1.8952
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.78062 0.17833 -9.985 < 2e-16 ***
## preg 0.14538 0.02342 6.207 5.42e-10 ***
## pedi 1.17890 0.23841 4.945 7.62e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 930.49 on 765 degrees of freedom
## AIC: 936.49
##
## Number of Fisher Scoring iterations: 4
<-predict(model.logistik, diabetes, type="response")
prob.prediksi<-as.factor(ifelse(prob.prediksi>0.5,
prediksi"tested_positive", "tested_negative"))
confusionMatrix(prediksi, diabetes$class, positive="tested_positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction tested_negative tested_positive
## tested_negative 455 198
## tested_positive 45 70
##
## Accuracy : 0.6836
## 95% CI : (0.6494, 0.7164)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 0.0311
##
## Kappa : 0.1973
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.26119
## Specificity : 0.91000
## Pos Pred Value : 0.60870
## Neg Pred Value : 0.69678
## Prevalence : 0.34896
## Detection Rate : 0.09115
## Detection Prevalence : 0.14974
## Balanced Accuracy : 0.58560
##
## 'Positive' Class : tested_positive
##
Regresi Logistik pada semua variabel
<-glm(class~., data=diabetes, family="binomial")
model.logistik model.logistik
##
## Call: glm(formula = class ~ ., family = "binomial", data = diabetes)
##
## Coefficients:
## (Intercept) preg plas pres skin insu
## -8.404696 0.123182 0.035164 -0.013296 0.000619 -0.001192
## mass pedi age
## 0.089701 0.945180 0.014869
##
## Degrees of Freedom: 767 Total (i.e. Null); 759 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.4 AIC: 741.4
summary(model.logistik)
##
## Call:
## glm(formula = class ~ ., family = "binomial", data = diabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## preg 0.1231823 0.0320776 3.840 0.000123 ***
## plas 0.0351637 0.0037087 9.481 < 2e-16 ***
## pres -0.0132955 0.0052336 -2.540 0.011072 *
## skin 0.0006190 0.0068994 0.090 0.928515
## insu -0.0011917 0.0009012 -1.322 0.186065
## mass 0.0897010 0.0150876 5.945 2.76e-09 ***
## pedi 0.9451797 0.2991475 3.160 0.001580 **
## age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
<-predict(model.logistik, diabetes, type="response")
prob.prediksi<-as.factor(ifelse(prob.prediksi>0.5,
prediksi"tested_positive", "tested_negative"))
confusionMatrix(prediksi, diabetes$class, positive="tested_positive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction tested_negative tested_positive
## tested_negative 445 112
## tested_positive 55 156
##
## Accuracy : 0.7826
## 95% CI : (0.7517, 0.8112)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 1.373e-15
##
## Kappa : 0.4966
##
## Mcnemar's Test P-Value : 1.468e-05
##
## Sensitivity : 0.5821
## Specificity : 0.8900
## Pos Pred Value : 0.7393
## Neg Pred Value : 0.7989
## Prevalence : 0.3490
## Detection Rate : 0.2031
## Detection Prevalence : 0.2747
## Balanced Accuracy : 0.7360
##
## 'Positive' Class : tested_positive
##
Pertemuan IX
Anda perlu melakukan Instalasi library partykit
sebelum melakukan instalasi CHAID
install.packages("partykit")
install.packages("CHAID", repos = "http://R-Forge.R-project.org", type = "source")
Library
library(dplyr)
library(ggplot2)
library(gridExtra)
library(caret)
library(rpart)
library(rpart.plot)
library(classInt)
library(CHAID)
Data
<- read.csv("bankloan.csv") #sesuai alamat direktori
bankloan str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ preg : int 6 1 8 1 0 5 3 10 2 8 ...
## $ plas : int 148 85 183 89 137 116 78 115 197 125 ...
## $ pres : int 72 66 64 66 40 74 50 0 70 96 ...
## $ skin : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insu : int 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedi : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ class: Factor w/ 2 levels "tested_negative",..: 2 1 2 1 2 1 2 1 2 2 ...
Daftar Peubah:
age
: Age in years
ed
: Level of education
employ
: Years with current employer
address
: Years at current address
income
: Household income in thousands
debtinc
: Debt to income ratio (x100)
creddebt
: Credit card debt in thousands
othdebt
: Other debt in thousands
default
: default status (1/0)
Eksplorasi Data
$default <- factor(bankloan$default,labels=c("non","def"))
bankloan$ed <- as.factor(bankloan$ed)
bankloan
# Memisahkan training-testing
set.seed(100)
<- createDataPartition(bankloan$default, p=0.3, list=FALSE)
idx <- bankloan[-idx,]
banktrain <- bankloan[idx,] banktest
<- ggplot(bankloan,aes(x=debtinc,fill=default)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
<- ggplot(bankloan,aes(x=address,fill=default)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
<- ggplot(bankloan,aes(x=employ,fill=default)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
CART
# Splitting Criteria: Information Gain
<- rpart(default~.,data=banktrain,parms=list(split='information'))
model.cart1 rpart.plot(model.cart1)
# Splitting Criteria: Gini Ratio
<- rpart(default~.,data=banktrain,parms=list(split='gini'))
model.cart2 rpart.plot(model.cart2)
CHAID
Karena CHAID mendasarkan pada tabel frekuensi, sehingga perlu mengubah semua peubah menjadi kategorik. Beberapa cara mengubah peubah menjadi kategorik diantaranya, yaitu:
langsung menggunakan fungsi
as.factor()
menggunakan fungsi
cut
dengan breaks yang ditentukan sendirimenggunakan fungsi
cut
dengan breaks yang sama lebar atau sama banyak (dengan fungsiclassIntervals
dari packageclassInt
)menggunakan fungsi
mdlp
dari packagediscretization
menggunakan fungsi
chiM
dari packagediscretization
<- bankloan
bankloan1 $age <- cut(bankloan1$age,breaks=c(20,30,40,50,56),label=1:4,
bankloan1include.lowest = T)
$employ <- cut(bankloan1$employ,breaks=c(0,10,20,31),label=1:3,
bankloan1include.lowest = T)
$address <- cut(bankloan1$address,breaks=c(0,10,20,34),label=1:3,
bankloan1include.lowest = T)
$income <- cut(bankloan1$income,breaks=c(14,30,45,60,446),label=1:4,
bankloan1include.lowest = T)
$debtinc <- cut(bankloan1$debtinc,breaks=c(0.4,5,9,15,41.3),label=1:4,
bankloan1include.lowest = T)
# equal width discretization
<-classIntervals(bankloan1$creddebt, 4, style = 'equal')
eqwid$brks eqwid
## [1] 0.011696 5.149099 10.286503 15.423906 20.561310
$creddebt<-cut(bankloan1$creddebt, breaks=eqwid$brks,label=1:4,
bankloan1include.lowest=TRUE)
# equal frequency discretization
<-classIntervals(bankloan1$othdebt, 4, style = 'quantile')
eqfreq$brks eqfreq
## [1] 0.045584 1.044178 1.987567 3.923065 27.033600
$othdebt<-cut(bankloan1$othdebt, breaks=eqfreq$brks, label=1:4,
bankloan1include.lowest=TRUE)
str(bankloan1)
## 'data.frame': 700 obs. of 9 variables:
## $ age : Factor w/ 4 levels "1","2","3","4": 3 1 2 3 1 3 2 3 1 2 ...
## $ ed : Factor w/ 5 levels "1","2","3","4",..: 3 1 1 1 2 2 1 1 1 1 ...
## $ employ : Factor w/ 3 levels "1","2","3": 2 1 2 2 1 1 2 2 1 1 ...
## $ address : Factor w/ 3 levels "1","2","3": 2 1 2 2 1 1 1 2 1 2 ...
## $ income : Factor w/ 4 levels "1","2","3","4": 4 2 3 4 1 1 4 2 1 1 ...
## $ debtinc : Factor w/ 4 levels "1","2","3","4": 3 4 2 1 4 3 4 1 4 4 ...
## $ creddebt: Factor w/ 4 levels "1","2","3","4": 3 1 1 1 1 1 1 1 1 1 ...
## $ othdebt : Factor w/ 4 levels "1","2","3","4": 4 4 3 1 3 3 4 2 3 3 ...
## $ default : Factor w/ 2 levels "non","def": 2 1 1 1 2 1 1 1 2 1 ...
<- bankloan1[-idx,]
banktrain1 <- bankloan1[idx,]
banktest1 <- chaid(default~.,data=banktrain1)
model.chaid1 plot(model.chaid1)
<- predict(model.cart1,banktest,type="class")
pred.cart1 <- predict(model.cart2,banktest,type="class")
pred.cart2 <- predict(model.chaid1,banktest1)
pred.chaid1
confusionMatrix(pred.cart1, banktest$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 146 31
## def 10 24
##
## Accuracy : 0.8057
## 95% CI : (0.7458, 0.8568)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.015084
##
## Kappa : 0.4248
##
## Mcnemar's Test P-Value : 0.001787
##
## Sensitivity : 0.4364
## Specificity : 0.9359
## Pos Pred Value : 0.7059
## Neg Pred Value : 0.8249
## Prevalence : 0.2607
## Detection Rate : 0.1137
## Detection Prevalence : 0.1611
## Balanced Accuracy : 0.6861
##
## 'Positive' Class : def
##
confusionMatrix(pred.cart2, banktest$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 132 29
## def 24 26
##
## Accuracy : 0.7488
## 95% CI : (0.6847, 0.8058)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.4117
##
## Kappa : 0.3285
##
## Mcnemar's Test P-Value : 0.5827
##
## Sensitivity : 0.4727
## Specificity : 0.8462
## Pos Pred Value : 0.5200
## Neg Pred Value : 0.8199
## Prevalence : 0.2607
## Detection Rate : 0.1232
## Detection Prevalence : 0.2370
## Balanced Accuracy : 0.6594
##
## 'Positive' Class : def
##
confusionMatrix(pred.chaid1, banktest1$default, positive = "def")
## Confusion Matrix and Statistics
##
## Reference
## Prediction non def
## non 119 22
## def 37 33
##
## Accuracy : 0.7204
## 95% CI : (0.6546, 0.7798)
## No Information Rate : 0.7393
## P-Value [Acc > NIR] : 0.76173
##
## Kappa : 0.3334
##
## Mcnemar's Test P-Value : 0.06836
##
## Sensitivity : 0.6000
## Specificity : 0.7628
## Pos Pred Value : 0.4714
## Neg Pred Value : 0.8440
## Prevalence : 0.2607
## Detection Rate : 0.1564
## Detection Prevalence : 0.3318
## Balanced Accuracy : 0.6814
##
## 'Positive' Class : def
##
HyperParameter
<- rpart(default~.,data=banktrain,
model.cart3 control=rpart.control(maxdepth=5))
rpart.plot(model.cart3)
<- rpart(default~.,data=banktrain,
model.cart4 control=rpart.control(minsplit=40))
rpart.plot(model.cart4)
<- chaid(default~.,data=banktrain1,
model.chaid2 control=chaid_control(alpha4=0.15))
plot(model.chaid2)
<- predict(model.cart3,banktest,type="class")
pred.cart3 <- predict(model.cart4,banktest,type="class")
pred.cart4 <- predict(model.chaid2,banktest1)
pred.chaid2
<- function(pred,data){
perform <- confusionMatrix(pred, data$default, positive = "def")
tabel <- c(tabel$overall[1],tabel$byClass[1:2])
result return(result)
}
perform(pred.cart3,banktest)
## Accuracy Sensitivity Specificity
## 0.7393365 0.4727273 0.8333333
perform(pred.cart4,banktest)
## Accuracy Sensitivity Specificity
## 0.7156398 0.6363636 0.7435897
perform(pred.chaid2,banktest1)
## Accuracy Sensitivity Specificity
## 0.7440758 0.5636364 0.8076923
Badan Informasi Geospasial, abdul.aziz@big.go.id↩︎