Contoh Decision Tree
Contoh Soal 8.5 (Data Mining dengan R)
library(party)
Data <- iris
Split Data (Training & Testing)
Sample <- sample(1:150, 50)
testing <- Data[Sample,]
learning <- Data[-Sample,]
Modelling
output.tree <- ctree(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = learning)
output.tree
##
## Conditional inference tree with 4 terminal nodes
##
## Response: Species
## Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
## Number of observations: 100
##
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 90.877
## 2)* weights = 26
## 1) Petal.Length > 1.9
## 3) Petal.Width <= 1.7; criterion = 1, statistic = 48.344
## 4) Petal.Length <= 4.8; criterion = 0.991, statistic = 9.364
## 5)* weights = 33
## 4) Petal.Length > 4.8
## 6)* weights = 7
## 3) Petal.Width > 1.7
## 7)* weights = 34
plot(output.tree)

Evaluasi Model
Prediksi <- predict(output.tree, testing[,-5])
CM <- table(testing[,5], Prediksi)
CM
## Prediksi
## setosa versicolor virginica
## setosa 24 0 0
## versicolor 0 13 0
## virginica 0 1 12
accuracy <- (sum(diag(CM)))/sum(CM)
accuracy
## [1] 0.98
Dari hasil Evaluasi Model di atas didapatkan bahwa model Decision Tree menghasilkan tingkat akurasi yang baik. Sehingga model Decision Tree dapat diterapkan pada Data.
Contoh soal Naive Bayes
Contoh 8.8 (Data Mining dengan R)
Package
library('e1071')
## Warning: package 'e1071' was built under R version 4.0.5
Input data training
data.training = as.data.frame(rbind(
c("Sunny", "Hot", "High", "False", "No"),
c("Sunny", "Hot", "High", "True", "No"),
c("Overcast","Hot","High","False","Yes"),
c("Rainy","Mild","High","False","Yes"),
c("Rainy","Cool","Normal","False","Yes"),
c("Rainy","Cool","Normal","True","No"),
c("Overcast","Cool","Normal","True","Yes"),
c("Sunny","Mild","High","False","No"),
c("Sunny","Cool","Normal","False","Yes"),
c("Rainy","Mild","Normal","False","Yes"),
c("Sunny","Mild","Normal","True","Yes"),
c("Overcast","Mild","High","True","Yes"),
c("Overcast","Hot","Normal","False","Yes"),
c("Rainy","Mild","High","True","No")))
Ganti nama kolom
names(data.training)[1] = "OUTLOOK"
names(data.training)[2] = "TEMP"
names(data.training)[3] = "HUMIDITY"
names(data.training)[4] = "WINDY"
names(data.training)[5] = "PLAY"
Modelling
model <- naiveBayes(PLAY ~., data= data.training)
print(model)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.3571429 0.6428571
##
## Conditional probabilities:
## OUTLOOK
## Y Overcast Rainy Sunny
## No 0.0000000 0.4000000 0.6000000
## Yes 0.4444444 0.3333333 0.2222222
##
## TEMP
## Y Cool Hot Mild
## No 0.2000000 0.4000000 0.4000000
## Yes 0.3333333 0.2222222 0.4444444
##
## HUMIDITY
## Y High Normal
## No 0.8000000 0.2000000
## Yes 0.3333333 0.6666667
##
## WINDY
## Y False True
## No 0.4000000 0.6000000
## Yes 0.6666667 0.3333333
Prediksi Data Testing
predict_result <- predict(model, data.test)
print(predict_result)
## [1] No
## Levels: No Yes
Dari Hasil Prediksi data testing tersebut diklasifikasikan pada PLAY = ‘NO’
Decision Tree Manual
Contoh Soal 8.8
Change Type of Data
Data diubah menjadi Factor untuk memudahkan proses perhitungan
library(dplyr)
data.training = data.training %>% mutate_if(is.character, as.factor)
Fungsi Hitung Entropy Attribut
Dibuat agar tidak melakukan pengulangan syntax
entropy <- function(x,y){
result = -(x*log(x, base=2)+y*log(y, base=2))
return(result)
}
Entropy System (Info (D))
Entropy.Sys = entropy(length(data.training$PLAY
[data.training$PLAY == 'Yes'])/length(data.training$PLAY),
length(data.training$PLAY
[data.training$PLAY == 'No'])/length(data.training$PLAY))
Entropy.Sys
## [1] 0.940286
Di atas merupakan nilai entropy system
Entropy Attribute
Entropy.Var = data.frame(matrix(ncol = 4, nrow = 0))
for (i in names(data.training[,names(data.training) != 'PLAY'])){
for (j in levels(data.training[,i])){
Freq = as.table(summary(data.training$PLAY[data.training[,i] == j]))
a = Freq['Yes']/sum(Freq)
b = Freq['No']/sum(Freq)
Hasil = entropy(a,b)
Hasil = ifelse(is.nan(Hasil), 0,Hasil)
cat('Attribut =',i,' Level =',j, '\n')
cat('Peluang Class "Yes :" ',Freq['Yes'],'/',sum(Freq),'\n')
cat('Peluang Class "No :" ',Freq['No'],'/',sum(Freq),'\n')
cat('Entropy = ',Hasil, '\n \n')
new_data = c(i,j,Freq['Yes']+Freq['No'],Hasil)
Entropy.Var = rbind(Entropy.Var, new_data)
}
}
## Attribut = OUTLOOK Level = Overcast
## Peluang Class "Yes :" 4 / 4
## Peluang Class "No :" 0 / 4
## Entropy = 0
##
## Attribut = OUTLOOK Level = Rainy
## Peluang Class "Yes :" 3 / 5
## Peluang Class "No :" 2 / 5
## Entropy = 0.9709506
##
## Attribut = OUTLOOK Level = Sunny
## Peluang Class "Yes :" 2 / 5
## Peluang Class "No :" 3 / 5
## Entropy = 0.9709506
##
## Attribut = TEMP Level = Cool
## Peluang Class "Yes :" 3 / 4
## Peluang Class "No :" 1 / 4
## Entropy = 0.8112781
##
## Attribut = TEMP Level = Hot
## Peluang Class "Yes :" 2 / 4
## Peluang Class "No :" 2 / 4
## Entropy = 1
##
## Attribut = TEMP Level = Mild
## Peluang Class "Yes :" 4 / 6
## Peluang Class "No :" 2 / 6
## Entropy = 0.9182958
##
## Attribut = HUMIDITY Level = High
## Peluang Class "Yes :" 3 / 7
## Peluang Class "No :" 4 / 7
## Entropy = 0.9852281
##
## Attribut = HUMIDITY Level = Normal
## Peluang Class "Yes :" 6 / 7
## Peluang Class "No :" 1 / 7
## Entropy = 0.5916728
##
## Attribut = WINDY Level = False
## Peluang Class "Yes :" 6 / 8
## Peluang Class "No :" 2 / 8
## Entropy = 0.8112781
##
## Attribut = WINDY Level = True
## Peluang Class "Yes :" 3 / 6
## Peluang Class "No :" 3 / 6
## Entropy = 1
##
colnames(Entropy.Var) <- c("Att", "Levels","Dj","Entropy")
Entropy.Var$Dj = as.numeric(Entropy.Var$Dj)
Entropy.Var$Entropy = as.numeric(Entropy.Var$Entropy)
Perhitungan Nilai Gain Attribute
Gain = data.frame(matrix(ncol = 2, nrow = 0))
for (i in names(data.training[,names(data.training) != 'PLAY'])){
temp = 0
for (j in levels(data.training[,i])){
Dj = Entropy.Var$Dj[(Entropy.Var$Att == i) & (Entropy.Var$Levels == j)]
Dj
D = length(data.training[,i])
En = Entropy.Var$Entropy[(Entropy.Var$Att == i) & (Entropy.Var$Levels == j)]
temp = temp + (Dj/D*En)
}
new_data = c(i,temp)
Gain = rbind(Gain, new_data)
}
colnames(Gain) <- c("Att", "Entropy")
Gain$Entropy = as.numeric(Gain$Entropy)
Gain$Gain = Entropy.Sys - Gain$Entropy
print(Gain[,c('Att', 'Gain')])
## Att Gain
## 1 OUTLOOK 0.24674982
## 2 TEMP 0.02922257
## 3 HUMIDITY 0.15183550
## 4 WINDY 0.04812703
Dari Gain di atas didapatkan urutan Gain dari terbesar ke Terkecil adalah : OUTLOOK > HUMIDITY > WINDY > TEMP
Naive Bayes Manual
Contoh Soal 8.8
Cross Table
Cross Table dibuat untuk mempermudah pekerjaan
OUTLOOK.CT = table(data.training$OUTLOOK, data.training$PLAY)
TEMP.CT = table(data.training$TEMP, data.training$PLAY)
HUMIDTY.CT = table(data.training$HUMIDITY, data.training$PLAY)
WINDY.CT = table(data.training$WINDY, data.training$PLAY)
PLAY.CT = table(data.training$PLAY)
CT = rbind(OUTLOOK.CT,TEMP.CT,HUMIDTY.CT,WINDY.CT)
CT
## No Yes
## Overcast 0 4
## Rainy 2 3
## Sunny 3 2
## Cool 1 3
## Hot 2 2
## Mild 2 4
## High 4 3
## Normal 1 6
## False 2 6
## True 3 3
Hitung Prior Probabiliy
prior.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = 1))
colnames(prior.prob) = c(names(PLAY.CT))
for (i in colnames(prior.prob)){
prior.prob[i] = PLAY.CT[i] / sum(PLAY.CT)
}
prior.prob
## No Yes
## 0.3571429 0.6428571
Di atas merupakan nilai dari prior probabilities
Hitung Conditional Probability
cond.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = nrow(CT)))
colnames(cond.prob) = c(names(PLAY.CT))
row.names(cond.prob) = c(row.names(CT))
for (i in row.names(cond.prob)){
for (j in colnames(cond.prob)){
cond.prob[i,j] = CT[i,j] / PLAY.CT[j]
}
}
cond.prob
## No Yes
## Overcast 0.0000000 0.4444444
## Rainy 0.4000000 0.3333333
## Sunny 0.6000000 0.2222222
## Cool 0.2000000 0.3333333
## Hot 0.4000000 0.2222222
## Mild 0.4000000 0.4444444
## High 0.8000000 0.3333333
## Normal 0.2000000 0.6666667
## False 0.4000000 0.6666667
## True 0.6000000 0.3333333
Di atas merupakan nilai dari class conditional Probability
Jika attribut pada amatan baru adalah sebagai berikut : Outlook = Sunny, Temp = Cool, Humidity = High, Windy = True)
Hitung Posteriror Probability
X = c("Sunny","Cool","High","True")
post.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = 1))
colnames(post.prob) = c(names(PLAY.CT))
for(i in colnames(post.prob)){
Temp = 1
for(j in X){
Temp = Temp*cond.prob[j,i]
}
post.prob[i] = Temp*prior.prob[i]
}
post.prob
## No Yes
## 0.020571429 0.005291005
Nilai posterior prob. dari untuk Play = No adalah 0.02057 dan Play = Yes adalah 0.00529
Prediksi
prob_yes = post.prob["Yes"]/(post.prob["Yes"]+post.prob["No"])
prob_no = post.prob["No"]/(post.prob["Yes"]+post.prob["No"])
cat('Prediksinya adalah PLAY = ',ifelse(prob_yes >= prob_no, "'Yes'", "'No'"), 'dengan peluang ', ifelse(prob_yes >= prob_no, prob_yes, prob_no))
## Prediksinya adalah PLAY = 'No' dengan peluang 0.7954173