Contoh Decision Tree

Contoh Soal 8.5 (Data Mining dengan R)

library(party)
Data <- iris

Split Data (Training & Testing)

Sample <- sample(1:150, 50)
testing <- Data[Sample,]
learning <- Data[-Sample,]

Modelling

output.tree <- ctree(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = learning)
output.tree
## 
##   Conditional inference tree with 4 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  100 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 90.877
##   2)*  weights = 26 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 48.344
##     4) Petal.Length <= 4.8; criterion = 0.991, statistic = 9.364
##       5)*  weights = 33 
##     4) Petal.Length > 4.8
##       6)*  weights = 7 
##   3) Petal.Width > 1.7
##     7)*  weights = 34
plot(output.tree)

Evaluasi Model

Prediksi <- predict(output.tree, testing[,-5])
CM <- table(testing[,5], Prediksi)
CM
##             Prediksi
##              setosa versicolor virginica
##   setosa         24          0         0
##   versicolor      0         13         0
##   virginica       0          1        12
accuracy <- (sum(diag(CM)))/sum(CM)
accuracy
## [1] 0.98

Dari hasil Evaluasi Model di atas didapatkan bahwa model Decision Tree menghasilkan tingkat akurasi yang baik. Sehingga model Decision Tree dapat diterapkan pada Data.

Contoh soal Naive Bayes

Contoh 8.8 (Data Mining dengan R)

Package

library('e1071')
## Warning: package 'e1071' was built under R version 4.0.5

Input data training

data.training = as.data.frame(rbind(
c("Sunny", "Hot", "High", "False", "No"),
c("Sunny", "Hot", "High", "True", "No"),
c("Overcast","Hot","High","False","Yes"),
c("Rainy","Mild","High","False","Yes"),
c("Rainy","Cool","Normal","False","Yes"),
c("Rainy","Cool","Normal","True","No"),
c("Overcast","Cool","Normal","True","Yes"),
c("Sunny","Mild","High","False","No"),
c("Sunny","Cool","Normal","False","Yes"),
c("Rainy","Mild","Normal","False","Yes"),
c("Sunny","Mild","Normal","True","Yes"),
c("Overcast","Mild","High","True","Yes"),
c("Overcast","Hot","Normal","False","Yes"),
c("Rainy","Mild","High","True","No")))

Ganti nama kolom

  names(data.training)[1] = "OUTLOOK"
  names(data.training)[2] = "TEMP"
  names(data.training)[3] = "HUMIDITY"
  names(data.training)[4] = "WINDY"
  names(data.training)[5] = "PLAY"

Input data testing

data.test = as.data.frame(cbind("Sunny","Cool","High","True"))
names(data.test)[1] = "OUTLOOK"
names(data.test)[2] = "TEMP"
names(data.test)[3] = "HUMIDITY"
names(data.test)[4] = "WINDY"

Modelling

model <- naiveBayes(PLAY ~., data= data.training)
print(model)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##        No       Yes 
## 0.3571429 0.6428571 
## 
## Conditional probabilities:
##      OUTLOOK
## Y      Overcast     Rainy     Sunny
##   No  0.0000000 0.4000000 0.6000000
##   Yes 0.4444444 0.3333333 0.2222222
## 
##      TEMP
## Y          Cool       Hot      Mild
##   No  0.2000000 0.4000000 0.4000000
##   Yes 0.3333333 0.2222222 0.4444444
## 
##      HUMIDITY
## Y          High    Normal
##   No  0.8000000 0.2000000
##   Yes 0.3333333 0.6666667
## 
##      WINDY
## Y         False      True
##   No  0.4000000 0.6000000
##   Yes 0.6666667 0.3333333

Prediksi Data Testing

predict_result <- predict(model, data.test)
print(predict_result)
## [1] No
## Levels: No Yes

Dari Hasil Prediksi data testing tersebut diklasifikasikan pada PLAY = ‘NO’

Decision Tree Manual

Contoh Soal 8.8

Data Input sama dengan Section sebelumnya

data.training

Change Type of Data

Data diubah menjadi Factor untuk memudahkan proses perhitungan

library(dplyr)
data.training = data.training %>% mutate_if(is.character, as.factor)

Fungsi Hitung Entropy Attribut

Dibuat agar tidak melakukan pengulangan syntax

entropy <- function(x,y){
  result = -(x*log(x, base=2)+y*log(y, base=2))
  return(result)
}

Entropy System (Info (D))

Entropy.Sys = entropy(length(data.training$PLAY
                      [data.training$PLAY == 'Yes'])/length(data.training$PLAY),
                      length(data.training$PLAY
                      [data.training$PLAY == 'No'])/length(data.training$PLAY))
Entropy.Sys
## [1] 0.940286

Di atas merupakan nilai entropy system

Entropy Attribute

Entropy.Var = data.frame(matrix(ncol = 4, nrow = 0))
for (i in names(data.training[,names(data.training) != 'PLAY'])){
  for (j in levels(data.training[,i])){
    Freq = as.table(summary(data.training$PLAY[data.training[,i] == j]))
    a = Freq['Yes']/sum(Freq)
    b = Freq['No']/sum(Freq)
    Hasil = entropy(a,b)
    Hasil = ifelse(is.nan(Hasil), 0,Hasil)
    
    cat('Attribut =',i,' Level =',j, '\n')
    cat('Peluang Class "Yes :" ',Freq['Yes'],'/',sum(Freq),'\n')
    cat('Peluang Class "No  :" ',Freq['No'],'/',sum(Freq),'\n')
    cat('Entropy = ',Hasil, '\n \n')

    new_data = c(i,j,Freq['Yes']+Freq['No'],Hasil)
    Entropy.Var = rbind(Entropy.Var, new_data)
  }
}
## Attribut = OUTLOOK  Level = Overcast 
## Peluang Class "Yes :"  4 / 4 
## Peluang Class "No  :"  0 / 4 
## Entropy =  0 
##  
## Attribut = OUTLOOK  Level = Rainy 
## Peluang Class "Yes :"  3 / 5 
## Peluang Class "No  :"  2 / 5 
## Entropy =  0.9709506 
##  
## Attribut = OUTLOOK  Level = Sunny 
## Peluang Class "Yes :"  2 / 5 
## Peluang Class "No  :"  3 / 5 
## Entropy =  0.9709506 
##  
## Attribut = TEMP  Level = Cool 
## Peluang Class "Yes :"  3 / 4 
## Peluang Class "No  :"  1 / 4 
## Entropy =  0.8112781 
##  
## Attribut = TEMP  Level = Hot 
## Peluang Class "Yes :"  2 / 4 
## Peluang Class "No  :"  2 / 4 
## Entropy =  1 
##  
## Attribut = TEMP  Level = Mild 
## Peluang Class "Yes :"  4 / 6 
## Peluang Class "No  :"  2 / 6 
## Entropy =  0.9182958 
##  
## Attribut = HUMIDITY  Level = High 
## Peluang Class "Yes :"  3 / 7 
## Peluang Class "No  :"  4 / 7 
## Entropy =  0.9852281 
##  
## Attribut = HUMIDITY  Level = Normal 
## Peluang Class "Yes :"  6 / 7 
## Peluang Class "No  :"  1 / 7 
## Entropy =  0.5916728 
##  
## Attribut = WINDY  Level = False 
## Peluang Class "Yes :"  6 / 8 
## Peluang Class "No  :"  2 / 8 
## Entropy =  0.8112781 
##  
## Attribut = WINDY  Level = True 
## Peluang Class "Yes :"  3 / 6 
## Peluang Class "No  :"  3 / 6 
## Entropy =  1 
## 
colnames(Entropy.Var) <- c("Att", "Levels","Dj","Entropy")
Entropy.Var$Dj = as.numeric(Entropy.Var$Dj)
Entropy.Var$Entropy = as.numeric(Entropy.Var$Entropy)

Perhitungan Nilai Gain Attribute

Gain = data.frame(matrix(ncol = 2, nrow = 0))
for (i in names(data.training[,names(data.training) != 'PLAY'])){
  temp = 0
  for (j in levels(data.training[,i])){
    Dj = Entropy.Var$Dj[(Entropy.Var$Att == i) & (Entropy.Var$Levels == j)]
    Dj
    D = length(data.training[,i])
    En = Entropy.Var$Entropy[(Entropy.Var$Att == i) & (Entropy.Var$Levels == j)]
    temp = temp + (Dj/D*En)
  }
  new_data = c(i,temp)
  Gain = rbind(Gain, new_data)
}
colnames(Gain) <- c("Att", "Entropy")
Gain$Entropy = as.numeric(Gain$Entropy)
Gain$Gain = Entropy.Sys - Gain$Entropy
print(Gain[,c('Att', 'Gain')])
##        Att       Gain
## 1  OUTLOOK 0.24674982
## 2     TEMP 0.02922257
## 3 HUMIDITY 0.15183550
## 4    WINDY 0.04812703

Dari Gain di atas didapatkan urutan Gain dari terbesar ke Terkecil adalah : OUTLOOK > HUMIDITY > WINDY > TEMP

Naive Bayes Manual

Contoh Soal 8.8

Data yang diinput sama dengan section sebelumnya

data.training

Cross Table

Cross Table dibuat untuk mempermudah pekerjaan

OUTLOOK.CT = table(data.training$OUTLOOK, data.training$PLAY)
TEMP.CT = table(data.training$TEMP, data.training$PLAY)
HUMIDTY.CT = table(data.training$HUMIDITY, data.training$PLAY)
WINDY.CT = table(data.training$WINDY, data.training$PLAY)
PLAY.CT = table(data.training$PLAY)

CT = rbind(OUTLOOK.CT,TEMP.CT,HUMIDTY.CT,WINDY.CT)
CT
##          No Yes
## Overcast  0   4
## Rainy     2   3
## Sunny     3   2
## Cool      1   3
## Hot       2   2
## Mild      2   4
## High      4   3
## Normal    1   6
## False     2   6
## True      3   3

Hitung Prior Probabiliy

prior.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = 1))
colnames(prior.prob) = c(names(PLAY.CT))

for (i in colnames(prior.prob)){
  prior.prob[i] = PLAY.CT[i] / sum(PLAY.CT)
}
  
prior.prob
##                            No       Yes 
##                     0.3571429 0.6428571

Di atas merupakan nilai dari prior probabilities

Hitung Conditional Probability

cond.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = nrow(CT)))
colnames(cond.prob) = c(names(PLAY.CT))
row.names(cond.prob) = c(row.names(CT))

for (i in row.names(cond.prob)){
  for (j in colnames(cond.prob)){
    cond.prob[i,j] = CT[i,j] / PLAY.CT[j]
  }
}

cond.prob
##                 No       Yes
## Overcast 0.0000000 0.4444444
## Rainy    0.4000000 0.3333333
## Sunny    0.6000000 0.2222222
## Cool     0.2000000 0.3333333
## Hot      0.4000000 0.2222222
## Mild     0.4000000 0.4444444
## High     0.8000000 0.3333333
## Normal   0.2000000 0.6666667
## False    0.4000000 0.6666667
## True     0.6000000 0.3333333

Di atas merupakan nilai dari class conditional Probability

Jika attribut pada amatan baru adalah sebagai berikut : Outlook = Sunny, Temp = Cool, Humidity = High, Windy = True)

Hitung Posteriror Probability

X = c("Sunny","Cool","High","True")

post.prob = as.table(matrix(ncol = length(names(PLAY.CT)), nrow = 1))
colnames(post.prob) = c(names(PLAY.CT)) 

for(i in colnames(post.prob)){
  Temp = 1
  for(j in X){
    Temp = Temp*cond.prob[j,i]
  }
  post.prob[i] = Temp*prior.prob[i]
}
post.prob
##                                  No         Yes 
##                         0.020571429 0.005291005

Nilai posterior prob. dari untuk Play = No adalah 0.02057 dan Play = Yes adalah 0.00529

Prediksi

prob_yes = post.prob["Yes"]/(post.prob["Yes"]+post.prob["No"])
prob_no = post.prob["No"]/(post.prob["Yes"]+post.prob["No"])

cat('Prediksinya adalah PLAY = ',ifelse(prob_yes >= prob_no, "'Yes'", "'No'"), 'dengan peluang ', ifelse(prob_yes >= prob_no, prob_yes, prob_no))
## Prediksinya adalah PLAY =  'No' dengan peluang  0.7954173