Classification trees

Przykładowy zbiór danych

Metadata

http://archive.ics.uci.edu/ml/datasets/statlog+(heart)

Attribute Information:

– 1. age

– 2. sex

– 3. chest pain type (4 values)

– 4. resting blood pressure

– 5. serum cholestoral in mg/dl

– 6. fasting blood sugar > 120 mg/dl

– 7. resting electrocardiographic results (values 0,1,2)

– 8. maximum heart rate achieved

– 9. exercise induced angina

– 10. oldpeak = ST depression induced by exercise relative to rest

– 11. the slope of the peak exercise ST segment

– 12. number of major vessels (0-3) colored by flourosopy

– 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Attributes types

Real: 1,4,5,8,10,12

Ordered:11,

Binary: 2,6,9

Nominal:7,3,13

Variable to be predicted

Absence (1) or presence (2) of heart disease

Import data

library(ggplot2)


heart<-read.csv("heart.dat", sep=' ')

colnames(heart)<-c('age',
  'sex' ,
  'chest_pain_type' ,
  'blood_pressure' ,
  'cholestor',
  'sugar',
  'resting_electrocardiographic_results' ,
  'heart_rate', 
  'angina' ,
  'oldpeak', 
  'slope' ,
  'vessles' ,
  'thal',
  'disease')


head(heart)

##   age sex chest_pain_type blood_pressure cholestor sugar
## 1  67   0               3            115       564     0
## 2  57   1               2            124       261     0
## 3  64   1               4            128       263     0
## 4  74   0               2            120       269     0
## 5  65   1               4            120       177     0
## 6  56   1               3            130       256     1
##   resting_electrocardiographic_results heart_rate angina oldpeak slope
## 1                                    2        160      0     1.6     2
## 2                                    0        141      0     0.3     1
## 3                                    0        105      1     0.2     2
## 4                                    2        121      1     0.2     1
## 5                                    0        140      0     0.4     1
## 6                                    2        142      1     0.6     2
##   vessles thal disease
## 1       0    7       1
## 2       0    7       2
## 3       1    7       1
## 4       1    3       1
## 5       0    7       1
## 6       1    6       2

#binary columns
bin_cols <- colnames(heart)[c(2,6,9)]
nom_cols <- colnames(heart)[c(7,3,13)] 

#
heart[bin_cols]<-lapply(heart[bin_cols], factor)
heart[nom_cols]<-lapply(heart[nom_cols], factor)

#target_columns
target_col<-'disease'
heart[target_col]<-lapply(heart[target_col], factor)
#losowe pomieszanie kolejnosci rekordow 

heart.shuffled<-heart[sample(nrow(heart)),]


#indeksy rozgraniczajce partycje (rekordy sa juz pomieszane)
idx1<-round(0.6*nrow(heart.shuffled))
idx2<-round(0.8*nrow(heart.shuffled))


train<-heart.shuffled[1:idx1,]
validation<-heart.shuffled[(idx1+1):idx2,]
test<-heart.shuffled[(idx2+1):nrow(heart.shuffled),]

predictors_string<-paste( c(bin_cols ,  nom_cols), collapse='+')
formula<-as.formula(paste('disease~', predictors_string))

###prior probabilities 

nrow(train)

## [1] 161

t<-table(heart$disease)
prop.table(t)

## 
##         1         2 
## 0.5576208 0.4423792

Budowa drzewa klasyfikującego

Biblioteka tree

Budowa drzewa

library(tree)

heart.tree <- tree(formula, train)
summary(heart.tree)

## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161

heart.tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 161 221.80 1 ( 0.54658 0.45342 )  
##    2) chest_pain_type: 1,2,3 83  78.43 1 ( 0.81928 0.18072 )  
##      4) thal: 3 59  23.72 1 ( 0.94915 0.05085 )  
##        8) sex: 0 25   0.00 1 ( 1.00000 0.00000 ) *
##        9) sex: 1 34  20.29 1 ( 0.91176 0.08824 )  
##         18) chest_pain_type: 2 15   0.00 1 ( 1.00000 0.00000 ) *
##         19) chest_pain_type: 1,3 19  16.57 1 ( 0.84211 0.15789 ) *
##      5) thal: 6,7 24  33.27 2 ( 0.50000 0.50000 ) *
##    3) chest_pain_type: 4 78  88.81 2 ( 0.25641 0.74359 )  
##      6) thal: 3,6 36  49.80 2 ( 0.47222 0.52778 )  
##       12) angina: 0 20  25.90 1 ( 0.65000 0.35000 )  
##         24) resting_electrocardiographic_results: 0 7   0.00 1 ( 1.00000 0.00000 ) *
##         25) resting_electrocardiographic_results: 2 13  17.94 2 ( 0.46154 0.53846 ) *
##       13) angina: 1 16  17.99 2 ( 0.25000 0.75000 ) *
##      7) thal: 7 42  21.61 2 ( 0.07143 0.92857 )  
##       14) resting_electrocardiographic_results: 0 20  16.91 2 ( 0.15000 0.85000 ) *
##       15) resting_electrocardiographic_results: 2 22   0.00 2 ( 0.00000 1.00000 ) *

plot(heart.tree)
text(heart.tree, pretty=0)

nodes <- heart.tree$frame[[1]]
print (nodes)

##  [1] chest_pain_type                     
##  [2] thal                                
##  [3] sex                                 
##  [4] <leaf>                              
##  [5] chest_pain_type                     
##  [6] <leaf>                              
##  [7] <leaf>                              
##  [8] <leaf>                              
##  [9] thal                                
## [10] angina                              
## [11] resting_electrocardiographic_results
## [12] <leaf>                              
## [13] <leaf>                              
## [14] <leaf>                              
## [15] resting_electrocardiographic_results
## [16] <leaf>                              
## [17] <leaf>                              
## 7 Levels: <leaf> sex sugar ... thal

#liczba liści drzewa wynikowego 
nleaves<- length(nodes[nodes=='<leaf>'])

Liczba liści drzewa: 9

Błąd klasyfikacji

ciąg trenujący

p_disease<-predict(heart.tree, train )
number_misclassified <-  sum(max.col(p_disease)!=train$disease)
accuracy<-mean(max.col(p_disease)==train$disease)
misclasification_rate<-1-accuracy

Liczba wszystkich obserwacji : 161

Liczba błędnie zaklasyfikowanych: 27

Dokładność: 0.8136646

ciąg testowy

p_disease<-predict(heart.tree, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich 
number_misclassified <-  sum(max.col(p_disease)!=test$disease)
accuracy<-mean(max.col(p_disease)==test$disease)
misclasification_rate<-1-accuracy

Liczba wszystkich obserwacji : 54

Liczba błędnie zaklasyfikowanych: 15

Accuracy: 0.7407407

Przycinanie (prunning)

Algorytm oparty na kryterium kosztu - złożoności

\(Q(T)\) - miara błędu drzewa, na przykład ułamek błędnych klasyfikacji. Dla każdego nieujemnego współczynnika złożoności \(\alpha\) szukamy takiego drzewa zakorzenionego w drzewie pełnym \(T_{0}\). że wartość minimalną osiąga funkcja kosztu- złożoności wyrażona wzorem:

\(S_{\alpha}(T) = Q(T) + \alpha * |T|\)

Przycinanie drzewa pełnego \(T_{0}\) przebiega w etapach: 1. Skonstruowanie ciągu poddrzew minimalizujących funkcję kosztu złożoności. J-te drzewo buduje się na podstawie dowolnie ustalonej wartości \(\alpha'_{j}\), wybieramy najmniejsze drzewo i oznaczamy je jako \(T_{j}\) Wszystkie możliwe wartośći \(\alpha\) można podzielić na rozłączne przedziały, wewnątrz których dla wszystkich \(\alpha\) wybierane jest to samo drzewo.

\(T(\alpha) = T_{0}\) dla \(\alpha < \alpha_{1}\)

\(T(\alpha) = T_{k}\) dla \(\alpha_{k} \le \alpha \le \alpha_{k+1}\), \(1 \le k \le K\)

\(T(\alpha) = T_{K}\) dla \(\alpha \ge \alpha_{K}\)

gdzie \(K\) jest najmniejszym indeksem, dla którego wybrany został sam korzeń.

Wybranie z ciągu \(T_{1}, T_{2}, \dots, T_{K}\) drzewa o najmniejszym ułamku błędów klasyfikacji dla próby testowej, lub w przypadku gdy nie dysponujemy próbą testową wybór najlepszego drzewa odbywa się na podstawie kroswalidacji.

Sekwencja drzew budowana w z wykorzystaniem liczby liści:

for (leaves_cnt in nleaves:1 ){
  print(leaves_cnt)
  pruned.heart.tree = prune.misclass(heart.tree, best=leaves_cnt)
  print(summary(pruned.heart.tree))
}

## [1] 9
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] 8
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] 7
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] 6
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] 5
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "angina"                              
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8717 = 136 / 156 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] 4
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal"            "angina"         
## Number of terminal nodes:  4 
## Residual mean deviance:  0.9168 = 143.9 / 157 
## Misclassification error rate: 0.1801 = 29 / 161 
## [1] 3
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal"            "angina"         
## Number of terminal nodes:  4 
## Residual mean deviance:  0.9168 = 143.9 / 157 
## Misclassification error rate: 0.1801 = 29 / 161 
## [1] 2
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 1
##         Length Class      Mode   
## frame     6    data.frame list   
## where   161    -none-     numeric
## terms     3    terms      call   
## call      3    -none-     call   
## y       161    factor     numeric
## weights 161    -none-     numeric

Sekwencja drzew budowana w z wykorzystaniem liczby liści:

for (alfa in seq(50,-10,-5)){
  print(alfa)
  pruned.heart.tree = prune.misclass(heart.tree, k=alfa)
  print(summary(pruned.heart.tree))
}

## [1] 50
##         Length Class      Mode   
## frame     6    data.frame list   
## where   161    -none-     numeric
## terms     3    terms      call   
## call      3    -none-     call   
## y       161    factor     numeric
## weights 161    -none-     numeric
## [1] 45
##         Length Class      Mode   
## frame     6    data.frame list   
## where   161    -none-     numeric
## terms     3    terms      call   
## call      3    -none-     call   
## y       161    factor     numeric
## weights 161    -none-     numeric
## [1] 40
##         Length Class      Mode   
## frame     6    data.frame list   
## where   161    -none-     numeric
## terms     3    terms      call   
## call      3    -none-     call   
## y       161    factor     numeric
## weights 161    -none-     numeric
## [1] 35
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 30
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 25
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 20
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 15
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 10
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 5
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161 
## [1] 0
## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "angina"                              
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8717 = 136 / 156 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] -5
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161 
## [1] -10
## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161

  pruned.heart.tree = prune.misclass(heart.tree, best=1)
  print(summary(pruned.heart.tree))

##         Length Class      Mode   
## frame     6    data.frame list   
## where   161    -none-     numeric
## terms     3    terms      call   
## call      3    -none-     call   
## y       161    factor     numeric
## weights 161    -none-     numeric

  ##plot(pruned.heart.tree)
  #text(pruned.heart.tree, pretty=0)

Wyznaczenie sekwencji drzew - w wyniku otrzymujemy liczbę liści drzewa, odpowiadający jej bład klasyfikacji oraz graniczną wartość parametru związanego z kosztem związanym z rozmiarem drzewa.

#przycinanie drzewa: wykreślenie błędu w zależności od liczby liśći 
#FUN - kryterium przycianania, 
cv.heart.tree<-cv.tree(heart.tree, FUN = prune.misclass)
cv.heart.tree

## $size
## [1] 9 5 4 2 1
## 
## $dev
## [1] 38 36 37 47 77
## 
## $k
## [1] -Inf    0    1    3   38
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

Wykres przedstawiający błąd klasyfikacji w zależności od liczby liści oraz wartości parametru:

par(mfrow=c(1,2))
plot(cv.heart.tree$size, cv.heart.tree$dev, type="b")
plot(cv.heart.tree$k, cv.heart.tree$dev, type="b")

Uzyskana sekwencja drzew:

for (leaves_cnt in cv.heart.tree$size[1:length(cv.heart.tree$size) - 1]) {
    
    pruned.heart.tree = prune.misclass(heart.tree, best = leaves_cnt)
    print(summary(pruned.heart.tree))
    plot(pruned.heart.tree)
    text(pruned.heart.tree, pretty = 0)
    
    
}

## 
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "sex"                                 
## [4] "angina"                              
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes:  9 
## Residual mean deviance:  0.6756 = 102.7 / 152 
## Misclassification error rate: 0.1739 = 28 / 161

## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"                     
## [2] "thal"                                
## [3] "angina"                              
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.8717 = 136 / 156 
## Misclassification error rate: 0.1739 = 28 / 161

## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal"            "angina"         
## Number of terminal nodes:  4 
## Residual mean deviance:  0.9168 = 143.9 / 157 
## Misclassification error rate: 0.1801 = 29 / 161

## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.052 = 167.2 / 159 
## Misclassification error rate: 0.2174 = 35 / 161

plot.tree.sequence(cv.heart.tree)

pruned.heart.tree=prune.misclass(heart.tree, best=4)
summary(pruned.heart.tree)

## 
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal"            "angina"         
## Number of terminal nodes:  4 
## Residual mean deviance:  0.9168 = 143.9 / 157 
## Misclassification error rate: 0.1801 = 29 / 161

plot(pruned.heart.tree)
text(pruned.heart.tree, pretty=0)

deviance(heart.tree)

## [1] 102.6931

misclass.tree(heart.tree, detail=FALSE)

## [1] 28

#Liczba błędnych klasyfikacji w każdym z węzłów (niekoniecznie liści)
misclass.tree(heart.tree, detail=TRUE)

##  1  2  4  8  9 18 19  5  3  6 12 24 25 13  7 14 15 
## 73 15  3  0  3  0  3 12 20 17  7  0  6  4  3  3  0

Błąd klasyfikacji

p_disease<-predict(pruned.heart.tree, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich 
accuracy<-mean(max.col(p_disease)==test$disease)
misclasification_rate<-1-accuracy

accuracy

## [1] 0.7407407

Biblioteka rpart

library(rpart)
library(rpart.plot)


heart.tree <- rpart(formula, train ,method="class")
summary(heart.tree)

## Call:
## rpart(formula = formula, data = train, method = "class")
##   n= 161 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.52054795      0 1.0000000 1.0000000 0.08653005
## 2 0.04109589      1 0.4794521 0.6164384 0.07800087
## 3 0.01369863      3 0.3972603 0.4931507 0.07242199
## 4 0.01000000      6 0.3561644 0.5479452 0.07510812
## 
## Variable importance
##                      chest_pain_type                                 thal 
##                                   35                                   29 
##                               angina resting_electrocardiographic_results 
##                                   22                                    9 
##                                  sex                                sugar 
##                                    3                                    3 
## 
## Node number 1: 161 observations,    complexity param=0.5205479
##   predicted class=1  expected loss=0.4534161  P(node) =1
##     class counts:    88    73
##    probabilities: 0.547 0.453 
##   left son=2 (83 obs) right son=3 (78 obs)
##   Primary splits:
##       chest_pain_type                      splits as  LLLR, improve=25.479340, (0 missing)
##       thal                                 splits as  LRR,  improve=25.109920, (0 missing)
##       angina                               splits as  LR,   improve=20.922100, (0 missing)
##       sex                                  splits as  LR,   improve= 4.361963, (0 missing)
##       resting_electrocardiographic_results splits as  LRR,  improve= 3.039587, (0 missing)
##   Surrogate splits:
##       angina                               splits as  LR,  agree=0.739, adj=0.462, (0 split)
##       thal                                 splits as  LRR, agree=0.665, adj=0.308, (0 split)
##       resting_electrocardiographic_results splits as  LRR, agree=0.578, adj=0.128, (0 split)
##       sugar                                splits as  RL,  agree=0.540, adj=0.051, (0 split)
## 
## Node number 2: 83 observations,    complexity param=0.01369863
##   predicted class=1  expected loss=0.1807229  P(node) =0.515528
##     class counts:    68    15
##    probabilities: 0.819 0.181 
##   left son=4 (59 obs) right son=5 (24 obs)
##   Primary splits:
##       thal                                 splits as  LRR,  improve=6.88339800, (0 missing)
##       sex                                  splits as  LR,   improve=1.65238700, (0 missing)
##       angina                               splits as  LR,   improve=0.84851530, (0 missing)
##       chest_pain_type                      splits as  RLR-, improve=0.45753400, (0 missing)
##       resting_electrocardiographic_results splits as  L-R,  improve=0.02393973, (0 missing)
##   Surrogate splits:
##       angina splits as  LR, agree=0.723, adj=0.042, (0 split)
## 
## Node number 3: 78 observations,    complexity param=0.04109589
##   predicted class=2  expected loss=0.2564103  P(node) =0.484472
##     class counts:    20    58
##    probabilities: 0.256 0.744 
##   left son=6 (36 obs) right son=7 (42 obs)
##   Primary splits:
##       thal                                 splits as  LLR, improve=6.2277170, (0 missing)
##       angina                               splits as  LR,  improve=5.3235490, (0 missing)
##       sex                                  splits as  LR,  improve=2.0756060, (0 missing)
##       resting_electrocardiographic_results splits as  LRR, improve=1.5261980, (0 missing)
##       sugar                                splits as  LR,  improve=0.1983181, (0 missing)
##   Surrogate splits:
##       sex                                  splits as  LR,  agree=0.679, adj=0.306, (0 split)
##       angina                               splits as  LR,  agree=0.654, adj=0.250, (0 split)
##       resting_electrocardiographic_results splits as  RLL, agree=0.564, adj=0.056, (0 split)
## 
## Node number 4: 59 observations
##   predicted class=1  expected loss=0.05084746  P(node) =0.3664596
##     class counts:    56     3
##    probabilities: 0.949 0.051 
## 
## Node number 5: 24 observations,    complexity param=0.01369863
##   predicted class=1  expected loss=0.5  P(node) =0.1490683
##     class counts:    12    12
##    probabilities: 0.500 0.500 
##   left son=10 (8 obs) right son=11 (16 obs)
##   Primary splits:
##       sugar                                splits as  RL,  improve=0.3750000, (0 missing)
##       resting_electrocardiographic_results splits as  L-R, improve=0.3428571, (0 missing)
##   Surrogate splits:
##       thal splits as  -LR, agree=0.708, adj=0.125, (0 split)
## 
## Node number 6: 36 observations,    complexity param=0.04109589
##   predicted class=2  expected loss=0.4722222  P(node) =0.2236025
##     class counts:    17    19
##    probabilities: 0.472 0.528 
##   left son=12 (20 obs) right son=13 (16 obs)
##   Primary splits:
##       angina                               splits as  LR,  improve=2.8444440, (0 missing)
##       resting_electrocardiographic_results splits as  LRR, improve=2.7777780, (0 missing)
##       sex                                  splits as  LR,  improve=0.8670451, (0 missing)
##   Surrogate splits:
##       sex                                  splits as  LR,  agree=0.583, adj=0.063, (0 split)
##       sugar                                splits as  LR,  agree=0.583, adj=0.063, (0 split)
##       resting_electrocardiographic_results splits as  LRL, agree=0.583, adj=0.063, (0 split)
## 
## Node number 7: 42 observations
##   predicted class=2  expected loss=0.07142857  P(node) =0.2608696
##     class counts:     3    39
##    probabilities: 0.071 0.929 
## 
## Node number 10: 8 observations
##   predicted class=1  expected loss=0.375  P(node) =0.04968944
##     class counts:     5     3
##    probabilities: 0.625 0.375 
## 
## Node number 11: 16 observations
##   predicted class=2  expected loss=0.4375  P(node) =0.09937888
##     class counts:     7     9
##    probabilities: 0.438 0.562 
## 
## Node number 12: 20 observations,    complexity param=0.01369863
##   predicted class=1  expected loss=0.35  P(node) =0.1242236
##     class counts:    13     7
##    probabilities: 0.650 0.350 
##   left son=24 (7 obs) right son=25 (13 obs)
##   Primary splits:
##       resting_electrocardiographic_results splits as  L-R, improve=2.6384620, (0 missing)
##       sex                                  splits as  LR,  improve=0.2919192, (0 missing)
## 
## Node number 13: 16 observations
##   predicted class=2  expected loss=0.25  P(node) =0.09937888
##     class counts:     4    12
##    probabilities: 0.250 0.750 
## 
## Node number 24: 7 observations
##   predicted class=1  expected loss=0  P(node) =0.04347826
##     class counts:     7     0
##    probabilities: 1.000 0.000 
## 
## Node number 25: 13 observations
##   predicted class=2  expected loss=0.4615385  P(node) =0.08074534
##     class counts:     6     7
##    probabilities: 0.462 0.538

rpart.plot(heart.tree, uniform=TRUE)

printcp(heart.tree)

## 
## Classification tree:
## rpart(formula = formula, data = train, method = "class")
## 
## Variables actually used in tree construction:
## [1] angina                              
## [2] chest_pain_type                     
## [3] resting_electrocardiographic_results
## [4] sugar                               
## [5] thal                                
## 
## Root node error: 73/161 = 0.45342
## 
## n= 161 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.520548      0   1.00000 1.00000 0.086530
## 2 0.041096      1   0.47945 0.61644 0.078001
## 3 0.013699      3   0.39726 0.49315 0.072422
## 4 0.010000      6   0.35616 0.54795 0.075108

opt <- which.min(heart.tree$cptable[,'xerror'])
cp<-heart.tree$cptable[opt,"CP"]

#pruned<-prune(heart.tree,cp)

plot(heart.tree)
text(heart.tree)

Random forest

library (randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

rforest<-randomForest(formula, train, ntree=50)

p_disease<-predict(rforest, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich 
accuracy<-mean(p_disease==test$disease)
misclasification_rate<-1-accuracy

accuracy

## [1] 0.7777778

Decision Trees and Random Forests

Marcin Mazurek

2017-11-08

Classification trees

Przykładowy zbiór danych

Metadata

Attribute Information:

Attributes types

Variable to be predicted

Import data

Budowa drzewa klasyfikującego

Biblioteka tree

Budowa drzewa

Błąd klasyfikacji

Przycinanie (prunning)

Błąd klasyfikacji

Biblioteka rpart

Random forest

Regression trees