http://archive.ics.uci.edu/ml/datasets/statlog+(heart)
– 1. age
– 2. sex
– 3. chest pain type (4 values)
– 4. resting blood pressure
– 5. serum cholestoral in mg/dl
– 6. fasting blood sugar > 120 mg/dl
– 7. resting electrocardiographic results (values 0,1,2)
– 8. maximum heart rate achieved
– 9. exercise induced angina
– 10. oldpeak = ST depression induced by exercise relative to rest
– 11. the slope of the peak exercise ST segment
– 12. number of major vessels (0-3) colored by flourosopy
– 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
Real: 1,4,5,8,10,12
Ordered:11,
Binary: 2,6,9
Nominal:7,3,13
Absence (1) or presence (2) of heart disease
library(ggplot2)
heart<-read.csv("heart.dat", sep=' ')
colnames(heart)<-c('age',
'sex' ,
'chest_pain_type' ,
'blood_pressure' ,
'cholestor',
'sugar',
'resting_electrocardiographic_results' ,
'heart_rate',
'angina' ,
'oldpeak',
'slope' ,
'vessles' ,
'thal',
'disease')
head(heart)
## age sex chest_pain_type blood_pressure cholestor sugar
## 1 67 0 3 115 564 0
## 2 57 1 2 124 261 0
## 3 64 1 4 128 263 0
## 4 74 0 2 120 269 0
## 5 65 1 4 120 177 0
## 6 56 1 3 130 256 1
## resting_electrocardiographic_results heart_rate angina oldpeak slope
## 1 2 160 0 1.6 2
## 2 0 141 0 0.3 1
## 3 0 105 1 0.2 2
## 4 2 121 1 0.2 1
## 5 0 140 0 0.4 1
## 6 2 142 1 0.6 2
## vessles thal disease
## 1 0 7 1
## 2 0 7 2
## 3 1 7 1
## 4 1 3 1
## 5 0 7 1
## 6 1 6 2
#binary columns
bin_cols <- colnames(heart)[c(2,6,9)]
nom_cols <- colnames(heart)[c(7,3,13)]
#
heart[bin_cols]<-lapply(heart[bin_cols], factor)
heart[nom_cols]<-lapply(heart[nom_cols], factor)
#target_columns
target_col<-'disease'
heart[target_col]<-lapply(heart[target_col], factor)
#losowe pomieszanie kolejnosci rekordow
heart.shuffled<-heart[sample(nrow(heart)),]
#indeksy rozgraniczajce partycje (rekordy sa juz pomieszane)
idx1<-round(0.6*nrow(heart.shuffled))
idx2<-round(0.8*nrow(heart.shuffled))
train<-heart.shuffled[1:idx1,]
validation<-heart.shuffled[(idx1+1):idx2,]
test<-heart.shuffled[(idx2+1):nrow(heart.shuffled),]
predictors_string<-paste( c(bin_cols , nom_cols), collapse='+')
formula<-as.formula(paste('disease~', predictors_string))
###prior probabilities
nrow(train)
## [1] 161
t<-table(heart$disease)
prop.table(t)
##
## 1 2
## 0.5576208 0.4423792
library(tree)
heart.tree <- tree(formula, train)
summary(heart.tree)
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
heart.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 161 221.80 1 ( 0.54658 0.45342 )
## 2) chest_pain_type: 1,2,3 83 78.43 1 ( 0.81928 0.18072 )
## 4) thal: 3 59 23.72 1 ( 0.94915 0.05085 )
## 8) sex: 0 25 0.00 1 ( 1.00000 0.00000 ) *
## 9) sex: 1 34 20.29 1 ( 0.91176 0.08824 )
## 18) chest_pain_type: 2 15 0.00 1 ( 1.00000 0.00000 ) *
## 19) chest_pain_type: 1,3 19 16.57 1 ( 0.84211 0.15789 ) *
## 5) thal: 6,7 24 33.27 2 ( 0.50000 0.50000 ) *
## 3) chest_pain_type: 4 78 88.81 2 ( 0.25641 0.74359 )
## 6) thal: 3,6 36 49.80 2 ( 0.47222 0.52778 )
## 12) angina: 0 20 25.90 1 ( 0.65000 0.35000 )
## 24) resting_electrocardiographic_results: 0 7 0.00 1 ( 1.00000 0.00000 ) *
## 25) resting_electrocardiographic_results: 2 13 17.94 2 ( 0.46154 0.53846 ) *
## 13) angina: 1 16 17.99 2 ( 0.25000 0.75000 ) *
## 7) thal: 7 42 21.61 2 ( 0.07143 0.92857 )
## 14) resting_electrocardiographic_results: 0 20 16.91 2 ( 0.15000 0.85000 ) *
## 15) resting_electrocardiographic_results: 2 22 0.00 2 ( 0.00000 1.00000 ) *
plot(heart.tree)
text(heart.tree, pretty=0)
nodes <- heart.tree$frame[[1]]
print (nodes)
## [1] chest_pain_type
## [2] thal
## [3] sex
## [4] <leaf>
## [5] chest_pain_type
## [6] <leaf>
## [7] <leaf>
## [8] <leaf>
## [9] thal
## [10] angina
## [11] resting_electrocardiographic_results
## [12] <leaf>
## [13] <leaf>
## [14] <leaf>
## [15] resting_electrocardiographic_results
## [16] <leaf>
## [17] <leaf>
## 7 Levels: <leaf> sex sugar ... thal
#liczba liści drzewa wynikowego
nleaves<- length(nodes[nodes=='<leaf>'])
Liczba liści drzewa: 9
p_disease<-predict(heart.tree, train )
number_misclassified <- sum(max.col(p_disease)!=train$disease)
accuracy<-mean(max.col(p_disease)==train$disease)
misclasification_rate<-1-accuracy
Liczba wszystkich obserwacji : 161
Liczba błędnie zaklasyfikowanych: 27
Dokładność: 0.8136646
p_disease<-predict(heart.tree, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich
number_misclassified <- sum(max.col(p_disease)!=test$disease)
accuracy<-mean(max.col(p_disease)==test$disease)
misclasification_rate<-1-accuracy
Liczba wszystkich obserwacji : 54
Liczba błędnie zaklasyfikowanych: 15
Accuracy: 0.7407407
Algorytm oparty na kryterium kosztu - złożoności
\(Q(T)\) - miara błędu drzewa, na przykład ułamek błędnych klasyfikacji. Dla każdego nieujemnego współczynnika złożoności \(\alpha\) szukamy takiego drzewa zakorzenionego w drzewie pełnym \(T_{0}\). że wartość minimalną osiąga funkcja kosztu- złożoności wyrażona wzorem:
\(S_{\alpha}(T) = Q(T) + \alpha * |T|\)
Przycinanie drzewa pełnego \(T_{0}\) przebiega w etapach: 1. Skonstruowanie ciągu poddrzew minimalizujących funkcję kosztu złożoności. J-te drzewo buduje się na podstawie dowolnie ustalonej wartości \(\alpha'_{j}\), wybieramy najmniejsze drzewo i oznaczamy je jako \(T_{j}\) Wszystkie możliwe wartośći \(\alpha\) można podzielić na rozłączne przedziały, wewnątrz których dla wszystkich \(\alpha\) wybierane jest to samo drzewo.
\(T(\alpha) = T_{0}\) dla \(\alpha < \alpha_{1}\)
\(T(\alpha) = T_{k}\) dla \(\alpha_{k} \le \alpha \le \alpha_{k+1}\), \(1 \le k \le K\)
\(T(\alpha) = T_{K}\) dla \(\alpha \ge \alpha_{K}\)
gdzie \(K\) jest najmniejszym indeksem, dla którego wybrany został sam korzeń.
Sekwencja drzew budowana w z wykorzystaniem liczby liści:
for (leaves_cnt in nleaves:1 ){
print(leaves_cnt)
pruned.heart.tree = prune.misclass(heart.tree, best=leaves_cnt)
print(summary(pruned.heart.tree))
}
## [1] 9
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
## [1] 8
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
## [1] 7
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
## [1] 6
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
## [1] 5
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "angina"
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8717 = 136 / 156
## Misclassification error rate: 0.1739 = 28 / 161
## [1] 4
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal" "angina"
## Number of terminal nodes: 4
## Residual mean deviance: 0.9168 = 143.9 / 157
## Misclassification error rate: 0.1801 = 29 / 161
## [1] 3
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal" "angina"
## Number of terminal nodes: 4
## Residual mean deviance: 0.9168 = 143.9 / 157
## Misclassification error rate: 0.1801 = 29 / 161
## [1] 2
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 1
## Length Class Mode
## frame 6 data.frame list
## where 161 -none- numeric
## terms 3 terms call
## call 3 -none- call
## y 161 factor numeric
## weights 161 -none- numeric
Sekwencja drzew budowana w z wykorzystaniem liczby liści:
for (alfa in seq(50,-10,-5)){
print(alfa)
pruned.heart.tree = prune.misclass(heart.tree, k=alfa)
print(summary(pruned.heart.tree))
}
## [1] 50
## Length Class Mode
## frame 6 data.frame list
## where 161 -none- numeric
## terms 3 terms call
## call 3 -none- call
## y 161 factor numeric
## weights 161 -none- numeric
## [1] 45
## Length Class Mode
## frame 6 data.frame list
## where 161 -none- numeric
## terms 3 terms call
## call 3 -none- call
## y 161 factor numeric
## weights 161 -none- numeric
## [1] 40
## Length Class Mode
## frame 6 data.frame list
## where 161 -none- numeric
## terms 3 terms call
## call 3 -none- call
## y 161 factor numeric
## weights 161 -none- numeric
## [1] 35
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 30
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 25
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 20
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 15
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 10
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 5
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
## [1] 0
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "angina"
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8717 = 136 / 156
## Misclassification error rate: 0.1739 = 28 / 161
## [1] -5
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
## [1] -10
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
pruned.heart.tree = prune.misclass(heart.tree, best=1)
print(summary(pruned.heart.tree))
## Length Class Mode
## frame 6 data.frame list
## where 161 -none- numeric
## terms 3 terms call
## call 3 -none- call
## y 161 factor numeric
## weights 161 -none- numeric
##plot(pruned.heart.tree)
#text(pruned.heart.tree, pretty=0)
Wyznaczenie sekwencji drzew - w wyniku otrzymujemy liczbę liści drzewa, odpowiadający jej bład klasyfikacji oraz graniczną wartość parametru związanego z kosztem związanym z rozmiarem drzewa.
#przycinanie drzewa: wykreślenie błędu w zależności od liczby liśći
#FUN - kryterium przycianania,
cv.heart.tree<-cv.tree(heart.tree, FUN = prune.misclass)
cv.heart.tree
## $size
## [1] 9 5 4 2 1
##
## $dev
## [1] 38 36 37 47 77
##
## $k
## [1] -Inf 0 1 3 38
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
Wykres przedstawiający błąd klasyfikacji w zależności od liczby liści oraz wartości parametru:
par(mfrow=c(1,2))
plot(cv.heart.tree$size, cv.heart.tree$dev, type="b")
plot(cv.heart.tree$k, cv.heart.tree$dev, type="b")
Uzyskana sekwencja drzew:
for (leaves_cnt in cv.heart.tree$size[1:length(cv.heart.tree$size) - 1]) {
pruned.heart.tree = prune.misclass(heart.tree, best = leaves_cnt)
print(summary(pruned.heart.tree))
plot(pruned.heart.tree)
text(pruned.heart.tree, pretty = 0)
}
##
## Classification tree:
## tree(formula = formula, data = train)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "sex"
## [4] "angina"
## [5] "resting_electrocardiographic_results"
## Number of terminal nodes: 9
## Residual mean deviance: 0.6756 = 102.7 / 152
## Misclassification error rate: 0.1739 = 28 / 161
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L))
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## [2] "thal"
## [3] "angina"
## [4] "resting_electrocardiographic_results"
## Number of terminal nodes: 5
## Residual mean deviance: 0.8717 = 136 / 156
## Misclassification error rate: 0.1739 = 28 / 161
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal" "angina"
## Number of terminal nodes: 4
## Residual mean deviance: 0.9168 = 143.9 / 157
## Misclassification error rate: 0.1801 = 29 / 161
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "chest_pain_type"
## Number of terminal nodes: 2
## Residual mean deviance: 1.052 = 167.2 / 159
## Misclassification error rate: 0.2174 = 35 / 161
plot.tree.sequence(cv.heart.tree)
pruned.heart.tree=prune.misclass(heart.tree, best=4)
summary(pruned.heart.tree)
##
## Classification tree:
## snip.tree(tree = heart.tree, nodes = c(2L, 7L, 12L))
## Variables actually used in tree construction:
## [1] "chest_pain_type" "thal" "angina"
## Number of terminal nodes: 4
## Residual mean deviance: 0.9168 = 143.9 / 157
## Misclassification error rate: 0.1801 = 29 / 161
plot(pruned.heart.tree)
text(pruned.heart.tree, pretty=0)
deviance(heart.tree)
## [1] 102.6931
misclass.tree(heart.tree, detail=FALSE)
## [1] 28
#Liczba błędnych klasyfikacji w każdym z węzłów (niekoniecznie liści)
misclass.tree(heart.tree, detail=TRUE)
## 1 2 4 8 9 18 19 5 3 6 12 24 25 13 7 14 15
## 73 15 3 0 3 0 3 12 20 17 7 0 6 4 3 3 0
p_disease<-predict(pruned.heart.tree, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich
accuracy<-mean(max.col(p_disease)==test$disease)
misclasification_rate<-1-accuracy
accuracy
## [1] 0.7407407
library(rpart)
library(rpart.plot)
heart.tree <- rpart(formula, train ,method="class")
summary(heart.tree)
## Call:
## rpart(formula = formula, data = train, method = "class")
## n= 161
##
## CP nsplit rel error xerror xstd
## 1 0.52054795 0 1.0000000 1.0000000 0.08653005
## 2 0.04109589 1 0.4794521 0.6164384 0.07800087
## 3 0.01369863 3 0.3972603 0.4931507 0.07242199
## 4 0.01000000 6 0.3561644 0.5479452 0.07510812
##
## Variable importance
## chest_pain_type thal
## 35 29
## angina resting_electrocardiographic_results
## 22 9
## sex sugar
## 3 3
##
## Node number 1: 161 observations, complexity param=0.5205479
## predicted class=1 expected loss=0.4534161 P(node) =1
## class counts: 88 73
## probabilities: 0.547 0.453
## left son=2 (83 obs) right son=3 (78 obs)
## Primary splits:
## chest_pain_type splits as LLLR, improve=25.479340, (0 missing)
## thal splits as LRR, improve=25.109920, (0 missing)
## angina splits as LR, improve=20.922100, (0 missing)
## sex splits as LR, improve= 4.361963, (0 missing)
## resting_electrocardiographic_results splits as LRR, improve= 3.039587, (0 missing)
## Surrogate splits:
## angina splits as LR, agree=0.739, adj=0.462, (0 split)
## thal splits as LRR, agree=0.665, adj=0.308, (0 split)
## resting_electrocardiographic_results splits as LRR, agree=0.578, adj=0.128, (0 split)
## sugar splits as RL, agree=0.540, adj=0.051, (0 split)
##
## Node number 2: 83 observations, complexity param=0.01369863
## predicted class=1 expected loss=0.1807229 P(node) =0.515528
## class counts: 68 15
## probabilities: 0.819 0.181
## left son=4 (59 obs) right son=5 (24 obs)
## Primary splits:
## thal splits as LRR, improve=6.88339800, (0 missing)
## sex splits as LR, improve=1.65238700, (0 missing)
## angina splits as LR, improve=0.84851530, (0 missing)
## chest_pain_type splits as RLR-, improve=0.45753400, (0 missing)
## resting_electrocardiographic_results splits as L-R, improve=0.02393973, (0 missing)
## Surrogate splits:
## angina splits as LR, agree=0.723, adj=0.042, (0 split)
##
## Node number 3: 78 observations, complexity param=0.04109589
## predicted class=2 expected loss=0.2564103 P(node) =0.484472
## class counts: 20 58
## probabilities: 0.256 0.744
## left son=6 (36 obs) right son=7 (42 obs)
## Primary splits:
## thal splits as LLR, improve=6.2277170, (0 missing)
## angina splits as LR, improve=5.3235490, (0 missing)
## sex splits as LR, improve=2.0756060, (0 missing)
## resting_electrocardiographic_results splits as LRR, improve=1.5261980, (0 missing)
## sugar splits as LR, improve=0.1983181, (0 missing)
## Surrogate splits:
## sex splits as LR, agree=0.679, adj=0.306, (0 split)
## angina splits as LR, agree=0.654, adj=0.250, (0 split)
## resting_electrocardiographic_results splits as RLL, agree=0.564, adj=0.056, (0 split)
##
## Node number 4: 59 observations
## predicted class=1 expected loss=0.05084746 P(node) =0.3664596
## class counts: 56 3
## probabilities: 0.949 0.051
##
## Node number 5: 24 observations, complexity param=0.01369863
## predicted class=1 expected loss=0.5 P(node) =0.1490683
## class counts: 12 12
## probabilities: 0.500 0.500
## left son=10 (8 obs) right son=11 (16 obs)
## Primary splits:
## sugar splits as RL, improve=0.3750000, (0 missing)
## resting_electrocardiographic_results splits as L-R, improve=0.3428571, (0 missing)
## Surrogate splits:
## thal splits as -LR, agree=0.708, adj=0.125, (0 split)
##
## Node number 6: 36 observations, complexity param=0.04109589
## predicted class=2 expected loss=0.4722222 P(node) =0.2236025
## class counts: 17 19
## probabilities: 0.472 0.528
## left son=12 (20 obs) right son=13 (16 obs)
## Primary splits:
## angina splits as LR, improve=2.8444440, (0 missing)
## resting_electrocardiographic_results splits as LRR, improve=2.7777780, (0 missing)
## sex splits as LR, improve=0.8670451, (0 missing)
## Surrogate splits:
## sex splits as LR, agree=0.583, adj=0.063, (0 split)
## sugar splits as LR, agree=0.583, adj=0.063, (0 split)
## resting_electrocardiographic_results splits as LRL, agree=0.583, adj=0.063, (0 split)
##
## Node number 7: 42 observations
## predicted class=2 expected loss=0.07142857 P(node) =0.2608696
## class counts: 3 39
## probabilities: 0.071 0.929
##
## Node number 10: 8 observations
## predicted class=1 expected loss=0.375 P(node) =0.04968944
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 11: 16 observations
## predicted class=2 expected loss=0.4375 P(node) =0.09937888
## class counts: 7 9
## probabilities: 0.438 0.562
##
## Node number 12: 20 observations, complexity param=0.01369863
## predicted class=1 expected loss=0.35 P(node) =0.1242236
## class counts: 13 7
## probabilities: 0.650 0.350
## left son=24 (7 obs) right son=25 (13 obs)
## Primary splits:
## resting_electrocardiographic_results splits as L-R, improve=2.6384620, (0 missing)
## sex splits as LR, improve=0.2919192, (0 missing)
##
## Node number 13: 16 observations
## predicted class=2 expected loss=0.25 P(node) =0.09937888
## class counts: 4 12
## probabilities: 0.250 0.750
##
## Node number 24: 7 observations
## predicted class=1 expected loss=0 P(node) =0.04347826
## class counts: 7 0
## probabilities: 1.000 0.000
##
## Node number 25: 13 observations
## predicted class=2 expected loss=0.4615385 P(node) =0.08074534
## class counts: 6 7
## probabilities: 0.462 0.538
rpart.plot(heart.tree, uniform=TRUE)
printcp(heart.tree)
##
## Classification tree:
## rpart(formula = formula, data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] angina
## [2] chest_pain_type
## [3] resting_electrocardiographic_results
## [4] sugar
## [5] thal
##
## Root node error: 73/161 = 0.45342
##
## n= 161
##
## CP nsplit rel error xerror xstd
## 1 0.520548 0 1.00000 1.00000 0.086530
## 2 0.041096 1 0.47945 0.61644 0.078001
## 3 0.013699 3 0.39726 0.49315 0.072422
## 4 0.010000 6 0.35616 0.54795 0.075108
opt <- which.min(heart.tree$cptable[,'xerror'])
cp<-heart.tree$cptable[opt,"CP"]
#pruned<-prune(heart.tree,cp)
plot(heart.tree)
text(heart.tree)
library (randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rforest<-randomForest(formula, train, ntree=50)
p_disease<-predict(rforest, test )
#Stosunek poprawnie zaklsyfikowanych do wszystkich
accuracy<-mean(p_disease==test$disease)
misclasification_rate<-1-accuracy
accuracy
## [1] 0.7777778