setwd('F:/Machine Learning/Data Science/Machine Learning/DTree')
data <- read.csv('Cardiotocographic.csv')
head(data)
## LB AC FM UC DL DS DP ASTV MSTV ALTV MLTV Width Min Max Nmax
## 1 120 0.00 0 0.00 0.00 0 0 73 0.5 43 2.4 64 62 126 2
## 2 132 0.01 0 0.01 0.00 0 0 17 2.1 0 10.4 130 68 198 6
## 3 133 0.00 0 0.01 0.00 0 0 16 2.1 0 13.4 130 68 198 5
## 4 134 0.00 0 0.01 0.00 0 0 16 2.4 0 23.0 117 53 170 11
## 5 132 0.01 0 0.01 0.00 0 0 16 2.4 0 19.9 117 53 170 9
## 6 134 0.00 0 0.01 0.01 0 0 26 5.9 0 0.0 150 50 200 5
## Nzeros Mode Mean Median Variance Tendency NSP
## 1 0 120 137 121 73 1 2
## 2 1 141 136 140 12 0 1
## 3 1 141 135 138 13 0 1
## 4 0 137 134 137 13 1 1
## 5 0 137 136 138 11 1 1
## 6 3 76 107 107 170 0 3
str(data)
## 'data.frame': 2126 obs. of 22 variables:
## $ LB : num 120 132 133 134 132 134 134 122 122 122 ...
## $ AC : num 0 0.01 0 0 0.01 0 0 0 0 0 ...
## $ FM : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UC : num 0 0.01 0.01 0.01 0.01 0.01 0.01 0 0 0 ...
## $ DL : num 0 0 0 0 0 0.01 0.01 0 0 0 ...
## $ DS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ASTV : num 73 17 16 16 16 26 29 83 84 86 ...
## $ MSTV : num 0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
## $ ALTV : num 43 0 0 0 0 0 0 6 5 6 ...
## $ MLTV : num 2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
## $ Width : num 64 130 130 117 117 150 150 68 68 68 ...
## $ Min : num 62 68 68 53 53 50 50 62 62 62 ...
## $ Max : num 126 198 198 170 170 200 200 130 130 130 ...
## $ Nmax : num 2 6 5 11 9 5 6 0 0 1 ...
## $ Nzeros : num 0 1 1 0 0 3 3 0 0 0 ...
## $ Mode : num 120 141 141 137 137 76 71 122 122 122 ...
## $ Mean : num 137 136 135 134 136 107 107 122 122 122 ...
## $ Median : num 121 140 138 137 138 107 106 123 123 123 ...
## $ Variance: num 73 12 13 13 11 170 215 3 3 1 ...
## $ Tendency: num 1 0 0 1 1 0 0 1 1 1 ...
## $ NSP : num 2 1 1 1 1 3 3 3 3 3 ...
#create new factor response variable
data$NSPF <- as.factor(data$NSP)
str(data)
## 'data.frame': 2126 obs. of 23 variables:
## $ LB : num 120 132 133 134 132 134 134 122 122 122 ...
## $ AC : num 0 0.01 0 0 0.01 0 0 0 0 0 ...
## $ FM : num 0 0 0 0 0 0 0 0 0 0 ...
## $ UC : num 0 0.01 0.01 0.01 0.01 0.01 0.01 0 0 0 ...
## $ DL : num 0 0 0 0 0 0.01 0.01 0 0 0 ...
## $ DS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ASTV : num 73 17 16 16 16 26 29 83 84 86 ...
## $ MSTV : num 0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
## $ ALTV : num 43 0 0 0 0 0 0 6 5 6 ...
## $ MLTV : num 2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
## $ Width : num 64 130 130 117 117 150 150 68 68 68 ...
## $ Min : num 62 68 68 53 53 50 50 62 62 62 ...
## $ Max : num 126 198 198 170 170 200 200 130 130 130 ...
## $ Nmax : num 2 6 5 11 9 5 6 0 0 1 ...
## $ Nzeros : num 0 1 1 0 0 3 3 0 0 0 ...
## $ Mode : num 120 141 141 137 137 76 71 122 122 122 ...
## $ Mean : num 137 136 135 134 136 107 107 122 122 122 ...
## $ Median : num 121 140 138 137 138 107 106 123 123 123 ...
## $ Variance: num 73 12 13 13 11 170 215 3 3 1 ...
## $ Tendency: num 1 0 0 1 1 0 0 1 1 1 ...
## $ NSP : num 2 1 1 1 1 3 3 3 3 3 ...
## $ NSPF : Factor w/ 3 levels "1","2","3": 2 1 1 1 1 3 3 3 3 3 ...
#Data Partition with random seed
set.seed(2498)
pd <- sample(2, nrow(data), replace = 2, prob = c(.8,.2))
train <- data[pd==1, ]
test <- data[pd==2, ]
dim(train)
## [1] 1696 23
dim(test)
## [1] 430 23
#Decision Tree with 'party' package
library(party)
## Warning: package 'party' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.5.2
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.5.2
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.5.3
# Decision tree with 3 features only
tree <- ctree(NSPF ~ LB+AC+FM, data=train)
tree
##
## Conditional inference tree with 8 terminal nodes
##
## Response: NSPF
## Inputs: LB, AC, FM
## Number of observations: 1696
##
## 1) LB <= 136; criterion = 1, statistic = 202.856
## 2) AC <= 0; criterion = 1, statistic = 77.569
## 3) FM <= 0; criterion = 1, statistic = 22.958
## 4)* weights = 613
## 3) FM > 0
## 5) FM <= 0.19; criterion = 1, statistic = 24.958
## 6) LB <= 124; criterion = 0.999, statistic = 15.433
## 7)* weights = 31
## 6) LB > 124
## 8)* weights = 86
## 5) FM > 0.19
## 9)* weights = 14
## 2) AC > 0
## 10)* weights = 318
## 1) LB > 136
## 11) AC <= 0; criterion = 1, statistic = 99.327
## 12) LB <= 143; criterion = 1, statistic = 23.034
## 13)* weights = 251
## 12) LB > 143
## 14)* weights = 230
## 11) AC > 0
## 15)* weights = 153
#plot
plot(tree, cex=.5)
#predict train data
#predict probabilities
train.prob <- predict(tree, train, type="prob")
head(train.prob)
## [[1]]
## [1] 1 0 0
##
## [[2]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[3]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[4]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[5]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[6]]
## [1] 0.80750408 0.03588907 0.15660685
#predict classes
train.class <- predict(tree,train)
head(train.class)
## [1] 1 1 1 1 1 1
## Levels: 1 2 3
#Predict test data
#predict probabilities
test.prob <- predict(tree, test, type="prob")
head(test.prob)
## [[1]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[2]]
## [1] 1 0 0
##
## [[3]]
## [1] 0.80750408 0.03588907 0.15660685
##
## [[4]]
## [1] 1 0 0
##
## [[5]]
## [1] 1 0 0
##
## [[6]]
## [1] 0.2142857 0.2142857 0.5714286
#predict classes
test.class <- predict(tree, test)
head(test.class)
## [1] 1 1 1 1 1 3
## Levels: 1 2 3
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
#accuracy
confusionMatrix(test$NSPF, test.class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 306 32 1
## 2 28 26 0
## 3 29 4 4
##
## Overall Statistics
##
## Accuracy : 0.7814
## 95% CI : (0.7393, 0.8196)
## No Information Rate : 0.8442
## P-Value [Acc > NIR] : 0.9998
##
## Kappa : 0.3068
## Mcnemar's Test P-Value : 1.137e-06
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8430 0.41935 0.800000
## Specificity 0.5075 0.92391 0.922353
## Pos Pred Value 0.9027 0.48148 0.108108
## Neg Pred Value 0.3736 0.90426 0.997455
## Prevalence 0.8442 0.14419 0.011628
## Detection Rate 0.7116 0.06047 0.009302
## Detection Prevalence 0.7884 0.12558 0.086047
## Balanced Accuracy 0.6752 0.67163 0.861176
#misclassification error of train data
table(train$NSPF, train.class)
## train.class
## 1 2 3
## 1 1219 94 3
## 2 125 113 3
## 3 108 23 8
tab <- table(train$NSPF, train.class)
1-sum(diag(tab))/sum(tab)
## [1] 0.2099057
#misclassification error with test data
table(test$NSPF, test.class)
## test.class
## 1 2 3
## 1 306 32 1
## 2 28 26 0
## 3 29 4 4
tab1 <- table(test$NSPF, test.class)
1-sum(diag(tab1))/sum(tab1)
## [1] 0.2186047
Misclassification error on train data is about 21% and similarly the misclassification error on test data is about 21.9%.