Decision tree

setwd('F:/Machine Learning/Data Science/Machine Learning/DTree')

data <- read.csv('Cardiotocographic.csv')
head(data)

##    LB   AC FM   UC   DL DS DP ASTV MSTV ALTV MLTV Width Min Max Nmax
## 1 120 0.00  0 0.00 0.00  0  0   73  0.5   43  2.4    64  62 126    2
## 2 132 0.01  0 0.01 0.00  0  0   17  2.1    0 10.4   130  68 198    6
## 3 133 0.00  0 0.01 0.00  0  0   16  2.1    0 13.4   130  68 198    5
## 4 134 0.00  0 0.01 0.00  0  0   16  2.4    0 23.0   117  53 170   11
## 5 132 0.01  0 0.01 0.00  0  0   16  2.4    0 19.9   117  53 170    9
## 6 134 0.00  0 0.01 0.01  0  0   26  5.9    0  0.0   150  50 200    5
##   Nzeros Mode Mean Median Variance Tendency NSP
## 1      0  120  137    121       73        1   2
## 2      1  141  136    140       12        0   1
## 3      1  141  135    138       13        0   1
## 4      0  137  134    137       13        1   1
## 5      0  137  136    138       11        1   1
## 6      3   76  107    107      170        0   3

str(data)

## 'data.frame':    2126 obs. of  22 variables:
##  $ LB      : num  120 132 133 134 132 134 134 122 122 122 ...
##  $ AC      : num  0 0.01 0 0 0.01 0 0 0 0 0 ...
##  $ FM      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UC      : num  0 0.01 0.01 0.01 0.01 0.01 0.01 0 0 0 ...
##  $ DL      : num  0 0 0 0 0 0.01 0.01 0 0 0 ...
##  $ DS      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DP      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ASTV    : num  73 17 16 16 16 26 29 83 84 86 ...
##  $ MSTV    : num  0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
##  $ ALTV    : num  43 0 0 0 0 0 0 6 5 6 ...
##  $ MLTV    : num  2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
##  $ Width   : num  64 130 130 117 117 150 150 68 68 68 ...
##  $ Min     : num  62 68 68 53 53 50 50 62 62 62 ...
##  $ Max     : num  126 198 198 170 170 200 200 130 130 130 ...
##  $ Nmax    : num  2 6 5 11 9 5 6 0 0 1 ...
##  $ Nzeros  : num  0 1 1 0 0 3 3 0 0 0 ...
##  $ Mode    : num  120 141 141 137 137 76 71 122 122 122 ...
##  $ Mean    : num  137 136 135 134 136 107 107 122 122 122 ...
##  $ Median  : num  121 140 138 137 138 107 106 123 123 123 ...
##  $ Variance: num  73 12 13 13 11 170 215 3 3 1 ...
##  $ Tendency: num  1 0 0 1 1 0 0 1 1 1 ...
##  $ NSP     : num  2 1 1 1 1 3 3 3 3 3 ...

#create new factor response variable

data$NSPF <- as.factor(data$NSP)

str(data)

## 'data.frame':    2126 obs. of  23 variables:
##  $ LB      : num  120 132 133 134 132 134 134 122 122 122 ...
##  $ AC      : num  0 0.01 0 0 0.01 0 0 0 0 0 ...
##  $ FM      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UC      : num  0 0.01 0.01 0.01 0.01 0.01 0.01 0 0 0 ...
##  $ DL      : num  0 0 0 0 0 0.01 0.01 0 0 0 ...
##  $ DS      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DP      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ASTV    : num  73 17 16 16 16 26 29 83 84 86 ...
##  $ MSTV    : num  0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
##  $ ALTV    : num  43 0 0 0 0 0 0 6 5 6 ...
##  $ MLTV    : num  2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
##  $ Width   : num  64 130 130 117 117 150 150 68 68 68 ...
##  $ Min     : num  62 68 68 53 53 50 50 62 62 62 ...
##  $ Max     : num  126 198 198 170 170 200 200 130 130 130 ...
##  $ Nmax    : num  2 6 5 11 9 5 6 0 0 1 ...
##  $ Nzeros  : num  0 1 1 0 0 3 3 0 0 0 ...
##  $ Mode    : num  120 141 141 137 137 76 71 122 122 122 ...
##  $ Mean    : num  137 136 135 134 136 107 107 122 122 122 ...
##  $ Median  : num  121 140 138 137 138 107 106 123 123 123 ...
##  $ Variance: num  73 12 13 13 11 170 215 3 3 1 ...
##  $ Tendency: num  1 0 0 1 1 0 0 1 1 1 ...
##  $ NSP     : num  2 1 1 1 1 3 3 3 3 3 ...
##  $ NSPF    : Factor w/ 3 levels "1","2","3": 2 1 1 1 1 3 3 3 3 3 ...

#Data Partition with random seed

set.seed(2498)

pd <- sample(2, nrow(data), replace = 2, prob = c(.8,.2))

train <- data[pd==1, ]
test <- data[pd==2, ]

dim(train)

## [1] 1696   23

dim(test)

## [1] 430  23

#Decision Tree with 'party' package

library(party)

## Warning: package 'party' was built under R version 3.5.3

## Loading required package: grid

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 3.5.2

## Loading required package: modeltools

## Warning: package 'modeltools' was built under R version 3.5.2

## Loading required package: stats4

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 3.5.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.5.2

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 3.5.3

# Decision tree with 3 features only

tree <- ctree(NSPF ~ LB+AC+FM, data=train)

tree

## 
##   Conditional inference tree with 8 terminal nodes
## 
## Response:  NSPF 
## Inputs:  LB, AC, FM 
## Number of observations:  1696 
## 
## 1) LB <= 136; criterion = 1, statistic = 202.856
##   2) AC <= 0; criterion = 1, statistic = 77.569
##     3) FM <= 0; criterion = 1, statistic = 22.958
##       4)*  weights = 613 
##     3) FM > 0
##       5) FM <= 0.19; criterion = 1, statistic = 24.958
##         6) LB <= 124; criterion = 0.999, statistic = 15.433
##           7)*  weights = 31 
##         6) LB > 124
##           8)*  weights = 86 
##       5) FM > 0.19
##         9)*  weights = 14 
##   2) AC > 0
##     10)*  weights = 318 
## 1) LB > 136
##   11) AC <= 0; criterion = 1, statistic = 99.327
##     12) LB <= 143; criterion = 1, statistic = 23.034
##       13)*  weights = 251 
##     12) LB > 143
##       14)*  weights = 230 
##   11) AC > 0
##     15)*  weights = 153

#plot

plot(tree, cex=.5)

#predict train data

#predict probabilities
train.prob <- predict(tree, train, type="prob")

head(train.prob)

## [[1]]
## [1] 1 0 0
## 
## [[2]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[3]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[4]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[5]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[6]]
## [1] 0.80750408 0.03588907 0.15660685

#predict classes
train.class <- predict(tree,train)

head(train.class)

## [1] 1 1 1 1 1 1
## Levels: 1 2 3

#Predict test data

#predict probabilities
test.prob <- predict(tree, test, type="prob")

head(test.prob)

## [[1]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[2]]
## [1] 1 0 0
## 
## [[3]]
## [1] 0.80750408 0.03588907 0.15660685
## 
## [[4]]
## [1] 1 0 0
## 
## [[5]]
## [1] 1 0 0
## 
## [[6]]
## [1] 0.2142857 0.2142857 0.5714286

#predict classes
test.class <- predict(tree, test)

head(test.class)

## [1] 1 1 1 1 1 3
## Levels: 1 2 3

library(caret)

## Warning: package 'caret' was built under R version 3.5.2

## Loading required package: lattice

## Loading required package: ggplot2

#accuracy

confusionMatrix(test$NSPF, test.class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3
##          1 306  32   1
##          2  28  26   0
##          3  29   4   4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7814          
##                  95% CI : (0.7393, 0.8196)
##     No Information Rate : 0.8442          
##     P-Value [Acc > NIR] : 0.9998          
##                                           
##                   Kappa : 0.3068          
##  Mcnemar's Test P-Value : 1.137e-06       
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            0.8430  0.41935 0.800000
## Specificity            0.5075  0.92391 0.922353
## Pos Pred Value         0.9027  0.48148 0.108108
## Neg Pred Value         0.3736  0.90426 0.997455
## Prevalence             0.8442  0.14419 0.011628
## Detection Rate         0.7116  0.06047 0.009302
## Detection Prevalence   0.7884  0.12558 0.086047
## Balanced Accuracy      0.6752  0.67163 0.861176

#misclassification error of train data

table(train$NSPF, train.class)

##    train.class
##        1    2    3
##   1 1219   94    3
##   2  125  113    3
##   3  108   23    8

tab <- table(train$NSPF, train.class)

1-sum(diag(tab))/sum(tab)

## [1] 0.2099057

#misclassification error with test data

table(test$NSPF, test.class)

##    test.class
##       1   2   3
##   1 306  32   1
##   2  28  26   0
##   3  29   4   4

tab1 <- table(test$NSPF, test.class)

1-sum(diag(tab1))/sum(tab1)

## [1] 0.2186047

Misclassification error on train data is about 21% and similarly the misclassification error on test data is about 21.9%.

DTree

Anil

April 29, 2019

Decision tree