#D-TREE
#install.packages("RWeka")
library("RWeka")
## Warning: package 'RWeka' was built under R version 4.2.2
data2 <- read.arff("C:/R pubs/messidor_features.arff")
data2<-data2[2:20]
View(data2)
#Creating Train and Test Set
library(caTools)
##
## Attaching package: 'caTools'
## The following object is masked from 'package:RWeka':
##
## LogitBoost
set.seed(123)
split=sample.split(Y=data2$Class,SplitRatio=2/3)
train_set=subset(x=data2,split==TRUE)
test_set=subset(x=data2,split==FALSE)
dim(train_set)
## [1] 767 19
dim(test_set)
## [1] 384 19
#Building a Model
library(rpart)
fit=rpart(formula=Class~.,data=train_set,method="class")
summary(fit)
## Call:
## rpart(formula = Class ~ ., data = train_set, method = "class")
## n= 767
##
## CP nsplit rel error xerror xstd
## 1 0.14722222 0 1.0000000 1.0000000 0.03839267
## 2 0.10000000 1 0.8527778 1.0166667 0.03842488
## 3 0.04166667 2 0.7527778 0.8277778 0.03749681
## 4 0.01666667 3 0.7111111 0.8361111 0.03756436
## 5 0.01527778 4 0.6944444 0.8611111 0.03775190
## 6 0.01250000 7 0.6388889 0.8638889 0.03777134
## 7 0.01166667 9 0.6138889 0.8722222 0.03782803
## 8 0.01000000 14 0.5555556 0.8833333 0.03789977
##
## Variable importance
## 2 3 4 5 6 10 7 8 14 15 9 11 13 12 16 17 1
## 13 12 11 9 8 6 6 6 6 5 4 3 3 3 2 1 1
##
## Node number 1: 767 observations, complexity param=0.1472222
## predicted class=1 expected loss=0.4693611 P(node) =1
## class counts: 360 407
## probabilities: 0.469 0.531
## left son=2 (587 obs) right son=3 (180 obs)
## Primary splits:
## 2 < 55.5 to the left, improve=28.73043, (0 missing)
## 14 < 0.0203045 to the left, improve=28.64074, (0 missing)
## 15 < 0.008936 to the left, improve=26.01861, (0 missing)
## 3 < 53.5 to the left, improve=24.25770, (0 missing)
## 4 < 54.5 to the left, improve=22.01306, (0 missing)
## Surrogate splits:
## 3 < 53.5 to the left, agree=0.983, adj=0.928, (0 split)
## 4 < 53.5 to the left, agree=0.969, adj=0.867, (0 split)
## 5 < 50.5 to the left, agree=0.937, adj=0.733, (0 split)
## 6 < 45.5 to the left, agree=0.917, adj=0.644, (0 split)
## 7 < 35.5 to the left, agree=0.880, adj=0.489, (0 split)
##
## Node number 2: 587 observations, complexity param=0.1
## predicted class=0 expected loss=0.4548552 P(node) =0.7653194
## class counts: 320 267
## probabilities: 0.545 0.455
## left son=4 (533 obs) right son=5 (54 obs)
## Primary splits:
## 14 < 0.061455 to the left, improve=17.037910, (0 missing)
## 15 < 0.0205125 to the left, improve=13.610980, (0 missing)
## 13 < 0.0623935 to the left, improve= 9.624010, (0 missing)
## 8 < 130.8606 to the left, improve= 9.083049, (0 missing)
## 12 < 0.429422 to the left, improve= 7.123798, (0 missing)
## Surrogate splits:
## 15 < 0.0331355 to the left, agree=0.981, adj=0.796, (0 split)
## 13 < 0.1667775 to the left, agree=0.956, adj=0.519, (0 split)
## 12 < 1.636641 to the left, agree=0.925, adj=0.185, (0 split)
##
## Node number 3: 180 observations
## predicted class=1 expected loss=0.2222222 P(node) =0.2346806
## class counts: 40 140
## probabilities: 0.222 0.778
##
## Node number 4: 533 observations, complexity param=0.04166667
## predicted class=0 expected loss=0.4165103 P(node) =0.6949153
## class counts: 311 222
## probabilities: 0.583 0.417
## left son=8 (458 obs) right son=9 (75 obs)
## Primary splits:
## 8 < 130.8606 to the left, improve=5.877279, (0 missing)
## 7 < 29.5 to the right, improve=3.534037, (0 missing)
## 17 < 0.0810385 to the right, improve=3.339984, (0 missing)
## 5 < 44.5 to the right, improve=3.244881, (0 missing)
## 9 < 5.291094 to the right, improve=2.997497, (0 missing)
## Surrogate splits:
## 9 < 65.59749 to the left, agree=0.893, adj=0.240, (0 split)
## 10 < 31.28616 to the left, agree=0.891, adj=0.227, (0 split)
## 11 < 4.126161 to the left, agree=0.874, adj=0.107, (0 split)
## 12 < 2.083382 to the left, agree=0.861, adj=0.013, (0 split)
## 17 < 0.1566735 to the left, agree=0.861, adj=0.013, (0 split)
##
## Node number 5: 54 observations
## predicted class=1 expected loss=0.1666667 P(node) =0.07040417
## class counts: 9 45
## probabilities: 0.167 0.833
##
## Node number 8: 458 observations, complexity param=0.01527778
## predicted class=0 expected loss=0.3864629 P(node) =0.5971317
## class counts: 281 177
## probabilities: 0.614 0.386
## left son=16 (315 obs) right son=17 (143 obs)
## Primary splits:
## 10 < 1.212989 to the right, improve=5.695636, (0 missing)
## 11 < 0.0401665 to the right, improve=5.522135, (0 missing)
## 8 < 27.31944 to the right, improve=4.944502, (0 missing)
## 9 < 5.291094 to the right, improve=4.917847, (0 missing)
## 12 < 0.0074575 to the right, improve=4.407162, (0 missing)
## Surrogate splits:
## 8 < 22.50991 to the right, agree=0.900, adj=0.678, (0 split)
## 9 < 8.449127 to the right, agree=0.889, adj=0.643, (0 split)
## 11 < 0.066131 to the right, agree=0.876, adj=0.601, (0 split)
## 12 < 0.001019 to the right, agree=0.784, adj=0.308, (0 split)
## 17 < 0.139616 to the left, agree=0.703, adj=0.049, (0 split)
##
## Node number 9: 75 observations, complexity param=0.01666667
## predicted class=1 expected loss=0.4 P(node) =0.09778357
## class counts: 30 45
## probabilities: 0.400 0.600
## left son=18 (14 obs) right son=19 (61 obs)
## Primary splits:
## 2 < 6.5 to the left, improve=3.400468, (0 missing)
## 3 < 6.5 to the left, improve=3.400468, (0 missing)
## 4 < 6.5 to the left, improve=3.400468, (0 missing)
## 16 < 0.4944415 to the left, improve=3.368421, (0 missing)
## 5 < 4.5 to the left, improve=3.226891, (0 missing)
## Surrogate splits:
## 3 < 6.5 to the left, agree=1.000, adj=1.000, (0 split)
## 4 < 6.5 to the left, agree=1.000, adj=1.000, (0 split)
## 5 < 6.5 to the left, agree=0.973, adj=0.857, (0 split)
## 6 < 5.5 to the left, agree=0.960, adj=0.786, (0 split)
## 7 < 3.5 to the left, agree=0.880, adj=0.357, (0 split)
##
## Node number 16: 315 observations, complexity param=0.0125
## predicted class=0 expected loss=0.3333333 P(node) =0.410691
## class counts: 210 105
## probabilities: 0.667 0.333
## left son=32 (284 obs) right son=33 (31 obs)
## Primary splits:
## 1 < 0.5 to the right, improve=3.180373, (0 missing)
## 16 < 0.504126 to the left, improve=2.926817, (0 missing)
## 17 < 0.082538 to the right, improve=2.867893, (0 missing)
## 9 < 5.249262 to the right, improve=2.850163, (0 missing)
## 7 < 31.5 to the right, improve=2.258065, (0 missing)
## Surrogate splits:
## 8 < 14.32749 to the right, agree=0.914, adj=0.129, (0 split)
## 9 < 3.455908 to the right, agree=0.908, adj=0.065, (0 split)
##
## Node number 17: 143 observations, complexity param=0.01527778
## predicted class=1 expected loss=0.4965035 P(node) =0.1864407
## class counts: 71 72
## probabilities: 0.497 0.503
## left son=34 (122 obs) right son=35 (21 obs)
## Primary splits:
## 10 < 0.930042 to the left, improve=3.287292, (0 missing)
## 16 < 0.5225855 to the right, improve=2.951263, (0 missing)
## 17 < 0.123821 to the right, improve=2.696002, (0 missing)
## 2 < 5 to the left, improve=1.914571, (0 missing)
## 3 < 5 to the left, improve=1.914571, (0 missing)
## Surrogate splits:
## 8 < 37.07348 to the left, agree=0.895, adj=0.286, (0 split)
## 9 < 14.107 to the left, agree=0.860, adj=0.048, (0 split)
##
## Node number 18: 14 observations
## predicted class=0 expected loss=0.2857143 P(node) =0.01825293
## class counts: 10 4
## probabilities: 0.714 0.286
##
## Node number 19: 61 observations
## predicted class=1 expected loss=0.3278689 P(node) =0.07953064
## class counts: 20 41
## probabilities: 0.328 0.672
##
## Node number 32: 284 observations, complexity param=0.01166667
## predicted class=0 expected loss=0.3098592 P(node) =0.3702738
## class counts: 196 88
## probabilities: 0.690 0.310
## left son=64 (81 obs) right son=65 (203 obs)
## Primary splits:
## 16 < 0.504126 to the left, improve=2.859668, (0 missing)
## 17 < 0.082538 to the right, improve=2.666276, (0 missing)
## 2 < 15.5 to the left, improve=1.888548, (0 missing)
## 7 < 31.5 to the right, improve=1.872371, (0 missing)
## 9 < 12.5133 to the left, improve=1.794989, (0 missing)
## Surrogate splits:
## 2 < 4.5 to the left, agree=0.725, adj=0.037, (0 split)
## 3 < 4.5 to the left, agree=0.725, adj=0.037, (0 split)
## 4 < 4.5 to the left, agree=0.725, adj=0.037, (0 split)
## 7 < 3.5 to the left, agree=0.725, adj=0.037, (0 split)
## 11 < 0.002563 to the left, agree=0.725, adj=0.037, (0 split)
##
## Node number 33: 31 observations, complexity param=0.0125
## predicted class=1 expected loss=0.4516129 P(node) =0.04041721
## class counts: 14 17
## probabilities: 0.452 0.548
## left son=66 (10 obs) right son=67 (21 obs)
## Primary splits:
## 8 < 62.02538 to the right, improve=3.583410, (0 missing)
## 9 < 27.90475 to the right, improve=2.698273, (0 missing)
## 11 < 1.587035 to the right, improve=1.920056, (0 missing)
## 17 < 0.1060835 to the right, improve=1.334637, (0 missing)
## 5 < 16.5 to the left, improve=1.279839, (0 missing)
## Surrogate splits:
## 9 < 27.90475 to the right, agree=0.903, adj=0.7, (0 split)
## 10 < 7.007994 to the right, agree=0.806, adj=0.4, (0 split)
## 11 < 1.081676 to the right, agree=0.806, adj=0.4, (0 split)
## 2 < 10.5 to the left, agree=0.710, adj=0.1, (0 split)
## 16 < 0.5638935 to the right, agree=0.710, adj=0.1, (0 split)
##
## Node number 34: 122 observations, complexity param=0.01527778
## predicted class=0 expected loss=0.4590164 P(node) =0.1590613
## class counts: 66 56
## probabilities: 0.541 0.459
## left son=68 (63 obs) right son=69 (59 obs)
## Primary splits:
## 16 < 0.5224985 to the right, improve=3.141684, (0 missing)
## 2 < 42.5 to the left, improve=1.988755, (0 missing)
## 17 < 0.0866185 to the right, improve=1.974628, (0 missing)
## 8 < 28.00077 to the right, improve=1.857977, (0 missing)
## 3 < 32 to the left, improve=1.805850, (0 missing)
## Surrogate splits:
## 9 < 4.731862 to the left, agree=0.631, adj=0.237, (0 split)
## 12 < 0.003474 to the left, agree=0.582, adj=0.136, (0 split)
## 14 < 0.0005085 to the left, agree=0.574, adj=0.119, (0 split)
## 15 < 0.0005085 to the left, agree=0.574, adj=0.119, (0 split)
## 17 < 0.1084225 to the left, agree=0.574, adj=0.119, (0 split)
##
## Node number 35: 21 observations
## predicted class=1 expected loss=0.2380952 P(node) =0.0273794
## class counts: 5 16
## probabilities: 0.238 0.762
##
## Node number 64: 81 observations
## predicted class=0 expected loss=0.1975309 P(node) =0.1056063
## class counts: 65 16
## probabilities: 0.802 0.198
##
## Node number 65: 203 observations, complexity param=0.01166667
## predicted class=0 expected loss=0.3546798 P(node) =0.2646675
## class counts: 131 72
## probabilities: 0.645 0.355
## left son=130 (192 obs) right son=131 (11 obs)
## Primary splits:
## 17 < 0.081813 to the right, improve=3.229139, (0 missing)
## 8 < 58.10918 to the left, improve=2.917744, (0 missing)
## 9 < 13.74342 to the left, improve=2.811121, (0 missing)
## 6 < 37.5 to the right, improve=2.383777, (0 missing)
## 7 < 26.5 to the right, improve=2.242087, (0 missing)
##
## Node number 66: 10 observations
## predicted class=0 expected loss=0.2 P(node) =0.01303781
## class counts: 8 2
## probabilities: 0.800 0.200
##
## Node number 67: 21 observations
## predicted class=1 expected loss=0.2857143 P(node) =0.0273794
## class counts: 6 15
## probabilities: 0.286 0.714
##
## Node number 68: 63 observations
## predicted class=0 expected loss=0.3492063 P(node) =0.0821382
## class counts: 41 22
## probabilities: 0.651 0.349
##
## Node number 69: 59 observations
## predicted class=1 expected loss=0.4237288 P(node) =0.07692308
## class counts: 25 34
## probabilities: 0.424 0.576
##
## Node number 130: 192 observations, complexity param=0.01166667
## predicted class=0 expected loss=0.3333333 P(node) =0.2503259
## class counts: 128 64
## probabilities: 0.667 0.333
## left son=260 (101 obs) right son=261 (91 obs)
## Primary splits:
## 8 < 58.10918 to the left, improve=3.138142, (0 missing)
## 6 < 43.5 to the right, improve=2.844444, (0 missing)
## 17 < 0.0864105 to the left, improve=2.810980, (0 missing)
## 7 < 35.5 to the right, improve=2.593002, (0 missing)
## 5 < 43.5 to the right, improve=2.188473, (0 missing)
## Surrogate splits:
## 9 < 21.76366 to the left, agree=0.792, adj=0.560, (0 split)
## 10 < 6.228308 to the left, agree=0.786, adj=0.549, (0 split)
## 11 < 0.5863835 to the left, agree=0.729, adj=0.429, (0 split)
## 13 < 0.000995 to the left, agree=0.672, adj=0.308, (0 split)
## 6 < 30.5 to the right, agree=0.646, adj=0.253, (0 split)
##
## Node number 131: 11 observations
## predicted class=1 expected loss=0.2727273 P(node) =0.01434159
## class counts: 3 8
## probabilities: 0.273 0.727
##
## Node number 260: 101 observations
## predicted class=0 expected loss=0.2475248 P(node) =0.1316819
## class counts: 76 25
## probabilities: 0.752 0.248
##
## Node number 261: 91 observations, complexity param=0.01166667
## predicted class=0 expected loss=0.4285714 P(node) =0.1186441
## class counts: 52 39
## probabilities: 0.571 0.429
## left son=522 (35 obs) right son=523 (56 obs)
## Primary splits:
## 2 < 15.5 to the left, improve=4.550000, (0 missing)
## 17 < 0.1023955 to the left, improve=3.183673, (0 missing)
## 10 < 15.86594 to the right, improve=3.162225, (0 missing)
## 3 < 25.5 to the left, improve=3.142063, (0 missing)
## 4 < 24.5 to the left, improve=3.103836, (0 missing)
## Surrogate splits:
## 3 < 15.5 to the left, agree=0.978, adj=0.943, (0 split)
## 4 < 15.5 to the left, agree=0.945, adj=0.857, (0 split)
## 5 < 14.5 to the left, agree=0.912, adj=0.771, (0 split)
## 6 < 12.5 to the left, agree=0.868, adj=0.657, (0 split)
## 7 < 7.5 to the left, agree=0.857, adj=0.629, (0 split)
##
## Node number 522: 35 observations
## predicted class=0 expected loss=0.2285714 P(node) =0.04563233
## class counts: 27 8
## probabilities: 0.771 0.229
##
## Node number 523: 56 observations, complexity param=0.01166667
## predicted class=1 expected loss=0.4464286 P(node) =0.07301173
## class counts: 25 31
## probabilities: 0.446 0.554
## left son=1046 (20 obs) right son=1047 (36 obs)
## Primary splits:
## 10 < 15.86594 to the right, improve=5.734127, (0 missing)
## 11 < 1.397867 to the right, improve=4.514469, (0 missing)
## 7 < 8.5 to the right, improve=3.720238, (0 missing)
## 12 < 0.2241915 to the right, improve=3.286415, (0 missing)
## 9 < 24.05174 to the right, improve=2.922050, (0 missing)
## Surrogate splits:
## 11 < 2.035929 to the right, agree=0.821, adj=0.5, (0 split)
## 12 < 0.2500485 to the right, agree=0.821, adj=0.5, (0 split)
## 9 < 35.20614 to the right, agree=0.750, adj=0.3, (0 split)
## 2 < 52.5 to the right, agree=0.714, adj=0.2, (0 split)
## 3 < 51.5 to the right, agree=0.714, adj=0.2, (0 split)
##
## Node number 1046: 20 observations
## predicted class=0 expected loss=0.25 P(node) =0.02607562
## class counts: 15 5
## probabilities: 0.750 0.250
##
## Node number 1047: 36 observations
## predicted class=1 expected loss=0.2777778 P(node) =0.04693611
## class counts: 10 26
## probabilities: 0.278 0.722
str(data2)
## 'data.frame': 1151 obs. of 19 variables:
## $ 1 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ 2 : num 22 24 62 55 44 44 29 6 22 79 ...
## $ 3 : num 22 24 60 53 44 43 29 6 21 75 ...
## $ 4 : num 22 22 59 53 44 41 29 6 18 73 ...
## $ 5 : num 19 18 54 50 41 41 27 6 15 71 ...
## $ 6 : num 18 16 47 43 39 37 25 2 13 64 ...
## $ 7 : num 14 13 33 31 27 29 16 1 10 47 ...
## $ 8 : num 49.9 57.7 55.8 40.5 18 ...
## $ 9 : num 17.78 23.8 27.99 18.45 8.57 ...
## $ 10 : num 5.27 3.33 12.69 9.12 0.41 ...
## $ 11 : num 0.772 0.234 4.852 3.079 0 ...
## $ 12 : num 0.0186 0.0039 1.3939 0.8403 0 ...
## $ 13 : num 0.00686 0.0039 0.37325 0.27243 0 ...
## $ 14 : num 0.00392 0.0039 0.04182 0.00765 0 ...
## $ 15 : num 0.00392 0.0039 0.00774 0.00153 0 ...
## $ 16 : num 0.487 0.521 0.531 0.483 0.476 ...
## $ 17 : num 0.1 0.144 0.129 0.115 0.124 ...
## $ 18 : num 1 0 0 0 0 0 0 1 0 0 ...
## $ Class: Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 1 2 2 ...
library(rpart.plot)
rpart.plot(fit)

predict_unseen=predict(object=fit,newdata=test_set,type="class")
predict_unseen
## 2 3 6 8 9 10 11 14 16 17 21 26 27 32 34 40
## 0 1 0 1 0 1 0 0 1 1 0 1 0 0 1 1
## 41 44 45 48 51 52 54 56 59 63 65 70 72 73 74 75
## 1 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1
## 76 77 81 82 85 87 92 97 102 107 115 116 120 121 130 133
## 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1
## 134 135 136 137 139 141 142 143 146 152 154 155 158 159 160 161
## 1 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0
## 164 165 168 172 174 176 180 181 187 192 193 195 197 201 202 203
## 0 1 1 0 0 0 1 0 1 1 0 1 1 0 0 0
## 204 205 207 213 214 220 224 228 232 236 240 246 250 251 252 257
## 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1
## 259 262 264 269 270 271 282 286 287 293 296 297 298 299 300 303
## 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 1
## 311 318 319 325 329 330 335 338 340 342 343 346 353 356 363 364
## 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1
## 365 369 377 378 379 385 387 388 390 392 400 401 413 417 420 422
## 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 1
## 424 425 426 428 429 438 442 446 447 455 456 457 465 468 473 474
## 1 0 0 0 1 0 1 0 1 0 1 1 1 1 1 0
## 481 483 484 488 494 496 498 499 501 502 506 507 517 520 522 523
## 1 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1
## 526 530 532 535 539 540 541 543 547 549 550 551 552 560 563 570
## 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0
## 572 573 574 575 578 579 580 586 590 591 601 602 603 609 621 624
## 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0 0
## 626 628 630 631 635 636 637 638 639 640 643 644 647 648 649 655
## 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1
## 657 659 663 664 672 673 675 678 679 681 682 684 685 687 692 693
## 1 0 1 1 0 0 1 1 1 1 0 1 0 1 0 1
## 694 699 700 707 710 711 714 715 716 719 720 726 727 728 730 732
## 0 0 1 1 0 0 1 1 1 0 0 0 1 0 1 1
## 737 739 741 746 748 756 762 766 770 774 778 780 786 797 799 804
## 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 0
## 805 807 809 813 815 819 822 824 827 828 829 834 835 838 840 843
## 1 1 1 1 1 0 0 1 0 1 0 1 1 0 0 0
## 853 854 857 860 863 869 870 873 876 878 883 890 898 901 902 920
## 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1
## 926 929 933 934 936 946 947 951 953 955 956 960 961 963 969 970
## 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1
## 971 979 984 985 987 988 989 990 991 993 997 1011 1015 1018 1020 1022
## 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0
## 1023 1024 1028 1032 1035 1036 1038 1039 1041 1043 1047 1050 1051 1052 1054 1057
## 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0
## 1059 1060 1063 1064 1067 1068 1069 1070 1073 1081 1089 1093 1096 1100 1101 1103
## 1 0 0 1 1 1 0 0 1 1 0 1 1 1 1 0
## 1110 1116 1118 1119 1120 1123 1125 1126 1127 1128 1133 1137 1143 1146 1148 1150
## 0 1 0 0 1 1 0 0 0 1 1 1 1 1 1 1
## Levels: 0 1
# Confusion Matrix
cmat=table(test_set$Class,predict_unseen)
cmat
## predict_unseen
## 0 1
## 0 107 73
## 1 53 151
sum(diag(cmat))/sum(cmat)
## [1] 0.671875
plot(fit)
text(fit)
