1.1 Sample Data
The
Sonar
dataset from the
mlbench
package was used for this illustrated example. The original dataset was
transformed to simulate class imbalance.
Preliminary dataset assessment:
[A] 136 rows (observations)
[A.1] Train Set = 96 observations with class ratio
of 80:20
[A.2] Test Set = 40 observations with class ratio
of 80:20
[B] 61 columns (variables)
[B.1] 1/61 response = Class variable (factor)
[B.1.1] Levels = Class=R < Class=M
[B.2] 60/61 predictors = All remaining variables
(60/60 numeric)
##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(mlbench)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
library(nnet)
library(elasticnet)
library(earth)
library(party)
library(kernlab)
library(randomForest)
library(Cubist)
library(pROC)
library(mda)
library(klaR)
library(pamr)
library(MLmetrics)
##################################
# Loading source and
# formulating the train set
##################################
data(Sonar)
Sonar.Original <- Sonar
Sonar.M <- Sonar[Sonar$Class=="M",]
Sonar.R <- Sonar[Sonar$Class=="R",]
set.seed(12345678)
Sonar.R.Reduced <- Sonar.R[sample(1:nrow(Sonar.R),25),]
Sonar <- as.data.frame(rbind(Sonar.M,Sonar.R.Reduced))
set.seed(12345678)
Sonar_Partition <- createDataPartition(Sonar$Class, p = .70, list = FALSE)
Sonar_Train <- Sonar[Sonar_Partition,]
Sonar_Test <- Sonar[-Sonar_Partition,]
##################################
# Performing a general exploration of the train set
##################################
dim(Sonar_Train)
## [1] 96 61
## 'data.frame': 96 obs. of 61 variables:
## $ V1 : num 0.0629 0.0587 0.0428 0.0599 0.0264 0.0454 0.0283 0.0114 0.0414 0.0228 ...
## $ V2 : num 0.1065 0.121 0.0555 0.0474 0.0071 ...
## $ V3 : num 0.1526 0.1268 0.0708 0.0498 0.0342 ...
## $ V4 : num 0.1229 0.1498 0.0618 0.0387 0.0793 ...
## $ V5 : num 0.144 0.144 0.121 0.103 0.104 ...
## $ V6 : num 0.119 0.0561 0.1524 0.0773 0.0783 ...
## $ V7 : num 0.0884 0.0832 0.1543 0.0853 0.1417 ...
## $ V8 : num 0.0907 0.0672 0.0391 0.0447 0.1176 ...
## $ V9 : num 0.2107 0.1372 0.061 0.1094 0.0453 ...
## $ V10 : num 0.3597 0.2352 0.0113 0.0351 0.0945 ...
## $ V11 : num 0.547 0.321 0.126 0.158 0.113 ...
## $ V12 : num 0.52 0.426 0.247 0.202 0.084 ...
## $ V13 : num 0.5127 0.5201 0.3011 0.2268 0.0717 ...
## $ V14 : num 0.539 0.491 0.375 0.283 0.197 ...
## $ V15 : num 0.656 0.595 0.452 0.382 0.263 ...
## $ V16 : num 0.871 0.722 0.539 0.467 0.419 ...
## $ V17 : num 0.979 0.904 0.659 0.669 0.505 ...
## $ V18 : num 0.933 0.911 0.711 0.865 0.671 ...
## $ V19 : num 0.792 0.872 0.76 0.936 0.792 ...
## $ V20 : num 0.738 0.769 0.867 0.937 0.838 ...
## $ V21 : num 0.691 0.733 0.842 0.914 0.876 ...
## $ V22 : num 0.385 0.522 0.797 0.916 0.942 ...
## $ V23 : num 0.0671 0.3097 0.8385 0.9311 1 ...
## $ V24 : num 0.0502 0.3172 0.9317 0.8604 0.9931 ...
## $ V25 : num 0.272 0.227 0.856 0.733 0.958 ...
## $ V26 : num 0.284 0.164 0.616 0.576 0.865 ...
## $ V27 : num 0.223 0.175 0.414 0.416 0.722 ...
## $ V28 : num 0.191 0.183 0.327 0.411 0.58 ...
## $ V29 : num 0.0408 0.2048 0.3108 0.4146 0.4964 ...
## $ V30 : num 0.253 0.167 0.255 0.315 0.489 ...
## $ V31 : num 0.198 0.277 0.337 0.294 0.408 ...
## $ V32 : num 0.189 0.31 0.447 0.317 0.244 ...
## $ V33 : num 0.243 0.34 0.5 0.315 0.177 ...
## $ V34 : num 0.196 0.444 0.511 0.413 0.247 ...
## $ V35 : num 0.267 0.505 0.519 0.399 0.352 ...
## $ V36 : num 0.134 0.281 0.462 0.419 0.376 ...
## $ V37 : num 0.107 0.168 0.423 0.453 0.291 ...
## $ V38 : num 0.202 0.263 0.437 0.442 0.231 ...
## $ V39 : num 0.179 0.32 0.428 0.474 0.317 ...
## $ V40 : num 0.0227 0.1933 0.4433 0.3431 0.3554 ...
## $ V41 : num 0.1313 0.0934 0.37 0.3194 0.3741 ...
## $ V42 : num 0.1775 0.0443 0.3324 0.337 0.4443 ...
## $ V43 : num 0.155 0.078 0.256 0.249 0.326 ...
## $ V44 : num 0.1626 0.0722 0.2527 0.265 0.1963 ...
## $ V45 : num 0.0708 0.0405 0.2137 0.1748 0.0864 ...
## $ V46 : num 0.0129 0.0553 0.1789 0.0932 0.1688 ...
## $ V47 : num 0.0795 0.1081 0.101 0.053 0.1991 ...
## $ V48 : num 0.0762 0.1139 0.0528 0.0081 0.1217 ...
## $ V49 : num 0.0117 0.0767 0.0453 0.0342 0.0628 0.038 0.0244 0.0728 0.0177 0.0649 ...
## $ V50 : num 0.0061 0.0265 0.0118 0.0137 0.0323 0.0142 0.0179 0.0174 0.0065 0.0313 ...
## $ V51 : num 0.0257 0.0215 0.0009 0.0028 0.0253 0.0137 0.0109 0.0213 0.0222 0.0185 ...
## $ V52 : num 0.0089 0.0331 0.0142 0.0013 0.0214 0.012 0.0147 0.0269 0.0045 0.0098 ...
## $ V53 : num 0.0262 0.0111 0.0179 0.0005 0.0262 0.0042 0.017 0.0152 0.0136 0.0178 ...
## $ V54 : num 0.0108 0.0088 0.0079 0.0227 0.0177 0.0238 0.0158 0.0257 0.0113 0.0077 ...
## $ V55 : num 0.0138 0.0158 0.006 0.0209 0.0037 0.0129 0.0046 0.0097 0.0053 0.0074 ...
## $ V56 : num 0.0187 0.0122 0.0131 0.0081 0.0068 0.0084 0.0073 0.0041 0.0165 0.0095 ...
## $ V57 : num 0.023 0.0038 0.0089 0.0117 0.0121 0.0218 0.0054 0.005 0.0141 0.0055 ...
## $ V58 : num 0.0057 0.0101 0.0084 0.0114 0.0077 0.0321 0.0033 0.0145 0.0077 0.0045 ...
## $ V59 : num 0.0113 0.0228 0.0113 0.0112 0.0078 0.0154 0.0045 0.0103 0.0246 0.0063 ...
## $ V60 : num 0.0131 0.0124 0.0049 0.01 0.0066 0.0053 0.0079 0.0025 0.0198 0.0039 ...
## $ Class: Factor w/ 2 levels "M","R": 1 1 1 1 1 1 1 1 1 1 ...
## V1 V2 V3 V4
## Min. :0.00150 Min. :0.00060 Min. :0.00150 Min. :0.00580
## 1st Qu.:0.01362 1st Qu.:0.01897 1st Qu.:0.02448 1st Qu.:0.02960
## Median :0.02320 Median :0.03385 Median :0.03880 Median :0.04905
## Mean :0.03171 Mean :0.04387 Mean :0.04718 Mean :0.05631
## 3rd Qu.:0.03982 3rd Qu.:0.05892 3rd Qu.:0.06212 3rd Qu.:0.07220
## Max. :0.13710 Max. :0.15740 Max. :0.16650 Max. :0.16440
## V5 V6 V7 V8
## Min. :0.00670 Min. :0.0102 Min. :0.0182 Min. :0.0124
## 1st Qu.:0.04530 1st Qu.:0.0782 1st Qu.:0.0937 1st Qu.:0.0950
## Median :0.07430 Median :0.1135 Median :0.1298 Median :0.1356
## Mean :0.08196 Mean :0.1178 Mean :0.1332 Mean :0.1511
## 3rd Qu.:0.10855 3rd Qu.:0.1496 3rd Qu.:0.1683 3rd Qu.:0.1906
## Max. :0.24820 Max. :0.3823 Max. :0.3729 Max. :0.4566
## V9 V10 V11 V12
## Min. :0.0075 Min. :0.0113 Min. :0.0526 Min. :0.0236
## 1st Qu.:0.1299 1st Qu.:0.1424 1st Qu.:0.1926 1st Qu.:0.1837
## Median :0.1815 Median :0.2124 Median :0.2515 Median :0.2781
## Mean :0.2039 Mean :0.2334 Mean :0.2662 Mean :0.2796
## 3rd Qu.:0.2596 3rd Qu.:0.2940 3rd Qu.:0.3335 3rd Qu.:0.3501
## Max. :0.6828 Max. :0.5965 Max. :0.6675 Max. :0.5679
## V13 V14 V15 V16
## Min. :0.0616 Min. :0.0273 Min. :0.0092 Min. :0.0422
## 1st Qu.:0.2122 1st Qu.:0.1855 1st Qu.:0.1673 1st Qu.:0.1911
## Median :0.2930 Median :0.2904 Median :0.2751 Median :0.3203
## Mean :0.3021 Mean :0.3139 Mean :0.3194 Mean :0.3753
## 3rd Qu.:0.3730 3rd Qu.:0.4051 3rd Qu.:0.4403 3rd Qu.:0.5332
## Max. :0.7131 Max. :0.9970 Max. :0.9137 Max. :0.9751
## V17 V18 V19 V20
## Min. :0.0367 Min. :0.0375 Min. :0.1316 Min. :0.0656
## 1st Qu.:0.2087 1st Qu.:0.2427 1st Qu.:0.2964 1st Qu.:0.3972
## Median :0.3160 Median :0.3730 Median :0.4462 Median :0.6223
## Mean :0.4137 Mean :0.4475 Mean :0.5134 Mean :0.5861
## 3rd Qu.:0.6466 3rd Qu.:0.6731 3rd Qu.:0.7310 3rd Qu.:0.7978
## Max. :1.0000 Max. :0.9335 Max. :0.9828 Max. :1.0000
## V21 V22 V23 V24
## Min. :0.0512 Min. :0.0219 Min. :0.0610 Min. :0.0502
## 1st Qu.:0.4412 1st Qu.:0.3991 1st Qu.:0.4533 1st Qu.:0.5795
## Median :0.6939 Median :0.7021 Median :0.7139 Median :0.6985
## Mean :0.6393 Mean :0.6364 Mean :0.6500 Mean :0.6795
## 3rd Qu.:0.8449 3rd Qu.:0.8498 3rd Qu.:0.8690 3rd Qu.:0.8968
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.0240 Min. :0.1640 Min. :0.1036 Min. :0.0598
## 1st Qu.:0.5690 1st Qu.:0.5637 1st Qu.:0.4955 1st Qu.:0.5582
## Median :0.7211 Median :0.7560 Median :0.7930 Median :0.7762
## Mean :0.6807 Mean :0.7079 Mean :0.7074 Mean :0.7076
## 3rd Qu.:0.8749 3rd Qu.:0.8766 3rd Qu.:0.9109 3rd Qu.:0.9116
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.0144 Min. :0.0613 Min. :0.1000 Min. :0.0877
## 1st Qu.:0.4667 1st Qu.:0.4140 1st Qu.:0.3268 1st Qu.:0.2771
## Median :0.7096 Median :0.6028 Median :0.4416 Median :0.4078
## Mean :0.6518 Mean :0.5869 Mean :0.4970 Mean :0.4364
## 3rd Qu.:0.8672 3rd Qu.:0.7189 3rd Qu.:0.6461 3rd Qu.:0.5816
## Max. :1.0000 Max. :1.0000 Max. :0.9657 Max. :0.9306
## V33 V34 V35 V36
## Min. :0.0477 Min. :0.0588 Min. :0.0223 Min. :0.0080
## 1st Qu.:0.2364 1st Qu.:0.2164 1st Qu.:0.1746 1st Qu.:0.1381
## Median :0.3875 Median :0.3644 Median :0.2930 Median :0.2808
## Mean :0.4079 Mean :0.3940 Mean :0.3772 Mean :0.3649
## 3rd Qu.:0.5409 3rd Qu.:0.5421 3rd Qu.:0.5775 3rd Qu.:0.5348
## Max. :1.0000 Max. :0.9536 Max. :0.9518 Max. :1.0000
## V37 V38 V39 V40
## Min. :0.0351 Min. :0.0618 Min. :0.0436 Min. :0.0227
## 1st Qu.:0.1447 1st Qu.:0.1747 1st Qu.:0.1827 1st Qu.:0.1962
## Median :0.2594 Median :0.3245 Median :0.3058 Median :0.2812
## Mean :0.3525 Mean :0.3458 Mean :0.3464 Mean :0.3173
## 3rd Qu.:0.4884 3rd Qu.:0.4405 3rd Qu.:0.4801 3rd Qu.:0.4269
## Max. :0.9123 Max. :0.9480 Max. :0.9709 Max. :0.9297
## V41 V42 V43 V44
## Min. :0.0438 Min. :0.0443 Min. :0.0308 Min. :0.0255
## 1st Qu.:0.1696 1st Qu.:0.1688 1st Qu.:0.1611 1st Qu.:0.1386
## Median :0.2658 Median :0.2808 Median :0.2580 Median :0.1916
## Mean :0.3043 Mean :0.3053 Mean :0.2722 Mean :0.2370
## 3rd Qu.:0.4094 3rd Qu.:0.3973 3rd Qu.:0.3471 3rd Qu.:0.3081
## Max. :0.8995 Max. :0.8246 Max. :0.7517 Max. :0.5772
## V45 V46 V47 V48
## Min. :0.0352 Min. :0.0080 Min. :0.01790 Min. :0.0081
## 1st Qu.:0.1105 1st Qu.:0.0846 1st Qu.:0.07727 1st Qu.:0.0531
## Median :0.1736 Median :0.1445 Median :0.10900 Median :0.0935
## Mean :0.2362 Mean :0.1930 Mean :0.14301 Mean :0.1088
## 3rd Qu.:0.3626 3rd Qu.:0.2283 3rd Qu.:0.18247 3rd Qu.:0.1351
## Max. :0.7034 Max. :0.7292 Max. :0.55220 Max. :0.3339
## V49 V50 V51 V52
## Min. :0.00730 Min. :0.00440 Min. :0.00090 Min. :0.00130
## 1st Qu.:0.03322 1st Qu.:0.01310 1st Qu.:0.01040 1st Qu.:0.00875
## Median :0.05445 Median :0.01920 Median :0.01565 Median :0.01215
## Mean :0.06444 Mean :0.02375 Mean :0.01878 Mean :0.01516
## 3rd Qu.:0.09137 3rd Qu.:0.02902 3rd Qu.:0.02363 3rd Qu.:0.01830
## Max. :0.19810 Max. :0.08250 Max. :0.10040 Max. :0.07090
## V53 V54 V55 V56
## Min. :0.000500 Min. :0.001000 Min. :0.001100 Min. :0.000400
## 1st Qu.:0.004975 1st Qu.:0.005375 1st Qu.:0.003700 1st Qu.:0.004350
## Median :0.007900 Median :0.009700 Median :0.007700 Median :0.007050
## Mean :0.010800 Mean :0.011600 Mean :0.009373 Mean :0.008372
## 3rd Qu.:0.015375 3rd Qu.:0.015050 3rd Qu.:0.012625 3rd Qu.:0.011625
## Max. :0.036100 Max. :0.035200 Max. :0.044700 Max. :0.039400
## V57 V58 V59 V60
## Min. :0.001100 Min. :0.000900 Min. :0.000100 Min. :0.000600
## 1st Qu.:0.003700 1st Qu.:0.003600 1st Qu.:0.003550 1st Qu.:0.003100
## Median :0.005750 Median :0.006300 Median :0.007000 Median :0.005100
## Mean :0.007678 Mean :0.008472 Mean :0.008259 Mean :0.006066
## 3rd Qu.:0.010725 3rd Qu.:0.010275 3rd Qu.:0.010750 3rd Qu.:0.008125
## Max. :0.035500 Max. :0.044000 Max. :0.029400 Max. :0.021800
## Class
## M:78
## R:18
##
##
##
##
##################################
# Performing a general exploration of the test set
##################################
dim(Sonar_Test)
## [1] 40 61
## 'data.frame': 40 obs. of 61 variables:
## $ V1 : num 0.0491 0.1313 0.0201 0.0335 0.0162 ...
## $ V2 : num 0.0279 0.2339 0.0423 0.0134 0.0253 ...
## $ V3 : num 0.0592 0.3059 0.0554 0.0696 0.0262 ...
## $ V4 : num 0.127 0.4264 0.0783 0.118 0.0386 ...
## $ V5 : num 0.1772 0.401 0.062 0.0348 0.0645 ...
## $ V6 : num 0.1908 0.1791 0.0871 0.118 0.0472 ...
## $ V7 : num 0.222 0.185 0.12 0.195 0.106 ...
## $ V8 : num 0.0768 0.0055 0.2707 0.1607 0.1388 ...
## $ V9 : num 0.1246 0.1929 0.1206 0.3036 0.0598 ...
## $ V10 : num 0.2028 0.2231 0.0279 0.4372 0.1334 ...
## $ V11 : num 0.0947 0.2907 0.2251 0.5533 0.2969 ...
## $ V12 : num 0.25 0.226 0.262 0.577 0.475 ...
## $ V13 : num 0.221 0.314 0.177 0.702 0.568 ...
## $ V14 : num 0.32 0.33 0.371 0.707 0.569 ...
## $ V15 : num 0.334 0.366 0.453 0.737 0.642 ...
## $ V16 : num 0.332 0.396 0.555 0.739 0.749 ...
## $ V17 : num 0.278 0.439 0.462 0.862 0.9 ...
## $ V18 : num 0.297 0.467 0.38 0.946 1 ...
## $ V19 : num 0.295 0.525 0.345 0.878 0.969 ...
## $ V20 : num 0.173 0.373 0.267 0.791 0.903 ...
## $ V21 : num 0.326 0.224 0.239 0.576 0.768 ...
## $ V22 : num 0.383 0.197 0.113 0.306 0.7 ...
## $ V23 : num 0.3523 0.4337 0.2556 0.0563 0.6644 ...
## $ V24 : num 0.541 0.6532 0.5169 0.0239 0.5964 ...
## $ V25 : num 0.523 0.507 0.378 0.255 0.371 ...
## $ V26 : num 0.4475 0.2796 0.4082 0.4862 0.0921 ...
## $ V27 : num 0.534 0.4163 0.5353 0.5027 0.0481 ...
## $ V28 : num 0.5323 0.595 0.5116 0.4402 0.0876 ...
## $ V29 : num 0.391 0.524 0.454 0.285 0.104 ...
## $ V30 : num 0.346 0.418 0.426 0.18 0.171 ...
## $ V31 : num 0.409 0.371 0.387 0.356 0.326 ...
## $ V32 : num 0.464 0.237 0.394 0.352 0.461 ...
## $ V33 : num 0.558 0.0863 0.4661 0.3321 0.3939 ...
## $ V34 : num 0.573 0.144 0.397 0.311 0.505 ...
## $ V35 : num 0.635 0.29 0.219 0.364 0.483 ...
## $ V36 : num 0.7563 0.4577 0.1816 0.0754 0.3511 ...
## $ V37 : num 0.69 0.372 0.102 0.183 0.232 ...
## $ V38 : num 0.618 0.337 0.211 0.182 0.403 ...
## $ V39 : num 0.538 0.38 0.325 0.181 0.368 ...
## $ V40 : num 0.562 0.418 0.37 0.159 0.151 ...
## $ V41 : num 0.6508 0.3603 0.2912 0.0576 0.0745 ...
## $ V42 : num 0.4797 0.2711 0.301 0.0954 0.1395 ...
## $ V43 : num 0.374 0.165 0.256 0.109 0.155 ...
## $ V44 : num 0.2804 0.1951 0.1927 0.0812 0.0377 ...
## $ V45 : num 0.1982 0.2811 0.2062 0.0784 0.0636 ...
## $ V46 : num 0.2438 0.2246 0.1751 0.0487 0.0443 ...
## $ V47 : num 0.1789 0.1921 0.0841 0.0439 0.0264 ...
## $ V48 : num 0.1706 0.15 0.1035 0.0586 0.0223 ...
## $ V49 : num 0.0762 0.0665 0.0641 0.037 0.0187 0.0245 0.0102 0.0436 0.0293 0.0469 ...
## $ V50 : num 0.0238 0.0193 0.0153 0.0185 0.0077 0.019 0.0057 0.0224 0.0183 0.0114 ...
## $ V51 : num 0.0268 0.0156 0.0081 0.0302 0.0137 0.0063 0.0031 0.0133 0.0104 0.0299 ...
## $ V52 : num 0.0081 0.0362 0.0191 0.0244 0.0071 0.0321 0.0163 0.0078 0.0117 0.0244 ...
## $ V53 : num 0.0129 0.021 0.0182 0.0232 0.0082 0.0189 0.0099 0.0174 0.0101 0.0199 ...
## $ V54 : num 0.0161 0.0154 0.016 0.0093 0.0232 0.0137 0.0084 0.0176 0.0061 0.0257 ...
## $ V55 : num 0.0063 0.018 0.029 0.0159 0.0198 0.0277 0.027 0.0038 0.0031 0.0082 ...
## $ V56 : num 0.0119 0.0013 0.009 0.0193 0.0074 0.0152 0.0277 0.0129 0.0099 0.0151 ...
## $ V57 : num 0.0194 0.0106 0.0242 0.0032 0.0035 0.0052 0.0097 0.0066 0.008 0.0171 ...
## $ V58 : num 0.014 0.0127 0.0224 0.0377 0.01 0.0121 0.0054 0.0044 0.0107 0.0146 ...
## $ V59 : num 0.0332 0.0178 0.019 0.0126 0.0048 0.0124 0.0148 0.0134 0.0161 0.0134 ...
## $ V60 : num 0.0439 0.0231 0.0096 0.0156 0.0019 0.0055 0.0092 0.0092 0.0133 0.0056 ...
## $ Class: Factor w/ 2 levels "M","R": 1 1 1 1 1 1 1 1 1 1 ...
## V1 V2 V3 V4
## Min. :0.00470 Min. :0.00220 Min. :0.00450 Min. :0.00760
## 1st Qu.:0.01620 1st Qu.:0.01392 1st Qu.:0.01770 1st Qu.:0.02615
## Median :0.02495 Median :0.03190 Median :0.03660 Median :0.04465
## Mean :0.03229 Mean :0.03954 Mean :0.04819 Mean :0.07107
## 3rd Qu.:0.03665 3rd Qu.:0.04850 3rd Qu.:0.05635 3rd Qu.:0.08830
## Max. :0.13130 Max. :0.23390 Max. :0.30590 Max. :0.42640
## V5 V6 V7 V8
## Min. :0.00970 Min. :0.02260 Min. :0.00330 Min. :0.00550
## 1st Qu.:0.03470 1st Qu.:0.05325 1st Qu.:0.06792 1st Qu.:0.08903
## Median :0.06155 Median :0.07610 Median :0.09480 Median :0.11180
## Mean :0.08154 Mean :0.08995 Mean :0.11237 Mean :0.12967
## 3rd Qu.:0.08470 3rd Qu.:0.11365 3rd Qu.:0.14510 3rd Qu.:0.15188
## Max. :0.40100 Max. :0.22470 Max. :0.33220 Max. :0.45900
## V9 V10 V11 V12
## Min. :0.0494 Min. :0.0193 Min. :0.0523 Min. :0.0259
## 1st Qu.:0.1000 1st Qu.:0.1261 1st Qu.:0.1572 1st Qu.:0.2245
## Median :0.1439 Median :0.1813 Median :0.2363 Median :0.2599
## Mean :0.1795 Mean :0.2212 Mean :0.2595 Mean :0.2809
## 3rd Qu.:0.2196 3rd Qu.:0.2596 3rd Qu.:0.2991 3rd Qu.:0.3141
## Max. :0.5664 Max. :0.7106 Max. :0.7342 Max. :0.5771
## V13 V14 V15 V16
## Min. :0.1184 Min. :0.0336 Min. :0.0166 Min. :0.0572
## 1st Qu.:0.2081 1st Qu.:0.2122 1st Qu.:0.1990 1st Qu.:0.2072
## Median :0.2581 Median :0.2959 Median :0.3125 Median :0.3199
## Mean :0.2880 Mean :0.3048 Mean :0.3301 Mean :0.3778
## 3rd Qu.:0.3155 3rd Qu.:0.3464 3rd Qu.:0.4298 3rd Qu.:0.5161
## Max. :0.7022 Max. :0.7067 Max. :0.7367 Max. :0.8278
## V17 V18 V19 V20
## Min. :0.1162 Min. :0.0837 Min. :0.1151 Min. :0.0902
## 1st Qu.:0.2159 1st Qu.:0.2492 1st Qu.:0.3366 1st Qu.:0.3652
## Median :0.3154 Median :0.3607 Median :0.5134 Median :0.6252
## Mean :0.4086 Mean :0.4693 Mean :0.5419 Mean :0.5995
## 3rd Qu.:0.6000 3rd Qu.:0.6776 3rd Qu.:0.8178 3rd Qu.:0.8684
## Max. :0.8999 Max. :1.0000 Max. :0.9975 Max. :0.9911
## V21 V22 V23 V24
## Min. :0.1354 Min. :0.1127 Min. :0.0563 Min. :0.0239
## 1st Qu.:0.4244 1st Qu.:0.4482 1st Qu.:0.5467 1st Qu.:0.5782
## Median :0.7064 Median :0.7190 Median :0.7579 Median :0.7542
## Mean :0.6382 Mean :0.6577 Mean :0.6836 Mean :0.7058
## 3rd Qu.:0.8115 3rd Qu.:0.8320 3rd Qu.:0.8524 3rd Qu.:0.8771
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.1934 Min. :0.0921 Min. :0.0481 Min. :0.0284
## 1st Qu.:0.5189 1st Qu.:0.4807 1st Qu.:0.4598 1st Qu.:0.5109
## Median :0.7201 Median :0.7925 Median :0.7719 Median :0.7435
## Mean :0.6937 Mean :0.6907 Mean :0.6910 Mean :0.6893
## 3rd Qu.:0.9090 3rd Qu.:0.9534 3rd Qu.:0.9674 3rd Qu.:0.9476
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.1008 Min. :0.1714 Min. :0.0482 Min. :0.0404
## 1st Qu.:0.4543 1st Qu.:0.4012 1st Qu.:0.3486 1st Qu.:0.3132
## Median :0.6583 Median :0.6019 Median :0.4360 Median :0.4179
## Mean :0.6327 Mean :0.5438 Mean :0.4626 Mean :0.4103
## 3rd Qu.:0.8402 3rd Qu.:0.6990 3rd Qu.:0.5930 3rd Qu.:0.4918
## Max. :1.0000 Max. :0.9151 Max. :0.8828 Max. :0.9108
## V33 V34 V35 V36
## Min. :0.0637 Min. :0.0212 Min. :0.0619 Min. :0.0271
## 1st Qu.:0.2634 1st Qu.:0.2005 1st Qu.:0.1475 1st Qu.:0.1501
## Median :0.3797 Median :0.3052 Median :0.2669 Median :0.2350
## Mean :0.3832 Mean :0.3476 Mean :0.3285 Mean :0.3095
## 3rd Qu.:0.5090 3rd Qu.:0.4620 3rd Qu.:0.4560 3rd Qu.:0.4424
## Max. :0.7927 Max. :0.8703 Max. :1.0000 Max. :0.9212
## V37 V38 V39 V40
## Min. :0.0476 Min. :0.0411 Min. :0.0712 Min. :0.0325
## 1st Qu.:0.1535 1st Qu.:0.1741 1st Qu.:0.1754 1st Qu.:0.1572
## Median :0.2416 Median :0.3095 Median :0.3251 Median :0.2807
## Mean :0.2919 Mean :0.3190 Mean :0.3071 Mean :0.2859
## 3rd Qu.:0.4083 3rd Qu.:0.4115 3rd Qu.:0.3901 3rd Qu.:0.4062
## Max. :0.9386 Max. :0.9303 Max. :0.7601 Max. :0.6034
## V41 V42 V43 V44
## Min. :0.0360 Min. :0.0300 Min. :0.0537 Min. :0.0255
## 1st Qu.:0.1157 1st Qu.:0.1473 1st Qu.:0.1704 1st Qu.:0.1412
## Median :0.2497 Median :0.2228 Median :0.2265 Median :0.1953
## Mean :0.2644 Mean :0.2729 Mean :0.2534 Mean :0.2204
## 3rd Qu.:0.3752 3rd Qu.:0.4326 3rd Qu.:0.3649 3rd Qu.:0.2792
## Max. :0.6508 Max. :0.6443 Max. :0.4478 Max. :0.5245
## V45 V46 V47 V48
## Min. :0.0298 Min. :0.01380 Min. :0.0237 Min. :0.00410
## 1st Qu.:0.0908 1st Qu.:0.07405 1st Qu.:0.0744 1st Qu.:0.04977
## Median :0.1463 Median :0.12550 Median :0.1134 Median :0.08030
## Mean :0.1969 Mean :0.15892 Mean :0.1220 Mean :0.08778
## 3rd Qu.:0.2072 3rd Qu.:0.20820 3rd Qu.:0.1572 3rd Qu.:0.12095
## Max. :0.6149 Max. :0.52930 Max. :0.3385 Max. :0.20520
## V49 V50 V51 V52
## Min. :0.01020 Min. :0.00500 Min. :0.00260 Min. :0.00400
## 1st Qu.:0.02652 1st Qu.:0.01155 1st Qu.:0.01093 1st Qu.:0.00945
## Median :0.04525 Median :0.01875 Median :0.01550 Median :0.01340
## Mean :0.04845 Mean :0.01904 Mean :0.01648 Mean :0.01561
## 3rd Qu.:0.06732 3rd Qu.:0.02312 3rd Qu.:0.02050 3rd Qu.:0.01770
## Max. :0.10690 Max. :0.06370 Max. :0.03800 Max. :0.04590
## V53 V54 V55 V56
## Min. :0.001500 Min. :0.00180 Min. :0.001300 Min. :0.00130
## 1st Qu.:0.009125 1st Qu.:0.00605 1st Qu.:0.003875 1st Qu.:0.00450
## Median :0.012000 Median :0.00905 Median :0.006250 Median :0.00700
## Mean :0.012740 Mean :0.01205 Mean :0.010433 Mean :0.00858
## 3rd Qu.:0.015675 3rd Qu.:0.01638 3rd Qu.:0.014550 3rd Qu.:0.01063
## Max. :0.039000 Max. :0.03350 Max. :0.037600 Max. :0.02770
## V57 V58 V59 V60
## Min. :0.000900 Min. :0.000600 Min. :0.000200 Min. :0.00150
## 1st Qu.:0.003425 1st Qu.:0.003600 1st Qu.:0.003575 1st Qu.:0.00310
## Median :0.005800 Median :0.005800 Median :0.006000 Median :0.00570
## Mean :0.007403 Mean :0.008155 Mean :0.009057 Mean :0.00817
## 3rd Qu.:0.009025 3rd Qu.:0.011650 3rd Qu.:0.012450 3rd Qu.:0.01020
## Max. :0.024200 Max. :0.037700 Max. :0.036400 Max. :0.04390
## Class
## M:33
## R: 7
##
##
##
##
##################################
# Formulating a data type assessment summary
##################################
PDA <- Sonar_Train
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 V1 numeric
## 2 2 V2 numeric
## 3 3 V3 numeric
## 4 4 V4 numeric
## 5 5 V5 numeric
## 6 6 V6 numeric
## 7 7 V7 numeric
## 8 8 V8 numeric
## 9 9 V9 numeric
## 10 10 V10 numeric
## 11 11 V11 numeric
## 12 12 V12 numeric
## 13 13 V13 numeric
## 14 14 V14 numeric
## 15 15 V15 numeric
## 16 16 V16 numeric
## 17 17 V17 numeric
## 18 18 V18 numeric
## 19 19 V19 numeric
## 20 20 V20 numeric
## 21 21 V21 numeric
## 22 22 V22 numeric
## 23 23 V23 numeric
## 24 24 V24 numeric
## 25 25 V25 numeric
## 26 26 V26 numeric
## 27 27 V27 numeric
## 28 28 V28 numeric
## 29 29 V29 numeric
## 30 30 V30 numeric
## 31 31 V31 numeric
## 32 32 V32 numeric
## 33 33 V33 numeric
## 34 34 V34 numeric
## 35 35 V35 numeric
## 36 36 V36 numeric
## 37 37 V37 numeric
## 38 38 V38 numeric
## 39 39 V39 numeric
## 40 40 V40 numeric
## 41 41 V41 numeric
## 42 42 V42 numeric
## 43 43 V43 numeric
## 44 44 V44 numeric
## 45 45 V45 numeric
## 46 46 V46 numeric
## 47 47 V47 numeric
## 48 48 V48 numeric
## 49 49 V49 numeric
## 50 50 V50 numeric
## 51 51 V51 numeric
## 52 52 V52 numeric
## 53 53 V53 numeric
## 54 54 V54 numeric
## 55 55 V55 numeric
## 56 56 V56 numeric
## 57 57 V57 numeric
## 58 58 V58 numeric
## 59 59 V59 numeric
## 60 60 V60 numeric
## 61 61 Class factor
1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] Low variance observed for 17 variables with
First.Second.Mode.Ratio>5.
[B.1] V9
variable (numeric)
[B.2] V10
variable (numeric)
[B.3] V12
variable (numeric)
[B.4] V16
variable (numeric)
[B.5] V19
variable (numeric)
[B.6] V26
variable (numeric)
[B.7] V28
variable (numeric)
[B.8] V32
variable (numeric)
[B.9] V34
variable (numeric)
[B.10] V35
variable (numeric)
[B.11] V37
variable (numeric)
[B.12] V38
variable (numeric)
[B.13] V41
variable (numeric)
[B.14] V42
variable (numeric)
[B.15] V43
variable (numeric)
[B.16] V45
variable (numeric)
[B.17] V48
variable (numeric)
[C] No low variance noted for any variable with
Unique.Count.Ratio<0.01.
[D] No high skewness noted for any variable with
Skewness>3 or Skewness<(-3).
##################################
# Loading dataset
##################################
DQA <- Sonar_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 V1 numeric 96 0 1.000
## 2 2 V2 numeric 96 0 1.000
## 3 3 V3 numeric 96 0 1.000
## 4 4 V4 numeric 96 0 1.000
## 5 5 V5 numeric 96 0 1.000
## 6 6 V6 numeric 96 0 1.000
## 7 7 V7 numeric 96 0 1.000
## 8 8 V8 numeric 96 0 1.000
## 9 9 V9 numeric 96 0 1.000
## 10 10 V10 numeric 96 0 1.000
## 11 11 V11 numeric 96 0 1.000
## 12 12 V12 numeric 96 0 1.000
## 13 13 V13 numeric 96 0 1.000
## 14 14 V14 numeric 96 0 1.000
## 15 15 V15 numeric 96 0 1.000
## 16 16 V16 numeric 96 0 1.000
## 17 17 V17 numeric 96 0 1.000
## 18 18 V18 numeric 96 0 1.000
## 19 19 V19 numeric 96 0 1.000
## 20 20 V20 numeric 96 0 1.000
## 21 21 V21 numeric 96 0 1.000
## 22 22 V22 numeric 96 0 1.000
## 23 23 V23 numeric 96 0 1.000
## 24 24 V24 numeric 96 0 1.000
## 25 25 V25 numeric 96 0 1.000
## 26 26 V26 numeric 96 0 1.000
## 27 27 V27 numeric 96 0 1.000
## 28 28 V28 numeric 96 0 1.000
## 29 29 V29 numeric 96 0 1.000
## 30 30 V30 numeric 96 0 1.000
## 31 31 V31 numeric 96 0 1.000
## 32 32 V32 numeric 96 0 1.000
## 33 33 V33 numeric 96 0 1.000
## 34 34 V34 numeric 96 0 1.000
## 35 35 V35 numeric 96 0 1.000
## 36 36 V36 numeric 96 0 1.000
## 37 37 V37 numeric 96 0 1.000
## 38 38 V38 numeric 96 0 1.000
## 39 39 V39 numeric 96 0 1.000
## 40 40 V40 numeric 96 0 1.000
## 41 41 V41 numeric 96 0 1.000
## 42 42 V42 numeric 96 0 1.000
## 43 43 V43 numeric 96 0 1.000
## 44 44 V44 numeric 96 0 1.000
## 45 45 V45 numeric 96 0 1.000
## 46 46 V46 numeric 96 0 1.000
## 47 47 V47 numeric 96 0 1.000
## 48 48 V48 numeric 96 0 1.000
## 49 49 V49 numeric 96 0 1.000
## 50 50 V50 numeric 96 0 1.000
## 51 51 V51 numeric 96 0 1.000
## 52 52 V52 numeric 96 0 1.000
## 53 53 V53 numeric 96 0 1.000
## 54 54 V54 numeric 96 0 1.000
## 55 55 V55 numeric 96 0 1.000
## 56 56 V56 numeric 96 0 1.000
## 57 57 V57 numeric 96 0 1.000
## 58 58 V58 numeric 96 0 1.000
## 59 59 V59 numeric 96 0 1.000
## 60 60 V60 numeric 96 0 1.000
## 61 61 Class factor 96 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 60 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 V1 numeric 91 0.948 0.021
## 2 V2 numeric 93 0.969 0.019
## 3 V3 numeric 92 0.958 0.030
## 4 V4 numeric 90 0.938 0.061
## 5 V5 numeric 93 0.969 0.112
## 6 V6 numeric 94 0.979 0.152
## 7 V7 numeric 94 0.979 0.149
## 8 V8 numeric 95 0.990 0.168
## 9 V9 numeric 96 1.000 0.211
## 10 V10 numeric 96 1.000 0.360
## 11 V11 numeric 94 0.979 0.213
## 12 V12 numeric 96 1.000 0.520
## 13 V13 numeric 95 0.990 0.286
## 14 V14 numeric 93 0.969 0.290
## 15 V15 numeric 93 0.969 0.377
## 16 V16 numeric 96 1.000 0.870
## 17 V17 numeric 94 0.979 1.000
## 18 V18 numeric 95 0.990 0.243
## 19 V19 numeric 96 1.000 0.792
## 20 V20 numeric 93 0.969 0.769
## 21 V21 numeric 93 0.969 1.000
## 22 V22 numeric 94 0.979 1.000
## 23 V23 numeric 94 0.979 1.000
## 24 V24 numeric 94 0.979 1.000
## 25 V25 numeric 93 0.969 1.000
## 26 V26 numeric 91 0.948 1.000
## 27 V27 numeric 90 0.938 1.000
## 28 V28 numeric 88 0.917 1.000
## 29 V29 numeric 93 0.969 1.000
## 30 V30 numeric 93 0.969 1.000
## 31 V31 numeric 95 0.990 0.386
## 32 V32 numeric 96 1.000 0.189
## 33 V33 numeric 94 0.979 0.525
## 34 V34 numeric 96 1.000 0.196
## 35 V35 numeric 96 1.000 0.267
## 36 V36 numeric 95 0.990 0.233
## 37 V37 numeric 96 1.000 0.107
## 38 V38 numeric 96 1.000 0.202
## 39 V39 numeric 95 0.990 0.089
## 40 V40 numeric 95 0.990 0.443
## 41 V41 numeric 96 1.000 0.131
## 42 V42 numeric 96 1.000 0.178
## 43 V43 numeric 96 1.000 0.155
## 44 V44 numeric 95 0.990 0.192
## 45 V45 numeric 96 1.000 0.071
## 46 V46 numeric 95 0.990 0.096
## 47 V47 numeric 94 0.979 0.080
## 48 V48 numeric 96 1.000 0.076
## 49 V49 numeric 95 0.990 0.108
## 50 V50 numeric 83 0.865 0.018
## 51 V51 numeric 83 0.865 0.014
## 52 V52 numeric 83 0.865 0.009
## 53 V53 numeric 78 0.812 0.018
## 54 V54 numeric 79 0.823 0.011
## 55 V55 numeric 75 0.781 0.008
## 56 V56 numeric 79 0.823 0.003
## 57 V57 numeric 72 0.750 0.005
## 58 V58 numeric 71 0.740 0.010
## 59 V59 numeric 70 0.729 0.008
## 60 V60 numeric 70 0.729 0.003
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 0.023 3 2 1.500
## 2 0.106 2 1 2.000
## 3 0.153 2 1 2.000
## 4 0.039 3 2 1.500
## 5 0.144 2 1 2.000
## 6 0.119 2 1 2.000
## 7 0.088 2 1 2.000
## 8 0.091 2 1 2.000
## 9 0.000 1 0 Inf
## 10 0.000 1 0 Inf
## 11 0.547 2 1 2.000
## 12 0.000 1 0 Inf
## 13 0.513 2 1 2.000
## 14 0.540 2 1 2.000
## 15 0.656 2 1 2.000
## 16 0.000 1 0 Inf
## 17 0.979 2 1 2.000
## 18 0.934 2 1 2.000
## 19 0.000 1 0 Inf
## 20 0.738 2 1 2.000
## 21 0.691 4 1 4.000
## 22 0.385 3 1 3.000
## 23 0.067 3 1 3.000
## 24 0.050 3 1 3.000
## 25 0.272 4 1 4.000
## 26 0.284 6 1 6.000
## 27 0.892 5 2 2.500
## 28 0.191 9 1 9.000
## 29 0.904 3 2 1.500
## 30 0.253 4 1 4.000
## 31 0.198 2 1 2.000
## 32 0.000 1 0 Inf
## 33 0.243 2 1 2.000
## 34 0.000 1 0 Inf
## 35 0.000 1 0 Inf
## 36 0.134 2 1 2.000
## 37 0.000 1 0 Inf
## 38 0.000 1 0 Inf
## 39 0.179 2 1 2.000
## 40 0.023 2 1 2.000
## 41 0.000 1 0 Inf
## 42 0.000 1 0 Inf
## 43 0.000 1 0 Inf
## 44 0.163 2 1 2.000
## 45 0.000 1 0 Inf
## 46 0.013 2 1 2.000
## 47 0.080 2 1 2.000
## 48 0.000 1 0 Inf
## 49 0.012 2 1 2.000
## 50 0.026 3 2 1.500
## 51 0.025 3 2 1.500
## 52 0.009 3 2 1.500
## 53 0.026 3 2 1.500
## 54 0.008 3 2 1.500
## 55 0.004 4 3 1.333
## 56 0.008 3 2 1.500
## 57 0.004 4 3 1.333
## 58 0.006 3 2 1.500
## 59 0.007 4 3 1.333
## 60 0.002 4 3 1.333
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 1 0.002 0.032 0.023 0.137 1.792 6.412 0.014 0.040
## 2 0.001 0.044 0.034 0.157 1.257 4.025 0.019 0.059
## 3 0.002 0.047 0.039 0.166 1.468 5.010 0.024 0.062
## 4 0.006 0.056 0.049 0.164 1.121 3.944 0.030 0.072
## 5 0.007 0.082 0.074 0.248 0.841 3.808 0.045 0.109
## 6 0.010 0.118 0.113 0.382 1.173 5.889 0.078 0.150
## 7 0.018 0.133 0.130 0.373 0.754 4.295 0.094 0.168
## 8 0.012 0.151 0.136 0.457 1.233 4.917 0.095 0.191
## 9 0.008 0.204 0.182 0.683 1.542 6.658 0.130 0.260
## 10 0.011 0.233 0.212 0.596 0.912 3.670 0.142 0.294
## 11 0.053 0.266 0.252 0.668 0.727 3.824 0.193 0.334
## 12 0.024 0.280 0.278 0.568 0.325 2.699 0.184 0.350
## 13 0.062 0.302 0.293 0.713 0.544 3.116 0.212 0.373
## 14 0.027 0.314 0.290 0.997 1.098 4.726 0.185 0.405
## 15 0.009 0.319 0.275 0.914 0.891 3.188 0.167 0.440
## 16 0.042 0.375 0.320 0.975 0.810 2.804 0.191 0.533
## 17 0.037 0.414 0.316 1.000 0.678 2.281 0.209 0.647
## 18 0.038 0.447 0.373 0.934 0.480 1.909 0.243 0.673
## 19 0.132 0.513 0.446 0.983 0.265 1.756 0.296 0.731
## 20 0.066 0.586 0.622 1.000 -0.329 1.943 0.397 0.798
## 21 0.051 0.639 0.694 1.000 -0.553 2.254 0.441 0.845
## 22 0.022 0.636 0.702 1.000 -0.522 2.078 0.399 0.850
## 23 0.061 0.650 0.714 1.000 -0.642 2.271 0.453 0.869
## 24 0.050 0.680 0.698 1.000 -0.741 2.742 0.580 0.897
## 25 0.024 0.681 0.721 1.000 -0.824 3.019 0.569 0.875
## 26 0.164 0.708 0.756 1.000 -0.686 2.618 0.564 0.877
## 27 0.104 0.707 0.793 1.000 -0.682 2.297 0.496 0.911
## 28 0.060 0.708 0.776 1.000 -0.681 2.421 0.558 0.912
## 29 0.014 0.652 0.710 1.000 -0.601 2.411 0.467 0.867
## 30 0.061 0.587 0.603 1.000 -0.025 2.431 0.414 0.719
## 31 0.100 0.497 0.442 0.966 0.465 2.307 0.327 0.646
## 32 0.088 0.436 0.408 0.931 0.460 2.307 0.277 0.582
## 33 0.048 0.408 0.387 1.000 0.470 2.647 0.236 0.541
## 34 0.059 0.394 0.364 0.954 0.588 2.552 0.216 0.542
## 35 0.022 0.377 0.293 0.952 0.570 2.141 0.175 0.578
## 36 0.008 0.365 0.281 1.000 0.794 2.447 0.138 0.535
## 37 0.035 0.352 0.259 0.912 0.783 2.269 0.145 0.488
## 38 0.062 0.346 0.324 0.948 0.995 3.280 0.175 0.441
## 39 0.044 0.346 0.306 0.971 0.796 3.067 0.183 0.480
## 40 0.023 0.317 0.281 0.930 0.806 3.719 0.196 0.427
## 41 0.044 0.304 0.266 0.900 0.879 3.612 0.170 0.409
## 42 0.044 0.305 0.281 0.825 0.869 3.538 0.169 0.397
## 43 0.031 0.272 0.258 0.752 0.658 3.105 0.161 0.347
## 44 0.026 0.237 0.192 0.577 0.861 2.674 0.139 0.308
## 45 0.035 0.236 0.174 0.703 0.958 2.730 0.111 0.363
## 46 0.008 0.193 0.145 0.729 1.458 4.505 0.085 0.228
## 47 0.018 0.143 0.109 0.552 1.636 5.675 0.077 0.182
## 48 0.008 0.109 0.094 0.334 1.102 3.696 0.053 0.135
## 49 0.007 0.064 0.054 0.198 0.974 3.513 0.033 0.091
## 50 0.004 0.024 0.019 0.082 1.591 5.793 0.013 0.029
## 51 0.001 0.019 0.016 0.100 2.728 14.443 0.010 0.024
## 52 0.001 0.015 0.012 0.071 2.229 10.249 0.009 0.018
## 53 0.000 0.011 0.008 0.036 1.024 3.674 0.005 0.015
## 54 0.001 0.012 0.010 0.035 0.991 3.390 0.005 0.015
## 55 0.001 0.009 0.008 0.045 1.958 8.464 0.004 0.013
## 56 0.000 0.008 0.007 0.039 2.124 10.327 0.004 0.012
## 57 0.001 0.008 0.006 0.036 1.859 8.338 0.004 0.011
## 58 0.001 0.008 0.006 0.044 2.152 9.133 0.004 0.010
## 59 0.000 0.008 0.007 0.029 1.280 4.224 0.004 0.011
## 60 0.001 0.006 0.005 0.022 1.381 5.177 0.003 0.008
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 17 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 9 V9 numeric 96 1.000 0.211
## 10 V10 numeric 96 1.000 0.360
## 12 V12 numeric 96 1.000 0.520
## 16 V16 numeric 96 1.000 0.870
## 19 V19 numeric 96 1.000 0.792
## 26 V26 numeric 91 0.948 1.000
## 28 V28 numeric 88 0.917 1.000
## 32 V32 numeric 96 1.000 0.189
## 34 V34 numeric 96 1.000 0.196
## 35 V35 numeric 96 1.000 0.267
## 37 V37 numeric 96 1.000 0.107
## 38 V38 numeric 96 1.000 0.202
## 41 V41 numeric 96 1.000 0.131
## 42 V42 numeric 96 1.000 0.178
## 43 V43 numeric 96 1.000 0.155
## 45 V45 numeric 96 1.000 0.071
## 48 V48 numeric 96 1.000 0.076
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 9 0.000 1 0 Inf
## 10 0.000 1 0 Inf
## 12 0.000 1 0 Inf
## 16 0.000 1 0 Inf
## 19 0.000 1 0 Inf
## 26 0.284 6 1 6.000
## 28 0.191 9 1 9.000
## 32 0.000 1 0 Inf
## 34 0.000 1 0 Inf
## 35 0.000 1 0 Inf
## 37 0.000 1 0 Inf
## 38 0.000 1 0 Inf
## 41 0.000 1 0 Inf
## 42 0.000 1 0 Inf
## 43 0.000 1 0 Inf
## 45 0.000 1 0 Inf
## 48 0.000 1 0 Inf
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 9 0.008 0.204 0.182 0.683 1.542 6.658 0.130 0.260
## 10 0.011 0.233 0.212 0.596 0.912 3.670 0.142 0.294
## 12 0.024 0.280 0.278 0.568 0.325 2.699 0.184 0.350
## 16 0.042 0.375 0.320 0.975 0.810 2.804 0.191 0.533
## 19 0.132 0.513 0.446 0.983 0.265 1.756 0.296 0.731
## 26 0.164 0.708 0.756 1.000 -0.686 2.618 0.564 0.877
## 28 0.060 0.708 0.776 1.000 -0.681 2.421 0.558 0.912
## 32 0.088 0.436 0.408 0.931 0.460 2.307 0.277 0.582
## 34 0.059 0.394 0.364 0.954 0.588 2.552 0.216 0.542
## 35 0.022 0.377 0.293 0.952 0.570 2.141 0.175 0.578
## 37 0.035 0.352 0.259 0.912 0.783 2.269 0.145 0.488
## 38 0.062 0.346 0.324 0.948 0.995 3.280 0.175 0.441
## 41 0.044 0.304 0.266 0.900 0.879 3.612 0.170 0.409
## 42 0.044 0.305 0.281 0.825 0.869 3.538 0.169 0.397
## 43 0.031 0.272 0.258 0.752 0.658 3.105 0.161 0.347
## 45 0.035 0.236 0.174 0.703 0.958 2.730 0.111 0.363
## 48 0.008 0.109 0.094 0.334 1.102 3.696 0.053 0.135
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
1.3 Data Preprocessing
1.3.1 Outlier
Outlier data assessment:
[A] Outliers noted for 39 variables with the numeric
data visualized through a boxplot including observations classified as
suspected outliers using the IQR criterion. The IQR criterion means that
all observations above the (75th percentile + 1.5 x IQR) or below the
(25th percentile - 1.5 x IQR) are suspected outliers, where IQR is the
difference between the third quartile (75th percentile) and first
quartile (25th percentile). Outlier treatment for numerical stability
remains optional depending on potential model requirements for the
subsequent steps.
[A.1] V1
variable (5 outliers detected)
[A.2] V2
variable (6 outliers detected)
[A.3] V3
variable (6 outliers detected)
[A.4] V4
variable (5 outliers detected)
[A.5] V5
variable (2 outliers detected)
[A.6] V6
variable (4 outliers detected)
[A.7] V7
variable (2 outliers detected)
[A.8] V8
variable (6 outliers detected)
[A.9] V9
variable (4 outliers detected)
[A.10] V10
variable (4 outliers detected)
[A.11] V11
variable (2 outliers detected)
[A.13] V13
variable (1 outlier detected)
[A.14] V14
variable (2 outliers detected)
[A.15] V15
variable (2 outliers detected)
[A.16] V24
variable (1 outlier detected)
[A.16] V25
variable (3 outliers detected)
[A.17] V38
variable (5 outliers detected)
[A.18] V39
variable (1 outlier detected)
[A.19] V40
variable (1 outliers detected)
[A.20] V41
variable (1 outlier detected)
[A.21] V42
variable (3 outliers detected)
[A.22] V43
variable (1 outliers detected)
[A.23] V44
variable (2 outliers detected)
[A.25] V46
variable (8 outliers detected)
[A.26] V47
variable (6 outliers detected)
[A.27] V48
variable (6 outliers detected)
[A.28] V49
variable (1 outliers detected)
[A.29] V50
variable (4 outliers detected)
[A.30] V51
variable (4 outliers detected)
[A.31] V52
variable (5 outliers detected)
[A.32] V53
variable (2 outliers detected)
[A.33] V54
variable (4 outliers detected)
[A.34] V55
variable (3 outliers detected)
[A.35] V56
variable (2 outliers detected)
[A.36] V57
variable (3 outliers detected)
[A.37] V58
variable (6 outliers detected)
[A.38] V59
variable (6 outliers detected)
[A.39] V60
variable (3 outliers detected)
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying outliers for the numeric predictors
##################################
OutlierCountList <- c()
for (i in 1:ncol(DPA.Predictors.Numeric)) {
Outliers <- boxplot.stats(DPA.Predictors.Numeric[,i])$out
OutlierCount <- length(Outliers)
OutlierCountList <- append(OutlierCountList,OutlierCount)
OutlierIndices <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}




























































OutlierCountSummary <- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
OutlierCountSummary$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
NumericPredictorWithOutlierCount <- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "38 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))
Data summary
Name |
DPA.Predictors.Numeric |
Number of rows |
96 |
Number of columns |
60 |
_______________________ |
|
Column type frequency: |
|
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: numeric
V1 |
0 |
1 |
0.03 |
0.03 |
0.00 |
0.01 |
0.02 |
0.04 |
0.14 |
▇▃▂▁▁ |
V2 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.06 |
0.16 |
▇▆▂▁▁ |
V3 |
0 |
1 |
0.05 |
0.03 |
0.00 |
0.02 |
0.04 |
0.06 |
0.17 |
▇▆▂▁▁ |
V4 |
0 |
1 |
0.06 |
0.04 |
0.01 |
0.03 |
0.05 |
0.07 |
0.16 |
▇▇▃▁▁ |
V5 |
0 |
1 |
0.08 |
0.05 |
0.01 |
0.05 |
0.07 |
0.11 |
0.25 |
▇▇▅▁▁ |
V6 |
0 |
1 |
0.12 |
0.06 |
0.01 |
0.08 |
0.11 |
0.15 |
0.38 |
▅▇▂▁▁ |
V7 |
0 |
1 |
0.13 |
0.06 |
0.02 |
0.09 |
0.13 |
0.17 |
0.37 |
▃▇▃▁▁ |
V8 |
0 |
1 |
0.15 |
0.09 |
0.01 |
0.10 |
0.14 |
0.19 |
0.46 |
▆▇▃▁▁ |
V9 |
0 |
1 |
0.20 |
0.12 |
0.01 |
0.13 |
0.18 |
0.26 |
0.68 |
▅▇▃▁▁ |
V10 |
0 |
1 |
0.23 |
0.13 |
0.01 |
0.14 |
0.21 |
0.29 |
0.60 |
▃▇▅▂▁ |
V11 |
0 |
1 |
0.27 |
0.12 |
0.05 |
0.19 |
0.25 |
0.33 |
0.67 |
▅▇▅▂▁ |
V12 |
0 |
1 |
0.28 |
0.13 |
0.02 |
0.18 |
0.28 |
0.35 |
0.57 |
▃▅▇▃▂ |
V13 |
0 |
1 |
0.30 |
0.13 |
0.06 |
0.21 |
0.29 |
0.37 |
0.71 |
▃▇▅▂▁ |
V14 |
0 |
1 |
0.31 |
0.17 |
0.03 |
0.19 |
0.29 |
0.41 |
1.00 |
▇▇▃▁▁ |
V15 |
0 |
1 |
0.32 |
0.21 |
0.01 |
0.17 |
0.28 |
0.44 |
0.91 |
▇▅▆▁▁ |
V16 |
0 |
1 |
0.38 |
0.23 |
0.04 |
0.19 |
0.32 |
0.53 |
0.98 |
▇▅▅▂▂ |
V17 |
0 |
1 |
0.41 |
0.25 |
0.04 |
0.21 |
0.32 |
0.65 |
1.00 |
▇▇▃▅▂ |
V18 |
0 |
1 |
0.45 |
0.25 |
0.04 |
0.24 |
0.37 |
0.67 |
0.93 |
▃▇▂▃▃ |
V19 |
0 |
1 |
0.51 |
0.25 |
0.13 |
0.30 |
0.45 |
0.73 |
0.98 |
▇▇▃▅▅ |
V20 |
0 |
1 |
0.59 |
0.26 |
0.07 |
0.40 |
0.62 |
0.80 |
1.00 |
▅▆▅▇▆ |
V21 |
0 |
1 |
0.64 |
0.27 |
0.05 |
0.44 |
0.69 |
0.84 |
1.00 |
▂▃▅▆▇ |
V22 |
0 |
1 |
0.64 |
0.27 |
0.02 |
0.40 |
0.70 |
0.85 |
1.00 |
▂▃▂▆▇ |
V23 |
0 |
1 |
0.65 |
0.26 |
0.06 |
0.45 |
0.71 |
0.87 |
1.00 |
▂▂▃▅▇ |
V24 |
0 |
1 |
0.68 |
0.24 |
0.05 |
0.58 |
0.70 |
0.90 |
1.00 |
▂▂▃▇▇ |
V25 |
0 |
1 |
0.68 |
0.25 |
0.02 |
0.57 |
0.72 |
0.87 |
1.00 |
▂▂▃▇▇ |
V26 |
0 |
1 |
0.71 |
0.22 |
0.16 |
0.56 |
0.76 |
0.88 |
1.00 |
▂▂▅▇▇ |
V27 |
0 |
1 |
0.71 |
0.25 |
0.10 |
0.50 |
0.79 |
0.91 |
1.00 |
▁▂▂▃▇ |
V28 |
0 |
1 |
0.71 |
0.25 |
0.06 |
0.56 |
0.78 |
0.91 |
1.00 |
▁▂▃▅▇ |
V29 |
0 |
1 |
0.65 |
0.25 |
0.01 |
0.47 |
0.71 |
0.87 |
1.00 |
▂▂▅▅▇ |
V30 |
0 |
1 |
0.59 |
0.22 |
0.06 |
0.41 |
0.60 |
0.72 |
1.00 |
▁▆▇▇▅ |
V31 |
0 |
1 |
0.50 |
0.23 |
0.10 |
0.33 |
0.44 |
0.65 |
0.97 |
▃▇▅▃▃ |
V32 |
0 |
1 |
0.44 |
0.22 |
0.09 |
0.28 |
0.41 |
0.58 |
0.93 |
▆▇▆▃▂ |
V33 |
0 |
1 |
0.41 |
0.21 |
0.05 |
0.24 |
0.39 |
0.54 |
1.00 |
▆▇▇▃▁ |
V34 |
0 |
1 |
0.39 |
0.22 |
0.06 |
0.22 |
0.36 |
0.54 |
0.95 |
▇▇▃▅▂ |
V35 |
0 |
1 |
0.38 |
0.26 |
0.02 |
0.17 |
0.29 |
0.58 |
0.95 |
▇▆▃▃▂ |
V36 |
0 |
1 |
0.36 |
0.28 |
0.01 |
0.14 |
0.28 |
0.53 |
1.00 |
▇▅▂▂▂ |
V37 |
0 |
1 |
0.35 |
0.26 |
0.04 |
0.14 |
0.26 |
0.49 |
0.91 |
▇▃▂▁▃ |
V38 |
0 |
1 |
0.35 |
0.22 |
0.06 |
0.17 |
0.32 |
0.44 |
0.95 |
▇▆▂▂▁ |
V39 |
0 |
1 |
0.35 |
0.20 |
0.04 |
0.18 |
0.31 |
0.48 |
0.97 |
▇▇▅▂▁ |
V40 |
0 |
1 |
0.32 |
0.18 |
0.02 |
0.20 |
0.28 |
0.43 |
0.93 |
▅▇▃▁▁ |
V41 |
0 |
1 |
0.30 |
0.17 |
0.04 |
0.17 |
0.27 |
0.41 |
0.90 |
▇▇▃▂▁ |
V42 |
0 |
1 |
0.31 |
0.18 |
0.04 |
0.17 |
0.28 |
0.40 |
0.82 |
▇▇▆▁▁ |
V43 |
0 |
1 |
0.27 |
0.15 |
0.03 |
0.16 |
0.26 |
0.35 |
0.75 |
▆▇▅▂▁ |
V44 |
0 |
1 |
0.24 |
0.14 |
0.03 |
0.14 |
0.19 |
0.31 |
0.58 |
▅▇▂▂▂ |
V45 |
0 |
1 |
0.24 |
0.17 |
0.04 |
0.11 |
0.17 |
0.36 |
0.70 |
▇▃▂▂▁ |
V46 |
0 |
1 |
0.19 |
0.16 |
0.01 |
0.08 |
0.14 |
0.23 |
0.73 |
▇▅▂▁▁ |
V47 |
0 |
1 |
0.14 |
0.10 |
0.02 |
0.08 |
0.11 |
0.18 |
0.55 |
▇▃▁▁▁ |
V48 |
0 |
1 |
0.11 |
0.07 |
0.01 |
0.05 |
0.09 |
0.14 |
0.33 |
▆▇▂▁▁ |
V49 |
0 |
1 |
0.06 |
0.04 |
0.01 |
0.03 |
0.05 |
0.09 |
0.20 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0.02 |
0.02 |
0.00 |
0.01 |
0.02 |
0.03 |
0.08 |
▇▅▂▁▁ |
V51 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.02 |
0.02 |
0.10 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.07 |
▇▃▁▁▁ |
V53 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.02 |
0.04 |
▇▃▃▁▁ |
V54 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.04 |
▆▇▂▂▁ |
V55 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V56 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V57 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V58 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▂▁▁▁ |
V59 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.03 |
▇▇▂▁▁ |
V60 |
0 |
1 |
0.01 |
0.00 |
0.00 |
0.00 |
0.01 |
0.01 |
0.02 |
▇▆▂▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 96 60
1.3.2 Zero and Near-Zero Variance
Zero and near-zero variance data assessment:
[A] Low variance noted for 17 variables from the
previous data quality assessment using a lower threshold.
[B] No low variance noted for any variables using a
preprocessing summary from the
caret
package. The nearZeroVar method
using both the freqCut and uniqueCut criteria set at 95/5 and 10,
respectively, were applied on the dataset.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA))
Data summary
Name |
DPA |
Number of rows |
96 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 78, R: 18 |
Variable type: numeric
V1 |
0 |
1 |
0.03 |
0.03 |
0.00 |
0.01 |
0.02 |
0.04 |
0.14 |
▇▃▂▁▁ |
V2 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.06 |
0.16 |
▇▆▂▁▁ |
V3 |
0 |
1 |
0.05 |
0.03 |
0.00 |
0.02 |
0.04 |
0.06 |
0.17 |
▇▆▂▁▁ |
V4 |
0 |
1 |
0.06 |
0.04 |
0.01 |
0.03 |
0.05 |
0.07 |
0.16 |
▇▇▃▁▁ |
V5 |
0 |
1 |
0.08 |
0.05 |
0.01 |
0.05 |
0.07 |
0.11 |
0.25 |
▇▇▅▁▁ |
V6 |
0 |
1 |
0.12 |
0.06 |
0.01 |
0.08 |
0.11 |
0.15 |
0.38 |
▅▇▂▁▁ |
V7 |
0 |
1 |
0.13 |
0.06 |
0.02 |
0.09 |
0.13 |
0.17 |
0.37 |
▃▇▃▁▁ |
V8 |
0 |
1 |
0.15 |
0.09 |
0.01 |
0.10 |
0.14 |
0.19 |
0.46 |
▆▇▃▁▁ |
V9 |
0 |
1 |
0.20 |
0.12 |
0.01 |
0.13 |
0.18 |
0.26 |
0.68 |
▅▇▃▁▁ |
V10 |
0 |
1 |
0.23 |
0.13 |
0.01 |
0.14 |
0.21 |
0.29 |
0.60 |
▃▇▅▂▁ |
V11 |
0 |
1 |
0.27 |
0.12 |
0.05 |
0.19 |
0.25 |
0.33 |
0.67 |
▅▇▅▂▁ |
V12 |
0 |
1 |
0.28 |
0.13 |
0.02 |
0.18 |
0.28 |
0.35 |
0.57 |
▃▅▇▃▂ |
V13 |
0 |
1 |
0.30 |
0.13 |
0.06 |
0.21 |
0.29 |
0.37 |
0.71 |
▃▇▅▂▁ |
V14 |
0 |
1 |
0.31 |
0.17 |
0.03 |
0.19 |
0.29 |
0.41 |
1.00 |
▇▇▃▁▁ |
V15 |
0 |
1 |
0.32 |
0.21 |
0.01 |
0.17 |
0.28 |
0.44 |
0.91 |
▇▅▆▁▁ |
V16 |
0 |
1 |
0.38 |
0.23 |
0.04 |
0.19 |
0.32 |
0.53 |
0.98 |
▇▅▅▂▂ |
V17 |
0 |
1 |
0.41 |
0.25 |
0.04 |
0.21 |
0.32 |
0.65 |
1.00 |
▇▇▃▅▂ |
V18 |
0 |
1 |
0.45 |
0.25 |
0.04 |
0.24 |
0.37 |
0.67 |
0.93 |
▃▇▂▃▃ |
V19 |
0 |
1 |
0.51 |
0.25 |
0.13 |
0.30 |
0.45 |
0.73 |
0.98 |
▇▇▃▅▅ |
V20 |
0 |
1 |
0.59 |
0.26 |
0.07 |
0.40 |
0.62 |
0.80 |
1.00 |
▅▆▅▇▆ |
V21 |
0 |
1 |
0.64 |
0.27 |
0.05 |
0.44 |
0.69 |
0.84 |
1.00 |
▂▃▅▆▇ |
V22 |
0 |
1 |
0.64 |
0.27 |
0.02 |
0.40 |
0.70 |
0.85 |
1.00 |
▂▃▂▆▇ |
V23 |
0 |
1 |
0.65 |
0.26 |
0.06 |
0.45 |
0.71 |
0.87 |
1.00 |
▂▂▃▅▇ |
V24 |
0 |
1 |
0.68 |
0.24 |
0.05 |
0.58 |
0.70 |
0.90 |
1.00 |
▂▂▃▇▇ |
V25 |
0 |
1 |
0.68 |
0.25 |
0.02 |
0.57 |
0.72 |
0.87 |
1.00 |
▂▂▃▇▇ |
V26 |
0 |
1 |
0.71 |
0.22 |
0.16 |
0.56 |
0.76 |
0.88 |
1.00 |
▂▂▅▇▇ |
V27 |
0 |
1 |
0.71 |
0.25 |
0.10 |
0.50 |
0.79 |
0.91 |
1.00 |
▁▂▂▃▇ |
V28 |
0 |
1 |
0.71 |
0.25 |
0.06 |
0.56 |
0.78 |
0.91 |
1.00 |
▁▂▃▅▇ |
V29 |
0 |
1 |
0.65 |
0.25 |
0.01 |
0.47 |
0.71 |
0.87 |
1.00 |
▂▂▅▅▇ |
V30 |
0 |
1 |
0.59 |
0.22 |
0.06 |
0.41 |
0.60 |
0.72 |
1.00 |
▁▆▇▇▅ |
V31 |
0 |
1 |
0.50 |
0.23 |
0.10 |
0.33 |
0.44 |
0.65 |
0.97 |
▃▇▅▃▃ |
V32 |
0 |
1 |
0.44 |
0.22 |
0.09 |
0.28 |
0.41 |
0.58 |
0.93 |
▆▇▆▃▂ |
V33 |
0 |
1 |
0.41 |
0.21 |
0.05 |
0.24 |
0.39 |
0.54 |
1.00 |
▆▇▇▃▁ |
V34 |
0 |
1 |
0.39 |
0.22 |
0.06 |
0.22 |
0.36 |
0.54 |
0.95 |
▇▇▃▅▂ |
V35 |
0 |
1 |
0.38 |
0.26 |
0.02 |
0.17 |
0.29 |
0.58 |
0.95 |
▇▆▃▃▂ |
V36 |
0 |
1 |
0.36 |
0.28 |
0.01 |
0.14 |
0.28 |
0.53 |
1.00 |
▇▅▂▂▂ |
V37 |
0 |
1 |
0.35 |
0.26 |
0.04 |
0.14 |
0.26 |
0.49 |
0.91 |
▇▃▂▁▃ |
V38 |
0 |
1 |
0.35 |
0.22 |
0.06 |
0.17 |
0.32 |
0.44 |
0.95 |
▇▆▂▂▁ |
V39 |
0 |
1 |
0.35 |
0.20 |
0.04 |
0.18 |
0.31 |
0.48 |
0.97 |
▇▇▅▂▁ |
V40 |
0 |
1 |
0.32 |
0.18 |
0.02 |
0.20 |
0.28 |
0.43 |
0.93 |
▅▇▃▁▁ |
V41 |
0 |
1 |
0.30 |
0.17 |
0.04 |
0.17 |
0.27 |
0.41 |
0.90 |
▇▇▃▂▁ |
V42 |
0 |
1 |
0.31 |
0.18 |
0.04 |
0.17 |
0.28 |
0.40 |
0.82 |
▇▇▆▁▁ |
V43 |
0 |
1 |
0.27 |
0.15 |
0.03 |
0.16 |
0.26 |
0.35 |
0.75 |
▆▇▅▂▁ |
V44 |
0 |
1 |
0.24 |
0.14 |
0.03 |
0.14 |
0.19 |
0.31 |
0.58 |
▅▇▂▂▂ |
V45 |
0 |
1 |
0.24 |
0.17 |
0.04 |
0.11 |
0.17 |
0.36 |
0.70 |
▇▃▂▂▁ |
V46 |
0 |
1 |
0.19 |
0.16 |
0.01 |
0.08 |
0.14 |
0.23 |
0.73 |
▇▅▂▁▁ |
V47 |
0 |
1 |
0.14 |
0.10 |
0.02 |
0.08 |
0.11 |
0.18 |
0.55 |
▇▃▁▁▁ |
V48 |
0 |
1 |
0.11 |
0.07 |
0.01 |
0.05 |
0.09 |
0.14 |
0.33 |
▆▇▂▁▁ |
V49 |
0 |
1 |
0.06 |
0.04 |
0.01 |
0.03 |
0.05 |
0.09 |
0.20 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0.02 |
0.02 |
0.00 |
0.01 |
0.02 |
0.03 |
0.08 |
▇▅▂▁▁ |
V51 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.02 |
0.02 |
0.10 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.07 |
▇▃▁▁▁ |
V53 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.02 |
0.04 |
▇▃▃▁▁ |
V54 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.04 |
▆▇▂▂▁ |
V55 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V56 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V57 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V58 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▂▁▁▁ |
V59 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.03 |
▇▇▂▁▁ |
V60 |
0 |
1 |
0.01 |
0.00 |
0.00 |
0.00 |
0.01 |
0.01 |
0.02 |
▇▆▂▁▁ |
##################################
# Identifying columns with low variance
###################################
DPA_LowVariance <- nearZeroVar(DPA,
freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
(DPA_LowVariance[DPA_LowVariance$nzv,])
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
} else {
print(paste0("Low variance observed for ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
DPA_LowVarianceForRemoval <- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
print(paste0("Low variance can be resolved by removing ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
DPA_LowVarianceRemovedVariable <- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LowVarianceRemovedVariable))
}
DPA %>%
skim() %>%
dplyr::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
##################################
# Filtering out columns with low variance
#################################
DPA_ExcludedLowVariance <- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLowVariance_Skimmed <- skim(DPA_ExcludedLowVariance))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLowVariance)
}
## [1] "No low variance predictors noted."
1.3.3 Collinearity
High collinearity data assessment:
[A] No high correlation > 95% were noted for any
variable as confirmed using the preprocessing summaries from the
caret
and
lares
packages.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Visualizing pairwise correlation between predictors
##################################
DPA_CorrelationTest <- cor.mtest(DPA.Predictors.Numeric,
method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")

##################################
# Identifying the highly correlated variables
##################################
DPA_Correlation <- cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs")
(DPA_HighlyCorrelatedCount <- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95))
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
} else {
print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount),
" pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
(DPA_HighlyCorrelatedPairs <- corr_cross(DPA.Predictors.Numeric,
max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
DPA_HighlyCorrelated <- findCorrelation(DPA_Correlation, cutoff = 0.95)
(DPA_HighlyCorrelatedForRemoval <- length(DPA_HighlyCorrelated))
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
DPA_HighlyCorrelatedRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
DPA_ExcludedHighCorrelation <- DPA[,-DPA_HighlyCorrelated]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedHighCorrelation_Skimmed <- skim(DPA_ExcludedHighCorrelation))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedHighCorrelation)
}
1.3.4 Linear Dependencies
Linear dependency data assessment:
[A] No linear dependencies noted for any subset of
variables using the preprocessing summary from the
caret
package applying the findLinearCombos method which utilizes the
QR decomposition of a matrix to enumerate sets of linear combinations
(if they exist).
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying the linearly dependent variables
##################################
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
(DPA_LinearlyDependentCount <- length(DPA_LinearlyDependent$linearCombos))
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
} else {
print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount),
" subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
DPA_LinearlyDependentSubset <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
print(paste0("Linear dependent variable(s) for subset ",
i,
" include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependentForRemoval <- length(DPA_LinearlyDependent$remove)
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
DPA_LinearlyDependentRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
DPA_ExcludedLinearlyDependent <- DPA[,-DPA_LinearlyDependent$remove]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLinearlyDependent_Skimmed <- skim(DPA_ExcludedLinearlyDependent))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLinearlyDependent)
} else {
###################################
# Verifying the data dimensions
###################################
dim(DPA)
}
## [1] 96 61
1.3.6 Centering and Scaling
Centering and scaling data assessment:
[A] To maintain numerical stability during modelling,
centering and scaling transformations were applied on the transformed
numeric variables. The center method
from the
caret
package was implemented which subtracts the average value of a numeric
variable to all the values. As a result of centering, the variables had
zero mean values. In addition, the scale method, also from the
caret
package, was applied which performs a center transformation with each
value of the variable divided by its standard deviation. Scaling the
data coerced the values to have a common standard deviation of
one.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Applying a Box-Cox transformation
##################################
DPA_BoxCox <- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCoxTransformed <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
##################################
# Applying a center and scale data transformation
##################################
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- preProcess(DPA_BoxCoxTransformed, method = c("center","scale"))
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed <- predict(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_BoxCoxTransformed)
##################################
# Gathering descriptive statistics
##################################
(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformedSkimmed <- skim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed))
Data summary
Name |
DPA.Predictors.Numeric_Bo… |
Number of rows |
96 |
Number of columns |
60 |
_______________________ |
|
Column type frequency: |
|
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-2.69 |
-0.72 |
-0.09 |
0.61 |
2.54 |
▁▅▇▅▁ |
V2 |
0 |
1 |
0 |
1 |
-2.81 |
-0.68 |
-0.05 |
0.66 |
2.26 |
▁▃▇▆▂ |
V3 |
0 |
1 |
0 |
1 |
-2.79 |
-0.60 |
-0.02 |
0.65 |
2.41 |
▁▃▇▅▂ |
V4 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
0.00 |
0.63 |
2.23 |
▁▆▇▅▂ |
V5 |
0 |
1 |
0 |
1 |
-2.44 |
-0.69 |
0.02 |
0.67 |
2.49 |
▂▆▇▆▁ |
V6 |
0 |
1 |
0 |
1 |
-2.57 |
-0.58 |
0.06 |
0.62 |
3.20 |
▂▅▇▂▁ |
V7 |
0 |
1 |
0 |
1 |
-2.33 |
-0.57 |
0.04 |
0.62 |
3.08 |
▂▆▇▃▁ |
V8 |
0 |
1 |
0 |
1 |
-2.54 |
-0.55 |
-0.01 |
0.59 |
2.56 |
▁▃▇▃▁ |
V9 |
0 |
1 |
0 |
1 |
-2.90 |
-0.52 |
-0.01 |
0.60 |
2.79 |
▁▃▇▃▁ |
V10 |
0 |
1 |
0 |
1 |
-2.73 |
-0.67 |
-0.03 |
0.59 |
2.34 |
▁▃▇▅▂ |
V11 |
0 |
1 |
0 |
1 |
-2.29 |
-0.53 |
-0.01 |
0.63 |
2.64 |
▂▃▇▃▁ |
V12 |
0 |
1 |
0 |
1 |
-2.42 |
-0.70 |
0.06 |
0.59 |
2.02 |
▂▅▇▇▃ |
V13 |
0 |
1 |
0 |
1 |
-2.30 |
-0.60 |
0.05 |
0.60 |
2.47 |
▂▅▇▅▂ |
V14 |
0 |
1 |
0 |
1 |
-2.92 |
-0.68 |
0.05 |
0.67 |
2.67 |
▁▃▇▆▁ |
V15 |
0 |
1 |
0 |
1 |
-3.02 |
-0.65 |
0.01 |
0.73 |
2.07 |
▁▃▇▇▃ |
V16 |
0 |
1 |
0 |
1 |
-2.63 |
-0.78 |
-0.01 |
0.83 |
1.95 |
▁▆▇▇▅ |
V17 |
0 |
1 |
0 |
1 |
-2.82 |
-0.77 |
-0.17 |
0.99 |
1.78 |
▁▅▇▅▇ |
V18 |
0 |
1 |
0 |
1 |
-2.53 |
-0.76 |
-0.13 |
0.94 |
1.64 |
▁▅▇▅▆ |
V19 |
0 |
1 |
0 |
1 |
-1.93 |
-0.82 |
-0.12 |
0.90 |
1.62 |
▃▇▅▇▇ |
V20 |
0 |
1 |
0 |
1 |
-2.01 |
-0.73 |
0.14 |
0.82 |
1.60 |
▅▆▅▇▆ |
V21 |
0 |
1 |
0 |
1 |
-2.03 |
-0.80 |
0.17 |
0.78 |
1.43 |
▃▃▅▇▇ |
V22 |
0 |
1 |
0 |
1 |
-2.06 |
-0.93 |
0.21 |
0.80 |
1.42 |
▃▅▅▇▇ |
V23 |
0 |
1 |
0 |
1 |
-1.95 |
-0.86 |
0.18 |
0.87 |
1.50 |
▃▅▃▇▇ |
V24 |
0 |
1 |
0 |
1 |
-2.09 |
-0.53 |
-0.02 |
0.94 |
1.49 |
▅▂▇▆▇ |
V25 |
0 |
1 |
0 |
1 |
-2.13 |
-0.59 |
0.08 |
0.82 |
1.48 |
▃▃▇▇▇ |
V26 |
0 |
1 |
0 |
1 |
-1.99 |
-0.77 |
0.12 |
0.77 |
1.51 |
▃▅▆▇▇ |
V27 |
0 |
1 |
0 |
1 |
-1.94 |
-0.95 |
0.26 |
0.83 |
1.29 |
▃▃▃▃▇ |
V28 |
0 |
1 |
0 |
1 |
-2.01 |
-0.73 |
0.19 |
0.84 |
1.30 |
▃▂▅▆▇ |
V29 |
0 |
1 |
0 |
1 |
-2.16 |
-0.81 |
0.18 |
0.88 |
1.50 |
▂▆▃▆▇ |
V30 |
0 |
1 |
0 |
1 |
-2.42 |
-0.80 |
0.07 |
0.61 |
1.90 |
▁▆▇▇▅ |
V31 |
0 |
1 |
0 |
1 |
-2.40 |
-0.69 |
-0.11 |
0.73 |
1.77 |
▁▅▇▆▆ |
V32 |
0 |
1 |
0 |
1 |
-2.15 |
-0.65 |
0.02 |
0.74 |
1.87 |
▃▇▇▇▆ |
V33 |
0 |
1 |
0 |
1 |
-2.40 |
-0.79 |
0.03 |
0.71 |
2.31 |
▁▇▇▇▂ |
V34 |
0 |
1 |
0 |
1 |
-2.14 |
-0.75 |
0.04 |
0.76 |
2.02 |
▂▆▇▆▃ |
V35 |
0 |
1 |
0 |
1 |
-2.08 |
-0.68 |
-0.11 |
0.85 |
1.74 |
▃▇▇▆▇ |
V36 |
0 |
1 |
0 |
1 |
-2.57 |
-0.78 |
-0.04 |
0.78 |
1.75 |
▁▅▇▆▆ |
V37 |
0 |
1 |
0 |
1 |
-2.48 |
-0.73 |
-0.01 |
0.77 |
1.54 |
▁▅▇▇▇ |
V38 |
0 |
1 |
0 |
1 |
-2.25 |
-0.70 |
0.22 |
0.67 |
1.81 |
▂▅▅▇▅ |
V39 |
0 |
1 |
0 |
1 |
-2.48 |
-0.81 |
-0.01 |
0.79 |
2.29 |
▁▇▇▆▂ |
V40 |
0 |
1 |
0 |
1 |
-2.46 |
-0.62 |
-0.07 |
0.71 |
2.67 |
▁▆▇▅▁ |
V41 |
0 |
1 |
0 |
1 |
-2.35 |
-0.73 |
-0.03 |
0.74 |
2.42 |
▂▆▇▆▂ |
V42 |
0 |
1 |
0 |
1 |
-2.09 |
-0.71 |
0.04 |
0.65 |
2.23 |
▃▅▇▅▂ |
V43 |
0 |
1 |
0 |
1 |
-2.23 |
-0.68 |
0.05 |
0.60 |
2.51 |
▃▆▇▅▂ |
V44 |
0 |
1 |
0 |
1 |
-2.66 |
-0.58 |
-0.10 |
0.67 |
1.81 |
▁▃▇▅▅ |
V45 |
0 |
1 |
0 |
1 |
-2.21 |
-0.67 |
-0.06 |
0.93 |
1.82 |
▂▅▇▃▆ |
V46 |
0 |
1 |
0 |
1 |
-2.72 |
-0.67 |
-0.05 |
0.53 |
2.27 |
▁▅▇▅▃ |
V47 |
0 |
1 |
0 |
1 |
-2.81 |
-0.60 |
-0.08 |
0.70 |
2.37 |
▁▅▇▅▂ |
V48 |
0 |
1 |
0 |
1 |
-2.44 |
-0.72 |
0.01 |
0.57 |
2.21 |
▂▅▇▅▂ |
V49 |
0 |
1 |
0 |
1 |
-2.21 |
-0.69 |
-0.02 |
0.79 |
2.26 |
▃▆▇▆▂ |
V50 |
0 |
1 |
0 |
1 |
-2.31 |
-0.61 |
-0.02 |
0.62 |
2.25 |
▂▆▇▆▃ |
V51 |
0 |
1 |
0 |
1 |
-2.60 |
-0.53 |
-0.01 |
0.59 |
3.36 |
▂▅▇▂▁ |
V52 |
0 |
1 |
0 |
1 |
-3.42 |
-0.52 |
-0.02 |
0.60 |
2.65 |
▁▂▇▆▁ |
V53 |
0 |
1 |
0 |
1 |
-2.66 |
-0.76 |
-0.19 |
0.78 |
2.34 |
▁▆▇▇▂ |
V54 |
0 |
1 |
0 |
1 |
-2.12 |
-0.69 |
0.01 |
0.62 |
2.05 |
▂▃▇▃▃ |
V55 |
0 |
1 |
0 |
1 |
-2.39 |
-0.83 |
0.11 |
0.75 |
2.37 |
▂▇▇▇▁ |
V56 |
0 |
1 |
0 |
1 |
-2.63 |
-0.61 |
0.00 |
0.74 |
3.08 |
▁▆▇▃▁ |
V57 |
0 |
1 |
0 |
1 |
-2.40 |
-0.69 |
-0.07 |
0.81 |
2.49 |
▂▇▇▇▂ |
V58 |
0 |
1 |
0 |
1 |
-2.44 |
-0.70 |
0.01 |
0.62 |
2.45 |
▁▅▇▅▂ |
V59 |
0 |
1 |
0 |
1 |
-2.88 |
-0.69 |
0.06 |
0.62 |
2.25 |
▁▃▇▅▂ |
V60 |
0 |
1 |
0 |
1 |
-2.43 |
-0.65 |
0.02 |
0.71 |
2.39 |
▂▆▇▆▂ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)
## [1] 96 60
1.3.7 Pre-Processed Dataset
Preliminary dataset assessment:
[A] 136 rows (observations)
[A.1] Train Set = 96 observations with class ratio
of 80:20
[A.2] Test Set = 40 observations with class ratio
of 80:20
[B] 61 columns (variables)
[B.1] 1/61 response = Class variable (factor)
[B.1.1] Levels = Class=R < Class=M
[B.2] 60/61 predictors = All remaining variables
(60/60 numeric)
[C] Pre-processing actions applied:
[C.1] Centering, scaling and shape transformation
applied to improve data quality
[C.2] No outlier treatment applied since the high
values noted were contextually valid and sensible
[C.3] No predictors removed due to zero or
near-zero variance
[C.4] No predictors removed due to high
correlation
[C.5] No predictors removed due to linear
dependencies
##################################
# Creating the pre-modelling
# train set
##################################
Class <- DPA$Class
PMA.Predictors.Numeric <- DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_BoxCoxTransformed_CenteredScaledTransformed <- cbind(Class,PMA.Predictors.Numeric)
PMA_PreModelling_Train <- PMA_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
(PMA_PreModelling_Train_Skimmed <- skim(PMA_PreModelling_Train))
Data summary
Name |
PMA_PreModelling_Train |
Number of rows |
96 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 78, R: 18 |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-2.69 |
-0.72 |
-0.09 |
0.61 |
2.54 |
▁▅▇▅▁ |
V2 |
0 |
1 |
0 |
1 |
-2.81 |
-0.68 |
-0.05 |
0.66 |
2.26 |
▁▃▇▆▂ |
V3 |
0 |
1 |
0 |
1 |
-2.79 |
-0.60 |
-0.02 |
0.65 |
2.41 |
▁▃▇▅▂ |
V4 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
0.00 |
0.63 |
2.23 |
▁▆▇▅▂ |
V5 |
0 |
1 |
0 |
1 |
-2.44 |
-0.69 |
0.02 |
0.67 |
2.49 |
▂▆▇▆▁ |
V6 |
0 |
1 |
0 |
1 |
-2.57 |
-0.58 |
0.06 |
0.62 |
3.20 |
▂▅▇▂▁ |
V7 |
0 |
1 |
0 |
1 |
-2.33 |
-0.57 |
0.04 |
0.62 |
3.08 |
▂▆▇▃▁ |
V8 |
0 |
1 |
0 |
1 |
-2.54 |
-0.55 |
-0.01 |
0.59 |
2.56 |
▁▃▇▃▁ |
V9 |
0 |
1 |
0 |
1 |
-2.90 |
-0.52 |
-0.01 |
0.60 |
2.79 |
▁▃▇▃▁ |
V10 |
0 |
1 |
0 |
1 |
-2.73 |
-0.67 |
-0.03 |
0.59 |
2.34 |
▁▃▇▅▂ |
V11 |
0 |
1 |
0 |
1 |
-2.29 |
-0.53 |
-0.01 |
0.63 |
2.64 |
▂▃▇▃▁ |
V12 |
0 |
1 |
0 |
1 |
-2.42 |
-0.70 |
0.06 |
0.59 |
2.02 |
▂▅▇▇▃ |
V13 |
0 |
1 |
0 |
1 |
-2.30 |
-0.60 |
0.05 |
0.60 |
2.47 |
▂▅▇▅▂ |
V14 |
0 |
1 |
0 |
1 |
-2.92 |
-0.68 |
0.05 |
0.67 |
2.67 |
▁▃▇▆▁ |
V15 |
0 |
1 |
0 |
1 |
-3.02 |
-0.65 |
0.01 |
0.73 |
2.07 |
▁▃▇▇▃ |
V16 |
0 |
1 |
0 |
1 |
-2.63 |
-0.78 |
-0.01 |
0.83 |
1.95 |
▁▆▇▇▅ |
V17 |
0 |
1 |
0 |
1 |
-2.82 |
-0.77 |
-0.17 |
0.99 |
1.78 |
▁▅▇▅▇ |
V18 |
0 |
1 |
0 |
1 |
-2.53 |
-0.76 |
-0.13 |
0.94 |
1.64 |
▁▅▇▅▆ |
V19 |
0 |
1 |
0 |
1 |
-1.93 |
-0.82 |
-0.12 |
0.90 |
1.62 |
▃▇▅▇▇ |
V20 |
0 |
1 |
0 |
1 |
-2.01 |
-0.73 |
0.14 |
0.82 |
1.60 |
▅▆▅▇▆ |
V21 |
0 |
1 |
0 |
1 |
-2.03 |
-0.80 |
0.17 |
0.78 |
1.43 |
▃▃▅▇▇ |
V22 |
0 |
1 |
0 |
1 |
-2.06 |
-0.93 |
0.21 |
0.80 |
1.42 |
▃▅▅▇▇ |
V23 |
0 |
1 |
0 |
1 |
-1.95 |
-0.86 |
0.18 |
0.87 |
1.50 |
▃▅▃▇▇ |
V24 |
0 |
1 |
0 |
1 |
-2.09 |
-0.53 |
-0.02 |
0.94 |
1.49 |
▅▂▇▆▇ |
V25 |
0 |
1 |
0 |
1 |
-2.13 |
-0.59 |
0.08 |
0.82 |
1.48 |
▃▃▇▇▇ |
V26 |
0 |
1 |
0 |
1 |
-1.99 |
-0.77 |
0.12 |
0.77 |
1.51 |
▃▅▆▇▇ |
V27 |
0 |
1 |
0 |
1 |
-1.94 |
-0.95 |
0.26 |
0.83 |
1.29 |
▃▃▃▃▇ |
V28 |
0 |
1 |
0 |
1 |
-2.01 |
-0.73 |
0.19 |
0.84 |
1.30 |
▃▂▅▆▇ |
V29 |
0 |
1 |
0 |
1 |
-2.16 |
-0.81 |
0.18 |
0.88 |
1.50 |
▂▆▃▆▇ |
V30 |
0 |
1 |
0 |
1 |
-2.42 |
-0.80 |
0.07 |
0.61 |
1.90 |
▁▆▇▇▅ |
V31 |
0 |
1 |
0 |
1 |
-2.40 |
-0.69 |
-0.11 |
0.73 |
1.77 |
▁▅▇▆▆ |
V32 |
0 |
1 |
0 |
1 |
-2.15 |
-0.65 |
0.02 |
0.74 |
1.87 |
▃▇▇▇▆ |
V33 |
0 |
1 |
0 |
1 |
-2.40 |
-0.79 |
0.03 |
0.71 |
2.31 |
▁▇▇▇▂ |
V34 |
0 |
1 |
0 |
1 |
-2.14 |
-0.75 |
0.04 |
0.76 |
2.02 |
▂▆▇▆▃ |
V35 |
0 |
1 |
0 |
1 |
-2.08 |
-0.68 |
-0.11 |
0.85 |
1.74 |
▃▇▇▆▇ |
V36 |
0 |
1 |
0 |
1 |
-2.57 |
-0.78 |
-0.04 |
0.78 |
1.75 |
▁▅▇▆▆ |
V37 |
0 |
1 |
0 |
1 |
-2.48 |
-0.73 |
-0.01 |
0.77 |
1.54 |
▁▅▇▇▇ |
V38 |
0 |
1 |
0 |
1 |
-2.25 |
-0.70 |
0.22 |
0.67 |
1.81 |
▂▅▅▇▅ |
V39 |
0 |
1 |
0 |
1 |
-2.48 |
-0.81 |
-0.01 |
0.79 |
2.29 |
▁▇▇▆▂ |
V40 |
0 |
1 |
0 |
1 |
-2.46 |
-0.62 |
-0.07 |
0.71 |
2.67 |
▁▆▇▅▁ |
V41 |
0 |
1 |
0 |
1 |
-2.35 |
-0.73 |
-0.03 |
0.74 |
2.42 |
▂▆▇▆▂ |
V42 |
0 |
1 |
0 |
1 |
-2.09 |
-0.71 |
0.04 |
0.65 |
2.23 |
▃▅▇▅▂ |
V43 |
0 |
1 |
0 |
1 |
-2.23 |
-0.68 |
0.05 |
0.60 |
2.51 |
▃▆▇▅▂ |
V44 |
0 |
1 |
0 |
1 |
-2.66 |
-0.58 |
-0.10 |
0.67 |
1.81 |
▁▃▇▅▅ |
V45 |
0 |
1 |
0 |
1 |
-2.21 |
-0.67 |
-0.06 |
0.93 |
1.82 |
▂▅▇▃▆ |
V46 |
0 |
1 |
0 |
1 |
-2.72 |
-0.67 |
-0.05 |
0.53 |
2.27 |
▁▅▇▅▃ |
V47 |
0 |
1 |
0 |
1 |
-2.81 |
-0.60 |
-0.08 |
0.70 |
2.37 |
▁▅▇▅▂ |
V48 |
0 |
1 |
0 |
1 |
-2.44 |
-0.72 |
0.01 |
0.57 |
2.21 |
▂▅▇▅▂ |
V49 |
0 |
1 |
0 |
1 |
-2.21 |
-0.69 |
-0.02 |
0.79 |
2.26 |
▃▆▇▆▂ |
V50 |
0 |
1 |
0 |
1 |
-2.31 |
-0.61 |
-0.02 |
0.62 |
2.25 |
▂▆▇▆▃ |
V51 |
0 |
1 |
0 |
1 |
-2.60 |
-0.53 |
-0.01 |
0.59 |
3.36 |
▂▅▇▂▁ |
V52 |
0 |
1 |
0 |
1 |
-3.42 |
-0.52 |
-0.02 |
0.60 |
2.65 |
▁▂▇▆▁ |
V53 |
0 |
1 |
0 |
1 |
-2.66 |
-0.76 |
-0.19 |
0.78 |
2.34 |
▁▆▇▇▂ |
V54 |
0 |
1 |
0 |
1 |
-2.12 |
-0.69 |
0.01 |
0.62 |
2.05 |
▂▃▇▃▃ |
V55 |
0 |
1 |
0 |
1 |
-2.39 |
-0.83 |
0.11 |
0.75 |
2.37 |
▂▇▇▇▁ |
V56 |
0 |
1 |
0 |
1 |
-2.63 |
-0.61 |
0.00 |
0.74 |
3.08 |
▁▆▇▃▁ |
V57 |
0 |
1 |
0 |
1 |
-2.40 |
-0.69 |
-0.07 |
0.81 |
2.49 |
▂▇▇▇▂ |
V58 |
0 |
1 |
0 |
1 |
-2.44 |
-0.70 |
0.01 |
0.62 |
2.45 |
▁▅▇▅▂ |
V59 |
0 |
1 |
0 |
1 |
-2.88 |
-0.69 |
0.06 |
0.62 |
2.25 |
▁▃▇▅▂ |
V60 |
0 |
1 |
0 |
1 |
-2.43 |
-0.65 |
0.02 |
0.71 |
2.39 |
▂▆▇▆▂ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 96 61
##################################
# Formulating the test set
##################################
DPA_Test <- Sonar_Test
DPA_Test.Predictors <- DPA_Test[,!names(DPA_Test) %in% c("Class")]
DPA_Test.Predictors.Numeric <- DPA_Test.Predictors[,sapply(DPA_Test.Predictors, is.numeric)]
DPA_Test_BoxCox <- preProcess(DPA_Test.Predictors.Numeric, method = c("BoxCox"))
DPA_Test_BoxCoxTransformed <- predict(DPA_Test_BoxCox, DPA_Test.Predictors.Numeric)
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- preProcess(DPA_Test_BoxCoxTransformed, method = c("center","scale"))
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed <- predict(DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_Test_BoxCoxTransformed)
##################################
# Creating the pre-modelling
# test set
##################################
Class <- DPA_Test$Class
PMA_Test.Predictors.Numeric <- DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_Test_BoxCoxTransformed_CenteredScaledTransformed <- cbind(Class,PMA_Test.Predictors.Numeric)
PMA_PreModelling_Test <- PMA_Test_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
(PMA_PreModelling_Test_Skimmed <- skim(PMA_PreModelling_Test))
Data summary
Name |
PMA_PreModelling_Test |
Number of rows |
40 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 33, R: 7 |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-2.49 |
-0.67 |
-0.04 |
0.53 |
2.40 |
▁▃▇▃▁ |
V2 |
0 |
1 |
0 |
1 |
-2.15 |
-0.72 |
0.11 |
0.59 |
2.77 |
▂▆▇▃▁ |
V3 |
0 |
1 |
0 |
1 |
-2.02 |
-0.59 |
0.17 |
0.62 |
2.39 |
▃▃▇▃▁ |
V4 |
0 |
1 |
0 |
1 |
-2.20 |
-0.74 |
-0.11 |
0.69 |
2.55 |
▂▇▇▅▁ |
V5 |
0 |
1 |
0 |
1 |
-2.24 |
-0.66 |
0.06 |
0.46 |
2.40 |
▂▅▇▃▂ |
V6 |
0 |
1 |
0 |
1 |
-2.15 |
-0.65 |
-0.03 |
0.67 |
1.87 |
▂▅▇▆▅ |
V7 |
0 |
1 |
0 |
1 |
-2.66 |
-0.61 |
-0.13 |
0.61 |
2.59 |
▁▅▇▅▁ |
V8 |
0 |
1 |
0 |
1 |
-2.93 |
-0.43 |
-0.08 |
0.45 |
3.01 |
▁▂▇▂▁ |
V9 |
0 |
1 |
0 |
1 |
-1.95 |
-0.73 |
-0.09 |
0.64 |
2.29 |
▃▆▇▅▂ |
V10 |
0 |
1 |
0 |
1 |
-2.23 |
-0.50 |
-0.04 |
0.47 |
2.23 |
▂▃▇▃▁ |
V11 |
0 |
1 |
0 |
1 |
-2.20 |
-0.62 |
0.06 |
0.47 |
2.26 |
▂▃▇▂▂ |
V12 |
0 |
1 |
0 |
1 |
-2.47 |
-0.39 |
-0.10 |
0.33 |
2.13 |
▁▂▇▃▂ |
V13 |
0 |
1 |
0 |
1 |
-2.21 |
-0.57 |
-0.01 |
0.48 |
2.17 |
▂▃▇▃▂ |
V14 |
0 |
1 |
0 |
1 |
-2.48 |
-0.66 |
0.00 |
0.37 |
2.67 |
▁▃▇▃▁ |
V15 |
0 |
1 |
0 |
1 |
-2.49 |
-0.71 |
0.01 |
0.64 |
2.04 |
▁▅▇▅▃ |
V16 |
0 |
1 |
0 |
1 |
-2.52 |
-0.85 |
-0.12 |
0.79 |
1.84 |
▁▆▆▇▃ |
V17 |
0 |
1 |
0 |
1 |
-1.87 |
-0.81 |
-0.16 |
0.94 |
1.63 |
▂▇▅▃▇ |
V18 |
0 |
1 |
0 |
1 |
-2.20 |
-0.76 |
-0.20 |
0.86 |
1.59 |
▁▆▇▃▇ |
V19 |
0 |
1 |
0 |
1 |
-1.82 |
-0.69 |
0.00 |
1.00 |
1.52 |
▃▇▆▂▇ |
V20 |
0 |
1 |
0 |
1 |
-1.85 |
-0.85 |
0.09 |
0.98 |
1.43 |
▃▃▅▅▇ |
V21 |
0 |
1 |
0 |
1 |
-1.83 |
-0.91 |
0.22 |
0.68 |
1.56 |
▅▂▅▇▆ |
V22 |
0 |
1 |
0 |
1 |
-1.98 |
-0.90 |
0.20 |
0.70 |
1.49 |
▃▃▃▇▅ |
V23 |
0 |
1 |
0 |
1 |
-2.05 |
-0.68 |
0.22 |
0.67 |
1.42 |
▃▃▅▇▇ |
V24 |
0 |
1 |
0 |
1 |
-2.24 |
-0.71 |
0.11 |
0.75 |
1.45 |
▂▅▆▇▇ |
V25 |
0 |
1 |
0 |
1 |
-1.79 |
-0.83 |
0.00 |
0.92 |
1.41 |
▃▂▆▃▇ |
V26 |
0 |
1 |
0 |
1 |
-1.98 |
-0.80 |
0.34 |
0.96 |
1.15 |
▂▃▂▂▇ |
V27 |
0 |
1 |
0 |
1 |
-1.91 |
-0.86 |
0.22 |
0.97 |
1.10 |
▂▃▂▂▇ |
V28 |
0 |
1 |
0 |
1 |
-2.12 |
-0.74 |
0.14 |
1.00 |
1.23 |
▂▃▅▃▇ |
V29 |
0 |
1 |
0 |
1 |
-2.01 |
-0.81 |
0.05 |
0.90 |
1.69 |
▃▇▅▇▆ |
V30 |
0 |
1 |
0 |
1 |
-1.85 |
-0.71 |
0.29 |
0.77 |
1.84 |
▃▇▃▇▃ |
V31 |
0 |
1 |
0 |
1 |
-2.08 |
-0.57 |
-0.13 |
0.65 |
2.11 |
▂▅▇▃▃ |
V32 |
0 |
1 |
0 |
1 |
-2.19 |
-0.41 |
0.12 |
0.46 |
2.19 |
▂▂▇▂▂ |
V33 |
0 |
1 |
0 |
1 |
-2.15 |
-0.59 |
0.08 |
0.73 |
1.96 |
▂▃▇▆▂ |
V34 |
0 |
1 |
0 |
1 |
-2.44 |
-0.68 |
-0.07 |
0.67 |
2.14 |
▁▅▇▆▃ |
V35 |
0 |
1 |
0 |
1 |
-1.85 |
-0.72 |
0.07 |
0.77 |
1.80 |
▅▅▅▇▃ |
V36 |
0 |
1 |
0 |
1 |
-2.08 |
-0.63 |
-0.11 |
0.76 |
1.98 |
▃▃▇▇▂ |
V37 |
0 |
1 |
0 |
1 |
-2.28 |
-0.61 |
0.05 |
0.80 |
1.99 |
▂▃▇▆▂ |
V38 |
0 |
1 |
0 |
1 |
-2.53 |
-0.79 |
0.14 |
0.66 |
2.44 |
▁▇▇▇▂ |
V39 |
0 |
1 |
0 |
1 |
-1.77 |
-0.71 |
0.27 |
0.61 |
2.08 |
▆▅▇▇▂ |
V40 |
0 |
1 |
0 |
1 |
-2.18 |
-0.84 |
0.07 |
0.84 |
1.89 |
▁▇▇▇▂ |
V41 |
0 |
1 |
0 |
1 |
-1.77 |
-0.81 |
0.12 |
0.74 |
1.76 |
▆▃▅▇▃ |
V42 |
0 |
1 |
0 |
1 |
-2.08 |
-0.74 |
-0.17 |
1.01 |
1.94 |
▂▇▇▅▃ |
V43 |
0 |
1 |
0 |
1 |
-2.14 |
-0.69 |
-0.15 |
0.99 |
1.59 |
▂▅▇▂▇ |
V44 |
0 |
1 |
0 |
1 |
-2.40 |
-0.56 |
-0.04 |
0.61 |
2.01 |
▁▃▇▅▃ |
V45 |
0 |
1 |
0 |
1 |
-2.22 |
-0.70 |
-0.04 |
0.43 |
1.92 |
▁▇▇▅▅ |
V46 |
0 |
1 |
0 |
1 |
-2.34 |
-0.68 |
-0.03 |
0.66 |
2.13 |
▁▇▇▇▃ |
V47 |
0 |
1 |
0 |
1 |
-2.06 |
-0.61 |
0.07 |
0.66 |
2.28 |
▂▆▇▆▁ |
V48 |
0 |
1 |
0 |
1 |
-2.05 |
-0.70 |
-0.05 |
0.70 |
2.07 |
▃▆▇▆▂ |
V49 |
0 |
1 |
0 |
1 |
-1.83 |
-0.75 |
0.05 |
0.78 |
1.77 |
▅▅▅▇▃ |
V50 |
0 |
1 |
0 |
1 |
-1.81 |
-0.60 |
0.20 |
0.57 |
2.59 |
▃▃▇▂▁ |
V51 |
0 |
1 |
0 |
1 |
-2.15 |
-0.57 |
0.02 |
0.57 |
2.09 |
▂▃▇▃▂ |
V52 |
0 |
1 |
0 |
1 |
-2.12 |
-0.61 |
0.01 |
0.50 |
2.17 |
▃▆▇▅▃ |
V53 |
0 |
1 |
0 |
1 |
-2.04 |
-0.36 |
0.05 |
0.51 |
2.65 |
▃▆▇▃▁ |
V54 |
0 |
1 |
0 |
1 |
-2.06 |
-0.68 |
-0.15 |
0.72 |
1.92 |
▃▆▇▇▅ |
V55 |
0 |
1 |
0 |
1 |
-1.93 |
-0.71 |
-0.17 |
0.77 |
1.84 |
▃▆▇▇▅ |
V56 |
0 |
1 |
0 |
1 |
-2.19 |
-0.53 |
0.06 |
0.62 |
1.91 |
▂▃▇▅▃ |
V57 |
0 |
1 |
0 |
1 |
-2.34 |
-0.64 |
0.04 |
0.60 |
1.86 |
▂▃▇▇▅ |
V58 |
0 |
1 |
0 |
1 |
-2.16 |
-0.60 |
-0.08 |
0.77 |
2.49 |
▃▇▇▇▁ |
V59 |
0 |
1 |
0 |
1 |
-2.56 |
-0.74 |
-0.21 |
0.70 |
2.43 |
▁▅▇▆▁ |
V60 |
0 |
1 |
0 |
1 |
-1.76 |
-0.83 |
-0.06 |
0.69 |
2.55 |
▅▇▇▅▁ |
###################################
# Verifying the data dimensions
# for the test set
###################################
dim(PMA_PreModelling_Test)
## [1] 40 61
1.5 Cost-Sensitive Learning Applied for Class Imbalance
1.5.1 Support Vector Machine - Radial Basis Function Kernel
(SVM_R)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package. This version of the model applied uniform case weights between
the classes of the response variable (Class=M:1, Class=R:1).
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma held constant at a value of 0.00873
[B.2] C = cost
made to vary across a range of 2^(-6) to 2^(1)
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves
sigma=0.00873 and C=2.00000
[D.2] Specificity = 0.60000
[E] The model does not allow for ranking of predictors
in terms of variable importance.
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.14286
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
##################################
# Pre-computing the sigma parameter
##################################
set.seed(12345678)
sigma <- sigest(Class~.,
data=PMA_PreModelling_Train,
frac=0.75)
names(sigma)=NULL
SVM_R_Grid <- data.frame(sigma=sigma[2], C=2^seq(-6,1,length=15))
##################################
# Running the support vector machine (radial basis function kernel) model
# by setting the caret method to 'svmRadial'
##################################
set.seed(12345678)
SVM_R_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "svmRadial",
tuneGrid = SVM_R_Grid,
metric = "Specificity",
preProc = c("center", "scale"),
trControl = KFold_Control,
class.weights = c(M=1,R=1))
##################################
# Reporting the cross-validation results
# for the train set
##################################
SVM_R_Tune
## Support Vector Machines with Radial Basis Function Kernel
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Sensitivity Specificity
## 0.01562500 0.8133333 0.0000000 1 0.0
## 0.02209709 0.8133333 0.0000000 1 0.0
## 0.03125000 0.8133333 0.0000000 1 0.0
## 0.04419417 0.8133333 0.0000000 1 0.0
## 0.06250000 0.8133333 0.0000000 1 0.0
## 0.08838835 0.8133333 0.0000000 1 0.0
## 0.12500000 0.8133333 0.0000000 1 0.0
## 0.17677670 0.8133333 0.0000000 1 0.0
## 0.25000000 0.8133333 0.0000000 1 0.0
## 0.35355339 0.8133333 0.0000000 1 0.0
## 0.50000000 0.8133333 0.0000000 1 0.0
## 0.70710678 0.8133333 0.0000000 1 0.0
## 1.00000000 0.8655556 0.3224080 1 0.3
## 1.41421356 0.9066667 0.5678930 1 0.5
## 2.00000000 0.9277778 0.6454849 1 0.6
##
## Tuning parameter 'sigma' was held constant at a value of 0.008732089
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.008732089 and C = 2.
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 2
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00873208894584695
##
## Number of Support Vectors : 57
##
## Objective Function Value : -33.2466
## Training error : 0.020833
## sigma C Accuracy Kappa Sensitivity Specificity
## 1 0.008732089 0.01562500 0.8133333 0.0000000 1 0.0
## 2 0.008732089 0.02209709 0.8133333 0.0000000 1 0.0
## 3 0.008732089 0.03125000 0.8133333 0.0000000 1 0.0
## 4 0.008732089 0.04419417 0.8133333 0.0000000 1 0.0
## 5 0.008732089 0.06250000 0.8133333 0.0000000 1 0.0
## 6 0.008732089 0.08838835 0.8133333 0.0000000 1 0.0
## 7 0.008732089 0.12500000 0.8133333 0.0000000 1 0.0
## 8 0.008732089 0.17677670 0.8133333 0.0000000 1 0.0
## 9 0.008732089 0.25000000 0.8133333 0.0000000 1 0.0
## 10 0.008732089 0.35355339 0.8133333 0.0000000 1 0.0
## 11 0.008732089 0.50000000 0.8133333 0.0000000 1 0.0
## 12 0.008732089 0.70710678 0.8133333 0.0000000 1 0.0
## 13 0.008732089 1.00000000 0.8655556 0.3224080 1 0.3
## 14 0.008732089 1.41421356 0.9066667 0.5678930 1 0.5
## 15 0.008732089 2.00000000 0.9277778 0.6454849 1 0.6
## AccuracySD KappaSD SensitivitySD SpecificitySD
## 1 0.04084163 0.0000000 0 0.0000000
## 2 0.04084163 0.0000000 0 0.0000000
## 3 0.04084163 0.0000000 0 0.0000000
## 4 0.04084163 0.0000000 0 0.0000000
## 5 0.04084163 0.0000000 0 0.0000000
## 6 0.04084163 0.0000000 0 0.0000000
## 7 0.04084163 0.0000000 0 0.0000000
## 8 0.04084163 0.0000000 0 0.0000000
## 9 0.04084163 0.0000000 0 0.0000000
## 10 0.04084163 0.0000000 0 0.0000000
## 11 0.04084163 0.0000000 0 0.0000000
## 12 0.04084163 0.0000000 0 0.0000000
## 13 0.08387887 0.4358567 0 0.4216370
## 14 0.05766371 0.3384260 0 0.3333333
## 15 0.06874337 0.3858752 0 0.3944053
(SVM_R_Train_Specificity <- SVM_R_Tune$results[SVM_R_Tune$results$C==SVM_R_Tune$bestTune$C,
c("Specificity")])
## [1] 0.6
SVM_R_Train <- data.frame(SVM_R_Observed = PMA_PreModelling_Train$Class,
SVM_R_Predicted = predict(SVM_R_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(SVM_R_Train_ConfusionMatrix <- confusionMatrix(data = SVM_R_Train$SVM_R_Predicted,
reference = SVM_R_Train$SVM_R_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 78 2
## R 0 16
##
## Accuracy : 0.9792
## 95% CI : (0.9268, 0.9975)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 5.86e-07
##
## Kappa : 0.9286
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.8889
## Pos Pred Value : 0.9750
## Neg Pred Value : 1.0000
## Prevalence : 0.8125
## Detection Rate : 0.8125
## Detection Prevalence : 0.8333
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_R_Test <- data.frame(SVM_R_Observed = PMA_PreModelling_Test$Class,
SVM_R_Predicted = predict(SVM_R_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
SVM_R_Test
## SVM_R_Observed SVM_R_Predicted
## 1 M M
## 2 M M
## 3 M M
## 4 M M
## 5 M M
## 6 M M
## 7 M M
## 8 M M
## 9 M M
## 10 M M
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M M
## 19 M M
## 20 M M
## 21 M M
## 22 M M
## 23 M M
## 24 M R
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R M
## 36 R M
## 37 R M
## 38 R M
## 39 R R
## 40 R M
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(SVM_R_Test_Specificity <- Specificity(y_pred = SVM_R_Test$SVM_R_Predicted,
y_true = SVM_R_Test$SVM_R_Observed))
## [1] 0.1428571
(SVM_R_Test_ConfusionMatrix <- confusionMatrix(data = SVM_R_Test$SVM_R_Predicted,
reference = SVM_R_Test$SVM_R_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 32 6
## R 1 1
##
## Accuracy : 0.825
## 95% CI : (0.6722, 0.9266)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.5992
##
## Kappa : 0.1566
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.9697
## Specificity : 0.1429
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.5000
## Prevalence : 0.8250
## Detection Rate : 0.8000
## Detection Prevalence : 0.9500
## Balanced Accuracy : 0.5563
##
## 'Positive' Class : M
##
1.5.2 Class-Weighted Support Vector Machine - Radial Basis Function
Kernel (CW_SVM_R)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package. This version of the model applied non-uniform case weights
between the classes of the response variable (Class=M:1, Class=R:4).
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma held constant at a value of 0.00873
[B.2] C = cost
made to vary across a range of 2^(-6) to 2^(1)
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves
sigma=0.00873 and C=0.35355
[D.2] Specificity = 0.65000
[E] The model does not allow for ranking of predictors
in terms of variable importance.
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.28571
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
##################################
# Pre-computing the sigma parameter
##################################
set.seed(12345678)
sigma <- sigest(Class~.,
data=PMA_PreModelling_Train,
frac=0.75)
names(sigma)=NULL
CW_SVM_R_Grid <- data.frame(sigma=sigma[2], C=2^seq(-6,1,length=15))
##################################
# Running the support vector machine (radial basis function kernel) model
# by setting the caret method to 'svmRadial'
##################################
set.seed(12345678)
CW_SVM_R_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "svmRadial",
tuneGrid = CW_SVM_R_Grid,
metric = "Specificity",
preProc = c("center", "scale"),
trControl = KFold_Control,
class.weights = c(M=1,R=4))
##################################
# Reporting the cross-validation results
# for the train set
##################################
CW_SVM_R_Tune
## Support Vector Machines with Radial Basis Function Kernel
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Sensitivity Specificity
## 0.01562500 0.8133333 0.0000000 1.0000000 0.00
## 0.02209709 0.8133333 0.0000000 1.0000000 0.00
## 0.03125000 0.8133333 0.0000000 1.0000000 0.00
## 0.04419417 0.8133333 0.0000000 1.0000000 0.00
## 0.06250000 0.8133333 0.0000000 1.0000000 0.00
## 0.08838835 0.8444444 0.2230769 1.0000000 0.20
## 0.12500000 0.8633333 0.4848567 0.9357143 0.55
## 0.17677670 0.8422222 0.4554315 0.8964286 0.60
## 0.25000000 0.8322222 0.4389841 0.8839286 0.60
## 0.35355339 0.8633333 0.5094884 0.9089286 0.65
## 0.50000000 0.8844444 0.5566738 0.9339286 0.65
## 0.70710678 0.8944444 0.5758125 0.9464286 0.65
## 1.00000000 0.9166667 0.6324754 0.9732143 0.65
## 1.41421356 0.9066667 0.5940139 0.9732143 0.60
## 2.00000000 0.9066667 0.5940139 0.9732143 0.60
##
## Tuning parameter 'sigma' was held constant at a value of 0.008732089
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.008732089 and C = 0.3535534.
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.353553390593274
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00873208894584695
##
## Number of Support Vectors : 78
##
## Objective Function Value : -25.2282
## Training error : 0.020833
## sigma C Accuracy Kappa Sensitivity Specificity
## 1 0.008732089 0.01562500 0.8133333 0.0000000 1.0000000 0.00
## 2 0.008732089 0.02209709 0.8133333 0.0000000 1.0000000 0.00
## 3 0.008732089 0.03125000 0.8133333 0.0000000 1.0000000 0.00
## 4 0.008732089 0.04419417 0.8133333 0.0000000 1.0000000 0.00
## 5 0.008732089 0.06250000 0.8133333 0.0000000 1.0000000 0.00
## 6 0.008732089 0.08838835 0.8444444 0.2230769 1.0000000 0.20
## 7 0.008732089 0.12500000 0.8633333 0.4848567 0.9357143 0.55
## 8 0.008732089 0.17677670 0.8422222 0.4554315 0.8964286 0.60
## 9 0.008732089 0.25000000 0.8322222 0.4389841 0.8839286 0.60
## 10 0.008732089 0.35355339 0.8633333 0.5094884 0.9089286 0.65
## 11 0.008732089 0.50000000 0.8844444 0.5566738 0.9339286 0.65
## 12 0.008732089 0.70710678 0.8944444 0.5758125 0.9464286 0.65
## 13 0.008732089 1.00000000 0.9166667 0.6324754 0.9732143 0.65
## 14 0.008732089 1.41421356 0.9066667 0.5940139 0.9732143 0.60
## 15 0.008732089 2.00000000 0.9066667 0.5940139 0.9732143 0.60
## AccuracySD KappaSD SensitivitySD SpecificitySD
## 1 0.04084163 0.0000000 0.00000000 0.0000000
## 2 0.04084163 0.0000000 0.00000000 0.0000000
## 3 0.04084163 0.0000000 0.00000000 0.0000000
## 4 0.04084163 0.0000000 0.00000000 0.0000000
## 5 0.04084163 0.0000000 0.00000000 0.0000000
## 6 0.07388866 0.3741306 0.00000000 0.3496029
## 7 0.10331276 0.3537822 0.12262483 0.3689324
## 8 0.11814757 0.3704066 0.13410702 0.3944053
## 9 0.12608378 0.3779497 0.14192404 0.3944053
## 10 0.10331276 0.3644062 0.12441758 0.4116363
## 11 0.10659977 0.3886770 0.11357979 0.4116363
## 12 0.10240590 0.3927539 0.09671474 0.4116363
## 13 0.08318918 0.3982785 0.05662589 0.4116363
## 14 0.07790114 0.3768373 0.05662589 0.3944053
## 15 0.07790114 0.3768373 0.05662589 0.3944053
(CW_SVM_R_Train_Specificity <- CW_SVM_R_Tune$results[CW_SVM_R_Tune$results$C==CW_SVM_R_Tune$bestTune$C,
c("Specificity")])
## [1] 0.65
CW_SVM_R_Train <- data.frame(CW_SVM_R_Observed = PMA_PreModelling_Train$Class,
CW_SVM_R_Predicted = predict(CW_SVM_R_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(CW_SVM_R_Train_ConfusionMatrix <- confusionMatrix(data = CW_SVM_R_Train$CW_SVM_R_Predicted,
reference = CW_SVM_R_Train$CW_SVM_R_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 77 1
## R 1 17
##
## Accuracy : 0.9792
## 95% CI : (0.9268, 0.9975)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 5.86e-07
##
## Kappa : 0.9316
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9872
## Specificity : 0.9444
## Pos Pred Value : 0.9872
## Neg Pred Value : 0.9444
## Prevalence : 0.8125
## Detection Rate : 0.8021
## Detection Prevalence : 0.8125
## Balanced Accuracy : 0.9658
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
# model does not support variable importance measurement
##################################
# Independently evaluating the model
# on the test set
##################################
CW_SVM_R_Test <- data.frame(CW_SVM_R_Observed = PMA_PreModelling_Test$Class,
CW_SVM_R_Predicted = predict(CW_SVM_R_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
CW_SVM_R_Test
## CW_SVM_R_Observed CW_SVM_R_Predicted
## 1 M M
## 2 M M
## 3 M R
## 4 M M
## 5 M R
## 6 M M
## 7 M M
## 8 M M
## 9 M M
## 10 M M
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M M
## 19 M M
## 20 M M
## 21 M M
## 22 M M
## 23 M M
## 24 M R
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R M
## 36 R R
## 37 R M
## 38 R M
## 39 R R
## 40 R M
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CW_SVM_R_Test_Specificity <- Specificity(y_pred = CW_SVM_R_Test$CW_SVM_R_Predicted,
y_true = CW_SVM_R_Test$CW_SVM_R_Observed))
## [1] 0.2857143
(CW_SVM_R_Test_ConfusionMatrix <- confusionMatrix(data = CW_SVM_R_Test$CW_SVM_R_Predicted,
reference = CW_SVM_R_Test$CW_SVM_R_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 30 5
## R 3 2
##
## Accuracy : 0.8
## 95% CI : (0.6435, 0.9095)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.7427
##
## Kappa : 0.2195
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9091
## Specificity : 0.2857
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.4000
## Prevalence : 0.8250
## Detection Rate : 0.7500
## Detection Prevalence : 0.8750
## Balanced Accuracy : 0.5974
##
## 'Positive' Class : M
##
1.5.3 Classification and Regression Trees (CART)
[A] The classification and regression trees model from
the
rpart
package was implemented through the
caret
package. This version of the model applied uniform case weights between
the classes of the response variable (Class=M:1, Class=R:1).
[B] The model contains 1 hyperparameter:
[B.1] cp =
complexity parameter threshold made to vary across a range of values
equal to 0.0001 to 0.0200
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves
cp=0.0200
[D.2] Specificity = 0.45000
[E] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[E.1] V49
variable (numeric)
[E.2] V11
variable (numeric)
[E.3] V21
variable (numeric)
[E.4] V22
variable (numeric)
[E.5] V10
variable (numeric)
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.28571
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CART_Grid = data.frame(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.010, 0.015, 0.020))
##################################
# Formulating the cost matrix
##################################
CART_CostMatrix <- matrix(c(0,1,1,0), ncol=2)
rownames(CART_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
colnames(CART_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
CART_CostMatrix
## M R
## M 0 1
## R 1 0
##################################
# Running the classification and regression trees model
# by setting the caret method to 'rpart'
##################################
set.seed(12345678)
CART_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "rpart",
tuneGrid = CART_Grid,
metric = "Specificity",
trControl = KFold_Control,
parms = list(loss=CART_CostMatrix))
##################################
# Reporting the cross-validation results
# for the train set
##################################
CART_Tune
## CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Sensitivity Specificity
## 0.0001 0.7922222 0.2829196 0.8714286 0.45
## 0.0005 0.7922222 0.2829196 0.8714286 0.45
## 0.0010 0.7922222 0.2829196 0.8714286 0.45
## 0.0050 0.7922222 0.2829196 0.8714286 0.45
## 0.0100 0.7922222 0.2829196 0.8714286 0.45
## 0.0150 0.7922222 0.2829196 0.8714286 0.45
## 0.0200 0.7922222 0.2829196 0.8714286 0.45
##
## Specificity was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02.
## n= 96
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 96 18 M (0.81250000 0.18750000)
## 2) V11>=-1.453302 87 10 M (0.88505747 0.11494253)
## 4) V49>=-1.421354 79 5 M (0.93670886 0.06329114) *
## 5) V49< -1.421354 8 3 R (0.37500000 0.62500000) *
## 3) V11< -1.453302 9 1 R (0.11111111 0.88888889) *
## cp Accuracy Kappa Sensitivity Specificity AccuracySD KappaSD
## 1 0.0001 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 2 0.0005 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 3 0.0010 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 4 0.0050 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 5 0.0100 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 6 0.0150 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## 7 0.0200 0.7922222 0.2829196 0.8714286 0.45 0.1193317 0.4149074
## SensitivitySD SpecificitySD
## 1 0.1225092 0.4377975
## 2 0.1225092 0.4377975
## 3 0.1225092 0.4377975
## 4 0.1225092 0.4377975
## 5 0.1225092 0.4377975
## 6 0.1225092 0.4377975
## 7 0.1225092 0.4377975
(CART_Train_Specificity <- CART_Tune$results[CART_Tune$results$cp==CART_Tune$bestTune$cp,
c("Specificity")])
## [1] 0.45
CART_Train <- data.frame(CART_Observed = PMA_PreModelling_Train$Class,
CART_Predicted = predict(CART_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(CART_Train_ConfusionMatrix <- confusionMatrix(data = CART_Train$CART_Predicted,
reference = CART_Train$CART_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 74 5
## R 4 13
##
## Accuracy : 0.9062
## 95% CI : (0.8295, 0.9562)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 0.008989
##
## Kappa : 0.6856
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.9487
## Specificity : 0.7222
## Pos Pred Value : 0.9367
## Neg Pred Value : 0.7647
## Prevalence : 0.8125
## Detection Rate : 0.7708
## Detection Prevalence : 0.8229
## Balanced Accuracy : 0.8355
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
CART_VarImp <- varImp(CART_Tune, scale = TRUE)
plot(CART_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Classification and Regression Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CART_Test <- data.frame(CART_Observed = PMA_PreModelling_Test$Class,
CART_Predicted = predict(CART_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
CART_Test
## CART_Observed CART_Predicted
## 1 M M
## 2 M M
## 3 M M
## 4 M M
## 5 M M
## 6 M M
## 7 M R
## 8 M R
## 9 M M
## 10 M M
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M R
## 19 M M
## 20 M M
## 21 M R
## 22 M R
## 23 M M
## 24 M M
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R R
## 36 R R
## 37 R M
## 38 R M
## 39 R M
## 40 R M
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CART_Test_Specificity <- Specificity(y_pred = CART_Test$CART_Predicted,
y_true = CART_Test$CART_Observed))
## [1] 0.2857143
(CART_Test_ConfusionMatrix <- confusionMatrix(data = CART_Test$CART_Predicted,
reference = CART_Test$CART_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 28 5
## R 5 2
##
## Accuracy : 0.75
## 95% CI : (0.588, 0.8731)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.922
##
## Kappa : 0.1342
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.8485
## Specificity : 0.2857
## Pos Pred Value : 0.8485
## Neg Pred Value : 0.2857
## Prevalence : 0.8250
## Detection Rate : 0.7000
## Detection Prevalence : 0.8250
## Balanced Accuracy : 0.5671
##
## 'Positive' Class : M
##
1.5.4 Cost-Sensitive Classification and Regression Trees
(CS_CART)
[A] The classification and regression trees model from
the
rpart
package was implemented through the
caret
package. This version of the model applied non-uniform case weights
between the classes of the response variable (Class=M:1, Class=R:4).
[B] The model contains 1 hyperparameter:
[B.1] cp =
complexity parameter threshold made to vary across a range of values
equal to 0.0001 to 0.0200
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves
cp=0.0200
[D.2] Specificity = 0.30000
[E] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[E.1] V21
variable (numeric)
[E.2] V11
variable (numeric)
[E.3] V10
variable (numeric)
[E.4] V35
variable (numeric)
[E.5] V48
variable (numeric)
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.71428
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CS_CART_Grid = data.frame(cp = c(0.0001, 0.0005, 0.0010, 0.0050, 0.0100, 0.0150, 0.0200))
##################################
# Formulating the cost matrix
##################################
CS_CART_CostMatrix <- matrix(c(0,4,1,0), ncol=2)
rownames(CS_CART_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
colnames(CS_CART_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
CS_CART_CostMatrix
## M R
## M 0 1
## R 4 0
##################################
# Running the classification and regression trees model
# by setting the caret method to 'rpart'
##################################
set.seed(12345678)
CS_CART_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "rpart",
tuneGrid = CS_CART_Grid,
metric = "Specificity",
trControl = KFold_Control,
parms = list(loss=CS_CART_CostMatrix))
##################################
# Reporting the cross-validation results
# for the train set
##################################
CS_CART_Tune
## CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Sensitivity Specificity
## 0.0001 0.76 0.169608 0.8535714 0.3
## 0.0005 0.76 0.169608 0.8535714 0.3
## 0.0010 0.76 0.169608 0.8535714 0.3
## 0.0050 0.76 0.169608 0.8535714 0.3
## 0.0100 0.76 0.169608 0.8535714 0.3
## 0.0150 0.76 0.169608 0.8535714 0.3
## 0.0200 0.76 0.169608 0.8535714 0.3
##
## Specificity was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02.
## n= 96
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 96 72 M (0.81250000 0.18750000)
## 2) V11>=-0.92108 78 28 M (0.91025641 0.08974359)
## 4) V49>=-1.421354 71 12 M (0.95774648 0.04225352) *
## 5) V49< -1.421354 7 3 R (0.42857143 0.57142857) *
## 3) V11< -0.92108 18 7 R (0.38888889 0.61111111) *
## cp Accuracy Kappa Sensitivity Specificity AccuracySD KappaSD
## 1 0.0001 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 2 0.0005 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 3 0.0010 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 4 0.0050 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 5 0.0100 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 6 0.0150 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## 7 0.0200 0.76 0.169608 0.8535714 0.3 0.1607766 0.3396056
## SensitivitySD SpecificitySD
## 1 0.1921061 0.3496029
## 2 0.1921061 0.3496029
## 3 0.1921061 0.3496029
## 4 0.1921061 0.3496029
## 5 0.1921061 0.3496029
## 6 0.1921061 0.3496029
## 7 0.1921061 0.3496029
(CS_CART_Train_Specificity <- CS_CART_Tune$results[CS_CART_Tune$results$cp==CS_CART_Tune$bestTune$cp,
c("Specificity")])
## [1] 0.3
CS_CART_Train <- data.frame(CS_CART_Observed = PMA_PreModelling_Train$Class,
CS_CART_Predicted = predict(CS_CART_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(CS_CART_Train_ConfusionMatrix <- confusionMatrix(data = CS_CART_Train$CS_CART_Predicted,
reference = CS_CART_Train$CS_CART_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 68 3
## R 10 15
##
## Accuracy : 0.8646
## 95% CI : (0.7796, 0.9259)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 0.11704
##
## Kappa : 0.6134
##
## Mcnemar's Test P-Value : 0.09609
##
## Sensitivity : 0.8718
## Specificity : 0.8333
## Pos Pred Value : 0.9577
## Neg Pred Value : 0.6000
## Prevalence : 0.8125
## Detection Rate : 0.7083
## Detection Prevalence : 0.7396
## Balanced Accuracy : 0.8526
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
CS_CART_VarImp <- varImp(CS_CART_Tune, scale = TRUE)
plot(CS_CART_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Cost-Sensitive Classification and Regression Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CS_CART_Test <- data.frame(CS_CART_Observed = PMA_PreModelling_Test$Class,
CS_CART_Predicted = predict(CS_CART_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
CS_CART_Test
## CS_CART_Observed CS_CART_Predicted
## 1 M R
## 2 M M
## 3 M M
## 4 M M
## 5 M M
## 6 M M
## 7 M R
## 8 M R
## 9 M R
## 10 M M
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M R
## 19 M M
## 20 M M
## 21 M R
## 22 M R
## 23 M M
## 24 M M
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R R
## 36 R R
## 37 R R
## 38 R M
## 39 R R
## 40 R R
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CS_CART_Test_Specificity <- Specificity(y_pred = CS_CART_Test$CS_CART_Predicted,
y_true = CS_CART_Test$CS_CART_Observed))
## [1] 0.7142857
(CS_CART_Test_ConfusionMatrix <- confusionMatrix(data = CS_CART_Test$CS_CART_Predicted,
reference = CS_CART_Test$CS_CART_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 2
## R 7 5
##
## Accuracy : 0.775
## 95% CI : (0.6155, 0.8916)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.8509
##
## Kappa : 0.3919
##
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.7879
## Specificity : 0.7143
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.4167
## Prevalence : 0.8250
## Detection Rate : 0.6500
## Detection Prevalence : 0.7000
## Balanced Accuracy : 0.7511
##
## 'Positive' Class : M
##
1.5.5 C5.0 Decision Trees (C50)
[A] The C5.0 decision trees model from the
C50 and
plyr
packages was implemented through the
caret
package. This version of the model applied uniform case weights between
the classes of the response variable (Class=M:1, Class=R:1).
[B] The model contains 3 hyperparameters:
[B.1] trials =
number of boosting iterations made to vary across a range of values
equal to 1 to 100
[B.2] model =
model type made to vary across a range of levels equal to TREE and
RULES
[B.3] winnow =
winnow made to vary across a range of levels equal to TRUE and
FALSE
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves trials=1,
model=TREE and winnow=FALSE
[D.2] Specificity = 0.55000
[E] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[E.1] V11
variable (numeric)
[E.2] V49
variable (numeric)
[E.3] V37
variable (numeric)
[E.4] V17
variable (numeric)
[E.5] V20
variable (numeric)
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.14286
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
C50_Grid = expand.grid(trials = c(1:9, (1:10)*10),
model = c("tree", "rules"),
winnow = c(TRUE, FALSE))
##################################
# Formulating the cost matrix
##################################
C50_CostMatrix <- matrix(c(0,1,1,0), ncol=2)
rownames(C50_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
colnames(C50_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
C50_CostMatrix
## M R
## M 0 1
## R 1 0
##################################
# Running the C5.0 decision trees model
# by setting the caret method to 'C5.0'
##################################
set.seed(12345678)
C50_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "C5.0",
tuneGrid = C50_Grid,
metric = "Specificity",
trControl = KFold_Control,
parms = list(loss=C50_CostMatrix))
##################################
# Reporting the cross-validation results
# for the train set
##################################
C50_Tune
## C5.0
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa Sensitivity Specificity
## rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50
## rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35
## rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45
## rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20
## rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40
## rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40
## rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40
## rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20
## rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30
## rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30
## tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55
## tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30
## tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40
## tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30
## tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40
## tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25
## tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40
## tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35
## tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45
## tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30
## tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35
## tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35
## tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30
## tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35
## tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20
## tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20
## tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30
## tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30
##
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
## = FALSE.
##
## Call:
## (function (x, y, trials = 1, rules = FALSE, weights = NULL, control
## = 3942L), parms = list(loss = structure(c(0, 1, 1, 0), .Dim = c(2L,
## 2L), .Dimnames = list(c("M", "R"), c("M", "R")))))
##
## Classification Tree
## Number of samples: 96
## Number of predictors: 60
##
## Tree size: 6
##
## Non-standard options: attempt to group attributes
## model winnow trials Accuracy Kappa Sensitivity Specificity AccuracySD
## 39 rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50 0.08764563
## 58 rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20 0.10213093
## 1 tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55 0.11468930
## 20 tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20 0.11357756
## 40 rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35 0.05341557
## 59 rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 2 tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30 0.04870054
## 21 tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20 0.11248533
## 41 rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45 0.09293271
## 60 rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30 0.09927031
## 3 tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40 0.10210406
## 22 tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30 0.10876536
## 42 rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20 0.05341557
## 61 rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 4 tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30 0.07723799
## 23 tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 43 rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 62 rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 5 tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40 0.07723799
## 24 tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 44 rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 63 rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 6 tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25 0.05341557
## 25 tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 45 rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 64 rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 7 tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40 0.07397215
## 26 tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 46 rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 65 rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 8 tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35 0.07388866
## 27 tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 47 rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40 0.07397215
## 66 rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 9 tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45 0.09273323
## 28 tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 48 rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 67 rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 10 tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30 0.04870054
## 29 tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 49 rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40 0.07360034
## 68 rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 11 tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35 0.08693059
## 30 tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 50 rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40 0.07360034
## 69 rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 12 tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35 0.07397215
## 31 tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 51 rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40 0.06229493
## 70 rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 13 tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30 0.05578963
## 32 tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 52 rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 71 rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 14 tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 33 tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 53 rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 72 rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 15 tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 34 tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 54 rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 73 rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 16 tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 35 tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 55 rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 74 rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 17 tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 36 tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 56 rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 75 rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 18 tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 37 tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 57 rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40 0.06229493
## 76 rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 19 tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 38 tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## KappaSD SensitivitySD SpecificitySD
## 39 0.3644110 0.08870845 0.4082483
## 58 0.3221457 0.12076147 0.3496029
## 1 0.4308081 0.10507259 0.4377975
## 20 0.3239613 0.13437096 0.3496029
## 40 0.2734629 0.08926587 0.3374743
## 59 0.3501294 0.13501543 0.4216370
## 2 0.3017636 0.08740074 0.3496029
## 21 0.3604969 0.12076147 0.3496029
## 41 0.3452821 0.11325177 0.3689324
## 60 0.3267224 0.13437096 0.4216370
## 3 0.3911162 0.11237238 0.3944053
## 22 0.3327914 0.14191155 0.4216370
## 42 0.2961876 0.03952847 0.3496029
## 61 0.3501294 0.13501543 0.4216370
## 4 0.3588047 0.06344244 0.3496029
## 23 0.3635361 0.13437096 0.4216370
## 43 0.3622503 0.05662589 0.3944053
## 62 0.3501294 0.13501543 0.4216370
## 5 0.3063525 0.09709864 0.3162278
## 24 0.3635361 0.13437096 0.4216370
## 44 0.3622503 0.05662589 0.3944053
## 63 0.3501294 0.13501543 0.4216370
## 6 0.3290280 0.05270463 0.3535534
## 25 0.3635361 0.13437096 0.4216370
## 45 0.3622503 0.05662589 0.3944053
## 64 0.3501294 0.13501543 0.4216370
## 7 0.3786511 0.06344244 0.3944053
## 26 0.3635361 0.13437096 0.4216370
## 46 0.3622503 0.05662589 0.3944053
## 65 0.3501294 0.13501543 0.4216370
## 8 0.3840725 0.06344244 0.4116363
## 27 0.3635361 0.13437096 0.4216370
## 47 0.3535960 0.08658617 0.3944053
## 66 0.3501294 0.13501543 0.4216370
## 9 0.3943442 0.09709864 0.4377975
## 28 0.3635361 0.13437096 0.4216370
## 48 0.3622503 0.05662589 0.3944053
## 67 0.3501294 0.13501543 0.4216370
## 10 0.2996664 0.06705351 0.3496029
## 29 0.3635361 0.13437096 0.4216370
## 49 0.4112362 0.00000000 0.3944053
## 68 0.3501294 0.13501543 0.4216370
## 11 0.4430978 0.03952847 0.4116363
## 30 0.3635361 0.13437096 0.4216370
## 50 0.4112362 0.00000000 0.3944053
## 69 0.3501294 0.13501543 0.4216370
## 12 0.3962940 0.05270463 0.4116363
## 31 0.3635361 0.13437096 0.4216370
## 51 0.3690642 0.03952847 0.3944053
## 70 0.3501294 0.13501543 0.4216370
## 13 0.3381708 0.05270463 0.3496029
## 32 0.3635361 0.13437096 0.4216370
## 52 0.3160680 0.03952847 0.3374743
## 71 0.3501294 0.13501543 0.4216370
## 14 0.3341588 0.05270463 0.3374743
## 33 0.3635361 0.13437096 0.4216370
## 53 0.3160680 0.03952847 0.3374743
## 72 0.3501294 0.13501543 0.4216370
## 15 0.3341588 0.05270463 0.3374743
## 34 0.3635361 0.13437096 0.4216370
## 54 0.3160680 0.03952847 0.3374743
## 73 0.3501294 0.13501543 0.4216370
## 16 0.3221690 0.06344244 0.3374743
## 35 0.3635361 0.13437096 0.4216370
## 55 0.3160680 0.03952847 0.3374743
## 74 0.3501294 0.13501543 0.4216370
## 17 0.3341588 0.05270463 0.3374743
## 36 0.3635361 0.13437096 0.4216370
## 56 0.3160680 0.03952847 0.3374743
## 75 0.3501294 0.13501543 0.4216370
## 18 0.3221690 0.06344244 0.3374743
## 37 0.3635361 0.13437096 0.4216370
## 57 0.3690642 0.03952847 0.3944053
## 76 0.3501294 0.13501543 0.4216370
## 19 0.3221690 0.06344244 0.3374743
## 38 0.3635361 0.13437096 0.4216370
(C50_Train_Specificity <- C50_Tune$results[C50_Tune$results$trials==C50_Tune$bestTune$trials &
C50_Tune$results$model==C50_Tune$bestTune$model &
C50_Tune$results$winnow==C50_Tune$bestTune$winnow,
c("Specificity")])
## [1] 0.55
C50_Train <- data.frame(C50_Observed = PMA_PreModelling_Train$Class,
C50_Predicted = predict(C50_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(C50_Train_ConfusionMatrix <- confusionMatrix(data = C50_Train$C50_Predicted,
reference = C50_Train$C50_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 77 1
## R 1 17
##
## Accuracy : 0.9792
## 95% CI : (0.9268, 0.9975)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 5.86e-07
##
## Kappa : 0.9316
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9872
## Specificity : 0.9444
## Pos Pred Value : 0.9872
## Neg Pred Value : 0.9444
## Prevalence : 0.8125
## Detection Rate : 0.8021
## Detection Prevalence : 0.8125
## Balanced Accuracy : 0.9658
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
C50_VarImp <- varImp(C50_Tune, scale = TRUE)
plot(C50_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : C5.0 Decision Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
C50_Test <- data.frame(C50_Observed = PMA_PreModelling_Test$Class,
C50_Predicted = predict(C50_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
C50_Test
## C50_Observed C50_Predicted
## 1 M M
## 2 M M
## 3 M M
## 4 M M
## 5 M M
## 6 M R
## 7 M M
## 8 M R
## 9 M M
## 10 M R
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M R
## 19 M M
## 20 M M
## 21 M R
## 22 M M
## 23 M M
## 24 M M
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R R
## 36 R M
## 37 R M
## 38 R M
## 39 R M
## 40 R M
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(C50_Test_Specificity <- Specificity(y_pred = C50_Test$C50_Predicted,
y_true = C50_Test$C50_Observed))
## [1] 0.1428571
(C50_Test_ConfusionMatrix <- confusionMatrix(data = C50_Test$C50_Predicted,
reference = C50_Test$C50_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 28 6
## R 5 1
##
## Accuracy : 0.725
## 95% CI : (0.5611, 0.854)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.9632
##
## Kappa : -0.0092
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8485
## Specificity : 0.1429
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.1667
## Prevalence : 0.8250
## Detection Rate : 0.7000
## Detection Prevalence : 0.8500
## Balanced Accuracy : 0.4957
##
## 'Positive' Class : M
##
1.5.6 Cost-Sensitive C5.0 Decision Trees (CS_C50)
[A] The C5.0 decision trees model from the
C50 and
plyr
packages was implemented through the
caret
package. This version of the model applied non-uniform case weights
between the classes of the response variable (Class=M:1, Class=R:4).
[B] The model contains 3 hyperparameters:
[B.1] trials =
number of boosting iterations made to vary across a range of values
equal to 1 to 100
[B.2] model =
model type made to vary across a range of levels equal to TREE and
RULES
[B.3] winnow =
winnow made to vary across a range of levels equal to TRUE and
FALSE
[C] Specificity was particularly used as the metric for
assessment to compare the effect of cost-sensitive learning to the
minority class.
[D] The cross-validated model performance of the final
model is summarized as follows:
[D.1] Final model configuration involves trials=1,
model=TREE and winnow=FALSE
[D.2] Specificity = 0.55000
[E] The model allows for ranking of predictors in terms
of variable importance. The top-performing predictors in the model are
as follows:
[E.1] V11
variable (numeric)
[E.2] V49
variable (numeric)
[E.3] V37
variable (numeric)
[E.4] V17
variable (numeric)
[E.5] V20
variable (numeric)
[F] The independent test model performance of the final
model is summarized as follows:
[F.1] Specificity = 0.14286
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating a function for
# a customized summary metrics
##################################
fourMetricSummary <- function (data, lev = levels(data$obs), model = NULL)
{
accKapp <- postResample(data[, "pred"], data[, "obs"])
out <- c(accKapp,
sensitivity(data[, "pred"], data[, "obs"], lev[1]),
specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out)[3:4] <- c("Sensitivity", "Specificity")
out
}
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
KFold_Control <- trainControl(method = "cv",
classProbs = FALSE,
summaryFunction = fourMetricSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
CS_C50_Grid = expand.grid(trials = c(1:9, (1:10)*10),
model = c("tree", "rules"),
winnow = c(TRUE, FALSE))
##################################
# Formulating the cost matrix
##################################
CS_C50_CostMatrix <- matrix(c(0,1,4,0), ncol=2)
rownames(CS_C50_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
colnames(CS_C50_CostMatrix) <- levels(PMA_PreModelling_Train$Class)
CS_C50_CostMatrix
## M R
## M 0 4
## R 1 0
##################################
# Running the C5.0 decision trees model
# by setting the caret method to 'C5.0'
##################################
set.seed(12345678)
CS_C50_Tune <- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
y = PMA_PreModelling_Train$Class,
method = "C5.0",
tuneGrid = CS_C50_Grid,
metric = "Specificity",
trControl = KFold_Control,
parms = list(loss=CS_C50_CostMatrix))
##################################
# Reporting the cross-validation results
# for the train set
##################################
CS_C50_Tune
## C5.0
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa Sensitivity Specificity
## rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50
## rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35
## rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45
## rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20
## rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40
## rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40
## rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40
## rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20
## rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30
## rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30
## tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55
## tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30
## tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40
## tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30
## tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40
## tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25
## tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40
## tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35
## tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45
## tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30
## tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35
## tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35
## tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30
## tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35
## tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20
## tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20
## tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30
## tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30
##
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
## = FALSE.
##
## Call:
## (function (x, y, trials = 1, rules = FALSE, weights = NULL, control
## = 3942L), parms = list(loss = structure(c(0, 1, 4, 0), .Dim = c(2L,
## 2L), .Dimnames = list(c("M", "R"), c("M", "R")))))
##
## Classification Tree
## Number of samples: 96
## Number of predictors: 60
##
## Tree size: 6
##
## Non-standard options: attempt to group attributes
## model winnow trials Accuracy Kappa Sensitivity Specificity AccuracySD
## 39 rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50 0.08764563
## 58 rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20 0.10213093
## 1 tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55 0.11468930
## 20 tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20 0.11357756
## 40 rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35 0.05341557
## 59 rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 2 tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30 0.04870054
## 21 tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20 0.11248533
## 41 rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45 0.09293271
## 60 rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30 0.09927031
## 3 tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40 0.10210406
## 22 tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30 0.10876536
## 42 rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20 0.05341557
## 61 rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 4 tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30 0.07723799
## 23 tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 43 rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 62 rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 5 tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40 0.07723799
## 24 tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 44 rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 63 rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 6 tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25 0.05341557
## 25 tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 45 rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 64 rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 7 tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40 0.07397215
## 26 tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 46 rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 65 rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 8 tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35 0.07388866
## 27 tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 47 rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40 0.07397215
## 66 rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 9 tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45 0.09273323
## 28 tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 48 rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40 0.06937898
## 67 rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 10 tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30 0.04870054
## 29 tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 49 rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40 0.07360034
## 68 rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 11 tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35 0.08693059
## 30 tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 50 rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40 0.07360034
## 69 rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 12 tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35 0.07397215
## 31 tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 51 rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40 0.06229493
## 70 rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 13 tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30 0.05578963
## 32 tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 52 rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 71 rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 14 tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 33 tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 53 rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 72 rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 15 tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 34 tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 54 rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 73 rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 16 tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 35 tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 55 rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 74 rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 17 tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35 0.05223404
## 36 tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 56 rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35 0.04549680
## 75 rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 18 tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 37 tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## 57 rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40 0.06229493
## 76 rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30 0.10371032
## 19 tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35 0.05578963
## 38 tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30 0.10989457
## KappaSD SensitivitySD SpecificitySD
## 39 0.3644110 0.08870845 0.4082483
## 58 0.3221457 0.12076147 0.3496029
## 1 0.4308081 0.10507259 0.4377975
## 20 0.3239613 0.13437096 0.3496029
## 40 0.2734629 0.08926587 0.3374743
## 59 0.3501294 0.13501543 0.4216370
## 2 0.3017636 0.08740074 0.3496029
## 21 0.3604969 0.12076147 0.3496029
## 41 0.3452821 0.11325177 0.3689324
## 60 0.3267224 0.13437096 0.4216370
## 3 0.3911162 0.11237238 0.3944053
## 22 0.3327914 0.14191155 0.4216370
## 42 0.2961876 0.03952847 0.3496029
## 61 0.3501294 0.13501543 0.4216370
## 4 0.3588047 0.06344244 0.3496029
## 23 0.3635361 0.13437096 0.4216370
## 43 0.3622503 0.05662589 0.3944053
## 62 0.3501294 0.13501543 0.4216370
## 5 0.3063525 0.09709864 0.3162278
## 24 0.3635361 0.13437096 0.4216370
## 44 0.3622503 0.05662589 0.3944053
## 63 0.3501294 0.13501543 0.4216370
## 6 0.3290280 0.05270463 0.3535534
## 25 0.3635361 0.13437096 0.4216370
## 45 0.3622503 0.05662589 0.3944053
## 64 0.3501294 0.13501543 0.4216370
## 7 0.3786511 0.06344244 0.3944053
## 26 0.3635361 0.13437096 0.4216370
## 46 0.3622503 0.05662589 0.3944053
## 65 0.3501294 0.13501543 0.4216370
## 8 0.3840725 0.06344244 0.4116363
## 27 0.3635361 0.13437096 0.4216370
## 47 0.3535960 0.08658617 0.3944053
## 66 0.3501294 0.13501543 0.4216370
## 9 0.3943442 0.09709864 0.4377975
## 28 0.3635361 0.13437096 0.4216370
## 48 0.3622503 0.05662589 0.3944053
## 67 0.3501294 0.13501543 0.4216370
## 10 0.2996664 0.06705351 0.3496029
## 29 0.3635361 0.13437096 0.4216370
## 49 0.4112362 0.00000000 0.3944053
## 68 0.3501294 0.13501543 0.4216370
## 11 0.4430978 0.03952847 0.4116363
## 30 0.3635361 0.13437096 0.4216370
## 50 0.4112362 0.00000000 0.3944053
## 69 0.3501294 0.13501543 0.4216370
## 12 0.3962940 0.05270463 0.4116363
## 31 0.3635361 0.13437096 0.4216370
## 51 0.3690642 0.03952847 0.3944053
## 70 0.3501294 0.13501543 0.4216370
## 13 0.3381708 0.05270463 0.3496029
## 32 0.3635361 0.13437096 0.4216370
## 52 0.3160680 0.03952847 0.3374743
## 71 0.3501294 0.13501543 0.4216370
## 14 0.3341588 0.05270463 0.3374743
## 33 0.3635361 0.13437096 0.4216370
## 53 0.3160680 0.03952847 0.3374743
## 72 0.3501294 0.13501543 0.4216370
## 15 0.3341588 0.05270463 0.3374743
## 34 0.3635361 0.13437096 0.4216370
## 54 0.3160680 0.03952847 0.3374743
## 73 0.3501294 0.13501543 0.4216370
## 16 0.3221690 0.06344244 0.3374743
## 35 0.3635361 0.13437096 0.4216370
## 55 0.3160680 0.03952847 0.3374743
## 74 0.3501294 0.13501543 0.4216370
## 17 0.3341588 0.05270463 0.3374743
## 36 0.3635361 0.13437096 0.4216370
## 56 0.3160680 0.03952847 0.3374743
## 75 0.3501294 0.13501543 0.4216370
## 18 0.3221690 0.06344244 0.3374743
## 37 0.3635361 0.13437096 0.4216370
## 57 0.3690642 0.03952847 0.3944053
## 76 0.3501294 0.13501543 0.4216370
## 19 0.3221690 0.06344244 0.3374743
## 38 0.3635361 0.13437096 0.4216370
(CS_C50_Train_Specificity <- CS_C50_Tune$results[CS_C50_Tune$results$trials==CS_C50_Tune$bestTune$trials &
CS_C50_Tune$results$model==CS_C50_Tune$bestTune$model &
CS_C50_Tune$results$winnow==CS_C50_Tune$bestTune$winnow,
c("Specificity")])
## [1] 0.55
CS_C50_Train <- data.frame(CS_C50_Observed = PMA_PreModelling_Train$Class,
CS_C50_Predicted = predict(CS_C50_Tune,
PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
type = "raw"))
(CS_C50_Train_ConfusionMatrix <- confusionMatrix(data = CS_C50_Train$CS_C50_Predicted,
reference = CS_C50_Train$CS_C50_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 77 1
## R 1 17
##
## Accuracy : 0.9792
## 95% CI : (0.9268, 0.9975)
## No Information Rate : 0.8125
## P-Value [Acc > NIR] : 5.86e-07
##
## Kappa : 0.9316
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9872
## Specificity : 0.9444
## Pos Pred Value : 0.9872
## Neg Pred Value : 0.9444
## Prevalence : 0.8125
## Detection Rate : 0.8021
## Detection Prevalence : 0.8125
## Balanced Accuracy : 0.9658
##
## 'Positive' Class : M
##
##################################
# Identifying and plotting the
# best model predictors
##################################
CS_C50_VarImp <- varImp(CS_C50_Tune, scale = TRUE)
plot(CS_C50_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Cost-Sensitive C5.0 Decision Trees",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)

##################################
# Independently evaluating the model
# on the test set
##################################
CS_C50_Test <- data.frame(CS_C50_Observed = PMA_PreModelling_Test$Class,
CS_C50_Predicted = predict(CS_C50_Tune,
PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in% c("Class")],
type = "raw"))
CS_C50_Test
## CS_C50_Observed CS_C50_Predicted
## 1 M M
## 2 M M
## 3 M M
## 4 M M
## 5 M M
## 6 M R
## 7 M M
## 8 M R
## 9 M M
## 10 M R
## 11 M M
## 12 M M
## 13 M M
## 14 M M
## 15 M M
## 16 M M
## 17 M M
## 18 M R
## 19 M M
## 20 M M
## 21 M R
## 22 M M
## 23 M M
## 24 M M
## 25 M M
## 26 M M
## 27 M M
## 28 M M
## 29 M M
## 30 M M
## 31 M M
## 32 M M
## 33 M M
## 34 R M
## 35 R R
## 36 R M
## 37 R M
## 38 R M
## 39 R M
## 40 R M
##################################
# Reporting the independent evaluation results
# for the test set
##################################
(CS_C50_Test_Specificity <- Specificity(y_pred = CS_C50_Test$CS_C50_Predicted,
y_true = CS_C50_Test$CS_C50_Observed))
## [1] 0.1428571
(CS_C50_Test_ConfusionMatrix <- confusionMatrix(data = CS_C50_Test$CS_C50_Predicted,
reference = CS_C50_Test$CS_C50_Observed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 28 6
## R 5 1
##
## Accuracy : 0.725
## 95% CI : (0.5611, 0.854)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.9632
##
## Kappa : -0.0092
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8485
## Specificity : 0.1429
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.1667
## Prevalence : 0.8250
## Detection Rate : 0.7000
## Detection Prevalence : 0.8500
## Balanced Accuracy : 0.4957
##
## 'Positive' Class : M
##
1.7 Evaluation Summary
Model performance comparison:
[A] The model which demonstrated the most consistent
improvement in specificity metric after cost-sensitive training is as
follows:
[A.1] SVM_R: Support Vector Machine - Radial Basis
Function Kernel
[A.1.1] Test Specificity = 0.14286,
Cross-Validation Specificity = 0.60000
[A.2] CW_SVM_R: Class-Weighted Support Vector
Machine - Radial Basis Function Kernel ()
[A.2.1] Test Specificity = 0.28571,
Cross-Validation Specificity = 0.65000
##################################
# Consolidating all evaluation results
# for the train and test sets
# using the specificity metric
##################################
Model <- c('SVM_R','CW_SVM_R','CART','CS_CART','C50','CS_C50',
'SVM_R','CW_SVM_R','CART','CS_CART','C50','CS_C50')
Set <- c(rep('Cross-Validation',6),rep('Test',6))
Specificity <- c(SVM_R_Train_Specificity,
CW_SVM_R_Train_Specificity,
CART_Train_Specificity,
CS_CART_Train_Specificity,
C50_Train_Specificity,
CS_C50_Train_Specificity,
SVM_R_Test_Specificity,
CW_SVM_R_Test_Specificity,
CART_Test_Specificity,
CS_CART_Test_Specificity,
C50_Test_Specificity,
CS_C50_Test_Specificity)
Specificity_Summary <- as.data.frame(cbind(Model,Set,Specificity))
Specificity_Summary$Specificity <- as.numeric(as.character(Specificity_Summary$Specificity))
Specificity_Summary$Set <- factor(Specificity_Summary$Set,
levels = c("Cross-Validation",
"Test"))
Specificity_Summary$Model <- factor(Specificity_Summary$Model,
levels =c('SVM_R',
'CW_SVM_R',
'CART',
'CS_CART',
'C50',
'CS_C50'))
print(Specificity_Summary, row.names=FALSE)
## Model Set Specificity
## SVM_R Cross-Validation 0.6000000
## CW_SVM_R Cross-Validation 0.6500000
## CART Cross-Validation 0.4500000
## CS_CART Cross-Validation 0.3000000
## C50 Cross-Validation 0.5500000
## CS_C50 Cross-Validation 0.5500000
## SVM_R Test 0.1428571
## CW_SVM_R Test 0.2857143
## CART Test 0.2857143
## CS_CART Test 0.7142857
## C50 Test 0.1428571
## CS_C50 Test 0.1428571
(Specificity_Plot <- dotplot(Model ~ Specificity,
data = Specificity_Summary,
groups = Set,
main = "Classification Model Performance Comparison",
ylab = "Model",
xlab = "Specificity",
auto.key = list(adj = 1),
type=c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))

##################################
# Consolidating the resampling results
# for the candidate models
##################################
(COST_COMPARISON_RESAMPLING <- resamples(list(SVM_R = SVM_R_Tune,
CW_SVM_R = SVM_R_Tune,
CART = CART_Tune,
CS_CART = CS_CART_Tune,
C50 = C50_Tune,
CS_C50 = CS_C50_Tune)))
##
## Call:
## resamples.default(x = list(SVM_R = SVM_R_Tune, CW_SVM_R = SVM_R_Tune, CART
## = CART_Tune, CS_CART = CS_CART_Tune, C50 = C50_Tune, CS_C50 = CS_C50_Tune))
##
## Models: SVM_R, CW_SVM_R, CART, CS_CART, C50, CS_C50
## Number of resamples: 10
## Performance metrics: Accuracy, Kappa, Sensitivity, Specificity
## Time estimates for: everything, final model fit
summary(COST_COMPARISON_RESAMPLING)
##
## Call:
## summary.resamples(object = COST_COMPARISON_RESAMPLING)
##
## Models: SVM_R, CW_SVM_R, CART, CS_CART, C50, CS_C50
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R 0.8000000 0.8916667 0.9000000 0.9277778 1.0000000 1.0 0
## CW_SVM_R 0.8000000 0.8916667 0.9000000 0.9277778 1.0000000 1.0 0
## CART 0.6000000 0.7194444 0.7888889 0.7922222 0.8750000 1.0 0
## CS_CART 0.4000000 0.7777778 0.8000000 0.7600000 0.8666667 0.9 0
## C50 0.6666667 0.7777778 0.8000000 0.8322222 0.9000000 1.0 0
## CS_C50 0.6666667 0.7777778 0.8000000 0.8322222 0.9000000 1.0 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R 0.0000000 0.61036789 0.61538462 0.6454849 1.0000000 1.0000000 0
## CW_SVM_R 0.0000000 0.61036789 0.61538462 0.6454849 1.0000000 1.0000000 0
## CART -0.2500000 -0.09375000 0.29090909 0.2829196 0.5979021 1.0000000 0
## CS_CART -0.3636364 0.00000000 0.02631579 0.1696080 0.4884868 0.6153846 0
## C50 -0.1538462 0.04545455 0.38750000 0.4030199 0.7064777 1.0000000 0
## CS_C50 -0.1538462 0.04545455 0.38750000 0.4030199 0.7064777 1.0000000 0
##
## Sensitivity
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R 1.0000000 1.0000000 1.0000 1.0000000 1 1 0
## CW_SVM_R 1.0000000 1.0000000 1.0000 1.0000000 1 1 0
## CART 0.7142857 0.7500000 0.8750 0.8714286 1 1 0
## CS_CART 0.5000000 0.7544643 0.9375 0.8535714 1 1 0
## C50 0.7142857 0.8750000 0.8750 0.8964286 1 1 0
## CS_C50 0.7142857 0.8750000 0.8750 0.8964286 1 1 0
##
## Specificity
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R 0 0.500 0.50 0.60 1.000 1 0
## CW_SVM_R 0 0.500 0.50 0.60 1.000 1 0
## CART 0 0.000 0.50 0.45 0.875 1 0
## CS_CART 0 0.000 0.25 0.30 0.500 1 0
## C50 0 0.125 0.50 0.55 1.000 1 0
## CS_C50 0 0.125 0.50 0.55 1.000 1 0
##################################
# Exploring the resampling results
##################################
bwplot(COST_COMPARISON_RESAMPLING,
main = "Model Resampling Performance Comparison (Range)",
ylab = "Model",
pch = 16,
cex = 2,
layout=c(4,1))

dotplot(COST_COMPARISON_RESAMPLING,
main = "Model Resampling Performance Comparison (95% Confidence Interval)",
ylab = "Model",
pch = 16,
cex = 2,
layout=c(4,1))

##################################
# Consolidating all models
##################################
(COST_COMPARISON_MODELS <- (list(SVM_R = SVM_R_Tune,
CW_SVM_R = CW_SVM_R_Tune,
CART = CART_Tune,
CS_CART = CS_CART_Tune,
C50 = C50_Tune,
CS_C50 = CS_C50_Tune)))
## $SVM_R
## Support Vector Machines with Radial Basis Function Kernel
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Sensitivity Specificity
## 0.01562500 0.8133333 0.0000000 1 0.0
## 0.02209709 0.8133333 0.0000000 1 0.0
## 0.03125000 0.8133333 0.0000000 1 0.0
## 0.04419417 0.8133333 0.0000000 1 0.0
## 0.06250000 0.8133333 0.0000000 1 0.0
## 0.08838835 0.8133333 0.0000000 1 0.0
## 0.12500000 0.8133333 0.0000000 1 0.0
## 0.17677670 0.8133333 0.0000000 1 0.0
## 0.25000000 0.8133333 0.0000000 1 0.0
## 0.35355339 0.8133333 0.0000000 1 0.0
## 0.50000000 0.8133333 0.0000000 1 0.0
## 0.70710678 0.8133333 0.0000000 1 0.0
## 1.00000000 0.8655556 0.3224080 1 0.3
## 1.41421356 0.9066667 0.5678930 1 0.5
## 2.00000000 0.9277778 0.6454849 1 0.6
##
## Tuning parameter 'sigma' was held constant at a value of 0.008732089
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.008732089 and C = 2.
##
## $CW_SVM_R
## Support Vector Machines with Radial Basis Function Kernel
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Sensitivity Specificity
## 0.01562500 0.8133333 0.0000000 1.0000000 0.00
## 0.02209709 0.8133333 0.0000000 1.0000000 0.00
## 0.03125000 0.8133333 0.0000000 1.0000000 0.00
## 0.04419417 0.8133333 0.0000000 1.0000000 0.00
## 0.06250000 0.8133333 0.0000000 1.0000000 0.00
## 0.08838835 0.8444444 0.2230769 1.0000000 0.20
## 0.12500000 0.8633333 0.4848567 0.9357143 0.55
## 0.17677670 0.8422222 0.4554315 0.8964286 0.60
## 0.25000000 0.8322222 0.4389841 0.8839286 0.60
## 0.35355339 0.8633333 0.5094884 0.9089286 0.65
## 0.50000000 0.8844444 0.5566738 0.9339286 0.65
## 0.70710678 0.8944444 0.5758125 0.9464286 0.65
## 1.00000000 0.9166667 0.6324754 0.9732143 0.65
## 1.41421356 0.9066667 0.5940139 0.9732143 0.60
## 2.00000000 0.9066667 0.5940139 0.9732143 0.60
##
## Tuning parameter 'sigma' was held constant at a value of 0.008732089
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.008732089 and C = 0.3535534.
##
## $CART
## CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Sensitivity Specificity
## 0.0001 0.7922222 0.2829196 0.8714286 0.45
## 0.0005 0.7922222 0.2829196 0.8714286 0.45
## 0.0010 0.7922222 0.2829196 0.8714286 0.45
## 0.0050 0.7922222 0.2829196 0.8714286 0.45
## 0.0100 0.7922222 0.2829196 0.8714286 0.45
## 0.0150 0.7922222 0.2829196 0.8714286 0.45
## 0.0200 0.7922222 0.2829196 0.8714286 0.45
##
## Specificity was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02.
##
## $CS_CART
## CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Sensitivity Specificity
## 0.0001 0.76 0.169608 0.8535714 0.3
## 0.0005 0.76 0.169608 0.8535714 0.3
## 0.0010 0.76 0.169608 0.8535714 0.3
## 0.0050 0.76 0.169608 0.8535714 0.3
## 0.0100 0.76 0.169608 0.8535714 0.3
## 0.0150 0.76 0.169608 0.8535714 0.3
## 0.0200 0.76 0.169608 0.8535714 0.3
##
## Specificity was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02.
##
## $C50
## C5.0
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa Sensitivity Specificity
## rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50
## rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35
## rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45
## rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20
## rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40
## rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40
## rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40
## rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20
## rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30
## rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30
## tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55
## tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30
## tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40
## tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30
## tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40
## tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25
## tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40
## tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35
## tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45
## tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30
## tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35
## tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35
## tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30
## tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35
## tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20
## tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20
## tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30
## tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30
##
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
## = FALSE.
##
## $CS_C50
## C5.0
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa Sensitivity Specificity
## rules FALSE 1 0.8444444 0.3945908 0.9232143 0.50
## rules FALSE 2 0.8333333 0.2971608 0.9482143 0.35
## rules FALSE 3 0.8533333 0.4043357 0.9464286 0.45
## rules FALSE 4 0.8333333 0.1839465 0.9875000 0.20
## rules FALSE 5 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 6 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 7 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 8 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 9 0.8544444 0.3603297 0.9607143 0.40
## rules FALSE 10 0.8655556 0.3811992 0.9732143 0.40
## rules FALSE 20 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 30 0.8877778 0.4454849 1.0000000 0.40
## rules FALSE 40 0.8766667 0.4063545 0.9875000 0.40
## rules FALSE 50 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 60 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 70 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 80 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 90 0.8655556 0.3672241 0.9875000 0.35
## rules FALSE 100 0.8766667 0.4063545 0.9875000 0.40
## rules TRUE 1 0.8155556 0.1465909 0.9500000 0.20
## rules TRUE 2 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 3 0.8044444 0.1865909 0.9250000 0.30
## rules TRUE 4 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 5 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 6 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 7 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 8 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 9 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 10 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 20 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 30 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 40 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 50 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 60 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 70 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 80 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 90 0.8144444 0.2106294 0.9375000 0.30
## rules TRUE 100 0.8144444 0.2106294 0.9375000 0.30
## tree FALSE 1 0.8322222 0.4030199 0.8964286 0.55
## tree FALSE 2 0.8233333 0.2482776 0.9500000 0.30
## tree FALSE 3 0.8222222 0.3149126 0.9214286 0.40
## tree FALSE 4 0.8322222 0.2764632 0.9607143 0.30
## tree FALSE 5 0.8322222 0.3495363 0.9339286 0.40
## tree FALSE 6 0.8333333 0.2323161 0.9750000 0.25
## tree FALSE 7 0.8544444 0.3686992 0.9607143 0.40
## tree FALSE 8 0.8444444 0.3071608 0.9607143 0.35
## tree FALSE 9 0.8433333 0.3633125 0.9339286 0.45
## tree FALSE 10 0.8233333 0.2439919 0.9482143 0.30
## tree FALSE 20 0.8655556 0.3721154 0.9875000 0.35
## tree FALSE 30 0.8544444 0.3329849 0.9750000 0.35
## tree FALSE 40 0.8433333 0.2938545 0.9750000 0.30
## tree FALSE 50 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 60 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 70 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 80 0.8544444 0.3547241 0.9750000 0.35
## tree FALSE 90 0.8433333 0.3295688 0.9607143 0.35
## tree FALSE 100 0.8433333 0.3295688 0.9607143 0.35
## tree TRUE 1 0.7955556 0.1147589 0.9250000 0.20
## tree TRUE 2 0.8155556 0.1552448 0.9500000 0.20
## tree TRUE 3 0.7844444 0.1547589 0.9000000 0.30
## tree TRUE 4 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 5 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 6 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 7 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 8 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 9 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 10 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 20 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 30 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 40 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 50 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 60 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 70 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 80 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 90 0.8044444 0.1952448 0.9250000 0.30
## tree TRUE 100 0.8044444 0.1952448 0.9250000 0.30
##
## Specificity was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
## = FALSE.
##################################
# Creating a function model performance
# on test data
##################################
COST_COMPARISON_TEST_Specificity <- function(model, data) {
Data_Test <- data.frame(Observed = data$Class,
Predicted = predict(model,
data[,!names(data) %in% c("Class")],
type = "raw"))
Specificity <- Specificity(y_pred = Data_Test$Predicted,
y_true = Data_Test$Observed)
return(Specificity)
}
COST_COMPARISON_TEST_SUMMARY <- lapply(COST_COMPARISON_MODELS,
COST_COMPARISON_TEST_Specificity,
data = PMA_PreModelling_Test)
COST_COMPARISON_TEST_SUMMARY <- lapply(COST_COMPARISON_TEST_SUMMARY, as.vector)
COST_COMPARISON_TEST_SUMMARY <- do.call("rbind", COST_COMPARISON_TEST_SUMMARY)
colnames(COST_COMPARISON_TEST_SUMMARY) <- c("Specificity")
(COST_COMPARISON_TEST_SUMMARY <- as.data.frame(COST_COMPARISON_TEST_SUMMARY))
## Specificity
## SVM_R 0.1428571
## CW_SVM_R 0.2857143
## CART 0.2857143
## CS_CART 0.7142857
## C50 0.1428571
## CS_C50 0.1428571