1.1 Sample Data
The
Sonar
dataset from the
mlbench
package was used for this illustrated example.
Preliminary dataset assessment:
[A] 208 rows (observations)
[A.1] Train Set = 167 observations
[A.2] Test Set = 41 observations
[B] 61 columns (variables)
[B.1] 1/61 response = Class variable (factor)
[B.1.1] Levels = Class=R < Class=M
[B.2] 60/61 predictors = All remaining variables
(60/60 numeric)
##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(mlbench)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
library(nnet)
library(elasticnet)
library(earth)
library(party)
library(kernlab)
library(randomForest)
library(Cubist)
library(pROC)
library(mda)
library(klaR)
library(pamr)
##################################
# Loading source and
# formulating the train set
##################################
data(Sonar)
set.seed(12345678)
Sonar_Partition <- createDataPartition(Sonar$Class, p = .80, list = FALSE)
Sonar_Train <- Sonar[Sonar_Partition,]
Sonar_Test <- Sonar[-Sonar_Partition,]
##################################
# Performing a general exploration of the train set
##################################
dim(Sonar_Train)
## [1] 167 61
## 'data.frame': 167 obs. of 61 variables:
## $ V1 : num 0.0262 0.01 0.0762 0.0286 0.0519 0.0223 0.0164 0.0039 0.0123 0.0124 ...
## $ V2 : num 0.0582 0.0171 0.0666 0.0453 0.0548 0.0375 0.0173 0.0063 0.0309 0.0433 ...
## $ V3 : num 0.1099 0.0623 0.0481 0.0277 0.0842 ...
## $ V4 : num 0.1083 0.0205 0.0394 0.0174 0.0319 ...
## $ V5 : num 0.0974 0.0205 0.059 0.0384 0.1158 ...
## $ V6 : num 0.228 0.0368 0.0649 0.099 0.0922 0.0591 0.0671 0.0284 0.0102 0.0355 ...
## $ V7 : num 0.243 0.11 0.121 0.12 0.103 ...
## $ V8 : num 0.3771 0.1276 0.2467 0.1833 0.0613 ...
## $ V9 : num 0.5598 0.0598 0.3564 0.2105 0.1465 ...
## $ V10 : num 0.619 0.126 0.446 0.304 0.284 ...
## $ V11 : num 0.6333 0.0881 0.4152 0.2988 0.2802 ...
## $ V12 : num 0.706 0.199 0.395 0.425 0.309 ...
## $ V13 : num 0.5544 0.0184 0.4256 0.6343 0.2657 ...
## $ V14 : num 0.532 0.226 0.413 0.82 0.38 ...
## $ V15 : num 0.648 0.173 0.453 1 0.563 ...
## $ V16 : num 0.693 0.213 0.533 0.999 0.438 ...
## $ V17 : num 0.6759 0.0693 0.7306 0.9508 0.2617 ...
## $ V18 : num 0.755 0.228 0.619 0.902 0.12 ...
## $ V19 : num 0.893 0.406 0.203 0.723 0.668 ...
## $ V20 : num 0.862 0.397 0.464 0.512 0.94 ...
## $ V21 : num 0.797 0.274 0.415 0.207 0.783 ...
## $ V22 : num 0.674 0.369 0.429 0.399 0.535 ...
## $ V23 : num 0.429 0.556 0.573 0.589 0.681 ...
## $ V24 : num 0.365 0.485 0.54 0.287 0.917 ...
## $ V25 : num 0.533 0.314 0.316 0.204 0.761 ...
## $ V26 : num 0.241 0.533 0.229 0.578 0.822 ...
## $ V27 : num 0.507 0.526 0.7 0.539 0.887 ...
## $ V28 : num 0.853 0.252 1 0.375 0.609 ...
## $ V29 : num 0.604 0.209 0.726 0.341 0.297 ...
## $ V30 : num 0.851 0.356 0.472 0.507 0.11 ...
## $ V31 : num 0.851 0.626 0.51 0.558 0.132 ...
## $ V32 : num 0.5045 0.734 0.5459 0.4778 0.0624 ...
## $ V33 : num 0.186 0.612 0.288 0.33 0.099 ...
## $ V34 : num 0.2709 0.3497 0.0981 0.2198 0.4006 ...
## $ V35 : num 0.423 0.395 0.195 0.141 0.367 ...
## $ V36 : num 0.304 0.301 0.418 0.286 0.105 ...
## $ V37 : num 0.612 0.541 0.46 0.381 0.192 ...
## $ V38 : num 0.676 0.881 0.322 0.416 0.393 ...
## $ V39 : num 0.537 0.986 0.283 0.405 0.429 ...
## $ V40 : num 0.472 0.917 0.243 0.33 0.255 ...
## $ V41 : num 0.465 0.612 0.198 0.271 0.115 ...
## $ V42 : num 0.259 0.501 0.244 0.265 0.22 ...
## $ V43 : num 0.2129 0.321 0.1847 0.0723 0.1879 ...
## $ V44 : num 0.2222 0.3202 0.0841 0.1238 0.1437 ...
## $ V45 : num 0.2111 0.4295 0.0692 0.1192 0.2146 ...
## $ V46 : num 0.0176 0.3654 0.0528 0.1089 0.236 ...
## $ V47 : num 0.1348 0.2655 0.0357 0.0623 0.1125 ...
## $ V48 : num 0.0744 0.1576 0.0085 0.0494 0.0254 ...
## $ V49 : num 0.013 0.0681 0.023 0.0264 0.0285 0.0777 0.0092 0.0228 0.0134 0.045 ...
## $ V50 : num 0.0106 0.0294 0.0046 0.0081 0.0178 0.0439 0.0198 0.0073 0.0217 0.0167 ...
## $ V51 : num 0.0033 0.0241 0.0156 0.0104 0.0052 0.0061 0.0118 0.0062 0.0188 0.0078 ...
## $ V52 : num 0.0232 0.0121 0.0031 0.0045 0.0081 0.0145 0.009 0.0062 0.0133 0.0083 ...
## $ V53 : num 0.0166 0.0036 0.0054 0.0014 0.012 0.0128 0.0223 0.012 0.0265 0.0057 ...
## $ V54 : num 0.0095 0.015 0.0105 0.0038 0.0045 0.0145 0.0179 0.0052 0.0224 0.0174 ...
## $ V55 : num 0.018 0.0085 0.011 0.0013 0.0121 0.0058 0.0084 0.0056 0.0074 0.0188 ...
## $ V56 : num 0.0244 0.0073 0.0015 0.0089 0.0097 0.0049 0.0068 0.0093 0.0118 0.0054 ...
## $ V57 : num 0.0316 0.005 0.0072 0.0057 0.0085 0.0065 0.0032 0.0042 0.0026 0.0114 ...
## $ V58 : num 0.0164 0.0044 0.0048 0.0027 0.0047 0.0093 0.0035 0.0003 0.0092 0.0196 ...
## $ V59 : num 0.0095 0.004 0.0107 0.0051 0.0048 0.0059 0.0056 0.0053 0.0009 0.0147 ...
## $ V60 : num 0.0078 0.0117 0.0094 0.0062 0.0053 0.0022 0.004 0.0036 0.0044 0.0062 ...
## $ Class: Factor w/ 2 levels "M","R": 2 2 2 2 2 2 2 2 2 2 ...
## V1 V2 V3 V4
## Min. :0.00150 Min. :0.00170 Min. :0.00150 Min. :0.0058
## 1st Qu.:0.01380 1st Qu.:0.01645 1st Qu.:0.01845 1st Qu.:0.0238
## Median :0.02280 Median :0.03080 Median :0.03470 Median :0.0444
## Mean :0.02928 Mean :0.03823 Mean :0.04328 Mean :0.0515
## 3rd Qu.:0.03640 3rd Qu.:0.04755 3rd Qu.:0.06015 3rd Qu.:0.0657
## Max. :0.13710 Max. :0.16320 Max. :0.16650 Max. :0.1732
## V5 V6 V7 V8
## Min. :0.00670 Min. :0.01020 Min. :0.01300 Min. :0.0057
## 1st Qu.:0.03640 1st Qu.:0.06665 1st Qu.:0.08365 1st Qu.:0.0780
## Median :0.06130 Median :0.09210 Median :0.10540 Median :0.1119
## Mean :0.07196 Mean :0.10333 Mean :0.12062 Mean :0.1345
## 3rd Qu.:0.09905 3rd Qu.:0.13145 3rd Qu.:0.15035 3rd Qu.:0.1723
## Max. :0.25650 Max. :0.38230 Max. :0.37290 Max. :0.4566
## V9 V10 V11 V12
## Min. :0.01170 Min. :0.0113 Min. :0.0289 Min. :0.0236
## 1st Qu.:0.09555 1st Qu.:0.1069 1st Qu.:0.1258 1st Qu.:0.1283
## Median :0.15220 Median :0.1799 Median :0.2210 Median :0.2484
## Mean :0.17995 Mean :0.2087 Mean :0.2367 Mean :0.2490
## 3rd Qu.:0.23940 3rd Qu.:0.2736 3rd Qu.:0.3081 3rd Qu.:0.3341
## Max. :0.68280 Max. :0.7106 Max. :0.7342 Max. :0.7060
## V13 V14 V15 V16
## Min. :0.0184 Min. :0.0273 Min. :0.0031 Min. :0.0162
## 1st Qu.:0.1626 1st Qu.:0.1665 1st Qu.:0.1548 1st Qu.:0.1842
## Median :0.2655 Median :0.2793 Median :0.2616 Median :0.2934
## Mean :0.2727 Mean :0.2970 Mean :0.3151 Mean :0.3707
## 3rd Qu.:0.3584 3rd Qu.:0.3946 3rd Qu.:0.4524 3rd Qu.:0.5361
## Max. :0.7131 Max. :0.9970 Max. :1.0000 Max. :0.9988
## V17 V18 V19 V20
## Min. :0.0349 Min. :0.0375 Min. :0.0494 Min. :0.0740
## 1st Qu.:0.2019 1st Qu.:0.2344 1st Qu.:0.2990 1st Qu.:0.3346
## Median :0.3041 Median :0.3657 Median :0.4309 Median :0.5224
## Mean :0.4092 Mean :0.4484 Mean :0.5012 Mean :0.5571
## 3rd Qu.:0.6601 3rd Qu.:0.6759 3rd Qu.:0.7307 3rd Qu.:0.7990
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V21 V22 V23 V24
## Min. :0.0512 Min. :0.0219 Min. :0.0563 Min. :0.0239
## 1st Qu.:0.3919 1st Qu.:0.3994 1st Qu.:0.4485 1st Qu.:0.5413
## Median :0.5911 Median :0.6464 Median :0.6809 Median :0.6954
## Mean :0.5992 Mean :0.6130 Mean :0.6385 Mean :0.6675
## 3rd Qu.:0.8153 3rd Qu.:0.8318 3rd Qu.:0.8517 3rd Qu.:0.8692
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.0240 Min. :0.1543 Min. :0.0874 Min. :0.0284
## 1st Qu.:0.5246 1st Qu.:0.5468 1st Qu.:0.5163 1st Qu.:0.5104
## Median :0.7221 Median :0.7529 Median :0.7207 Median :0.7278
## Mean :0.6702 Mean :0.6948 Mean :0.6976 Mean :0.6955
## 3rd Qu.:0.8623 3rd Qu.:0.8801 3rd Qu.:0.9000 3rd Qu.:0.9055
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.0144 Min. :0.0613 Min. :0.1000 Min. :0.0404
## 1st Qu.:0.4736 1st Qu.:0.4447 1st Qu.:0.3493 1st Qu.:0.2806
## Median :0.6898 Median :0.6213 Median :0.4973 Median :0.4241
## Mean :0.6528 Mean :0.6009 Mean :0.5211 Mean :0.4464
## 3rd Qu.:0.8620 3rd Qu.:0.7541 3rd Qu.:0.6797 3rd Qu.:0.6219
## Max. :1.0000 Max. :1.0000 Max. :0.9657 Max. :0.9306
## V33 V34 V35 V36
## Min. :0.0477 Min. :0.0212 Min. :0.0223 Min. :0.0080
## 1st Qu.:0.2691 1st Qu.:0.2183 1st Qu.:0.1840 1st Qu.:0.1552
## Median :0.3921 Median :0.3785 Median :0.3330 Median :0.3172
## Mean :0.4259 Mean :0.4121 Mean :0.3976 Mean :0.3873
## 3rd Qu.:0.5774 3rd Qu.:0.6046 3rd Qu.:0.6110 3rd Qu.:0.5663
## Max. :1.0000 Max. :0.9647 Max. :1.0000 Max. :1.0000
## V37 V38 V39 V40
## Min. :0.0351 Min. :0.0383 Min. :0.0371 Min. :0.0117
## 1st Qu.:0.1593 1st Qu.:0.1784 1st Qu.:0.1787 1st Qu.:0.1921
## Median :0.3039 Median :0.3104 Median :0.2828 Median :0.2792
## Mean :0.3677 Mean :0.3433 Mean :0.3319 Mean :0.3148
## 3rd Qu.:0.5395 3rd Qu.:0.4410 3rd Qu.:0.4587 3rd Qu.:0.4254
## Max. :0.9497 Max. :1.0000 Max. :0.9857 Max. :0.9167
## V41 V42 V43 V44
## Min. :0.0438 Min. :0.0056 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1691 1st Qu.:0.1722 1st Qu.:0.1555 1st Qu.:0.1253
## Median :0.2649 Median :0.2587 Median :0.2275 Median :0.1753
## Mean :0.2977 Mean :0.2847 Mean :0.2495 Mean :0.2147
## 3rd Qu.:0.4024 3rd Qu.:0.3851 3rd Qu.:0.3225 3rd Qu.:0.2655
## Max. :0.7751 Max. :0.8246 Max. :0.7733 Max. :0.7762
## V45 V46 V47 V48
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.09695 1st Qu.:0.0693 1st Qu.:0.06635 1st Qu.:0.04625
## Median :0.14730 Median :0.1199 Median :0.10340 Median :0.08120
## Mean :0.20028 Mean :0.1631 Mean :0.12474 Mean :0.09464
## 3rd Qu.:0.23645 3rd Qu.:0.2006 3rd Qu.:0.15475 3rd Qu.:0.12245
## Max. :0.70340 Max. :0.7292 Max. :0.55220 Max. :0.33390
## V49 V50 V51 V52
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00080
## 1st Qu.:0.02875 1st Qu.:0.01170 1st Qu.:0.00785 1st Qu.:0.00725
## Median :0.04520 Median :0.01790 Median :0.01360 Median :0.01120
## Mean :0.05411 Mean :0.02075 Mean :0.01595 Mean :0.01351
## 3rd Qu.:0.07235 3rd Qu.:0.02545 3rd Qu.:0.02115 3rd Qu.:0.01670
## Max. :0.17940 Max. :0.08250 Max. :0.10040 Max. :0.07090
## V53 V54 V55 V56
## Min. :0.00050 Min. :0.00100 Min. :0.000600 Min. :0.000400
## 1st Qu.:0.00470 1st Qu.:0.00545 1st Qu.:0.004100 1st Qu.:0.004450
## Median :0.00800 Median :0.00950 Median :0.007500 Median :0.007000
## Mean :0.01067 Mean :0.01083 Mean :0.009063 Mean :0.008187
## 3rd Qu.:0.01525 3rd Qu.:0.01375 3rd Qu.:0.012100 3rd Qu.:0.010350
## Max. :0.03900 Max. :0.03520 Max. :0.044700 Max. :0.039400
## V57 V58 V59 V60
## Min. :0.000300 Min. :0.000300 Min. :0.000100 Min. :0.000600
## 1st Qu.:0.003700 1st Qu.:0.003600 1st Qu.:0.003500 1st Qu.:0.003000
## Median :0.006000 Median :0.006000 Median :0.006000 Median :0.005100
## Mean :0.007763 Mean :0.008172 Mean :0.007586 Mean :0.006029
## 3rd Qu.:0.010350 3rd Qu.:0.010350 3rd Qu.:0.009800 3rd Qu.:0.008500
## Max. :0.035500 Max. :0.044000 Max. :0.029400 Max. :0.021800
## Class
## M:89
## R:78
##
##
##
##
##################################
# Performing a general exploration of the test set
##################################
dim(Sonar_Test)
## [1] 41 61
## 'data.frame': 41 obs. of 61 variables:
## $ V1 : num 0.02 0.0453 0.0317 0.0079 0.009 0.0352 0.0099 0.01 0.0189 0.0123 ...
## $ V2 : num 0.0371 0.0523 0.0956 0.0086 0.0062 0.0116 0.0484 0.0275 0.0308 0.0022 ...
## $ V3 : num 0.0428 0.0843 0.1321 0.0055 0.0253 ...
## $ V4 : num 0.0207 0.0689 0.1408 0.025 0.0489 ...
## $ V5 : num 0.0954 0.1183 0.1674 0.0344 0.1197 ...
## $ V6 : num 0.0986 0.2583 0.171 0.0546 0.1589 ...
## $ V7 : num 0.1539 0.2156 0.0731 0.0528 0.1392 ...
## $ V8 : num 0.1601 0.3481 0.1401 0.0958 0.0987 ...
## $ V9 : num 0.3109 0.3337 0.2083 0.1009 0.0955 ...
## $ V10 : num 0.211 0.287 0.351 0.124 0.19 ...
## $ V11 : num 0.161 0.492 0.179 0.11 0.19 ...
## $ V12 : num 0.1582 0.6552 0.0658 0.1215 0.2547 ...
## $ V13 : num 0.2238 0.6919 0.0513 0.1874 0.4073 ...
## $ V14 : num 0.0645 0.7797 0.3752 0.3383 0.2988 ...
## $ V15 : num 0.066 0.746 0.542 0.323 0.29 ...
## $ V16 : num 0.227 0.944 0.544 0.272 0.533 ...
## $ V17 : num 0.31 1 0.515 0.394 0.402 ...
## $ V18 : num 0.3 0.887 0.426 0.643 0.157 ...
## $ V19 : num 0.508 0.802 0.202 0.727 0.302 ...
## $ V20 : num 0.48 0.782 0.423 0.867 0.391 ...
## $ V21 : num 0.578 0.521 0.772 0.967 0.354 ...
## $ V22 : num 0.507 0.405 0.974 0.985 0.444 ...
## $ V23 : num 0.433 0.396 0.939 0.948 0.641 ...
## $ V24 : num 0.555 0.391 0.556 0.804 0.46 ...
## $ V25 : num 0.671 0.325 0.527 0.683 0.601 ...
## $ V26 : num 0.641 0.32 0.683 0.514 0.869 ...
## $ V27 : num 0.71 0.327 0.571 0.309 0.835 ...
## $ V28 : num 0.808 0.2767 0.5429 0.0832 0.7669 ...
## $ V29 : num 0.679 0.442 0.218 0.402 0.508 ...
## $ V30 : num 0.386 0.203 0.215 0.234 0.462 ...
## $ V31 : num 0.131 0.379 0.581 0.19 0.538 ...
## $ V32 : num 0.26 0.295 0.632 0.123 0.537 ...
## $ V33 : num 0.512 0.198 0.296 0.172 0.384 ...
## $ V34 : num 0.755 0.234 0.187 0.235 0.36 ...
## $ V35 : num 0.854 0.131 0.297 0.249 0.74 ...
## $ V36 : num 0.851 0.418 0.516 0.365 0.776 ...
## $ V37 : num 0.669 0.384 0.615 0.338 0.386 ...
## $ V38 : num 0.6097 0.1057 0.4283 0.1589 0.0667 ...
## $ V39 : num 0.4943 0.184 0.5479 0.0989 0.3684 ...
## $ V40 : num 0.274 0.197 0.613 0.109 0.611 ...
## $ V41 : num 0.051 0.167 0.502 0.104 0.351 ...
## $ V42 : num 0.2834 0.0583 0.2377 0.0839 0.2312 ...
## $ V43 : num 0.282 0.14 0.196 0.139 0.22 ...
## $ V44 : num 0.4256 0.1628 0.1749 0.0819 0.3051 ...
## $ V45 : num 0.2641 0.0621 0.1304 0.0678 0.1937 ...
## $ V46 : num 0.1386 0.0203 0.0597 0.0663 0.157 ...
## $ V47 : num 0.1051 0.053 0.1124 0.1202 0.0479 ...
## $ V48 : num 0.1343 0.0742 0.1047 0.0692 0.0538 ...
## $ V49 : num 0.0383 0.0409 0.0507 0.0152 0.0146 ...
## $ V50 : num 0.0324 0.0061 0.0159 0.0266 0.0068 0.0469 0.0779 0.0247 0.0143 0.0074 ...
## $ V51 : num 0.0232 0.0125 0.0195 0.0174 0.0187 0.0426 0.0396 0.0118 0.0091 0.0149 ...
## $ V52 : num 0.0027 0.0084 0.0201 0.0176 0.0059 0.0346 0.0173 0.0088 0.0038 0.0125 ...
## $ V53 : num 0.0065 0.0089 0.0248 0.0127 0.0095 0.0158 0.0149 0.0104 0.0096 0.0134 ...
## $ V54 : num 0.0159 0.0048 0.0131 0.0088 0.0194 0.0154 0.0115 0.0036 0.0142 0.0026 ...
## $ V55 : num 0.0072 0.0094 0.007 0.0098 0.008 0.0109 0.0202 0.0088 0.019 0.0038 ...
## $ V56 : num 0.0167 0.0191 0.0138 0.0019 0.0152 0.0048 0.0139 0.0047 0.014 0.0018 ...
## $ V57 : num 0.018 0.014 0.0092 0.0059 0.0158 0.0095 0.0029 0.0117 0.0099 0.0113 ...
## $ V58 : num 0.0084 0.0049 0.0143 0.0058 0.0053 0.0015 0.016 0.002 0.0092 0.0058 ...
## $ V59 : num 0.009 0.0052 0.0036 0.0059 0.0189 0.0073 0.0106 0.0091 0.0052 0.0047 ...
## $ V60 : num 0.0032 0.0044 0.0103 0.0032 0.0102 0.0067 0.0134 0.0058 0.0075 0.0071 ...
## $ Class: Factor w/ 2 levels "M","R": 2 2 2 2 2 2 2 2 2 2 ...
## V1 V2 V3 V4
## Min. :0.00790 Min. :0.00060 Min. :0.00300 Min. :0.00610
## 1st Qu.:0.01290 1st Qu.:0.01650 1st Qu.:0.01910 1st Qu.:0.02500
## Median :0.02100 Median :0.03080 Median :0.03060 Median :0.03990
## Mean :0.02868 Mean :0.03928 Mean :0.04607 Mean :0.06363
## 3rd Qu.:0.03460 3rd Qu.:0.05090 3rd Qu.:0.04660 3rd Qu.:0.06270
## Max. :0.13130 Max. :0.23390 Max. :0.30590 Max. :0.42640
## V5 V6 V7 V8
## Min. :0.0080 Min. :0.0201 Min. :0.0033 Min. :0.0055
## 1st Qu.:0.0397 1st Qu.:0.0696 1st Qu.:0.0742 1st Qu.:0.0941
## Median :0.0652 Median :0.0924 Median :0.1178 Median :0.1134
## Mean :0.0884 Mean :0.1096 Mean :0.1264 Mean :0.1358
## 3rd Qu.:0.1158 3rd Qu.:0.1589 3rd Qu.:0.1683 3rd Qu.:0.1601
## Max. :0.4010 Max. :0.2587 Max. :0.3322 Max. :0.4590
## V9 V10 V11 V12
## Min. :0.0075 Min. :0.0279 Min. :0.0575 Min. :0.0259
## 1st Qu.:0.1063 1st Qu.:0.1370 1st Qu.:0.1532 1st Qu.:0.1741
## Median :0.1523 Median :0.2028 Median :0.2295 Median :0.2497
## Mean :0.1701 Mean :0.2066 Mean :0.2333 Mean :0.2552
## 3rd Qu.:0.2083 3rd Qu.:0.2571 3rd Qu.:0.2931 3rd Qu.:0.3134
## Max. :0.5526 Max. :0.5966 Max. :0.5304 Max. :0.6552
## V13 V14 V15 V16
## Min. :0.0513 Min. :0.0336 Min. :0.0660 Min. :0.0742
## 1st Qu.:0.1874 1st Qu.:0.1943 1st Qu.:0.1840 1st Qu.:0.2285
## Median :0.2508 Median :0.2917 Median :0.3024 Median :0.3323
## Mean :0.2757 Mean :0.2950 Mean :0.3408 Mean :0.4100
## 3rd Qu.:0.3206 3rd Qu.:0.3383 3rd Qu.:0.4533 3rd Qu.:0.5343
## Max. :0.6919 Max. :0.7797 Max. :0.7464 Max. :0.9751
## V17 V18 V19 V20
## Min. :0.0699 Min. :0.0837 Min. :0.1151 Min. :0.0656
## 1st Qu.:0.2177 1st Qu.:0.2838 1st Qu.:0.3024 1st Qu.:0.3915
## Median :0.3943 Median :0.3797 Median :0.5036 Median :0.6127
## Mean :0.4438 Mean :0.4682 Mean :0.5194 Mean :0.5873
## 3rd Qu.:0.6441 3rd Qu.:0.6921 3rd Qu.:0.7708 3rd Qu.:0.8627
## Max. :1.0000 Max. :1.0000 Max. :0.9832 Max. :0.9634
## V21 V22 V23 V24
## Min. :0.1354 Min. :0.1127 Min. :0.1668 Min. :0.1611
## 1st Qu.:0.4539 1st Qu.:0.4789 1st Qu.:0.4645 1st Qu.:0.5410
## Median :0.7209 Median :0.7676 Median :0.7609 Median :0.7605
## Mean :0.6491 Mean :0.6703 Mean :0.6814 Mean :0.6936
## 3rd Qu.:0.8646 3rd Qu.:0.8318 3rd Qu.:0.8449 3rd Qu.:0.8760
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.1934 Min. :0.0921 Min. :0.0481 Min. :0.0832
## 1st Qu.:0.5268 1st Qu.:0.5423 1st Qu.:0.5353 1st Qu.:0.5897
## Median :0.7115 Median :0.7867 Median :0.7750 Median :0.7325
## Mean :0.6967 Mean :0.7206 Mean :0.7208 Mean :0.6881
## 3rd Qu.:0.9066 3rd Qu.:0.9481 3rd Qu.:0.9673 3rd Qu.:0.8664
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.1040 Min. :0.0823 Min. :0.0482 Min. :0.0994
## 1st Qu.:0.4421 1st Qu.:0.3822 1st Qu.:0.3264 1st Qu.:0.2947
## Median :0.5807 Median :0.4301 Median :0.4302 Median :0.4444
## Mean :0.5983 Mean :0.4997 Mean :0.4369 Mean :0.4092
## 3rd Qu.:0.8184 3rd Qu.:0.6616 3rd Qu.:0.5524 3rd Qu.:0.5141
## Max. :1.0000 Max. :0.8660 Max. :0.8787 Max. :0.9108
## V33 V34 V35 V36
## Min. :0.0507 Min. :0.0431 Min. :0.0619 Min. :0.0271
## 1st Qu.:0.2277 1st Qu.:0.2017 1st Qu.:0.1641 1st Qu.:0.1519
## Median :0.3844 Median :0.3095 Median :0.2896 Median :0.3649
## Mean :0.3820 Mean :0.3669 Mean :0.3722 Mean :0.3750
## 3rd Qu.:0.5121 3rd Qu.:0.5050 3rd Qu.:0.5531 3rd Qu.:0.5163
## Max. :0.8032 Max. :0.8703 Max. :1.0000 Max. :0.9212
## V37 V38 V39 V40
## Min. :0.0535 Min. :0.0411 Min. :0.0477 Min. :0.0202
## 1st Qu.:0.1644 1st Qu.:0.1730 1st Qu.:0.1252 1st Qu.:0.1485
## Median :0.3201 Median :0.3170 Median :0.2916 Median :0.2715
## Mean :0.3481 Mean :0.3249 Mean :0.3009 Mean :0.2964
## 3rd Qu.:0.4801 3rd Qu.:0.4283 3rd Qu.:0.3803 3rd Qu.:0.4022
## Max. :0.9386 Max. :0.9303 Max. :0.9709 Max. :0.9297
## V41 V42 V43 V44
## Min. :0.0360 Min. :0.0300 Min. :0.0550 Min. :0.0375
## 1st Qu.:0.1043 1st Qu.:0.1197 1st Qu.:0.1552 1st Qu.:0.1381
## Median :0.1921 Median :0.1986 Median :0.2195 Median :0.1927
## Mean :0.2549 Mean :0.2521 Mean :0.2344 Mean :0.2114
## 3rd Qu.:0.3510 3rd Qu.:0.3010 3rd Qu.:0.3370 3rd Qu.:0.2804
## Max. :0.8995 Max. :0.7911 Max. :0.5600 Max. :0.5245
## V45 V46 V47 V48
## Min. :0.0335 Min. :0.0203 Min. :0.0179 Min. :0.00410
## 1st Qu.:0.0902 1st Qu.:0.0654 1st Qu.:0.0530 1st Qu.:0.04180
## Median :0.1625 Median :0.1294 Median :0.1013 Median :0.06920
## Mean :0.1848 Mean :0.1505 Mean :0.1131 Mean :0.07832
## 3rd Qu.:0.2078 3rd Qu.:0.1985 3rd Qu.:0.1521 3rd Qu.:0.10470
## Max. :0.6149 Max. :0.5507 Max. :0.4331 Max. :0.29050
## V49 V50 V51 V52
## Min. :0.00730 Min. :0.00060 Min. :0.00190 Min. :0.00250
## 1st Qu.:0.01870 1st Qu.:0.00950 1st Qu.:0.01040 1st Qu.:0.00780
## Median :0.03830 Median :0.01700 Median :0.01540 Median :0.01170
## Mean :0.04304 Mean :0.01909 Mean :0.01655 Mean :0.01304
## 3rd Qu.:0.05490 3rd Qu.:0.02470 3rd Qu.:0.01950 3rd Qu.:0.01680
## Max. :0.19810 Max. :0.07790 Max. :0.04260 Max. :0.03620
## V53 V54 V55 V56
## Min. :0.00190 Min. :0.00130 Min. :0.00110 Min. :0.001300
## 1st Qu.:0.00820 1st Qu.:0.00510 1st Qu.:0.00430 1st Qu.:0.004200
## Median :0.01030 Median :0.00840 Median :0.00720 Median :0.006500
## Mean :0.01089 Mean :0.01139 Mean :0.01022 Mean :0.008363
## 3rd Qu.:0.01290 3rd Qu.:0.01600 3rd Qu.:0.01230 3rd Qu.:0.011900
## Max. :0.02480 Max. :0.03350 Max. :0.03760 Max. :0.027700
## V57 V58 V59 V60
## Min. :0.000900 Min. :0.000600 Min. :0.002300 Min. :0.001600
## 1st Qu.:0.003700 1st Qu.:0.003400 1st Qu.:0.004400 1st Qu.:0.004300
## Median :0.005900 Median :0.005500 Median :0.007300 Median :0.006000
## Mean :0.008051 Mean :0.007039 Mean :0.009388 Mean :0.008454
## 3rd Qu.:0.010600 3rd Qu.:0.010300 3rd Qu.:0.011000 3rd Qu.:0.010200
## Max. :0.024200 Max. :0.022400 Max. :0.036400 Max. :0.043900
## Class
## M:22
## R:19
##
##
##
##
##################################
# Formulating a data type assessment summary
##################################
PDA <- Sonar_Train
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 V1 numeric
## 2 2 V2 numeric
## 3 3 V3 numeric
## 4 4 V4 numeric
## 5 5 V5 numeric
## 6 6 V6 numeric
## 7 7 V7 numeric
## 8 8 V8 numeric
## 9 9 V9 numeric
## 10 10 V10 numeric
## 11 11 V11 numeric
## 12 12 V12 numeric
## 13 13 V13 numeric
## 14 14 V14 numeric
## 15 15 V15 numeric
## 16 16 V16 numeric
## 17 17 V17 numeric
## 18 18 V18 numeric
## 19 19 V19 numeric
## 20 20 V20 numeric
## 21 21 V21 numeric
## 22 22 V22 numeric
## 23 23 V23 numeric
## 24 24 V24 numeric
## 25 25 V25 numeric
## 26 26 V26 numeric
## 27 27 V27 numeric
## 28 28 V28 numeric
## 29 29 V29 numeric
## 30 30 V30 numeric
## 31 31 V31 numeric
## 32 32 V32 numeric
## 33 33 V33 numeric
## 34 34 V34 numeric
## 35 35 V35 numeric
## 36 36 V36 numeric
## 37 37 V37 numeric
## 38 38 V38 numeric
## 39 39 V39 numeric
## 40 40 V40 numeric
## 41 41 V41 numeric
## 42 42 V42 numeric
## 43 43 V43 numeric
## 44 44 V44 numeric
## 45 45 V45 numeric
## 46 46 V46 numeric
## 47 47 V47 numeric
## 48 48 V48 numeric
## 49 49 V49 numeric
## 50 50 V50 numeric
## 51 51 V51 numeric
## 52 52 V52 numeric
## 53 53 V53 numeric
## 54 54 V54 numeric
## 55 55 V55 numeric
## 56 56 V56 numeric
## 57 57 V57 numeric
## 58 58 V58 numeric
## 59 59 V59 numeric
## 60 60 V60 numeric
## 61 61 Class factor
1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] Low variance observed for 8 variables with
First.Second.Mode.Ratio>5.
[B.1] V10
variable (numeric)
[B.2] V21
variable (numeric)
[B.3] V27
variable (numeric)
[B.4] V28
variable (numeric)
[B.5] V30
variable (numeric)
[B.6] V34
variable (numeric)
[B.7] V37
variable (numeric)
[B.8] V42
variable (numeric)
[C] No low variance noted for any variable with
Unique.Count.Ratio<0.01.
[D] No high skewness noted for any variable with
Skewness>3 or Skewness<(-3).
##################################
# Loading dataset
##################################
DQA <- Sonar_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 V1 numeric 167 0 1.000
## 2 2 V2 numeric 167 0 1.000
## 3 3 V3 numeric 167 0 1.000
## 4 4 V4 numeric 167 0 1.000
## 5 5 V5 numeric 167 0 1.000
## 6 6 V6 numeric 167 0 1.000
## 7 7 V7 numeric 167 0 1.000
## 8 8 V8 numeric 167 0 1.000
## 9 9 V9 numeric 167 0 1.000
## 10 10 V10 numeric 167 0 1.000
## 11 11 V11 numeric 167 0 1.000
## 12 12 V12 numeric 167 0 1.000
## 13 13 V13 numeric 167 0 1.000
## 14 14 V14 numeric 167 0 1.000
## 15 15 V15 numeric 167 0 1.000
## 16 16 V16 numeric 167 0 1.000
## 17 17 V17 numeric 167 0 1.000
## 18 18 V18 numeric 167 0 1.000
## 19 19 V19 numeric 167 0 1.000
## 20 20 V20 numeric 167 0 1.000
## 21 21 V21 numeric 167 0 1.000
## 22 22 V22 numeric 167 0 1.000
## 23 23 V23 numeric 167 0 1.000
## 24 24 V24 numeric 167 0 1.000
## 25 25 V25 numeric 167 0 1.000
## 26 26 V26 numeric 167 0 1.000
## 27 27 V27 numeric 167 0 1.000
## 28 28 V28 numeric 167 0 1.000
## 29 29 V29 numeric 167 0 1.000
## 30 30 V30 numeric 167 0 1.000
## 31 31 V31 numeric 167 0 1.000
## 32 32 V32 numeric 167 0 1.000
## 33 33 V33 numeric 167 0 1.000
## 34 34 V34 numeric 167 0 1.000
## 35 35 V35 numeric 167 0 1.000
## 36 36 V36 numeric 167 0 1.000
## 37 37 V37 numeric 167 0 1.000
## 38 38 V38 numeric 167 0 1.000
## 39 39 V39 numeric 167 0 1.000
## 40 40 V40 numeric 167 0 1.000
## 41 41 V41 numeric 167 0 1.000
## 42 42 V42 numeric 167 0 1.000
## 43 43 V43 numeric 167 0 1.000
## 44 44 V44 numeric 167 0 1.000
## 45 45 V45 numeric 167 0 1.000
## 46 46 V46 numeric 167 0 1.000
## 47 47 V47 numeric 167 0 1.000
## 48 48 V48 numeric 167 0 1.000
## 49 49 V49 numeric 167 0 1.000
## 50 50 V50 numeric 167 0 1.000
## 51 51 V51 numeric 167 0 1.000
## 52 52 V52 numeric 167 0 1.000
## 53 53 V53 numeric 167 0 1.000
## 54 54 V54 numeric 167 0 1.000
## 55 55 V55 numeric 167 0 1.000
## 56 56 V56 numeric 167 0 1.000
## 57 57 V57 numeric 167 0 1.000
## 58 58 V58 numeric 167 0 1.000
## 59 59 V59 numeric 167 0 1.000
## 60 60 V60 numeric 167 0 1.000
## 61 61 Class factor 167 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 60 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 V1 numeric 149 0.892 0.020
## 2 V2 numeric 154 0.922 0.045
## 3 V3 numeric 153 0.916 0.028
## 4 V4 numeric 150 0.898 0.011
## 5 V5 numeric 156 0.934 0.065
## 6 V6 numeric 156 0.934 0.028
## 7 V7 numeric 158 0.946 0.077
## 8 V8 numeric 161 0.964 0.183
## 9 V9 numeric 165 0.988 0.210
## 10 V10 numeric 167 1.000 0.619
## 11 V11 numeric 163 0.976 0.248
## 12 V12 numeric 165 0.988 0.374
## 13 V13 numeric 160 0.958 0.266
## 14 V14 numeric 162 0.970 0.280
## 15 V15 numeric 164 0.982 0.187
## 16 V16 numeric 166 0.994 0.203
## 17 V17 numeric 163 0.976 0.216
## 18 V18 numeric 164 0.982 1.000
## 19 V19 numeric 166 0.994 0.406
## 20 V20 numeric 162 0.970 0.334
## 21 V21 numeric 161 0.964 1.000
## 22 V22 numeric 164 0.982 1.000
## 23 V23 numeric 161 0.964 1.000
## 24 V24 numeric 161 0.964 1.000
## 25 V25 numeric 158 0.946 1.000
## 26 V26 numeric 158 0.946 1.000
## 27 V27 numeric 155 0.928 1.000
## 28 V28 numeric 154 0.922 1.000
## 29 V29 numeric 159 0.952 1.000
## 30 V30 numeric 162 0.970 1.000
## 31 V31 numeric 166 0.994 0.386
## 32 V32 numeric 165 0.988 0.290
## 33 V33 numeric 165 0.988 0.525
## 34 V34 numeric 167 1.000 0.271
## 35 V35 numeric 165 0.988 1.000
## 36 V36 numeric 165 0.988 1.000
## 37 V37 numeric 167 1.000 0.612
## 38 V38 numeric 165 0.988 0.315
## 39 V39 numeric 163 0.976 0.168
## 40 V40 numeric 166 0.994 0.443
## 41 V41 numeric 165 0.988 0.305
## 42 V42 numeric 167 1.000 0.259
## 43 V43 numeric 165 0.988 0.212
## 44 V44 numeric 156 0.934 0.320
## 45 V45 numeric 164 0.982 0.119
## 46 V46 numeric 162 0.970 0.143
## 47 V47 numeric 164 0.982 0.079
## 48 V48 numeric 166 0.994 0.075
## 49 V49 numeric 157 0.940 0.032
## 50 V50 numeric 135 0.808 0.018
## 51 V51 numeric 138 0.826 0.015
## 52 V52 numeric 124 0.743 0.009
## 53 V53 numeric 115 0.689 0.004
## 54 V54 numeric 112 0.671 0.014
## 55 V55 numeric 105 0.629 0.004
## 56 V56 numeric 106 0.635 0.007
## 57 V57 numeric 107 0.641 0.005
## 58 V58 numeric 107 0.641 0.004
## 59 V59 numeric 103 0.617 0.007
## 60 V60 numeric 94 0.563 0.005
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 0.010 3 2 1.500
## 2 0.058 2 1 2.000
## 3 0.110 2 1 2.000
## 4 0.039 3 2 1.500
## 5 0.097 2 1 2.000
## 6 0.228 2 1 2.000
## 7 0.099 3 2 1.500
## 8 0.377 2 1 2.000
## 9 0.560 2 1 2.000
## 10 0.000 1 0 Inf
## 11 0.633 2 1 2.000
## 12 0.706 2 1 2.000
## 13 0.554 2 1 2.000
## 14 0.532 2 1 2.000
## 15 0.648 2 1 2.000
## 16 0.693 2 1 2.000
## 17 0.676 2 1 2.000
## 18 0.755 2 1 2.000
## 19 0.893 2 1 2.000
## 20 0.862 2 1 2.000
## 21 0.797 7 1 7.000
## 22 0.674 4 1 4.000
## 23 0.143 6 2 3.000
## 24 0.943 6 2 3.000
## 25 0.695 6 2 3.000
## 26 0.754 6 2 3.000
## 27 0.684 11 2 5.500
## 28 0.440 11 2 5.500
## 29 0.904 6 3 2.000
## 30 0.851 6 1 6.000
## 31 0.851 2 1 2.000
## 32 0.504 2 1 2.000
## 33 0.186 2 1 2.000
## 34 0.000 1 0 Inf
## 35 0.423 2 1 2.000
## 36 0.304 2 1 2.000
## 37 0.000 1 0 Inf
## 38 0.676 2 1 2.000
## 39 0.538 2 1 2.000
## 40 0.472 2 1 2.000
## 41 0.465 2 1 2.000
## 42 0.000 1 0 Inf
## 43 0.213 2 1 2.000
## 44 0.222 2 1 2.000
## 45 0.211 2 1 2.000
## 46 0.018 2 1 2.000
## 47 0.135 2 1 2.000
## 48 0.074 2 1 2.000
## 49 0.078 3 2 1.500
## 50 0.022 5 3 1.667
## 51 0.003 3 2 1.500
## 52 0.003 4 3 1.333
## 53 0.005 3 2 1.500
## 54 0.002 4 3 1.333
## 55 0.008 5 4 1.250
## 56 0.005 4 3 1.333
## 57 0.004 6 4 1.500
## 58 0.004 5 4 1.250
## 59 0.005 5 4 1.250
## 60 0.003 6 5 1.200
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 1 0.002 0.029 0.023 0.137 1.945 7.608 0.014 0.036
## 2 0.002 0.038 0.031 0.163 1.604 5.752 0.016 0.048
## 3 0.002 0.043 0.035 0.166 1.467 5.425 0.018 0.060
## 4 0.006 0.052 0.044 0.173 1.259 4.317 0.024 0.066
## 5 0.007 0.072 0.061 0.256 1.167 4.555 0.036 0.099
## 6 0.010 0.103 0.092 0.382 1.375 6.358 0.067 0.131
## 7 0.013 0.121 0.105 0.373 1.008 4.620 0.084 0.150
## 8 0.006 0.135 0.112 0.457 1.400 5.331 0.078 0.172
## 9 0.012 0.180 0.152 0.683 1.611 6.269 0.096 0.239
## 10 0.011 0.209 0.180 0.711 1.278 4.581 0.107 0.274
## 11 0.029 0.237 0.221 0.734 0.986 4.104 0.126 0.308
## 12 0.024 0.249 0.248 0.706 0.584 2.867 0.128 0.334
## 13 0.018 0.273 0.266 0.713 0.687 3.213 0.163 0.358
## 14 0.027 0.297 0.279 0.997 1.028 4.457 0.166 0.395
## 15 0.003 0.315 0.262 1.000 0.811 3.092 0.155 0.452
## 16 0.016 0.371 0.293 0.999 0.679 2.388 0.184 0.536
## 17 0.035 0.409 0.304 1.000 0.653 2.145 0.202 0.660
## 18 0.038 0.448 0.366 1.000 0.541 1.961 0.234 0.676
## 19 0.049 0.501 0.431 1.000 0.297 1.842 0.299 0.731
## 20 0.074 0.557 0.522 1.000 -0.048 1.764 0.335 0.799
## 21 0.051 0.599 0.591 1.000 -0.206 1.955 0.392 0.815
## 22 0.022 0.613 0.646 1.000 -0.325 2.010 0.399 0.832
## 23 0.056 0.639 0.681 1.000 -0.571 2.356 0.448 0.852
## 24 0.024 0.668 0.695 1.000 -0.684 2.746 0.541 0.869
## 25 0.024 0.670 0.722 1.000 -0.815 2.860 0.525 0.862
## 26 0.154 0.695 0.753 1.000 -0.690 2.496 0.547 0.880
## 27 0.087 0.698 0.721 1.000 -0.562 2.352 0.516 0.900
## 28 0.028 0.695 0.728 1.000 -0.544 2.358 0.510 0.906
## 29 0.014 0.653 0.690 1.000 -0.503 2.401 0.474 0.862
## 30 0.061 0.601 0.621 1.000 -0.182 2.404 0.445 0.754
## 31 0.100 0.521 0.497 0.966 0.218 2.146 0.349 0.680
## 32 0.040 0.446 0.424 0.931 0.283 2.161 0.281 0.622
## 33 0.048 0.426 0.392 1.000 0.406 2.421 0.269 0.577
## 34 0.021 0.412 0.378 0.965 0.489 2.345 0.218 0.605
## 35 0.022 0.398 0.333 1.000 0.558 2.265 0.184 0.611
## 36 0.008 0.387 0.317 1.000 0.659 2.385 0.155 0.566
## 37 0.035 0.368 0.304 0.950 0.648 2.311 0.159 0.540
## 38 0.038 0.343 0.310 1.000 0.979 3.362 0.178 0.441
## 39 0.037 0.332 0.283 0.986 0.858 3.179 0.179 0.459
## 40 0.012 0.315 0.279 0.917 0.773 3.539 0.192 0.425
## 41 0.044 0.298 0.265 0.775 0.740 3.037 0.169 0.402
## 42 0.006 0.285 0.259 0.825 0.814 3.530 0.172 0.385
## 43 0.000 0.250 0.228 0.773 0.921 3.992 0.155 0.322
## 44 0.000 0.215 0.175 0.776 1.271 4.382 0.125 0.265
## 45 0.000 0.200 0.147 0.703 1.314 3.825 0.097 0.236
## 46 0.000 0.163 0.120 0.729 1.674 5.816 0.069 0.201
## 47 0.000 0.125 0.103 0.552 1.770 7.024 0.066 0.155
## 48 0.000 0.095 0.081 0.334 1.224 4.510 0.046 0.122
## 49 0.000 0.054 0.045 0.179 1.065 3.769 0.029 0.072
## 50 0.000 0.021 0.018 0.082 1.681 6.763 0.012 0.025
## 51 0.000 0.016 0.014 0.100 2.850 17.472 0.008 0.021
## 52 0.001 0.014 0.011 0.071 2.156 10.103 0.007 0.017
## 53 0.000 0.011 0.008 0.039 1.067 3.957 0.005 0.015
## 54 0.001 0.011 0.010 0.035 1.173 4.193 0.005 0.014
## 55 0.001 0.009 0.008 0.045 1.857 8.514 0.004 0.012
## 56 0.000 0.008 0.007 0.039 1.960 9.547 0.004 0.010
## 57 0.000 0.008 0.006 0.036 1.784 7.243 0.004 0.010
## 58 0.000 0.008 0.006 0.044 2.125 9.392 0.004 0.010
## 59 0.000 0.008 0.006 0.029 1.456 5.083 0.003 0.010
## 60 0.001 0.006 0.005 0.022 1.290 4.732 0.003 0.008
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 8 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 10 V10 numeric 167 1.000 0.619
## 21 V21 numeric 161 0.964 1.000
## 27 V27 numeric 155 0.928 1.000
## 28 V28 numeric 154 0.922 1.000
## 30 V30 numeric 162 0.970 1.000
## 34 V34 numeric 167 1.000 0.271
## 37 V37 numeric 167 1.000 0.612
## 42 V42 numeric 167 1.000 0.259
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 10 0.000 1 0 Inf
## 21 0.797 7 1 7.000
## 27 0.684 11 2 5.500
## 28 0.440 11 2 5.500
## 30 0.851 6 1 6.000
## 34 0.000 1 0 Inf
## 37 0.000 1 0 Inf
## 42 0.000 1 0 Inf
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 10 0.011 0.209 0.180 0.711 1.278 4.581 0.107 0.274
## 21 0.051 0.599 0.591 1.000 -0.206 1.955 0.392 0.815
## 27 0.087 0.698 0.721 1.000 -0.562 2.352 0.516 0.900
## 28 0.028 0.695 0.728 1.000 -0.544 2.358 0.510 0.906
## 30 0.061 0.601 0.621 1.000 -0.182 2.404 0.445 0.754
## 34 0.021 0.412 0.378 0.965 0.489 2.345 0.218 0.605
## 37 0.035 0.368 0.304 0.950 0.648 2.311 0.159 0.540
## 42 0.006 0.285 0.259 0.825 0.814 3.530 0.172 0.385
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
1.3 Data Preprocessing
1.3.1 Outlier
Outlier data assessment:
[A] Outliers noted for 39 variables with the numeric
data visualized through a boxplot including observations classified as
suspected outliers using the IQR criterion. The IQR criterion means that
all observations above the (75th percentile + 1.5 x IQR) or below the
(25th percentile - 1.5 x IQR) are suspected outliers, where IQR is the
difference between the third quartile (75th percentile) and first
quartile (25th percentile). Outlier treatment for numerical stability
remains optional depending on potential model requirements for the
subsequent steps.
[A.1] V1
variable (13 outliers detected)
[A.2] V2
variable (11 outliers detected)
[A.3] V3
variable (7 outliers detected)
[A.4] V4
variable (8 outliers detected)
[A.5] V5
variable (3 outliers detected)
[A.6] V6
variable (7 outliers detected)
[A.7] V7
variable (5 outliers detected)
[A.8] V8
variable (8 outliers detected)
[A.9] V9
variable (7 outliers detected)
[A.10] V10
variable (8 outliers detected)
[A.11] V11
variable (5 outliers detected)
[A.12] V12
variable (1 outliers detected)
[A.13] V13
variable (2 outliers detected)
[A.14] V14
variable (3 outliers detected)
[A.15] V15
variable (2 outliers detected)
[A.16] V24
variable (2 outliers detected)
[A.17] V38
variable (7 outliers detected)
[A.18] V39
variable (3 outliers detected)
[A.19] V40
variable (3 outliers detected)
[A.20] V41
variable (2 outliers detected)
[A.21] V42
variable (4 outliers detected)
[A.22] V43
variable (5 outliers detected)
[A.23] V44
variable (12 outliers detected)
[A.24] V45
variable (19 outliers detected)
[A.25] V46
variable (14 outliers detected)
[A.26] V47
variable (11 outliers detected)
[A.27] V48
variable (8 outliers detected)
[A.28] V49
variable (5 outliers detected)
[A.29] V50
variable (12 outliers detected)
[A.30] V51
variable (4 outliers detected)
[A.31] V52
variable (11 outliers detected)
[A.32] V53
variable (3 outliers detected)
[A.33] V54
variable (9 outliers detected)
[A.34] V55
variable (5 outliers detected)
[A.35] V56
variable (7 outliers detected)
[A.36] V57
variable (7 outliers detected)
[A.37] V58
variable (8 outliers detected)
[A.38] V59
variable (12 outliers detected)
[A.39] V60
variable (5 outliers detected)
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying outliers for the numeric predictors
##################################
OutlierCountList <- c()
for (i in 1:ncol(DPA.Predictors.Numeric)) {
Outliers <- boxplot.stats(DPA.Predictors.Numeric[,i])$out
OutlierCount <- length(Outliers)
OutlierCountList <- append(OutlierCountList,OutlierCount)
OutlierIndices <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}




























































OutlierCountSummary <- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
OutlierCountSummary$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
NumericPredictorWithOutlierCount <- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "39 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))
Data summary
Name |
DPA.Predictors.Numeric |
Number of rows |
167 |
Number of columns |
60 |
_______________________ |
|
Column type frequency: |
|
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: numeric
V1 |
0 |
1 |
0.03 |
0.02 |
0.00 |
0.01 |
0.02 |
0.04 |
0.14 |
▇▃▁▁▁ |
V2 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.05 |
0.16 |
▇▅▁▁▁ |
V3 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.06 |
0.17 |
▇▆▂▁▁ |
V4 |
0 |
1 |
0.05 |
0.04 |
0.01 |
0.02 |
0.04 |
0.07 |
0.17 |
▇▆▂▁▁ |
V5 |
0 |
1 |
0.07 |
0.05 |
0.01 |
0.04 |
0.06 |
0.10 |
0.26 |
▇▆▃▁▁ |
V6 |
0 |
1 |
0.10 |
0.06 |
0.01 |
0.07 |
0.09 |
0.13 |
0.38 |
▇▇▂▁▁ |
V7 |
0 |
1 |
0.12 |
0.06 |
0.01 |
0.08 |
0.11 |
0.15 |
0.37 |
▅▇▃▁▁ |
V8 |
0 |
1 |
0.13 |
0.09 |
0.01 |
0.08 |
0.11 |
0.17 |
0.46 |
▇▇▃▁▁ |
V9 |
0 |
1 |
0.18 |
0.12 |
0.01 |
0.10 |
0.15 |
0.24 |
0.68 |
▇▆▂▁▁ |
V10 |
0 |
1 |
0.21 |
0.14 |
0.01 |
0.11 |
0.18 |
0.27 |
0.71 |
▇▆▃▁▁ |
V11 |
0 |
1 |
0.24 |
0.14 |
0.03 |
0.13 |
0.22 |
0.31 |
0.73 |
▇▇▃▁▁ |
V12 |
0 |
1 |
0.25 |
0.14 |
0.02 |
0.13 |
0.25 |
0.33 |
0.71 |
▇▇▆▂▁ |
V13 |
0 |
1 |
0.27 |
0.14 |
0.02 |
0.16 |
0.27 |
0.36 |
0.71 |
▅▇▅▂▁ |
V14 |
0 |
1 |
0.30 |
0.17 |
0.03 |
0.17 |
0.28 |
0.39 |
1.00 |
▇▇▃▁▁ |
V15 |
0 |
1 |
0.32 |
0.21 |
0.00 |
0.15 |
0.26 |
0.45 |
1.00 |
▇▆▅▂▁ |
V16 |
0 |
1 |
0.37 |
0.23 |
0.02 |
0.18 |
0.29 |
0.54 |
1.00 |
▇▆▃▃▁ |
V17 |
0 |
1 |
0.41 |
0.27 |
0.03 |
0.20 |
0.30 |
0.66 |
1.00 |
▇▇▃▅▃ |
V18 |
0 |
1 |
0.45 |
0.26 |
0.04 |
0.23 |
0.37 |
0.68 |
1.00 |
▆▇▂▅▃ |
V19 |
0 |
1 |
0.50 |
0.26 |
0.05 |
0.30 |
0.43 |
0.73 |
1.00 |
▃▇▃▅▅ |
V20 |
0 |
1 |
0.56 |
0.26 |
0.07 |
0.33 |
0.52 |
0.80 |
1.00 |
▅▇▆▆▇ |
V21 |
0 |
1 |
0.60 |
0.26 |
0.05 |
0.39 |
0.59 |
0.82 |
1.00 |
▃▆▆▆▇ |
V22 |
0 |
1 |
0.61 |
0.26 |
0.02 |
0.40 |
0.65 |
0.83 |
1.00 |
▂▅▅▆▇ |
V23 |
0 |
1 |
0.64 |
0.25 |
0.06 |
0.45 |
0.68 |
0.85 |
1.00 |
▂▃▅▆▇ |
V24 |
0 |
1 |
0.67 |
0.24 |
0.02 |
0.54 |
0.70 |
0.87 |
1.00 |
▂▂▅▇▇ |
V25 |
0 |
1 |
0.67 |
0.25 |
0.02 |
0.52 |
0.72 |
0.86 |
1.00 |
▂▂▃▇▇ |
V26 |
0 |
1 |
0.69 |
0.23 |
0.15 |
0.55 |
0.75 |
0.88 |
1.00 |
▂▂▅▆▇ |
V27 |
0 |
1 |
0.70 |
0.24 |
0.09 |
0.52 |
0.72 |
0.90 |
1.00 |
▁▂▅▃▇ |
V28 |
0 |
1 |
0.70 |
0.24 |
0.03 |
0.51 |
0.73 |
0.91 |
1.00 |
▁▂▃▅▇ |
V29 |
0 |
1 |
0.65 |
0.24 |
0.01 |
0.47 |
0.69 |
0.86 |
1.00 |
▁▂▆▅▇ |
V30 |
0 |
1 |
0.60 |
0.22 |
0.06 |
0.44 |
0.62 |
0.75 |
1.00 |
▂▅▇▇▅ |
V31 |
0 |
1 |
0.52 |
0.22 |
0.10 |
0.35 |
0.50 |
0.68 |
0.97 |
▅▇▇▅▅ |
V32 |
0 |
1 |
0.45 |
0.22 |
0.04 |
0.28 |
0.42 |
0.62 |
0.93 |
▃▇▆▅▃ |
V33 |
0 |
1 |
0.43 |
0.21 |
0.05 |
0.27 |
0.39 |
0.58 |
1.00 |
▅▇▆▅▁ |
V34 |
0 |
1 |
0.41 |
0.24 |
0.02 |
0.22 |
0.38 |
0.60 |
0.96 |
▆▇▅▅▂ |
V35 |
0 |
1 |
0.40 |
0.26 |
0.02 |
0.18 |
0.33 |
0.61 |
1.00 |
▇▇▃▅▂ |
V36 |
0 |
1 |
0.39 |
0.27 |
0.01 |
0.16 |
0.32 |
0.57 |
1.00 |
▇▆▅▃▂ |
V37 |
0 |
1 |
0.37 |
0.24 |
0.04 |
0.16 |
0.30 |
0.54 |
0.95 |
▇▅▅▂▂ |
V38 |
0 |
1 |
0.34 |
0.22 |
0.04 |
0.18 |
0.31 |
0.44 |
1.00 |
▇▇▃▂▁ |
V39 |
0 |
1 |
0.33 |
0.20 |
0.04 |
0.18 |
0.28 |
0.46 |
0.99 |
▇▇▃▂▁ |
V40 |
0 |
1 |
0.31 |
0.18 |
0.01 |
0.19 |
0.28 |
0.43 |
0.92 |
▅▇▅▂▁ |
V41 |
0 |
1 |
0.30 |
0.16 |
0.04 |
0.17 |
0.26 |
0.40 |
0.78 |
▆▇▃▂▁ |
V42 |
0 |
1 |
0.28 |
0.17 |
0.01 |
0.17 |
0.26 |
0.39 |
0.82 |
▅▇▅▁▁ |
V43 |
0 |
1 |
0.25 |
0.14 |
0.00 |
0.16 |
0.23 |
0.32 |
0.77 |
▃▇▃▁▁ |
V44 |
0 |
1 |
0.21 |
0.14 |
0.00 |
0.13 |
0.18 |
0.27 |
0.78 |
▇▇▂▂▁ |
V45 |
0 |
1 |
0.20 |
0.16 |
0.00 |
0.10 |
0.15 |
0.24 |
0.70 |
▇▆▁▂▁ |
V46 |
0 |
1 |
0.16 |
0.14 |
0.00 |
0.07 |
0.12 |
0.20 |
0.73 |
▇▃▁▁▁ |
V47 |
0 |
1 |
0.12 |
0.09 |
0.00 |
0.07 |
0.10 |
0.15 |
0.55 |
▇▅▁▁▁ |
V48 |
0 |
1 |
0.09 |
0.06 |
0.00 |
0.05 |
0.08 |
0.12 |
0.33 |
▇▇▂▁▁ |
V49 |
0 |
1 |
0.05 |
0.04 |
0.00 |
0.03 |
0.05 |
0.07 |
0.18 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.02 |
0.03 |
0.08 |
▇▇▂▁▁ |
V51 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.10 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.07 |
▇▃▁▁▁ |
V53 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.02 |
0.04 |
▇▅▂▁▁ |
V54 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.01 |
0.04 |
▆▇▂▁▁ |
V55 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V56 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V57 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V58 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V59 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.03 |
▇▆▁▁▁ |
V60 |
0 |
1 |
0.01 |
0.00 |
0.00 |
0.00 |
0.01 |
0.01 |
0.02 |
▇▆▂▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 167 60
1.3.2 Zero and Near-Zero Variance
Zero and near-zero variance data assessment:
[A] Low variance noted for 8 variables from the
previous data quality assessment using a lower threshold.
[B] No low variance noted for any variables using a
preprocessing summary from the
caret
package. The nearZeroVar method
using both the freqCut and uniqueCut criteria set at 95/5 and 10,
respectively, were applied on the dataset.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA))
Data summary
Name |
DPA |
Number of rows |
167 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 89, R: 78 |
Variable type: numeric
V1 |
0 |
1 |
0.03 |
0.02 |
0.00 |
0.01 |
0.02 |
0.04 |
0.14 |
▇▃▁▁▁ |
V2 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.05 |
0.16 |
▇▅▁▁▁ |
V3 |
0 |
1 |
0.04 |
0.03 |
0.00 |
0.02 |
0.03 |
0.06 |
0.17 |
▇▆▂▁▁ |
V4 |
0 |
1 |
0.05 |
0.04 |
0.01 |
0.02 |
0.04 |
0.07 |
0.17 |
▇▆▂▁▁ |
V5 |
0 |
1 |
0.07 |
0.05 |
0.01 |
0.04 |
0.06 |
0.10 |
0.26 |
▇▆▃▁▁ |
V6 |
0 |
1 |
0.10 |
0.06 |
0.01 |
0.07 |
0.09 |
0.13 |
0.38 |
▇▇▂▁▁ |
V7 |
0 |
1 |
0.12 |
0.06 |
0.01 |
0.08 |
0.11 |
0.15 |
0.37 |
▅▇▃▁▁ |
V8 |
0 |
1 |
0.13 |
0.09 |
0.01 |
0.08 |
0.11 |
0.17 |
0.46 |
▇▇▃▁▁ |
V9 |
0 |
1 |
0.18 |
0.12 |
0.01 |
0.10 |
0.15 |
0.24 |
0.68 |
▇▆▂▁▁ |
V10 |
0 |
1 |
0.21 |
0.14 |
0.01 |
0.11 |
0.18 |
0.27 |
0.71 |
▇▆▃▁▁ |
V11 |
0 |
1 |
0.24 |
0.14 |
0.03 |
0.13 |
0.22 |
0.31 |
0.73 |
▇▇▃▁▁ |
V12 |
0 |
1 |
0.25 |
0.14 |
0.02 |
0.13 |
0.25 |
0.33 |
0.71 |
▇▇▆▂▁ |
V13 |
0 |
1 |
0.27 |
0.14 |
0.02 |
0.16 |
0.27 |
0.36 |
0.71 |
▅▇▅▂▁ |
V14 |
0 |
1 |
0.30 |
0.17 |
0.03 |
0.17 |
0.28 |
0.39 |
1.00 |
▇▇▃▁▁ |
V15 |
0 |
1 |
0.32 |
0.21 |
0.00 |
0.15 |
0.26 |
0.45 |
1.00 |
▇▆▅▂▁ |
V16 |
0 |
1 |
0.37 |
0.23 |
0.02 |
0.18 |
0.29 |
0.54 |
1.00 |
▇▆▃▃▁ |
V17 |
0 |
1 |
0.41 |
0.27 |
0.03 |
0.20 |
0.30 |
0.66 |
1.00 |
▇▇▃▅▃ |
V18 |
0 |
1 |
0.45 |
0.26 |
0.04 |
0.23 |
0.37 |
0.68 |
1.00 |
▆▇▂▅▃ |
V19 |
0 |
1 |
0.50 |
0.26 |
0.05 |
0.30 |
0.43 |
0.73 |
1.00 |
▃▇▃▅▅ |
V20 |
0 |
1 |
0.56 |
0.26 |
0.07 |
0.33 |
0.52 |
0.80 |
1.00 |
▅▇▆▆▇ |
V21 |
0 |
1 |
0.60 |
0.26 |
0.05 |
0.39 |
0.59 |
0.82 |
1.00 |
▃▆▆▆▇ |
V22 |
0 |
1 |
0.61 |
0.26 |
0.02 |
0.40 |
0.65 |
0.83 |
1.00 |
▂▅▅▆▇ |
V23 |
0 |
1 |
0.64 |
0.25 |
0.06 |
0.45 |
0.68 |
0.85 |
1.00 |
▂▃▅▆▇ |
V24 |
0 |
1 |
0.67 |
0.24 |
0.02 |
0.54 |
0.70 |
0.87 |
1.00 |
▂▂▅▇▇ |
V25 |
0 |
1 |
0.67 |
0.25 |
0.02 |
0.52 |
0.72 |
0.86 |
1.00 |
▂▂▃▇▇ |
V26 |
0 |
1 |
0.69 |
0.23 |
0.15 |
0.55 |
0.75 |
0.88 |
1.00 |
▂▂▅▆▇ |
V27 |
0 |
1 |
0.70 |
0.24 |
0.09 |
0.52 |
0.72 |
0.90 |
1.00 |
▁▂▅▃▇ |
V28 |
0 |
1 |
0.70 |
0.24 |
0.03 |
0.51 |
0.73 |
0.91 |
1.00 |
▁▂▃▅▇ |
V29 |
0 |
1 |
0.65 |
0.24 |
0.01 |
0.47 |
0.69 |
0.86 |
1.00 |
▁▂▆▅▇ |
V30 |
0 |
1 |
0.60 |
0.22 |
0.06 |
0.44 |
0.62 |
0.75 |
1.00 |
▂▅▇▇▅ |
V31 |
0 |
1 |
0.52 |
0.22 |
0.10 |
0.35 |
0.50 |
0.68 |
0.97 |
▅▇▇▅▅ |
V32 |
0 |
1 |
0.45 |
0.22 |
0.04 |
0.28 |
0.42 |
0.62 |
0.93 |
▃▇▆▅▃ |
V33 |
0 |
1 |
0.43 |
0.21 |
0.05 |
0.27 |
0.39 |
0.58 |
1.00 |
▅▇▆▅▁ |
V34 |
0 |
1 |
0.41 |
0.24 |
0.02 |
0.22 |
0.38 |
0.60 |
0.96 |
▆▇▅▅▂ |
V35 |
0 |
1 |
0.40 |
0.26 |
0.02 |
0.18 |
0.33 |
0.61 |
1.00 |
▇▇▃▅▂ |
V36 |
0 |
1 |
0.39 |
0.27 |
0.01 |
0.16 |
0.32 |
0.57 |
1.00 |
▇▆▅▃▂ |
V37 |
0 |
1 |
0.37 |
0.24 |
0.04 |
0.16 |
0.30 |
0.54 |
0.95 |
▇▅▅▂▂ |
V38 |
0 |
1 |
0.34 |
0.22 |
0.04 |
0.18 |
0.31 |
0.44 |
1.00 |
▇▇▃▂▁ |
V39 |
0 |
1 |
0.33 |
0.20 |
0.04 |
0.18 |
0.28 |
0.46 |
0.99 |
▇▇▃▂▁ |
V40 |
0 |
1 |
0.31 |
0.18 |
0.01 |
0.19 |
0.28 |
0.43 |
0.92 |
▅▇▅▂▁ |
V41 |
0 |
1 |
0.30 |
0.16 |
0.04 |
0.17 |
0.26 |
0.40 |
0.78 |
▆▇▃▂▁ |
V42 |
0 |
1 |
0.28 |
0.17 |
0.01 |
0.17 |
0.26 |
0.39 |
0.82 |
▅▇▅▁▁ |
V43 |
0 |
1 |
0.25 |
0.14 |
0.00 |
0.16 |
0.23 |
0.32 |
0.77 |
▃▇▃▁▁ |
V44 |
0 |
1 |
0.21 |
0.14 |
0.00 |
0.13 |
0.18 |
0.27 |
0.78 |
▇▇▂▂▁ |
V45 |
0 |
1 |
0.20 |
0.16 |
0.00 |
0.10 |
0.15 |
0.24 |
0.70 |
▇▆▁▂▁ |
V46 |
0 |
1 |
0.16 |
0.14 |
0.00 |
0.07 |
0.12 |
0.20 |
0.73 |
▇▃▁▁▁ |
V47 |
0 |
1 |
0.12 |
0.09 |
0.00 |
0.07 |
0.10 |
0.15 |
0.55 |
▇▅▁▁▁ |
V48 |
0 |
1 |
0.09 |
0.06 |
0.00 |
0.05 |
0.08 |
0.12 |
0.33 |
▇▇▂▁▁ |
V49 |
0 |
1 |
0.05 |
0.04 |
0.00 |
0.03 |
0.05 |
0.07 |
0.18 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.02 |
0.03 |
0.08 |
▇▇▂▁▁ |
V51 |
0 |
1 |
0.02 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.10 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.02 |
0.07 |
▇▃▁▁▁ |
V53 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.02 |
0.04 |
▇▅▂▁▁ |
V54 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.01 |
0.01 |
0.01 |
0.04 |
▆▇▂▁▁ |
V55 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V56 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V57 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▅▁▁▁ |
V58 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.04 |
▇▃▁▁▁ |
V59 |
0 |
1 |
0.01 |
0.01 |
0.00 |
0.00 |
0.01 |
0.01 |
0.03 |
▇▆▁▁▁ |
V60 |
0 |
1 |
0.01 |
0.00 |
0.00 |
0.00 |
0.01 |
0.01 |
0.02 |
▇▆▂▁▁ |
##################################
# Identifying columns with low variance
###################################
DPA_LowVariance <- nearZeroVar(DPA,
freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
(DPA_LowVariance[DPA_LowVariance$nzv,])
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
} else {
print(paste0("Low variance observed for ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
DPA_LowVarianceForRemoval <- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
print(paste0("Low variance can be resolved by removing ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
DPA_LowVarianceRemovedVariable <- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LowVarianceRemovedVariable))
}
DPA %>%
skim() %>%
dplyr::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
##################################
# Filtering out columns with low variance
#################################
DPA_ExcludedLowVariance <- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLowVariance_Skimmed <- skim(DPA_ExcludedLowVariance))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLowVariance)
}
## [1] "No low variance predictors noted."
1.3.3 Collinearity
High collinearity data assessment:
[A] No high correlation > 95% were noted for any
variable as confirmed using the preprocessing summaries from the
caret
and
lares
packages.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Visualizing pairwise correlation between predictors
##################################
DPA_CorrelationTest <- cor.mtest(DPA.Predictors.Numeric,
method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")

##################################
# Identifying the highly correlated variables
##################################
DPA_Correlation <- cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs")
(DPA_HighlyCorrelatedCount <- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95))
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
} else {
print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount),
" pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
(DPA_HighlyCorrelatedPairs <- corr_cross(DPA.Predictors.Numeric,
max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
DPA_HighlyCorrelated <- findCorrelation(DPA_Correlation, cutoff = 0.95)
(DPA_HighlyCorrelatedForRemoval <- length(DPA_HighlyCorrelated))
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
DPA_HighlyCorrelatedRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
DPA_ExcludedHighCorrelation <- DPA[,-DPA_HighlyCorrelated]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedHighCorrelation_Skimmed <- skim(DPA_ExcludedHighCorrelation))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedHighCorrelation)
}
1.3.4 Linear Dependencies
Linear dependency data assessment:
[A] No linear dependencies noted for any subset of
variables using the preprocessing summary from the
caret
package applying the findLinearCombos method which utilizes the
QR decomposition of a matrix to enumerate sets of linear combinations
(if they exist).
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying the linearly dependent variables
##################################
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
(DPA_LinearlyDependentCount <- length(DPA_LinearlyDependent$linearCombos))
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
} else {
print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount),
" subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
DPA_LinearlyDependentSubset <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
print(paste0("Linear dependent variable(s) for subset ",
i,
" include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependentForRemoval <- length(DPA_LinearlyDependent$remove)
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
DPA_LinearlyDependentRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
DPA_ExcludedLinearlyDependent <- DPA[,-DPA_LinearlyDependent$remove]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLinearlyDependent_Skimmed <- skim(DPA_ExcludedLinearlyDependent))
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLinearlyDependent)
} else {
###################################
# Verifying the data dimensions
###################################
dim(DPA)
}
## [1] 167 61
1.3.6 Centering and Scaling
Centering and scaling data assessment:
[A] To maintain numerical stability during modelling,
centering and scaling transformations were applied on the transformed
numeric variables. The center method
from the
caret
package was implemented which subtracts the average value of a numeric
variable to all the values. As a result of centering, the variables had
zero mean values. In addition, the scale method, also from the
caret
package, was applied which performs a center transformation with each
value of the variable divided by its standard deviation. Scaling the
data coerced the values to have a common standard deviation of
one.
##################################
# Loading dataset
##################################
DPA <- Sonar_Train
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Applying a Box-Cox transformation
##################################
DPA_BoxCox <- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCoxTransformed <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
##################################
# Applying a center and scale data transformation
##################################
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- preProcess(DPA_BoxCoxTransformed, method = c("center","scale"))
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed <- predict(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_BoxCoxTransformed)
##################################
# Gathering descriptive statistics
##################################
(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformedSkimmed <- skim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed))
Data summary
Name |
DPA.Predictors.Numeric_Bo… |
Number of rows |
167 |
Number of columns |
60 |
_______________________ |
|
Column type frequency: |
|
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-3.60 |
-0.65 |
0.02 |
0.64 |
2.40 |
▁▂▇▇▂ |
V2 |
0 |
1 |
0 |
1 |
-2.55 |
-0.65 |
0.05 |
0.59 |
2.40 |
▁▅▇▆▂ |
V3 |
0 |
1 |
0 |
1 |
-2.55 |
-0.74 |
-0.02 |
0.72 |
2.46 |
▂▅▇▅▂ |
V4 |
0 |
1 |
0 |
1 |
-2.17 |
-0.71 |
0.08 |
0.62 |
2.18 |
▂▅▇▅▂ |
V5 |
0 |
1 |
0 |
1 |
-2.29 |
-0.68 |
0.00 |
0.73 |
2.53 |
▂▇▇▆▂ |
V6 |
0 |
1 |
0 |
1 |
-2.54 |
-0.55 |
-0.03 |
0.62 |
3.25 |
▂▆▇▂▁ |
V7 |
0 |
1 |
0 |
1 |
-2.75 |
-0.52 |
-0.11 |
0.61 |
2.96 |
▁▃▇▃▁ |
V8 |
0 |
1 |
0 |
1 |
-2.70 |
-0.60 |
-0.10 |
0.61 |
2.72 |
▁▅▇▅▁ |
V9 |
0 |
1 |
0 |
1 |
-2.94 |
-0.64 |
0.01 |
0.71 |
2.58 |
▁▃▇▅▁ |
V10 |
0 |
1 |
0 |
1 |
-2.72 |
-0.69 |
0.01 |
0.66 |
2.48 |
▁▅▇▅▂ |
V11 |
0 |
1 |
0 |
1 |
-2.43 |
-0.77 |
0.09 |
0.67 |
2.48 |
▂▆▇▆▂ |
V12 |
0 |
1 |
0 |
1 |
-2.22 |
-0.81 |
0.14 |
0.69 |
2.48 |
▃▆▇▅▂ |
V13 |
0 |
1 |
0 |
1 |
-2.64 |
-0.72 |
0.09 |
0.68 |
2.45 |
▁▅▇▅▂ |
V14 |
0 |
1 |
0 |
1 |
-2.76 |
-0.72 |
0.09 |
0.71 |
2.74 |
▁▅▇▆▁ |
V15 |
0 |
1 |
0 |
1 |
-2.79 |
-0.67 |
-0.04 |
0.76 |
2.30 |
▁▅▇▇▂ |
V16 |
0 |
1 |
0 |
1 |
-2.90 |
-0.75 |
-0.12 |
0.82 |
2.00 |
▁▅▇▆▅ |
V17 |
0 |
1 |
0 |
1 |
-2.59 |
-0.70 |
-0.16 |
1.00 |
1.70 |
▁▅▇▅▇ |
V18 |
0 |
1 |
0 |
1 |
-2.68 |
-0.76 |
-0.12 |
0.93 |
1.71 |
▁▃▇▃▆ |
V19 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
-0.13 |
0.90 |
1.66 |
▂▅▇▅▇ |
V20 |
0 |
1 |
0 |
1 |
-1.83 |
-0.84 |
-0.13 |
0.92 |
1.68 |
▅▇▆▆▇ |
V21 |
0 |
1 |
0 |
1 |
-2.12 |
-0.80 |
-0.03 |
0.84 |
1.55 |
▃▆▆▆▇ |
V22 |
0 |
1 |
0 |
1 |
-2.32 |
-0.84 |
0.13 |
0.86 |
1.52 |
▂▅▅▆▇ |
V23 |
0 |
1 |
0 |
1 |
-2.02 |
-0.82 |
0.11 |
0.86 |
1.54 |
▃▆▇▇▇ |
V24 |
0 |
1 |
0 |
1 |
-2.17 |
-0.62 |
0.04 |
0.86 |
1.52 |
▃▃▇▆▇ |
V25 |
0 |
1 |
0 |
1 |
-2.09 |
-0.72 |
0.13 |
0.81 |
1.53 |
▅▃▆▇▇ |
V26 |
0 |
1 |
0 |
1 |
-1.91 |
-0.78 |
0.15 |
0.83 |
1.53 |
▅▅▆▇▇ |
V27 |
0 |
1 |
0 |
1 |
-2.06 |
-0.84 |
0.01 |
0.86 |
1.38 |
▂▃▅▃▇ |
V28 |
0 |
1 |
0 |
1 |
-2.25 |
-0.85 |
0.07 |
0.91 |
1.38 |
▂▅▅▅▇ |
V29 |
0 |
1 |
0 |
1 |
-2.23 |
-0.81 |
0.10 |
0.89 |
1.56 |
▂▆▅▆▇ |
V30 |
0 |
1 |
0 |
1 |
-2.47 |
-0.72 |
0.09 |
0.70 |
1.83 |
▂▅▇▇▅ |
V31 |
0 |
1 |
0 |
1 |
-2.33 |
-0.73 |
-0.02 |
0.75 |
1.81 |
▂▆▇▇▆ |
V32 |
0 |
1 |
0 |
1 |
-2.38 |
-0.68 |
0.01 |
0.82 |
1.91 |
▂▅▇▆▅ |
V33 |
0 |
1 |
0 |
1 |
-2.48 |
-0.67 |
-0.03 |
0.77 |
2.21 |
▁▆▇▇▃ |
V34 |
0 |
1 |
0 |
1 |
-2.44 |
-0.76 |
0.01 |
0.86 |
1.93 |
▂▆▇▆▅ |
V35 |
0 |
1 |
0 |
1 |
-2.24 |
-0.74 |
-0.04 |
0.88 |
1.81 |
▂▆▇▆▆ |
V36 |
0 |
1 |
0 |
1 |
-2.53 |
-0.85 |
-0.05 |
0.79 |
1.82 |
▁▇▇▇▆ |
V37 |
0 |
1 |
0 |
1 |
-2.16 |
-0.81 |
-0.02 |
0.82 |
1.81 |
▂▇▆▇▆ |
V38 |
0 |
1 |
0 |
1 |
-2.52 |
-0.71 |
0.09 |
0.65 |
2.11 |
▂▆▇▇▃ |
V39 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
-0.04 |
0.77 |
2.31 |
▂▇▇▆▂ |
V40 |
0 |
1 |
0 |
1 |
-2.65 |
-0.61 |
-0.06 |
0.71 |
2.59 |
▁▅▇▅▁ |
V41 |
0 |
1 |
0 |
1 |
-2.25 |
-0.75 |
-0.04 |
0.75 |
2.27 |
▂▆▇▅▂ |
V42 |
0 |
1 |
0 |
1 |
-2.71 |
-0.59 |
0.00 |
0.70 |
2.49 |
▁▃▇▆▁ |
V43 |
0 |
1 |
0 |
1 |
-1.73 |
-0.65 |
-0.15 |
0.51 |
3.64 |
▃▇▃▁▁ |
V44 |
0 |
1 |
0 |
1 |
-1.56 |
-0.65 |
-0.29 |
0.37 |
4.08 |
▇▇▂▂▁ |
V45 |
0 |
1 |
0 |
1 |
-1.28 |
-0.66 |
-0.34 |
0.23 |
3.22 |
▇▆▁▂▁ |
V46 |
0 |
1 |
0 |
1 |
-1.18 |
-0.68 |
-0.31 |
0.27 |
4.11 |
▇▃▁▁▁ |
V47 |
0 |
1 |
0 |
1 |
-1.42 |
-0.66 |
-0.24 |
0.34 |
4.86 |
▇▅▁▁▁ |
V48 |
0 |
1 |
0 |
1 |
-1.49 |
-0.76 |
-0.21 |
0.44 |
3.76 |
▇▇▂▁▁ |
V49 |
0 |
1 |
0 |
1 |
-1.50 |
-0.70 |
-0.25 |
0.51 |
3.48 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0 |
1 |
-1.52 |
-0.66 |
-0.21 |
0.34 |
4.51 |
▇▇▂▁▁ |
V51 |
0 |
1 |
0 |
1 |
-1.27 |
-0.64 |
-0.19 |
0.41 |
6.72 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0 |
1 |
-3.63 |
-0.54 |
0.07 |
0.63 |
2.66 |
▁▂▇▇▂ |
V53 |
0 |
1 |
0 |
1 |
-2.84 |
-0.75 |
-0.10 |
0.79 |
2.30 |
▁▅▇▇▂ |
V54 |
0 |
1 |
0 |
1 |
-2.32 |
-0.70 |
0.04 |
0.60 |
2.35 |
▂▃▇▃▂ |
V55 |
0 |
1 |
0 |
1 |
-2.67 |
-0.75 |
0.02 |
0.70 |
2.93 |
▁▆▇▅▁ |
V56 |
0 |
1 |
0 |
1 |
-2.77 |
-0.61 |
0.00 |
0.60 |
3.26 |
▁▆▇▃▁ |
V57 |
0 |
1 |
0 |
1 |
-3.11 |
-0.70 |
-0.07 |
0.70 |
2.80 |
▁▃▇▅▁ |
V58 |
0 |
1 |
0 |
1 |
-3.73 |
-0.65 |
-0.01 |
0.67 |
2.46 |
▁▂▇▇▂ |
V59 |
0 |
1 |
0 |
1 |
-2.93 |
-0.65 |
-0.04 |
0.61 |
2.46 |
▁▃▇▅▂ |
V60 |
0 |
1 |
0 |
1 |
-2.95 |
-0.67 |
0.09 |
0.81 |
2.15 |
▁▃▆▇▂ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)
## [1] 167 60
1.3.7 Pre-Processed Dataset
Preliminary dataset assessment:
[A] 208 rows (observations)
[A.1] Train Set = 167 observations
[A.2] Test Set = 41 observations
[B] 61 columns (variables)
[B.1] 1/61 response = Class variable (factor)
[B.1.1] Levels = Class=R < Class=M
[B.2] 60/61 predictors = All remaining variables
(60/60 numeric)
[C] Pre-processing actions applied:
[C.1] Centering, scaling and shape transformation
applied to improve data quality
[C.2] No outlier treatment applied since the high
values noted were contextually valid and sensible
[C.3] No predictors removed due to zero or
near-zero variance
[C.4] No predictors removed due to high
correlation
[C.5] No predictors removed due to linear
dependencies
##################################
# Creating the pre-modelling
# train set
##################################
Class <- DPA$Class
PMA.Predictors.Numeric <- DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_BoxCoxTransformed_CenteredScaledTransformed <- cbind(Class,PMA.Predictors.Numeric)
PMA_PreModelling_Train <- PMA_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
(PMA_PreModelling_Train_Skimmed <- skim(PMA_PreModelling_Train))
Data summary
Name |
PMA_PreModelling_Train |
Number of rows |
167 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 89, R: 78 |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-3.60 |
-0.65 |
0.02 |
0.64 |
2.40 |
▁▂▇▇▂ |
V2 |
0 |
1 |
0 |
1 |
-2.55 |
-0.65 |
0.05 |
0.59 |
2.40 |
▁▅▇▆▂ |
V3 |
0 |
1 |
0 |
1 |
-2.55 |
-0.74 |
-0.02 |
0.72 |
2.46 |
▂▅▇▅▂ |
V4 |
0 |
1 |
0 |
1 |
-2.17 |
-0.71 |
0.08 |
0.62 |
2.18 |
▂▅▇▅▂ |
V5 |
0 |
1 |
0 |
1 |
-2.29 |
-0.68 |
0.00 |
0.73 |
2.53 |
▂▇▇▆▂ |
V6 |
0 |
1 |
0 |
1 |
-2.54 |
-0.55 |
-0.03 |
0.62 |
3.25 |
▂▆▇▂▁ |
V7 |
0 |
1 |
0 |
1 |
-2.75 |
-0.52 |
-0.11 |
0.61 |
2.96 |
▁▃▇▃▁ |
V8 |
0 |
1 |
0 |
1 |
-2.70 |
-0.60 |
-0.10 |
0.61 |
2.72 |
▁▅▇▅▁ |
V9 |
0 |
1 |
0 |
1 |
-2.94 |
-0.64 |
0.01 |
0.71 |
2.58 |
▁▃▇▅▁ |
V10 |
0 |
1 |
0 |
1 |
-2.72 |
-0.69 |
0.01 |
0.66 |
2.48 |
▁▅▇▅▂ |
V11 |
0 |
1 |
0 |
1 |
-2.43 |
-0.77 |
0.09 |
0.67 |
2.48 |
▂▆▇▆▂ |
V12 |
0 |
1 |
0 |
1 |
-2.22 |
-0.81 |
0.14 |
0.69 |
2.48 |
▃▆▇▅▂ |
V13 |
0 |
1 |
0 |
1 |
-2.64 |
-0.72 |
0.09 |
0.68 |
2.45 |
▁▅▇▅▂ |
V14 |
0 |
1 |
0 |
1 |
-2.76 |
-0.72 |
0.09 |
0.71 |
2.74 |
▁▅▇▆▁ |
V15 |
0 |
1 |
0 |
1 |
-2.79 |
-0.67 |
-0.04 |
0.76 |
2.30 |
▁▅▇▇▂ |
V16 |
0 |
1 |
0 |
1 |
-2.90 |
-0.75 |
-0.12 |
0.82 |
2.00 |
▁▅▇▆▅ |
V17 |
0 |
1 |
0 |
1 |
-2.59 |
-0.70 |
-0.16 |
1.00 |
1.70 |
▁▅▇▅▇ |
V18 |
0 |
1 |
0 |
1 |
-2.68 |
-0.76 |
-0.12 |
0.93 |
1.71 |
▁▃▇▃▆ |
V19 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
-0.13 |
0.90 |
1.66 |
▂▅▇▅▇ |
V20 |
0 |
1 |
0 |
1 |
-1.83 |
-0.84 |
-0.13 |
0.92 |
1.68 |
▅▇▆▆▇ |
V21 |
0 |
1 |
0 |
1 |
-2.12 |
-0.80 |
-0.03 |
0.84 |
1.55 |
▃▆▆▆▇ |
V22 |
0 |
1 |
0 |
1 |
-2.32 |
-0.84 |
0.13 |
0.86 |
1.52 |
▂▅▅▆▇ |
V23 |
0 |
1 |
0 |
1 |
-2.02 |
-0.82 |
0.11 |
0.86 |
1.54 |
▃▆▇▇▇ |
V24 |
0 |
1 |
0 |
1 |
-2.17 |
-0.62 |
0.04 |
0.86 |
1.52 |
▃▃▇▆▇ |
V25 |
0 |
1 |
0 |
1 |
-2.09 |
-0.72 |
0.13 |
0.81 |
1.53 |
▅▃▆▇▇ |
V26 |
0 |
1 |
0 |
1 |
-1.91 |
-0.78 |
0.15 |
0.83 |
1.53 |
▅▅▆▇▇ |
V27 |
0 |
1 |
0 |
1 |
-2.06 |
-0.84 |
0.01 |
0.86 |
1.38 |
▂▃▅▃▇ |
V28 |
0 |
1 |
0 |
1 |
-2.25 |
-0.85 |
0.07 |
0.91 |
1.38 |
▂▅▅▅▇ |
V29 |
0 |
1 |
0 |
1 |
-2.23 |
-0.81 |
0.10 |
0.89 |
1.56 |
▂▆▅▆▇ |
V30 |
0 |
1 |
0 |
1 |
-2.47 |
-0.72 |
0.09 |
0.70 |
1.83 |
▂▅▇▇▅ |
V31 |
0 |
1 |
0 |
1 |
-2.33 |
-0.73 |
-0.02 |
0.75 |
1.81 |
▂▆▇▇▆ |
V32 |
0 |
1 |
0 |
1 |
-2.38 |
-0.68 |
0.01 |
0.82 |
1.91 |
▂▅▇▆▅ |
V33 |
0 |
1 |
0 |
1 |
-2.48 |
-0.67 |
-0.03 |
0.77 |
2.21 |
▁▆▇▇▃ |
V34 |
0 |
1 |
0 |
1 |
-2.44 |
-0.76 |
0.01 |
0.86 |
1.93 |
▂▆▇▆▅ |
V35 |
0 |
1 |
0 |
1 |
-2.24 |
-0.74 |
-0.04 |
0.88 |
1.81 |
▂▆▇▆▆ |
V36 |
0 |
1 |
0 |
1 |
-2.53 |
-0.85 |
-0.05 |
0.79 |
1.82 |
▁▇▇▇▆ |
V37 |
0 |
1 |
0 |
1 |
-2.16 |
-0.81 |
-0.02 |
0.82 |
1.81 |
▂▇▆▇▆ |
V38 |
0 |
1 |
0 |
1 |
-2.52 |
-0.71 |
0.09 |
0.65 |
2.11 |
▂▆▇▇▃ |
V39 |
0 |
1 |
0 |
1 |
-2.40 |
-0.71 |
-0.04 |
0.77 |
2.31 |
▂▇▇▆▂ |
V40 |
0 |
1 |
0 |
1 |
-2.65 |
-0.61 |
-0.06 |
0.71 |
2.59 |
▁▅▇▅▁ |
V41 |
0 |
1 |
0 |
1 |
-2.25 |
-0.75 |
-0.04 |
0.75 |
2.27 |
▂▆▇▅▂ |
V42 |
0 |
1 |
0 |
1 |
-2.71 |
-0.59 |
0.00 |
0.70 |
2.49 |
▁▃▇▆▁ |
V43 |
0 |
1 |
0 |
1 |
-1.73 |
-0.65 |
-0.15 |
0.51 |
3.64 |
▃▇▃▁▁ |
V44 |
0 |
1 |
0 |
1 |
-1.56 |
-0.65 |
-0.29 |
0.37 |
4.08 |
▇▇▂▂▁ |
V45 |
0 |
1 |
0 |
1 |
-1.28 |
-0.66 |
-0.34 |
0.23 |
3.22 |
▇▆▁▂▁ |
V46 |
0 |
1 |
0 |
1 |
-1.18 |
-0.68 |
-0.31 |
0.27 |
4.11 |
▇▃▁▁▁ |
V47 |
0 |
1 |
0 |
1 |
-1.42 |
-0.66 |
-0.24 |
0.34 |
4.86 |
▇▅▁▁▁ |
V48 |
0 |
1 |
0 |
1 |
-1.49 |
-0.76 |
-0.21 |
0.44 |
3.76 |
▇▇▂▁▁ |
V49 |
0 |
1 |
0 |
1 |
-1.50 |
-0.70 |
-0.25 |
0.51 |
3.48 |
▇▇▃▂▁ |
V50 |
0 |
1 |
0 |
1 |
-1.52 |
-0.66 |
-0.21 |
0.34 |
4.51 |
▇▇▂▁▁ |
V51 |
0 |
1 |
0 |
1 |
-1.27 |
-0.64 |
-0.19 |
0.41 |
6.72 |
▇▃▁▁▁ |
V52 |
0 |
1 |
0 |
1 |
-3.63 |
-0.54 |
0.07 |
0.63 |
2.66 |
▁▂▇▇▂ |
V53 |
0 |
1 |
0 |
1 |
-2.84 |
-0.75 |
-0.10 |
0.79 |
2.30 |
▁▅▇▇▂ |
V54 |
0 |
1 |
0 |
1 |
-2.32 |
-0.70 |
0.04 |
0.60 |
2.35 |
▂▃▇▃▂ |
V55 |
0 |
1 |
0 |
1 |
-2.67 |
-0.75 |
0.02 |
0.70 |
2.93 |
▁▆▇▅▁ |
V56 |
0 |
1 |
0 |
1 |
-2.77 |
-0.61 |
0.00 |
0.60 |
3.26 |
▁▆▇▃▁ |
V57 |
0 |
1 |
0 |
1 |
-3.11 |
-0.70 |
-0.07 |
0.70 |
2.80 |
▁▃▇▅▁ |
V58 |
0 |
1 |
0 |
1 |
-3.73 |
-0.65 |
-0.01 |
0.67 |
2.46 |
▁▂▇▇▂ |
V59 |
0 |
1 |
0 |
1 |
-2.93 |
-0.65 |
-0.04 |
0.61 |
2.46 |
▁▃▇▅▂ |
V60 |
0 |
1 |
0 |
1 |
-2.95 |
-0.67 |
0.09 |
0.81 |
2.15 |
▁▃▆▇▂ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 167 61
##################################
# Formulating the test set
##################################
DPA_Test <- Sonar_Test
DPA_Test.Predictors <- DPA_Test[,!names(DPA_Test) %in% c("Class")]
DPA_Test.Predictors.Numeric <- DPA_Test.Predictors[,sapply(DPA_Test.Predictors, is.numeric)]
DPA_Test_BoxCox <- preProcess(DPA_Test.Predictors.Numeric, method = c("BoxCox"))
DPA_Test_BoxCoxTransformed <- predict(DPA_Test_BoxCox, DPA_Test.Predictors.Numeric)
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- preProcess(DPA_Test_BoxCoxTransformed, method = c("center","scale"))
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed <- predict(DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_Test_BoxCoxTransformed)
##################################
# Creating the pre-modelling
# test set
##################################
Class <- DPA_Test$Class
PMA_Test.Predictors.Numeric <- DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_Test_BoxCoxTransformed_CenteredScaledTransformed <- cbind(Class,PMA_Test.Predictors.Numeric)
PMA_PreModelling_Test <- PMA_Test_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
(PMA_PreModelling_Test_Skimmed <- skim(PMA_PreModelling_Test))
Data summary
Name |
PMA_PreModelling_Test |
Number of rows |
41 |
Number of columns |
61 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
60 |
________________________ |
|
Group variables |
None |
Variable type: factor
Class |
0 |
1 |
FALSE |
2 |
M: 22, R: 19 |
Variable type: numeric
V1 |
0 |
1 |
0 |
1 |
-1.96 |
-0.88 |
0.00 |
0.74 |
2.12 |
▃▆▇▇▁ |
V2 |
0 |
1 |
0 |
1 |
-2.49 |
-0.60 |
0.03 |
0.62 |
3.07 |
▁▆▇▂▁ |
V3 |
0 |
1 |
0 |
1 |
-2.39 |
-0.45 |
0.05 |
0.49 |
2.46 |
▁▂▇▂▂ |
V4 |
0 |
1 |
0 |
1 |
-2.44 |
-0.70 |
-0.13 |
0.43 |
2.80 |
▁▇▇▃▁ |
V5 |
0 |
1 |
0 |
1 |
-2.69 |
-0.64 |
-0.01 |
0.72 |
2.31 |
▁▃▇▅▁ |
V6 |
0 |
1 |
0 |
1 |
-2.11 |
-0.54 |
-0.09 |
0.88 |
1.90 |
▂▅▇▃▅ |
V7 |
0 |
1 |
0 |
1 |
-2.66 |
-0.74 |
-0.02 |
0.69 |
2.56 |
▁▅▇▅▁ |
V8 |
0 |
1 |
0 |
1 |
-2.87 |
-0.40 |
-0.12 |
0.46 |
2.82 |
▁▂▇▂▁ |
V9 |
0 |
1 |
0 |
1 |
-2.60 |
-0.59 |
-0.04 |
0.51 |
2.93 |
▁▅▇▂▁ |
V10 |
0 |
1 |
0 |
1 |
-2.17 |
-0.54 |
0.10 |
0.56 |
2.69 |
▂▅▇▂▁ |
V11 |
0 |
1 |
0 |
1 |
-2.13 |
-0.63 |
0.14 |
0.64 |
2.04 |
▃▅▇▇▂ |
V12 |
0 |
1 |
0 |
1 |
-1.99 |
-0.52 |
0.05 |
0.49 |
2.51 |
▂▃▇▂▁ |
V13 |
0 |
1 |
0 |
1 |
-2.29 |
-0.56 |
-0.04 |
0.46 |
2.36 |
▁▃▇▂▂ |
V14 |
0 |
1 |
0 |
1 |
-2.47 |
-0.61 |
0.10 |
0.40 |
2.58 |
▁▅▇▃▁ |
V15 |
0 |
1 |
0 |
1 |
-2.01 |
-0.82 |
-0.05 |
0.71 |
1.82 |
▃▃▇▃▅ |
V16 |
0 |
1 |
0 |
1 |
-2.28 |
-0.73 |
-0.13 |
0.69 |
1.85 |
▂▆▇▇▆ |
V17 |
0 |
1 |
0 |
1 |
-2.37 |
-0.89 |
0.03 |
0.88 |
1.71 |
▁▇▆▆▇ |
V18 |
0 |
1 |
0 |
1 |
-2.11 |
-0.60 |
-0.15 |
0.91 |
1.66 |
▂▃▇▅▇ |
V19 |
0 |
1 |
0 |
1 |
-1.97 |
-0.81 |
0.06 |
0.99 |
1.61 |
▃▇▆▅▇ |
V20 |
0 |
1 |
0 |
1 |
-2.01 |
-0.75 |
0.10 |
1.06 |
1.45 |
▂▆▅▅▇ |
V21 |
0 |
1 |
0 |
1 |
-1.84 |
-0.83 |
0.23 |
0.86 |
1.48 |
▅▃▃▇▇ |
V22 |
0 |
1 |
0 |
1 |
-1.87 |
-0.86 |
0.32 |
0.61 |
1.44 |
▃▃▂▇▅ |
V23 |
0 |
1 |
0 |
1 |
-1.86 |
-1.01 |
0.26 |
0.69 |
1.54 |
▃▅▃▇▆ |
V24 |
0 |
1 |
0 |
1 |
-1.97 |
-0.78 |
0.22 |
0.83 |
1.53 |
▃▇▃▇▇ |
V25 |
0 |
1 |
0 |
1 |
-1.81 |
-0.82 |
-0.05 |
0.91 |
1.41 |
▃▂▅▂▇ |
V26 |
0 |
1 |
0 |
1 |
-1.97 |
-0.78 |
0.17 |
0.89 |
1.14 |
▂▂▃▁▇ |
V27 |
0 |
1 |
0 |
1 |
-2.05 |
-0.82 |
0.12 |
0.99 |
1.14 |
▂▃▃▅▇ |
V28 |
0 |
1 |
0 |
1 |
-2.06 |
-0.56 |
0.09 |
0.78 |
1.53 |
▃▅▇▇▇ |
V29 |
0 |
1 |
0 |
1 |
-2.06 |
-0.65 |
-0.07 |
0.92 |
1.67 |
▃▇▆▇▇ |
V30 |
0 |
1 |
0 |
1 |
-1.94 |
-0.55 |
-0.32 |
0.75 |
1.70 |
▅▆▆▇▅ |
V31 |
0 |
1 |
0 |
1 |
-2.24 |
-0.64 |
-0.04 |
0.67 |
2.55 |
▂▅▇▅▁ |
V32 |
0 |
1 |
0 |
1 |
-1.94 |
-0.59 |
0.26 |
0.62 |
2.46 |
▃▃▇▃▁ |
V33 |
0 |
1 |
0 |
1 |
-2.19 |
-0.77 |
0.12 |
0.73 |
1.93 |
▂▆▇▆▃ |
V34 |
0 |
1 |
0 |
1 |
-2.52 |
-0.75 |
-0.09 |
0.78 |
1.90 |
▁▅▇▆▃ |
V35 |
0 |
1 |
0 |
1 |
-1.75 |
-0.74 |
-0.05 |
0.84 |
1.76 |
▇▇▇▇▆ |
V36 |
0 |
1 |
0 |
1 |
-1.76 |
-0.78 |
0.15 |
0.65 |
1.70 |
▆▇▆▇▆ |
V37 |
0 |
1 |
0 |
1 |
-1.91 |
-0.78 |
0.10 |
0.73 |
1.95 |
▅▅▇▇▃ |
V38 |
0 |
1 |
0 |
1 |
-2.29 |
-0.72 |
0.17 |
0.68 |
2.21 |
▂▇▇▇▂ |
V39 |
0 |
1 |
0 |
1 |
-2.00 |
-0.92 |
0.21 |
0.61 |
2.19 |
▃▆▃▇▂ |
V40 |
0 |
1 |
0 |
1 |
-2.67 |
-0.76 |
0.08 |
0.72 |
2.35 |
▁▆▇▅▂ |
V41 |
0 |
1 |
0 |
1 |
-1.95 |
-0.69 |
0.04 |
0.76 |
1.88 |
▅▆▇▇▅ |
V42 |
0 |
1 |
0 |
1 |
-2.16 |
-0.70 |
-0.05 |
0.53 |
2.08 |
▃▅▇▃▃ |
V43 |
0 |
1 |
0 |
1 |
-1.90 |
-0.60 |
0.00 |
0.91 |
2.27 |
▃▅▇▅▁ |
V44 |
0 |
1 |
0 |
1 |
-2.07 |
-0.54 |
0.00 |
0.70 |
2.13 |
▃▆▇▇▃ |
V45 |
0 |
1 |
0 |
1 |
-2.18 |
-0.73 |
0.14 |
0.50 |
2.10 |
▂▇▇▇▃ |
V46 |
0 |
1 |
0 |
1 |
-2.21 |
-0.71 |
0.16 |
0.71 |
2.02 |
▂▇▇▇▂ |
V47 |
0 |
1 |
0 |
1 |
-1.90 |
-0.72 |
0.12 |
0.70 |
2.45 |
▃▅▇▃▁ |
V48 |
0 |
1 |
0 |
1 |
-2.18 |
-0.55 |
0.05 |
0.64 |
2.60 |
▂▅▇▃▁ |
V49 |
0 |
1 |
0 |
1 |
-1.98 |
-0.74 |
0.20 |
0.68 |
2.37 |
▃▇▇▇▁ |
V50 |
0 |
1 |
0 |
1 |
-2.56 |
-0.70 |
0.03 |
0.60 |
2.99 |
▁▆▇▃▁ |
V51 |
0 |
1 |
0 |
1 |
-2.13 |
-0.57 |
0.02 |
0.44 |
2.22 |
▂▃▇▂▂ |
V52 |
0 |
1 |
0 |
1 |
-2.05 |
-0.56 |
0.06 |
0.66 |
2.08 |
▂▅▇▅▂ |
V53 |
0 |
1 |
0 |
1 |
-2.14 |
-0.49 |
-0.05 |
0.46 |
2.51 |
▂▃▇▂▁ |
V54 |
0 |
1 |
0 |
1 |
-1.96 |
-0.74 |
-0.16 |
0.74 |
2.00 |
▃▆▆▇▃ |
V55 |
0 |
1 |
0 |
1 |
-2.20 |
-0.62 |
-0.02 |
0.60 |
1.89 |
▃▃▇▆▅ |
V56 |
0 |
1 |
0 |
1 |
-1.81 |
-0.58 |
-0.05 |
0.78 |
2.11 |
▆▇▇▇▃ |
V57 |
0 |
1 |
0 |
1 |
-2.16 |
-0.71 |
-0.14 |
0.67 |
1.97 |
▂▆▇▇▃ |
V58 |
0 |
1 |
0 |
1 |
-2.07 |
-0.62 |
-0.07 |
0.79 |
2.09 |
▅▆▇▇▃ |
V59 |
0 |
1 |
0 |
1 |
-1.66 |
-0.73 |
0.00 |
0.59 |
2.31 |
▅▅▇▃▁ |
V60 |
0 |
1 |
0 |
1 |
-2.01 |
-0.60 |
-0.12 |
0.64 |
2.72 |
▃▇▇▃▁ |
###################################
# Verifying the data dimensions
# for the test set
###################################
dim(PMA_PreModelling_Test)
## [1] 41 61
1.5 Predictive Model Development and Hyperparameter Tuning
1.5.1 Support Vector Machine - Radial Basis Function Kernel - Manual
Grid Search (SVM_R_MGS)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma
[B.2] C =
cost
[C] Performance of the applied manual grid search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
sigma=0.03000 and C=2
[C.2] Cross-Validation ROC Curve AUC =
0.96629
[C.3] Test ROC Curve AUC = 0.93780
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Transforming factor predictors
# as required by the nature of the model
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_SVM_R <- as.data.frame(lapply(PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in%
c("Class")],
function(x) as.numeric(as.character(x))))
PMA_PreModelling_Train_SVM_R$Class <- PMA_PreModelling_Train$Class
dim(PMA_PreModelling_Train_SVM_R)
## [1] 167 61
PMA_PreModelling_Test_SVM_R <- as.data.frame(lapply(PMA_PreModelling_Test[,!names(PMA_PreModelling_Test) %in%
c("Class")],
function(x) as.numeric(as.character(x))))
PMA_PreModelling_Test_SVM_R$Class <- PMA_PreModelling_Test$Class
dim(PMA_PreModelling_Test_SVM_R)
## [1] 41 61
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_SVM_R$Class,
k = 10,
returnTrain=TRUE)
KFold_Control_RandomSearch <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE,
search = "random")
KFold_Control_GridSearch <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
SVM_R_Grid = expand.grid(sigma = c(0.030, 0.025, 0.020, 0.015, 0.010, 0.005),
C = 2^(-2:9))
##################################
# Running the support vector machine (radial basis function kernel) model
# by setting the caret method to 'svmRadial'
##################################
##################################
# Using a manual grid search
##################################
set.seed(12345678)
SVM_R_Tune_GridSearch_Manual <- train(x = PMA_PreModelling_Train_SVM_R[,!names(PMA_PreModelling_Train_SVM_R) %in% c("Class")],
y = PMA_PreModelling_Train_SVM_R$Class,
method = "svmRadial",
tuneGrid = SVM_R_Grid,
metric = "ROC",
preProc = c("center", "scale"),
trControl = KFold_Control_GridSearch,
returnResamp = "all")
SVM_R_Tune_GridSearch_Manual$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 2
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.03
##
## Number of Support Vectors : 156
##
## Objective Function Value : -54.7816
## Training error : 0
## Probability model included.
SVM_R_Tune_GridSearch_Manual$results
## sigma C ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.005 0.25 0.8260169 0.7888889 0.7321429 0.13266813 0.16101530 0.21179103
## 2 0.005 0.50 0.8414683 0.8333333 0.7303571 0.13145729 0.14103284 0.18412976
## 3 0.005 1.00 0.8714286 0.8777778 0.7696429 0.10662639 0.11049210 0.19351190
## 4 0.005 2.00 0.8863095 0.8888889 0.7964286 0.09551350 0.11712139 0.19852774
## 5 0.005 4.00 0.8880952 0.9111111 0.8089286 0.10012450 0.11475506 0.19652775
## 6 0.005 8.00 0.8938492 0.8888889 0.7964286 0.09055897 0.11712139 0.18675684
## 7 0.005 16.00 0.9109375 0.9000000 0.7821429 0.08273844 0.12227833 0.14526737
## 8 0.005 32.00 0.9151042 0.8652778 0.8214286 0.08203275 0.17198934 0.12371791
## 9 0.005 64.00 0.9151042 0.8652778 0.8214286 0.08203275 0.17198934 0.12371791
## 10 0.005 128.00 0.9151042 0.8763889 0.8214286 0.08203275 0.16927613 0.12371791
## 11 0.005 256.00 0.9151042 0.8986111 0.8214286 0.08203275 0.09756333 0.12371791
## 12 0.005 512.00 0.9151042 0.8652778 0.8071429 0.08203275 0.17198934 0.10806482
## 13 0.010 0.25 0.8458333 0.8222222 0.7196429 0.13137608 0.16728281 0.19742712
## 14 0.010 0.50 0.8684524 0.8444444 0.7821429 0.11588634 0.11944086 0.18706014
## 15 0.010 1.00 0.9017857 0.8763889 0.7964286 0.08970483 0.11042419 0.19852774
## 16 0.010 2.00 0.9063492 0.9097222 0.8232143 0.09227459 0.11513732 0.20541321
## 17 0.010 4.00 0.9236359 0.8875000 0.8357143 0.08044389 0.11720372 0.16683665
## 18 0.010 8.00 0.9323661 0.9097222 0.8339286 0.07378842 0.11513732 0.12200203
## 19 0.010 16.00 0.9337550 0.9097222 0.8339286 0.07092083 0.11513732 0.12200203
## 20 0.010 32.00 0.9337550 0.8986111 0.8339286 0.07092083 0.11073431 0.12200203
## 21 0.010 64.00 0.9337550 0.9097222 0.8339286 0.07092083 0.11513732 0.12200203
## 22 0.010 128.00 0.9337550 0.8875000 0.8339286 0.07092083 0.11720372 0.12200203
## 23 0.010 256.00 0.9337550 0.8986111 0.8214286 0.07092083 0.11073431 0.12371791
## 24 0.010 512.00 0.9337550 0.9097222 0.8339286 0.07092083 0.11513732 0.12200203
## 25 0.015 0.25 0.8599206 0.8333333 0.7571429 0.12319069 0.14103284 0.21249250
## 26 0.015 0.50 0.8900794 0.8666667 0.7964286 0.10042985 0.10210406 0.19852774
## 27 0.015 1.00 0.9190724 0.8986111 0.8232143 0.07313455 0.11073431 0.20541321
## 28 0.015 2.00 0.9337550 0.9319444 0.8232143 0.06785716 0.09452838 0.16824297
## 29 0.015 4.00 0.9522073 0.9430556 0.8607143 0.05912173 0.09543103 0.12423234
## 30 0.015 8.00 0.9535962 0.9208333 0.8607143 0.05648576 0.09214010 0.09213701
## 31 0.015 16.00 0.9535962 0.9208333 0.8464286 0.05648576 0.11822337 0.10316697
## 32 0.015 32.00 0.9535962 0.9208333 0.8464286 0.05648576 0.10598718 0.10316697
## 33 0.015 64.00 0.9535962 0.9097222 0.8607143 0.05648576 0.11513732 0.09213701
## 34 0.015 128.00 0.9535962 0.9208333 0.8607143 0.05648576 0.11822337 0.09213701
## 35 0.015 256.00 0.9535962 0.9208333 0.8607143 0.05648576 0.11822337 0.09213701
## 36 0.015 512.00 0.9535962 0.8986111 0.8607143 0.05648576 0.11073431 0.09213701
## 37 0.020 0.25 0.8599454 0.7986111 0.7839286 0.12654947 0.12512853 0.21386367
## 38 0.020 0.50 0.9093254 0.8875000 0.7964286 0.08581104 0.07420417 0.19852774
## 39 0.020 1.00 0.9365327 0.8986111 0.8482143 0.06463239 0.11073431 0.16436549
## 40 0.020 2.00 0.9522073 0.9541667 0.8607143 0.05728040 0.07912469 0.12423234
## 41 0.020 4.00 0.9563740 0.9652778 0.8607143 0.05151736 0.07670390 0.09213701
## 42 0.020 8.00 0.9563740 0.9541667 0.8607143 0.05151736 0.07912469 0.09213701
## 43 0.020 16.00 0.9563740 0.9541667 0.8607143 0.05151736 0.07912469 0.09213701
## 44 0.020 32.00 0.9563740 0.9430556 0.8607143 0.05151736 0.10886030 0.09213701
## 45 0.020 64.00 0.9563740 0.9541667 0.8607143 0.05151736 0.07912469 0.09213701
## 46 0.020 128.00 0.9563740 0.9652778 0.8607143 0.05151736 0.07670390 0.09213701
## 47 0.020 256.00 0.9563740 0.9430556 0.8732143 0.05151736 0.07977216 0.10221817
## 48 0.020 512.00 0.9563740 0.9541667 0.8607143 0.05151736 0.07912469 0.09213701
## 49 0.025 0.25 0.8744296 0.7875000 0.8250000 0.11673972 0.12108938 0.20581815
## 50 0.025 0.50 0.9238095 0.8875000 0.8107143 0.07900159 0.07420417 0.19709483
## 51 0.025 1.00 0.9478423 0.9208333 0.8607143 0.05851844 0.07580444 0.12423234
## 52 0.025 2.00 0.9607391 0.9652778 0.8607143 0.04456594 0.07670390 0.12423234
## 53 0.025 4.00 0.9649058 0.9430556 0.8732143 0.03865818 0.07977216 0.10221817
## 54 0.025 8.00 0.9649058 0.9541667 0.8607143 0.03865818 0.07912469 0.09213701
## 55 0.025 16.00 0.9649058 0.9652778 0.8607143 0.03865818 0.07670390 0.09213701
## 56 0.025 32.00 0.9649058 0.9319444 0.8732143 0.03865818 0.07869009 0.10221817
## 57 0.025 64.00 0.9649058 0.9319444 0.8607143 0.03865818 0.07869009 0.09213701
## 58 0.025 128.00 0.9649058 0.9541667 0.8732143 0.03865818 0.07912469 0.10221817
## 59 0.025 256.00 0.9649058 0.9430556 0.8607143 0.03865818 0.07977216 0.09213701
## 60 0.025 512.00 0.9649058 0.9541667 0.8732143 0.03865818 0.07912469 0.10221817
## 61 0.030 0.25 0.8918899 0.6986111 0.8750000 0.09764678 0.19489036 0.16666667
## 62 0.030 0.50 0.9264137 0.9097222 0.8107143 0.07046736 0.07089534 0.19709483
## 63 0.030 1.00 0.9565724 0.9208333 0.8607143 0.05274606 0.07580444 0.12423234
## 64 0.030 2.00 0.9662946 0.9319444 0.8857143 0.03635921 0.09452838 0.09267381
## 65 0.030 4.00 0.9662946 0.9430556 0.8482143 0.03635921 0.07977216 0.11457367
## 66 0.030 8.00 0.9662946 0.9319444 0.8732143 0.03635921 0.07869009 0.08352444
## 67 0.030 16.00 0.9662946 0.9430556 0.8607143 0.03635921 0.07977216 0.12423234
## 68 0.030 32.00 0.9662946 0.9430556 0.8857143 0.03635921 0.07977216 0.09267381
## 69 0.030 64.00 0.9662946 0.9541667 0.8482143 0.03635921 0.07912469 0.11457367
## 70 0.030 128.00 0.9662946 0.9541667 0.8607143 0.03635921 0.07912469 0.09213701
## 71 0.030 256.00 0.9662946 0.9319444 0.8607143 0.03635921 0.07869009 0.09213701
## 72 0.030 512.00 0.9662946 0.9319444 0.8607143 0.03635921 0.07869009 0.09213701
##################################
# Reporting the cross-validation results
# for the train set
##################################
(SVM_R_Train_GridSearch_Manual_ROCCurveAUC <- SVM_R_Tune_GridSearch_Manual$results[SVM_R_Tune_GridSearch_Manual$results$C==SVM_R_Tune_GridSearch_Manual$bestTune$C & SVM_R_Tune_GridSearch_Manual$results$sigma==SVM_R_Tune_GridSearch_Manual$bestTune$sigma,
c("ROC")])
## [1] 0.9662946
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_R_Test_GridSearch_Manual <- data.frame(SVM_R_Observed = PMA_PreModelling_Test_SVM_R$Class,
SVM_R_Predicted = predict(SVM_R_Tune_GridSearch_Manual,
PMA_PreModelling_Test_SVM_R[,!names(PMA_PreModelling_Test_SVM_R) %in% c("Class")],
type = "prob"))
SVM_R_Test_GridSearch_Manual
## SVM_R_Observed SVM_R_Predicted.M SVM_R_Predicted.R
## 1 R 0.498313517 0.501686483
## 2 R 0.252823888 0.747176112
## 3 R 0.432002237 0.567997763
## 4 R 0.338372591 0.661627409
## 5 R 0.044367096 0.955632904
## 6 R 0.792956664 0.207043336
## 7 R 0.372759443 0.627240557
## 8 R 0.518259266 0.481740734
## 9 R 0.166319456 0.833680544
## 10 R 0.007429286 0.992570714
## 11 R 0.409778215 0.590221785
## 12 R 0.187868114 0.812131886
## 13 R 0.001983070 0.998016930
## 14 R 0.015027703 0.984972297
## 15 R 0.117879991 0.882120009
## 16 R 0.075579467 0.924420533
## 17 R 0.037979966 0.962020034
## 18 R 0.053744989 0.946255011
## 19 R 0.212654747 0.787345253
## 20 M 0.233770876 0.766229124
## 21 M 0.593526187 0.406473813
## 22 M 0.321828992 0.678171008
## 23 M 0.523709904 0.476290096
## 24 M 0.739237045 0.260762955
## 25 M 0.980117995 0.019882005
## 26 M 0.984706536 0.015293464
## 27 M 0.978129469 0.021870531
## 28 M 0.756532422 0.243467578
## 29 M 0.918799066 0.081200934
## 30 M 0.963846419 0.036153581
## 31 M 0.430839955 0.569160045
## 32 M 0.603029446 0.396970554
## 33 M 0.931623529 0.068376471
## 34 M 0.538337898 0.461662102
## 35 M 0.995676794 0.004323206
## 36 M 0.987575783 0.012424217
## 37 M 0.998846682 0.001153318
## 38 M 0.974067410 0.025932590
## 39 M 0.998920687 0.001079313
## 40 M 0.997777114 0.002222886
## 41 M 0.589129927 0.410870073
##################################
# Reporting the independent evaluation results
# for the test set
##################################
SVM_R_Test_GridSearch_Manual_ROC <- roc(response = SVM_R_Test_GridSearch_Manual$SVM_R_Observed,
predictor = SVM_R_Test_GridSearch_Manual$SVM_R_Predicted.R,
levels = rev(levels(SVM_R_Test_GridSearch_Manual$SVM_R_Observed)))
(SVM_R_Test_GridSearch_Manual_ROCCurveAUC <- auc(SVM_R_Test_GridSearch_Manual_ROC)[1])
## [1] 0.937799
1.5.2 Support Vector Machine - Radial Basis Function Kernel -
Automated Grid Search (SVM_R_AGS)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma
[B.2] C =
cost
[C] Performance of the applied automated grid search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
sigma=0.01012 and C=8
[C.2] Cross-Validation ROC Curve AUC =
0.93514
[C.3] Test ROC Curve AUC = 0.91148
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Using an automated grid search
##################################
set.seed(12345678)
SVM_R_Tune_GridSearch_Auto <- train(x = PMA_PreModelling_Train_SVM_R[,!names(PMA_PreModelling_Train_SVM_R) %in% c("Class")],
y = PMA_PreModelling_Train_SVM_R$Class,
method = "svmRadial",
tuneLength = 12,
metric = "ROC",
preProc = c("center", "scale"),
trControl = KFold_Control_GridSearch,
returnResamp = "all")
SVM_R_Tune_GridSearch_Auto$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0101286917816145
##
## Number of Support Vectors : 107
##
## Objective Function Value : -116.4213
## Training error : 0
## Probability model included.
SVM_R_Tune_GridSearch_Auto$results
## sigma C ROC Sens Spec ROCSD SensSD
## 1 0.01012869 0.25 0.8472222 0.8222222 0.7321429 0.13031954 0.1672828
## 2 0.01012869 0.50 0.8698413 0.8555556 0.7821429 0.11287340 0.1054093
## 3 0.01012869 1.00 0.9017857 0.8986111 0.7964286 0.08970483 0.1107343
## 4 0.01012869 2.00 0.9047867 0.9097222 0.8232143 0.09063022 0.1151373
## 5 0.01012869 4.00 0.9252232 0.9097222 0.7821429 0.08053464 0.1025335
## 6 0.01012869 8.00 0.9351438 0.8986111 0.8339286 0.07128847 0.1224972
## 7 0.01012869 16.00 0.9321925 0.9097222 0.8214286 0.06984564 0.1151373
## 8 0.01012869 32.00 0.9321925 0.8986111 0.8339286 0.06984564 0.1224972
## 9 0.01012869 64.00 0.9321925 0.9097222 0.8214286 0.06984564 0.1151373
## 10 0.01012869 128.00 0.9321925 0.9097222 0.8339286 0.06984564 0.1151373
## 11 0.01012869 256.00 0.9321925 0.8986111 0.8339286 0.06984564 0.1107343
## 12 0.01012869 512.00 0.9321925 0.8986111 0.8339286 0.06984564 0.1107343
## SpecSD
## 1 0.2117910
## 2 0.1870601
## 3 0.1985277
## 4 0.2054132
## 5 0.1870601
## 6 0.1220020
## 7 0.1237179
## 8 0.1220020
## 9 0.1237179
## 10 0.1220020
## 11 0.1220020
## 12 0.1220020
##################################
# Reporting the cross-validation results
# for the train set
##################################
(SVM_R_Train_GridSearch_Auto_ROCCurveAUC <- SVM_R_Tune_GridSearch_Auto$results[SVM_R_Tune_GridSearch_Auto$results$C==SVM_R_Tune_GridSearch_Auto$bestTune$C & SVM_R_Tune_GridSearch_Auto$results$sigma==SVM_R_Tune_GridSearch_Auto$bestTune$sigma,
c("ROC")])
## [1] 0.9351438
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_R_Test_GridSearch_Auto <- data.frame(SVM_R_Observed = PMA_PreModelling_Test_SVM_R$Class,
SVM_R_Predicted = predict(SVM_R_Tune_GridSearch_Auto,
PMA_PreModelling_Test_SVM_R[,!names(PMA_PreModelling_Test_SVM_R) %in% c("Class")],
type = "prob"))
SVM_R_Test_GridSearch_Auto
## SVM_R_Observed SVM_R_Predicted.M SVM_R_Predicted.R
## 1 R 0.644520120 0.355479880
## 2 R 0.068227365 0.931772635
## 3 R 0.274246432 0.725753568
## 4 R 0.306557842 0.693442158
## 5 R 0.035774492 0.964225508
## 6 R 0.630076881 0.369923119
## 7 R 0.182377193 0.817622807
## 8 R 0.592301365 0.407698635
## 9 R 0.302447356 0.697552644
## 10 R 0.025172209 0.974827791
## 11 R 0.280726484 0.719273516
## 12 R 0.159681848 0.840318152
## 13 R 0.005997015 0.994002985
## 14 R 0.030660134 0.969339866
## 15 R 0.305052217 0.694947783
## 16 R 0.164417547 0.835582453
## 17 R 0.077980882 0.922019118
## 18 R 0.003954464 0.996045536
## 19 R 0.198233556 0.801766444
## 20 M 0.099944550 0.900055450
## 21 M 0.538894167 0.461105833
## 22 M 0.272072827 0.727927173
## 23 M 0.327887741 0.672112259
## 24 M 0.452517107 0.547482893
## 25 M 0.952480800 0.047519200
## 26 M 0.989378821 0.010621179
## 27 M 0.920060058 0.079939942
## 28 M 0.735616095 0.264383905
## 29 M 0.701311496 0.298688504
## 30 M 0.964975997 0.035024003
## 31 M 0.677307373 0.322692627
## 32 M 0.535382523 0.464617477
## 33 M 0.877683205 0.122316795
## 34 M 0.453848395 0.546151605
## 35 M 0.996891033 0.003108967
## 36 M 0.981416116 0.018583884
## 37 M 0.992227874 0.007772126
## 38 M 0.923369542 0.076630458
## 39 M 0.987060650 0.012939350
## 40 M 0.993618617 0.006381383
## 41 M 0.625963239 0.374036761
##################################
# Reporting the independent evaluation results
# for the test set
##################################
SVM_R_Test_GridSearch_Auto_ROC <- roc(response = SVM_R_Test_GridSearch_Auto$SVM_R_Observed,
predictor = SVM_R_Test_GridSearch_Auto$SVM_R_Predicted.R,
levels = rev(levels(SVM_R_Test_GridSearch_Auto$SVM_R_Observed)))
(SVM_R_Test_GridSearch_Auto_ROCCurveAUC <- auc(SVM_R_Test_GridSearch_Auto_ROC)[1])
## [1] 0.9114833
1.5.3 Support Vector Machine - Radial Basis Function Kernel -
Automated Random Search (SVM_R_ARS)
[A] The support vector machine (radial basis function
kernel) model from the
kernlab
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] sigma =
sigma
[B.2] C =
cost
[C] Performance of the applied automated random search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
sigma=0.02458 and C=183
[C.2] Cross-Validation ROC Curve AUC =
0.96213
[C.3] Test ROC Curve AUC = 0.93541
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Using an automated random search
##################################
set.seed(12345678)
SVM_R_Tune_RandomSearch_Auto <- train(x = PMA_PreModelling_Train_SVM_R[,!names(PMA_PreModelling_Train_SVM_R) %in% c("Class")],
y = PMA_PreModelling_Train_SVM_R$Class,
method = "svmRadial",
tuneLength = 30,
metric = "ROC",
preProc = c("center", "scale"),
trControl = KFold_Control_RandomSearch,
returnResamp = "all")
SVM_R_Tune_RandomSearch_Auto$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 183.359327063186
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0245815648934837
##
## Number of Support Vectors : 145
##
## Objective Function Value : -57.5767
## Training error : 0
## Probability model included.
SVM_R_Tune_RandomSearch_Auto$results
## sigma C ROC Sens Spec ROCSD SensSD
## 1 0.003062763 0.07968017 0.8188988 0.5736111 0.8357143 0.12762632 0.14404771
## 2 0.003161972 557.27174973 0.8962550 0.8555556 0.7803571 0.08481918 0.16604824
## 3 0.003197104 19.09397071 0.8890873 0.8888889 0.7446429 0.09068061 0.10475656
## 4 0.003260497 311.39561843 0.9031994 0.8666667 0.7785714 0.07868373 0.17213259
## 5 0.003720532 2.43055400 0.8789683 0.9111111 0.7964286 0.09924602 0.10210406
## 6 0.003771907 1.36632284 0.8684524 0.8666667 0.7946429 0.11078013 0.12614360
## 7 0.004808965 0.58526382 0.8428571 0.8444444 0.7428571 0.12813221 0.14054567
## 8 0.004911297 0.20061724 0.8218502 0.6763889 0.7839286 0.13322750 0.16744289
## 9 0.005629841 1.35019752 0.8930556 0.8777778 0.7964286 0.09020180 0.12227833
## 10 0.005642254 185.24975110 0.9180804 0.8875000 0.8214286 0.08155714 0.11720372
## 11 0.006545177 0.09296935 0.8190724 0.6069444 0.8232143 0.13421268 0.18239213
## 12 0.007471523 41.17996347 0.9240327 0.8875000 0.8214286 0.08203642 0.11720372
## 13 0.007619948 0.66682930 0.8696429 0.8666667 0.7821429 0.10902894 0.10210406
## 14 0.008007937 412.07218822 0.9226438 0.8875000 0.8214286 0.08071429 0.11720372
## 15 0.010394362 0.08678718 0.8303819 0.6527778 0.8089286 0.13758504 0.17531571
## 16 0.011821589 13.07907674 0.9438740 0.9208333 0.8464286 0.06384653 0.10598718
## 17 0.012384606 0.35581028 0.8587302 0.8333333 0.7678571 0.12842887 0.14103284
## 18 0.013580611 40.75126495 0.9522073 0.9208333 0.8464286 0.05614662 0.11822337
## 19 0.014508407 16.69657765 0.9535962 0.9430556 0.8607143 0.05648576 0.10886030
## 20 0.014591810 116.40343712 0.9535962 0.9097222 0.8464286 0.05648576 0.11513732
## 21 0.014725783 38.54782847 0.9535962 0.9208333 0.8464286 0.05648576 0.10598718
## 22 0.014997325 1.29929841 0.9232391 0.9097222 0.8232143 0.06686697 0.11513732
## 23 0.015734712 0.10535004 0.8474454 0.6527778 0.8089286 0.13659952 0.18297290
## 24 0.021462739 481.87920593 0.9577629 0.9652778 0.8607143 0.04921559 0.07670390
## 25 0.023003220 0.06568199 0.8655010 0.6986111 0.8500000 0.12355617 0.18026449
## 26 0.023225821 0.25030006 0.8684772 0.7875000 0.8107143 0.12076345 0.12108938
## 27 0.023466522 0.30388775 0.8787946 0.8208333 0.7982143 0.11241867 0.09269670
## 28 0.024581565 183.35932706 0.9621280 0.9541667 0.8607143 0.04223332 0.07912469
## 29 0.026812304 0.32643705 0.8934772 0.8430556 0.8107143 0.09835957 0.09306592
## 30 0.027283954 1.12493978 0.9579613 0.9430556 0.8607143 0.05006799 0.07977216
## SpecSD
## 1 0.17693696
## 2 0.12628310
## 3 0.17578178
## 4 0.16042743
## 5 0.19852774
## 6 0.18884157
## 7 0.18038759
## 8 0.19695996
## 9 0.19852774
## 10 0.12371791
## 11 0.16824297
## 12 0.12371791
## 13 0.18706014
## 14 0.12371791
## 15 0.19029940
## 16 0.10316697
## 17 0.19795582
## 18 0.10316697
## 19 0.09213701
## 20 0.10316697
## 21 0.10316697
## 22 0.20541321
## 23 0.19029940
## 24 0.09213701
## 25 0.21889876
## 26 0.19709483
## 27 0.21346569
## 28 0.09213701
## 29 0.19709483
## 30 0.12423234
##################################
# Reporting the cross-validation results
# for the train set
##################################
(SVM_R_Train_RandomSearch_Auto_ROCCurveAUC <- SVM_R_Tune_RandomSearch_Auto$results[SVM_R_Tune_RandomSearch_Auto$results$C==SVM_R_Tune_RandomSearch_Auto$bestTune$C & SVM_R_Tune_RandomSearch_Auto$results$sigma==SVM_R_Tune_RandomSearch_Auto$bestTune$sigma,
c("ROC")])
## [1] 0.962128
##################################
# Independently evaluating the model
# on the test set
##################################
SVM_R_Test_RandomSearch_Auto <- data.frame(SVM_R_Observed = PMA_PreModelling_Test_SVM_R$Class,
SVM_R_Predicted = predict(SVM_R_Tune_RandomSearch_Auto,
PMA_PreModelling_Test_SVM_R[,!names(PMA_PreModelling_Test_SVM_R) %in% c("Class")],
type = "prob"))
SVM_R_Test_RandomSearch_Auto
## SVM_R_Observed SVM_R_Predicted.M SVM_R_Predicted.R
## 1 R 0.4461118516 0.5538881484
## 2 R 0.0917810626 0.9082189374
## 3 R 0.3003831045 0.6996168955
## 4 R 0.2113270453 0.7886729547
## 5 R 0.0110686706 0.9889313294
## 6 R 0.7937314777 0.2062685223
## 7 R 0.2302256584 0.7697743416
## 8 R 0.4654496465 0.5345503535
## 9 R 0.0718372374 0.9281627626
## 10 R 0.0017355436 0.9982644564
## 11 R 0.2759847540 0.7240152460
## 12 R 0.0883429855 0.9116570145
## 13 R 0.0003928833 0.9996071167
## 14 R 0.0033157653 0.9966842347
## 15 R 0.0791601264 0.9208398736
## 16 R 0.0344303176 0.9655696824
## 17 R 0.0100423552 0.9899576448
## 18 R 0.0063847677 0.9936152323
## 19 R 0.1556639090 0.8443360910
## 20 M 0.1100785449 0.8899214551
## 21 M 0.5315453448 0.4684546552
## 22 M 0.1856970809 0.8143029191
## 23 M 0.3843546079 0.6156453921
## 24 M 0.6661845595 0.3338154405
## 25 M 0.9900636439 0.0099363561
## 26 M 0.9955404724 0.0044595276
## 27 M 0.9829171642 0.0170828358
## 28 M 0.7646969623 0.2353030377
## 29 M 0.9260418003 0.0739581997
## 30 M 0.9831113603 0.0168886397
## 31 M 0.4164007909 0.5835992091
## 32 M 0.5178765245 0.4821234755
## 33 M 0.9468755388 0.0531244612
## 34 M 0.5335050901 0.4664949099
## 35 M 0.9986015025 0.0013984975
## 36 M 0.9948842061 0.0051157939
## 37 M 0.9994503690 0.0005496310
## 38 M 0.9797647607 0.0202352393
## 39 M 0.9994629054 0.0005370946
## 40 M 0.9994536506 0.0005463494
## 41 M 0.5222730772 0.4777269228
##################################
# Reporting the independent evaluation results
# for the test set
##################################
SVM_R_Test_RandomSearch_Auto_ROC <- roc(response = SVM_R_Test_RandomSearch_Auto$SVM_R_Observed,
predictor = SVM_R_Test_RandomSearch_Auto$SVM_R_Predicted.R,
levels = rev(levels(SVM_R_Test_RandomSearch_Auto$SVM_R_Observed)))
(SVM_R_Test_RandomSearch_Auto_ROCCurveAUC <- auc(SVM_R_Test_RandomSearch_Auto_ROC)[1])
## [1] 0.9354067
1.5.4 Regularized Discriminant Analysis - Manual Grid Search
(RDA_MGS)
[A] The regularized discriminant analysis model from
the klaR
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] gamma =
gamma
[B.2] lambda =
lambda
[C] Performance of the applied manual grid search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
gamma=0.60 and lambda=0.20
[C.2] Cross-Validation ROC Curve AUC =
0.91150
[C.3] Test ROC Curve AUC = 0.87560
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Creating a local object
# for the train and test sets
##################################
PMA_PreModelling_Train_RDA <- PMA_PreModelling_Train
PMA_PreModelling_Test_RDA <- PMA_PreModelling_Test
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
KFold_Indices <- createFolds(PMA_PreModelling_Train_RDA$Class,
k = 10,
returnTrain=TRUE)
KFold_Control_RandomSearch <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE,
search = "random")
KFold_Control_GridSearch <- trainControl(method="cv",
index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
RDA_Grid = expand.grid(gamma = seq(0,1,0.20),
lambda = seq(0,1,0.20))
##################################
# Running the regularized discriminant analysis model
# by setting the caret method to 'rda'
##################################
##################################
# Using a manual grid search
##################################
set.seed(12345678)
RDA_Tune_GridSearch_Manual <- train(x = PMA_PreModelling_Train_RDA[,!names(PMA_PreModelling_Train_RDA) %in% c("Class")],
y = PMA_PreModelling_Train_RDA$Class,
method = "rda",
tuneGrid = RDA_Grid,
metric = "ROC",
trControl = KFold_Control_GridSearch,
returnResamp = "all")
RDA_Tune_GridSearch_Manual$finalModel
## Call:
## rda.default(x = x, grouping = y, gamma = param$gamma, lambda = param$lambda,
## returnResamp = "all")
##
## Regularization parameters:
## gamma lambda
## 0.6 0.2
##
## Prior probabilities of groups:
## M R
## 0.5329341 0.4670659
##
## Misclassification rate:
## apparent: 1.198 %
RDA_Tune_GridSearch_Manual$results
## gamma lambda ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.0 0.0 0.7171627 0.9222222 0.4571429 0.06867241 0.07499428 0.1733768
## 2 0.0 0.2 0.8133681 0.8444444 0.6678571 0.08674264 0.10734353 0.1307358
## 3 0.0 0.4 0.8363343 0.8222222 0.6803571 0.08956975 0.14054567 0.1025642
## 4 0.0 0.6 0.8492063 0.8222222 0.6785714 0.10624090 0.14054567 0.1240040
## 5 0.0 0.8 0.8196429 0.8333333 0.7267857 0.14048419 0.14103284 0.1925207
## 6 0.0 1.0 0.7682540 0.8000000 0.6750000 0.16285642 0.17213259 0.1911817
## 7 0.2 0.0 0.8772569 0.8777778 0.7339286 0.09411413 0.11049210 0.1905599
## 8 0.2 0.2 0.8867560 0.8333333 0.7714286 0.07884393 0.10798059 0.1944890
## 9 0.2 0.4 0.8819692 0.8222222 0.7553571 0.07297162 0.13042087 0.1653969
## 10 0.2 0.6 0.8773810 0.8333333 0.7821429 0.08398065 0.14103284 0.1189881
## 11 0.2 0.8 0.8648810 0.8444444 0.7446429 0.10567719 0.14998857 0.1656110
## 12 0.2 1.0 0.8486111 0.8666667 0.7428571 0.12071166 0.14628458 0.1897680
## 13 0.4 0.0 0.9031994 0.8666667 0.8250000 0.08263177 0.08764563 0.1787301
## 14 0.4 0.2 0.8970486 0.8444444 0.8232143 0.07794668 0.10734353 0.1461549
## 15 0.4 0.4 0.8934772 0.8444444 0.8232143 0.07570235 0.13042087 0.1337499
## 16 0.4 0.6 0.8863095 0.8555556 0.7964286 0.08527926 0.14861039 0.1597635
## 17 0.4 0.8 0.8734127 0.8555556 0.7964286 0.10487258 0.14861039 0.1985277
## 18 0.4 1.0 0.8527778 0.8666667 0.7428571 0.13214875 0.15537909 0.1803876
## 19 0.6 0.0 0.9075645 0.8555556 0.8500000 0.08292834 0.07499428 0.1419116
## 20 0.6 0.2 0.9115079 0.8555556 0.8232143 0.08101892 0.10540926 0.1337499
## 21 0.6 0.4 0.8980159 0.8777778 0.8107143 0.08518765 0.11049210 0.1786111
## 22 0.6 0.6 0.8894841 0.8555556 0.8232143 0.09272973 0.14861039 0.2054132
## 23 0.6 0.8 0.8634921 0.8444444 0.7964286 0.11629234 0.16728281 0.1985277
## 24 0.6 1.0 0.8503968 0.8666667 0.8089286 0.13213021 0.13658584 0.1992137
## 25 0.8 0.0 0.8888889 0.8222222 0.7964286 0.10146460 0.11944086 0.2153084
## 26 0.8 0.2 0.8837302 0.8444444 0.7964286 0.10085144 0.13042087 0.2070881
## 27 0.8 0.4 0.8748016 0.8333333 0.7839286 0.10354892 0.15930232 0.2055856
## 28 0.8 0.6 0.8664683 0.8555556 0.7839286 0.11154351 0.13907395 0.2055856
## 29 0.8 0.8 0.8549603 0.8444444 0.7839286 0.11584255 0.14054567 0.2055856
## 30 0.8 1.0 0.8418651 0.8333333 0.7714286 0.12604040 0.15044516 0.2032196
## 31 1.0 0.0 0.8085813 0.6986111 0.6928571 0.13201497 0.15577173 0.2070197
## 32 1.0 0.2 0.8113591 0.7208333 0.6928571 0.13044117 0.16620951 0.2070197
## 33 1.0 0.4 0.8099702 0.7319444 0.6928571 0.13302295 0.15741422 0.2070197
## 34 1.0 0.6 0.8099702 0.7319444 0.7053571 0.13302295 0.15741422 0.2144923
## 35 1.0 0.8 0.8115575 0.7430556 0.7053571 0.13163130 0.15621141 0.2144923
## 36 1.0 1.0 0.8115575 0.7541667 0.7053571 0.13163130 0.13514237 0.2144923
##################################
# Reporting the cross-validation results
# for the train set
##################################
(RDA_Train_GridSearch_Manual_ROCCurveAUC <- RDA_Tune_GridSearch_Manual$results[RDA_Tune_GridSearch_Manual$results$gamma==RDA_Tune_GridSearch_Manual$bestTune$gamma & RDA_Tune_GridSearch_Manual$results$lambda==RDA_Tune_GridSearch_Manual$bestTune$lambda,
c("ROC")])
## [1] 0.9115079
##################################
# Independently evaluating the model
# on the test set
##################################
RDA_Test_GridSearch_Manual <- data.frame(RDA_Observed = PMA_PreModelling_Test_RDA$Class,
RDA_Predicted = predict(RDA_Tune_GridSearch_Manual,
PMA_PreModelling_Test_RDA[,!names(PMA_PreModelling_Test_RDA) %in% c("Class")],
type = "prob"))
RDA_Test_GridSearch_Manual
## RDA_Observed RDA_Predicted.M RDA_Predicted.R
## 1 R 9.939995e-01 6.000456e-03
## 2 R 8.730765e-02 9.126924e-01
## 3 R 3.181359e-02 9.681864e-01
## 4 R 1.527510e-01 8.472490e-01
## 5 R 2.466817e-03 9.975332e-01
## 6 R 9.772080e-01 2.279197e-02
## 7 R 9.486356e-01 5.136444e-02
## 8 R 8.394090e-01 1.605910e-01
## 9 R 1.672424e-01 8.327576e-01
## 10 R 1.587608e-06 9.999984e-01
## 11 R 1.833434e-01 8.166566e-01
## 12 R 3.039097e-02 9.696090e-01
## 13 R 2.978790e-04 9.997021e-01
## 14 R 4.653232e-03 9.953468e-01
## 15 R 5.380004e-02 9.462000e-01
## 16 R 2.995551e-02 9.700445e-01
## 17 R 3.556569e-02 9.644343e-01
## 18 R 6.449323e-03 9.935507e-01
## 19 R 7.450271e-04 9.992550e-01
## 20 M 1.813621e-01 8.186379e-01
## 21 M 9.661042e-01 3.389583e-02
## 22 M 5.517503e-02 9.448250e-01
## 23 M 7.867602e-01 2.132398e-01
## 24 M 5.603520e-01 4.396480e-01
## 25 M 8.893416e-01 1.106584e-01
## 26 M 9.962381e-01 3.761856e-03
## 27 M 8.889273e-01 1.110727e-01
## 28 M 6.322747e-01 3.677253e-01
## 29 M 9.999874e-01 1.264440e-05
## 30 M 1.000000e+00 3.973935e-08
## 31 M 1.451264e-01 8.548736e-01
## 32 M 2.717538e-01 7.282462e-01
## 33 M 9.985120e-01 1.488016e-03
## 34 M 3.199744e-01 6.800256e-01
## 35 M 9.999998e-01 1.507811e-07
## 36 M 9.997433e-01 2.566687e-04
## 37 M 9.996882e-01 3.118405e-04
## 38 M 9.952482e-01 4.751830e-03
## 39 M 9.989721e-01 1.027896e-03
## 40 M 9.999744e-01 2.561338e-05
## 41 M 5.232949e-01 4.767051e-01
##################################
# Reporting the independent evaluation results
# for the test set
##################################
RDA_Test_GridSearch_Manual_ROC <- roc(response = RDA_Test_GridSearch_Manual$RDA_Observed,
predictor = RDA_Test_GridSearch_Manual$RDA_Predicted.R,
levels = rev(levels(RDA_Test_GridSearch_Manual$RDA_Observed)))
(RDA_Test_GridSearch_Manual_ROCCurveAUC <- auc(RDA_Test_GridSearch_Manual_ROC)[1])
## [1] 0.8755981
1.5.5 Regularized Discriminant Analysis - Automated Grid Search
(RDA_AGS)
[A] The regularized discriminant analysis model from
the klaR
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] gamma =
gamma
[B.2] lambda =
lambda
[C] Performance of the applied automated grid search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
gamma=0.57 and lambda=0.14
[C.2] Cross-Validation ROC Curve AUC =
0.91014
[C.3] Test ROC Curve AUC = 0.87081
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Using an automated grid search
##################################
set.seed(12345678)
RDA_Tune_GridSearch_Auto <- train(x = PMA_PreModelling_Train_RDA[,!names(PMA_PreModelling_Train_RDA) %in% c("Class")],
y = PMA_PreModelling_Train_RDA$Class,
method = "rda",
tuneLength = 8,
metric = "ROC",
trControl = KFold_Control_GridSearch,
returnResamp = "all")
RDA_Tune_GridSearch_Auto$finalModel
## Call:
## rda.default(x = x, grouping = y, gamma = param$gamma, lambda = param$lambda,
## returnResamp = "all")
##
## Regularization parameters:
## gamma lambda
## 0.5714286 0.1428571
##
## Prior probabilities of groups:
## M R
## 0.5329341 0.4670659
##
## Misclassification rate:
## apparent: 1.198 %
RDA_Tune_GridSearch_Auto$results
## gamma lambda ROC Sens Spec ROCSD SensSD
## 1 0.0000000 0.0000000 0.7171627 0.9222222 0.4571429 0.06867241 0.07499428
## 2 0.0000000 0.1428571 0.8037698 0.8444444 0.6303571 0.08966800 0.10734353
## 3 0.0000000 0.2857143 0.8246776 0.8555556 0.6928571 0.08393174 0.10540926
## 4 0.0000000 0.4285714 0.8349454 0.8222222 0.6803571 0.08996755 0.14054567
## 5 0.0000000 0.5714286 0.8535714 0.8222222 0.6928571 0.09877685 0.14054567
## 6 0.0000000 0.7142857 0.8442460 0.8444444 0.7160714 0.10832942 0.14054567
## 7 0.0000000 0.8571429 0.8033730 0.8333333 0.6982143 0.15380205 0.14103284
## 8 0.0000000 1.0000000 0.7682540 0.8000000 0.6750000 0.16285642 0.17213259
## 9 0.1428571 0.0000000 0.8730903 0.8777778 0.7339286 0.09736862 0.11049210
## 10 0.1428571 0.1428571 0.8750744 0.8666667 0.7321429 0.07161015 0.11475506
## 11 0.1428571 0.2857143 0.8792163 0.8444444 0.7696429 0.07041525 0.13042087
## 12 0.1428571 0.4285714 0.8776042 0.8333333 0.7678571 0.07485983 0.13094570
## 13 0.1428571 0.5714286 0.8716270 0.8111111 0.7428571 0.08441315 0.13907395
## 14 0.1428571 0.7142857 0.8601190 0.8444444 0.7821429 0.09937926 0.13042087
## 15 0.1428571 0.8571429 0.8587302 0.8555556 0.7428571 0.10951231 0.13907395
## 16 0.1428571 1.0000000 0.8523810 0.8444444 0.7285714 0.11916587 0.17529125
## 17 0.2857143 0.0000000 0.8972718 0.8555556 0.7714286 0.08916942 0.10540926
## 18 0.2857143 0.1428571 0.8914931 0.8444444 0.7982143 0.08791021 0.09369712
## 19 0.2857143 0.2857143 0.8912946 0.8333333 0.7964286 0.07678601 0.13094570
## 20 0.2857143 0.4285714 0.8905010 0.8333333 0.8089286 0.07040285 0.13094570
## 21 0.2857143 0.5714286 0.8875000 0.8444444 0.8089286 0.07974474 0.14998857
## 22 0.2857143 0.7142857 0.8734127 0.8444444 0.7714286 0.09318499 0.14998857
## 23 0.2857143 0.8571429 0.8676587 0.8555556 0.7571429 0.10970885 0.15757072
## 24 0.2857143 1.0000000 0.8500000 0.8555556 0.7553571 0.13000432 0.15757072
## 25 0.4285714 0.0000000 0.9018105 0.8666667 0.8250000 0.08480293 0.08764563
## 26 0.4285714 0.1428571 0.9014137 0.8444444 0.8232143 0.07891334 0.10734353
## 27 0.4285714 0.2857143 0.8962550 0.8555556 0.8357143 0.07226627 0.10540926
## 28 0.4285714 0.4285714 0.8934772 0.8555556 0.8107143 0.07765896 0.12883353
## 29 0.4285714 0.5714286 0.8821429 0.8555556 0.8107143 0.08884715 0.14861039
## 30 0.4285714 0.7142857 0.8777778 0.8555556 0.7839286 0.10030346 0.14861039
## 31 0.4285714 0.8571429 0.8662698 0.8555556 0.7839286 0.11559212 0.14861039
## 32 0.4285714 1.0000000 0.8501984 0.8666667 0.7553571 0.13183873 0.15537909
## 33 0.5714286 0.0000000 0.9089534 0.8555556 0.8500000 0.08321354 0.07499428
## 34 0.5714286 0.1428571 0.9101438 0.8555556 0.8357143 0.07894354 0.10540926
## 35 0.5714286 0.2857143 0.9037698 0.8666667 0.8107143 0.07659547 0.10210406
## 36 0.5714286 0.4285714 0.8982143 0.8777778 0.8107143 0.08393506 0.11049210
## 37 0.5714286 0.5714286 0.8894841 0.8555556 0.7982143 0.09062560 0.14861039
## 38 0.5714286 0.7142857 0.8809524 0.8555556 0.8232143 0.10067259 0.14861039
## 39 0.5714286 0.8571429 0.8575397 0.8555556 0.7964286 0.12395034 0.14861039
## 40 0.5714286 1.0000000 0.8501984 0.8777778 0.7964286 0.13216348 0.16101530
## 41 0.7142857 0.0000000 0.9065476 0.8430556 0.8625000 0.09452560 0.05634090
## 42 0.7142857 0.1428571 0.9043651 0.8555556 0.8232143 0.08854929 0.10540926
## 43 0.7142857 0.2857143 0.8952381 0.8555556 0.8232143 0.08927739 0.10540926
## 44 0.7142857 0.4285714 0.8950397 0.8444444 0.8107143 0.08920215 0.14054567
## 45 0.7142857 0.5714286 0.8775794 0.8555556 0.7839286 0.10226352 0.12883353
## 46 0.7142857 0.7142857 0.8690476 0.8444444 0.7839286 0.11144916 0.17529125
## 47 0.7142857 0.8571429 0.8591270 0.8444444 0.7964286 0.12080203 0.18294947
## 48 0.7142857 1.0000000 0.8450397 0.8333333 0.7964286 0.13314348 0.17568209
## 49 0.8571429 0.0000000 0.8740079 0.8111111 0.8089286 0.11105697 0.16604824
## 50 0.8571429 0.1428571 0.8738095 0.8222222 0.7964286 0.10755929 0.16728281
## 51 0.8571429 0.2857143 0.8724206 0.8111111 0.7964286 0.10643033 0.16604824
## 52 0.8571429 0.4285714 0.8634921 0.8222222 0.7964286 0.10501013 0.17529125
## 53 0.8571429 0.5714286 0.8591270 0.8111111 0.7964286 0.10688125 0.17411347
## 54 0.8571429 0.7142857 0.8535714 0.8333333 0.7964286 0.11949797 0.16769232
## 55 0.8571429 0.8571429 0.8442460 0.8333333 0.7714286 0.12244652 0.16769232
## 56 0.8571429 1.0000000 0.8386905 0.8333333 0.7839286 0.12595708 0.16769232
## 57 1.0000000 0.0000000 0.8085813 0.6986111 0.6928571 0.13201497 0.15577173
## 58 1.0000000 0.1428571 0.8113591 0.7097222 0.6928571 0.13044117 0.14821324
## 59 1.0000000 0.2857143 0.8113591 0.7208333 0.6928571 0.13044117 0.16620951
## 60 1.0000000 0.4285714 0.8099702 0.7319444 0.6928571 0.13302295 0.15741422
## 61 1.0000000 0.5714286 0.8099702 0.7319444 0.7053571 0.13302295 0.15741422
## 62 1.0000000 0.7142857 0.8115575 0.7430556 0.7053571 0.13163130 0.15621141
## 63 1.0000000 0.8571429 0.8115575 0.7541667 0.7053571 0.13163130 0.13514237
## 64 1.0000000 1.0000000 0.8115575 0.7541667 0.7053571 0.13163130 0.13514237
## SpecSD
## 1 0.1733768
## 2 0.1666773
## 3 0.1026850
## 4 0.1025642
## 5 0.1183910
## 6 0.1466389
## 7 0.2088344
## 8 0.1911817
## 9 0.1905599
## 10 0.1592749
## 11 0.1534860
## 12 0.1590523
## 13 0.1599851
## 14 0.1189881
## 15 0.1326193
## 16 0.1855387
## 17 0.1853476
## 18 0.1679479
## 19 0.1484997
## 20 0.1494154
## 21 0.1373053
## 22 0.1655575
## 23 0.1805839
## 24 0.1943523
## 25 0.1787301
## 26 0.1461549
## 27 0.1319765
## 28 0.1465785
## 29 0.1579794
## 30 0.1879388
## 31 0.1969600
## 32 0.1852042
## 33 0.1419116
## 34 0.1319765
## 35 0.1465785
## 36 0.1786111
## 37 0.1965278
## 38 0.2054132
## 39 0.1985277
## 40 0.1895812
## 41 0.1608355
## 42 0.1877502
## 43 0.1877502
## 44 0.2139879
## 45 0.2055856
## 46 0.2055856
## 47 0.1985277
## 48 0.2070881
## 49 0.1992137
## 50 0.1985277
## 51 0.1985277
## 52 0.1985277
## 53 0.1985277
## 54 0.1985277
## 55 0.2032196
## 56 0.1969600
## 57 0.2070197
## 58 0.2070197
## 59 0.2070197
## 60 0.2070197
## 61 0.2144923
## 62 0.2144923
## 63 0.2144923
## 64 0.2144923
##################################
# Reporting the cross-validation results
# for the train set
##################################
(RDA_Train_GridSearch_Auto_ROCCurveAUC <- RDA_Tune_GridSearch_Auto$results[RDA_Tune_GridSearch_Auto$results$gamma==RDA_Tune_GridSearch_Auto$bestTune$gamma &
RDA_Tune_GridSearch_Auto$results$lambda==RDA_Tune_GridSearch_Auto$bestTune$lambda,
c("ROC")])
## [1] 0.9101438
##################################
# Independently evaluating the model
# on the test set
##################################
RDA_Test_GridSearch_Auto <- data.frame(RDA_Observed = PMA_PreModelling_Test_RDA$Class,
RDA_Predicted = predict(RDA_Tune_GridSearch_Auto,
PMA_PreModelling_Test_RDA[,!names(PMA_PreModelling_Test_RDA) %in% c("Class")],
type = "prob"))
RDA_Test_GridSearch_Auto
## RDA_Observed RDA_Predicted.M RDA_Predicted.R
## 1 R 9.973229e-01 2.677145e-03
## 2 R 6.705582e-02 9.329442e-01
## 3 R 1.573949e-02 9.842605e-01
## 4 R 1.929045e-01 8.070955e-01
## 5 R 1.876099e-03 9.981239e-01
## 6 R 9.861147e-01 1.388529e-02
## 7 R 9.626076e-01 3.739240e-02
## 8 R 8.787244e-01 1.212756e-01
## 9 R 1.764799e-01 8.235201e-01
## 10 R 9.506597e-07 9.999990e-01
## 11 R 1.250982e-01 8.749018e-01
## 12 R 2.167116e-02 9.783288e-01
## 13 R 3.972024e-04 9.996028e-01
## 14 R 5.146713e-03 9.948533e-01
## 15 R 5.584792e-02 9.441521e-01
## 16 R 2.447285e-02 9.755271e-01
## 17 R 3.005016e-02 9.699498e-01
## 18 R 4.542990e-03 9.954570e-01
## 19 R 5.643707e-04 9.994356e-01
## 20 M 1.981419e-01 8.018581e-01
## 21 M 9.664446e-01 3.355538e-02
## 22 M 4.726127e-02 9.527387e-01
## 23 M 8.819351e-01 1.180649e-01
## 24 M 6.380720e-01 3.619280e-01
## 25 M 9.078489e-01 9.215112e-02
## 26 M 9.968674e-01 3.132565e-03
## 27 M 8.782702e-01 1.217298e-01
## 28 M 6.556653e-01 3.443347e-01
## 29 M 9.999961e-01 3.904439e-06
## 30 M 1.000000e+00 5.817384e-09
## 31 M 1.141914e-01 8.858086e-01
## 32 M 2.175458e-01 7.824542e-01
## 33 M 9.988325e-01 1.167511e-03
## 34 M 3.347904e-01 6.652096e-01
## 35 M 9.999999e-01 5.034955e-08
## 36 M 9.998912e-01 1.087956e-04
## 37 M 9.998089e-01 1.910645e-04
## 38 M 9.965992e-01 3.400819e-03
## 39 M 9.993342e-01 6.657531e-04
## 40 M 9.999922e-01 7.787154e-06
## 41 M 5.640104e-01 4.359896e-01
##################################
# Reporting the independent evaluation results
# for the test set
##################################
RDA_Test_GridSearch_Auto_ROC <- roc(response = RDA_Test_GridSearch_Auto$RDA_Observed,
predictor = RDA_Test_GridSearch_Auto$RDA_Predicted.R,
levels = rev(levels(RDA_Test_GridSearch_Auto$RDA_Observed)))
(RDA_Test_GridSearch_Auto_ROCCurveAUC <- auc(RDA_Test_GridSearch_Auto_ROC)[1])
## [1] 0.8708134
1.5.6 Regularized Discriminant Analysis - Automated Random Search
(RDA_ARS)
[A] The regularized discriminant analysis model from
the klaR
package was implemented through the
caret
package.
[B] The model contains 2 hyperparameters:
[B.1] gamma =
gamma
[B.2] lambda =
lambda
[C] Performance of the applied automated random search
method for hyperparameter tuning is summarized as follows :
[C.1] Final model configuration involves
gamma=0.48 and lambda=0.09
[C.2] Cross-Validation ROC Curve AUC =
0.90578
[C.3] Test ROC Curve AUC = 0.861244
[D] The model does not allow for ranking of predictors
in terms of variable importance.
##################################
# Using an automated random search
##################################
set.seed(12345678)
RDA_Tune_RandomSearch_Auto <- train(x = PMA_PreModelling_Train_RDA[,!names(PMA_PreModelling_Train_RDA) %in% c("Class")],
y = PMA_PreModelling_Train_RDA$Class,
method = "rda",
tuneLength = 36,
metric = "ROC",
trControl = KFold_Control_RandomSearch,
returnResamp = "all")
RDA_Tune_RandomSearch_Auto$finalModel
## Call:
## rda.default(x = x, grouping = y, gamma = param$gamma, lambda = param$lambda,
## returnResamp = "all")
##
## Regularization parameters:
## gamma lambda
## 0.47910538 0.09509243
##
## Prior probabilities of groups:
## M R
## 0.5329341 0.4670659
##
## Misclassification rate:
## apparent: 0.599 %
RDA_Tune_RandomSearch_Auto$results
## gamma lambda ROC Sens Spec ROCSD SensSD
## 1 0.008678354 0.422683378 0.8468502 0.8222222 0.6785714 0.08719860 0.14054567
## 2 0.063090573 0.159730318 0.8601935 0.8444444 0.6928571 0.06592393 0.11944086
## 3 0.092448982 0.851543210 0.8541667 0.8444444 0.7535714 0.10672665 0.13042087
## 4 0.224852071 0.014492429 0.8857639 0.8666667 0.7464286 0.09217891 0.10210406
## 5 0.225922084 0.240498578 0.8839534 0.8333333 0.7839286 0.07936822 0.10798059
## 6 0.242685425 0.479121222 0.8843502 0.8333333 0.7839286 0.07331041 0.13094570
## 7 0.285900684 0.818685511 0.8692460 0.8444444 0.7446429 0.10440292 0.14998857
## 8 0.299823386 0.296154688 0.8926835 0.8333333 0.7964286 0.07517150 0.13094570
## 9 0.309769555 0.501567030 0.8905010 0.8444444 0.7964286 0.07483025 0.13042087
## 10 0.371572583 0.530818735 0.8918651 0.8555556 0.8089286 0.07707935 0.14861039
## 11 0.389756622 0.291965252 0.8952629 0.8444444 0.8232143 0.07455911 0.10734353
## 12 0.398705438 0.020711658 0.9004216 0.8666667 0.8250000 0.08738713 0.08764563
## 13 0.398780852 0.126081060 0.8986359 0.8555556 0.8375000 0.08278481 0.09147473
## 14 0.404947380 0.001431841 0.9018105 0.8666667 0.8250000 0.08480293 0.08764563
## 15 0.412378052 0.370807810 0.8948661 0.8444444 0.8357143 0.07713825 0.13042087
## 16 0.479105382 0.095092430 0.9057788 0.8444444 0.8500000 0.07887411 0.10734353
## 17 0.521891946 0.597569945 0.8863095 0.8555556 0.7982143 0.09064008 0.14861039
## 18 0.546598573 0.763725982 0.8718254 0.8555556 0.8232143 0.10726608 0.14861039
## 19 0.562150992 0.294328171 0.9037698 0.8666667 0.8107143 0.07603375 0.10210406
## 20 0.606571331 0.651290618 0.8880952 0.8444444 0.8232143 0.09512412 0.16728281
## 21 0.626348499 0.357598145 0.8966270 0.8777778 0.8107143 0.08646307 0.11049210
## 22 0.694459123 0.453600885 0.8950397 0.8444444 0.8107143 0.08920215 0.14054567
## 23 0.701360101 0.794419097 0.8619048 0.8555556 0.7964286 0.11648175 0.18182130
## 24 0.750428761 0.529016046 0.8759921 0.8555556 0.7839286 0.10313750 0.12883353
## 25 0.813903614 0.401602007 0.8734127 0.8333333 0.7839286 0.10343205 0.15930232
## 26 0.824399956 0.753058456 0.8535714 0.8444444 0.7839286 0.11859777 0.14054567
## 27 0.837353629 0.650854883 0.8577381 0.8333333 0.7964286 0.11107193 0.16769232
## 28 0.864207304 0.669060568 0.8549603 0.8222222 0.7964286 0.11659154 0.16728281
## 29 0.884520706 0.274960286 0.8638889 0.8111111 0.7839286 0.10443957 0.17411347
## 30 0.903947357 0.122033964 0.8636905 0.7777778 0.7839286 0.10432161 0.16563466
## 31 0.933136827 0.875548703 0.8331349 0.7777778 0.7321429 0.12693055 0.18144368
## 32 0.947211719 0.696390216 0.8315724 0.7777778 0.7321429 0.12592242 0.18144368
## 33 0.958416023 0.536026059 0.8244296 0.7777778 0.7321429 0.13030047 0.18144368
## 34 0.960019596 0.213178845 0.8285962 0.7555556 0.7321429 0.12474957 0.19457667
## 35 0.993158175 0.642527189 0.8145089 0.7541667 0.7053571 0.13117580 0.17098946
## 36 0.994644627 0.887776134 0.8115575 0.7541667 0.7053571 0.13163130 0.17098946
## SpecSD
## 1 0.1091089
## 2 0.1596748
## 3 0.1325124
## 4 0.2044363
## 5 0.1684534
## 6 0.1463971
## 7 0.1656110
## 8 0.1484997
## 9 0.1484997
## 10 0.1373053
## 11 0.1461549
## 12 0.1787301
## 13 0.1564582
## 14 0.1787301
## 15 0.1319765
## 16 0.1419116
## 17 0.1965278
## 18 0.2054132
## 19 0.1465785
## 20 0.2054132
## 21 0.1786111
## 22 0.2139879
## 23 0.1985277
## 24 0.2055856
## 25 0.2055856
## 26 0.2055856
## 27 0.1985277
## 28 0.1985277
## 29 0.1879388
## 30 0.1879388
## 31 0.2117910
## 32 0.2117910
## 33 0.2117910
## 34 0.2117910
## 35 0.2144923
## 36 0.2144923
##################################
# Reporting the cross-validation results
# for the train set
##################################
(RDA_Train_RandomSearch_Auto_ROCCurveAUC <- RDA_Tune_RandomSearch_Auto$results[RDA_Tune_RandomSearch_Auto$results$gamma==RDA_Tune_RandomSearch_Auto$bestTune$gamma & RDA_Tune_RandomSearch_Auto$results$lambda==RDA_Tune_RandomSearch_Auto$bestTune$lambda,
c("ROC")])
## [1] 0.9057788
##################################
# Independently evaluating the model
# on the test set
##################################
RDA_Test_RandomSearch_Auto <- data.frame(RDA_Observed = PMA_PreModelling_Test_RDA$Class,
RDA_Predicted = predict(RDA_Tune_RandomSearch_Auto,
PMA_PreModelling_Test_RDA[,!names(PMA_PreModelling_Test_RDA) %in% c("Class")],
type = "prob"))
RDA_Test_RandomSearch_Auto
## RDA_Observed RDA_Predicted.M RDA_Predicted.R
## 1 R 9.996342e-01 3.657658e-04
## 2 R 5.444014e-02 9.455599e-01
## 3 R 2.402096e-03 9.975979e-01
## 4 R 3.138262e-01 6.861738e-01
## 5 R 1.116578e-03 9.988834e-01
## 6 R 9.916533e-01 8.346719e-03
## 7 R 9.595638e-01 4.043622e-02
## 8 R 9.431910e-01 5.680897e-02
## 9 R 2.228901e-01 7.771099e-01
## 10 R 5.857581e-07 9.999994e-01
## 11 R 4.288120e-02 9.571188e-01
## 12 R 9.111884e-03 9.908881e-01
## 13 R 1.101115e-03 9.988989e-01
## 14 R 6.469385e-03 9.935306e-01
## 15 R 6.034082e-02 9.396592e-01
## 16 R 1.707100e-02 9.829290e-01
## 17 R 2.127312e-02 9.787269e-01
## 18 R 1.747011e-03 9.982530e-01
## 19 R 3.356219e-04 9.996644e-01
## 20 M 2.785323e-01 7.214677e-01
## 21 M 9.570640e-01 4.293596e-02
## 22 M 2.667801e-02 9.733220e-01
## 23 M 9.599714e-01 4.002858e-02
## 24 M 7.331989e-01 2.668011e-01
## 25 M 9.406450e-01 5.935500e-02
## 26 M 9.982936e-01 1.706444e-03
## 27 M 8.784421e-01 1.215579e-01
## 28 M 7.213115e-01 2.786885e-01
## 29 M 9.999994e-01 5.834747e-07
## 30 M 1.000000e+00 1.733133e-10
## 31 M 7.585306e-02 9.241469e-01
## 32 M 1.411484e-01 8.588516e-01
## 33 M 9.991374e-01 8.626115e-04
## 34 M 3.976876e-01 6.023124e-01
## 35 M 1.000000e+00 5.940803e-09
## 36 M 9.999789e-01 2.105140e-05
## 37 M 9.999167e-01 8.334350e-05
## 38 M 9.984505e-01 1.549530e-03
## 39 M 9.997395e-01 2.604779e-04
## 40 M 9.999993e-01 7.294063e-07
## 41 M 7.207425e-01 2.792575e-01
##################################
# Reporting the independent evaluation results
# for the test set
##################################
RDA_Test_RandomSearch_Auto_ROC <- roc(response = RDA_Test_RandomSearch_Auto$RDA_Observed,
predictor = RDA_Test_RandomSearch_Auto$RDA_Predicted.R,
levels = rev(levels(RDA_Test_RandomSearch_Auto$RDA_Observed)))
(RDA_Test_RandomSearch_Auto_ROCCurveAUC <- auc(RDA_Test_RandomSearch_Auto_ROC)[1])
## [1] 0.861244
1.6 Hyperparameter Tuning Evaluation Summary
Model performance comparison:
[A] The models which demonstrated the best and most
consistent ROC Curve AUC metrics are as follows:
[A.1] SVM_R_MGS: Support Vector Machine - Radial
Basis Function Kernel with Manual Grid Search-Tuned Hyperparameters
(kernlab
packages)
[A.1.1] Cross-Validation ROC Curve AUC =
0.96629, Test ROC Curve AUC = 0.93780
[A.2] SVM_R_ARS: Support Vector Machine - Radial
Basis Function Kernel with Automated Random Search-Tuned Hyperparameters
(kernlab
packages)
[A.2.1] Cross-Validation ROC Curve AUC =
0.96213, Test ROC Curve AUC = 0.93541
##################################
# Consolidating all results
# from the evaluated hyperparameter tuning methods
# for Support Vector Machine - Radial Basis Function Kernel (SVM_R)
##################################
SVM_R_Tune_GridSearch_Manual_ROC <- SVM_R_Tune_GridSearch_Manual$results
SVM_R_Tune_GridSearch_Auto_ROC <- SVM_R_Tune_GridSearch_Auto$results
SVM_R_Tune_RandomSearch_Auto_ROC <- SVM_R_Tune_RandomSearch_Auto$results
SVM_R_Tune_All <- as.data.frame(rbind(SVM_R_Tune_GridSearch_Manual_ROC,
SVM_R_Tune_GridSearch_Auto_ROC,
SVM_R_Tune_RandomSearch_Auto_ROC))
SVM_R_Tune_All$Method <- c(rep("Manual Grid Search (MGS)",nrow(SVM_R_Tune_GridSearch_Manual_ROC)),
rep("Automated Grid Search (AGS)",nrow(SVM_R_Tune_GridSearch_Auto_ROC)),
rep("Automated Random Search (ARS)",nrow(SVM_R_Tune_RandomSearch_Auto_ROC)))
SVM_R_Tune_All$Method <- factor(SVM_R_Tune_All$Method,
levels=c("Manual Grid Search (MGS)",
"Automated Grid Search (AGS)",
"Automated Random Search (ARS)"))
##################################
# Plotting all results
# from the evaluated hyperparameter tuning methods
##################################
ggplot(SVM_R_Tune_All, aes(x=sigma, y=C, color=ROC, size= ROC)) +
geom_point() +
scale_color_gradient(low="blue", high="red") +
theme_bw() +
facet_grid(. ~ Method) +
scale_x_continuous(name="Sigma", limits=c(0,0.03),breaks=seq(0,0.03,by=0.01)) +
scale_y_continuous(name="C", limits=c(0,600),breaks=seq(0,600,by=100)) +
theme(legend.position="top",
plot.title=element_text(color="black",size=15,face="bold",hjust=0.50)) +
ggtitle("Hyperparameter Tuning : Support Vector Machine - Radial Basis Function Kernel (SVM_R)")

##################################
# Consolidating all results
# from the evaluated hyperparameter tuning methods
# for Regularized Discriminant Analysis (RDA)
##################################
RDA_Tune_GridSearch_Manual_ROC <- RDA_Tune_GridSearch_Manual$results
RDA_Tune_GridSearch_Auto_ROC <- RDA_Tune_GridSearch_Auto$results
RDA_Tune_RandomSearch_Auto_ROC <- RDA_Tune_RandomSearch_Auto$results
RDA_Tune_All <- as.data.frame(rbind(RDA_Tune_GridSearch_Manual_ROC,
RDA_Tune_GridSearch_Auto_ROC,
RDA_Tune_RandomSearch_Auto_ROC))
RDA_Tune_All$Method <- c(rep("Manual Grid Search (MGS)",nrow(RDA_Tune_GridSearch_Manual_ROC)),
rep("Automated Grid Search (AGS)",nrow(RDA_Tune_GridSearch_Auto_ROC)),
rep("Automated Random Search (ARS)",nrow(RDA_Tune_RandomSearch_Auto_ROC)))
RDA_Tune_All$Method <- factor(RDA_Tune_All$Method,
levels=c("Manual Grid Search (MGS)",
"Automated Grid Search (AGS)",
"Automated Random Search (ARS)"))
##################################
# Plotting all results
# from the evaluated hyperparameter tuning methods
##################################
ggplot(RDA_Tune_All, aes(x=gamma, y=lambda, color=ROC, size= ROC)) +
geom_point() +
scale_color_gradient(low="blue", high="red") +
theme_bw() +
facet_grid(. ~ Method) +
scale_x_continuous(name="Gamma", limits=c(0,1),breaks=seq(0,1,by=0.10)) +
scale_y_continuous(name="Lambda", limits=c(0,1),breaks=seq(0,1,by=0.10)) +
theme(legend.position="top",
plot.title=element_text(color="black",size=15,face="bold",hjust=0.50)) +
ggtitle("Hyperparameter Tuning : Regularized Discriminant Analysis (RDA)")

##################################
# Consolidating all evaluation results
# for the train and test sets
# using the ROC Curve AUC metric
##################################
Model <- c('SVM_R_MGS','SVM_R_AGS','SVM_R_ARS','RDA_MGS','RDA_AGS','RDA_ARS',
'SVM_R_MGS','SVM_R_AGS','SVM_R_ARS','RDA_MGS','RDA_AGS','RDA_ARS')
Set <- c(rep('Cross-Validation',6),rep('Test',6))
ROCCurveAUC <- c(SVM_R_Train_GridSearch_Manual_ROCCurveAUC,
SVM_R_Train_GridSearch_Auto_ROCCurveAUC,
SVM_R_Train_RandomSearch_Auto_ROCCurveAUC,
RDA_Train_GridSearch_Manual_ROCCurveAUC,
RDA_Train_GridSearch_Auto_ROCCurveAUC,
RDA_Train_RandomSearch_Auto_ROCCurveAUC,
SVM_R_Test_GridSearch_Manual_ROCCurveAUC,
SVM_R_Test_GridSearch_Auto_ROCCurveAUC,
SVM_R_Test_RandomSearch_Auto_ROCCurveAUC,
RDA_Test_GridSearch_Manual_ROCCurveAUC,
RDA_Test_GridSearch_Auto_ROCCurveAUC,
RDA_Test_RandomSearch_Auto_ROCCurveAUC)
ROCCurveAUC_Summary <- as.data.frame(cbind(Model,Set,ROCCurveAUC))
ROCCurveAUC_Summary$ROCCurveAUC <- as.numeric(as.character(ROCCurveAUC_Summary$ROCCurveAUC))
ROCCurveAUC_Summary$Set <- factor(ROCCurveAUC_Summary$Set,
levels = c("Cross-Validation",
"Test"))
ROCCurveAUC_Summary$Model <- factor(ROCCurveAUC_Summary$Model,
levels = c('SVM_R_MGS',
'SVM_R_AGS',
'SVM_R_ARS',
'RDA_MGS',
'RDA_AGS',
'RDA_ARS'))
print(ROCCurveAUC_Summary, row.names=FALSE)
## Model Set ROCCurveAUC
## SVM_R_MGS Cross-Validation 0.9662946
## SVM_R_AGS Cross-Validation 0.9351438
## SVM_R_ARS Cross-Validation 0.9621280
## RDA_MGS Cross-Validation 0.9115079
## RDA_AGS Cross-Validation 0.9101438
## RDA_ARS Cross-Validation 0.9057788
## SVM_R_MGS Test 0.9377990
## SVM_R_AGS Test 0.9114833
## SVM_R_ARS Test 0.9354067
## RDA_MGS Test 0.8755981
## RDA_AGS Test 0.8708134
## RDA_ARS Test 0.8612440
(ROCCurveAUC_Plot <- dotplot(Model ~ ROCCurveAUC,
data = ROCCurveAUC_Summary,
groups = Set,
main = "Classification Model Performance Comparison",
ylab = "Model",
xlab = "ROC Curve AUC",
auto.key = list(adj = 1),
type=c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))

1.7 Resampling Distribution Evaluation Summary
Model performance comparison:
[A] Between models, the support vector machine - radial
basis function kernel model demonstrated a more stable ROC Curve AUC
metrics driven by the narrower interquartile range and 95% confidence
interval, as compared to the regularized discriminant analysis
model.
[B] No statistically significant differences in the ROC
Curve AUC metrics was observed between models and hyperparameter tuning
strategies, although the support vector machine - radial basis function
kernel model SVM_R_MGS demonstrated the lowest p-values (between 0.10 to
0.14) when specifically compared to all regularized discriminant
analysis models (RDA_MGS, RDA_AGS, RDA_ARS).
##################################
# Consolidating the resampling results
# for the candidate models
##################################
(OverallResampling <- resamples(list(SVM_R_MGS = SVM_R_Tune_GridSearch_Manual,
SVM_R_AGS = SVM_R_Tune_GridSearch_Auto,
SVM_R_ARS = SVM_R_Tune_RandomSearch_Auto,
RDA_MGS = RDA_Tune_GridSearch_Manual,
RDA_AGS = RDA_Tune_GridSearch_Auto,
RDA_ARS = RDA_Tune_RandomSearch_Auto)))
##
## Call:
## resamples.default(x = list(SVM_R_MGS = SVM_R_Tune_GridSearch_Manual,
## = SVM_R_Tune_RandomSearch_Auto, RDA_MGS = RDA_Tune_GridSearch_Manual,
## RDA_AGS = RDA_Tune_GridSearch_Auto, RDA_ARS = RDA_Tune_RandomSearch_Auto))
##
## Models: SVM_R_MGS, SVM_R_AGS, SVM_R_ARS, RDA_MGS, RDA_AGS, RDA_ARS
## Number of resamples: 10
## Performance metrics: ROC, Sens, Spec
## Time estimates for: everything, final model fit
summary(OverallResampling)
##
## Call:
## summary.resamples(object = OverallResampling)
##
## Models: SVM_R_MGS, SVM_R_AGS, SVM_R_ARS, RDA_MGS, RDA_AGS, RDA_ARS
## Number of resamples: 10
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R_MGS 0.9027778 0.9409722 0.9781746 0.9662946 0.9960938 1 0
## SVM_R_AGS 0.8055556 0.8993056 0.9543651 0.9351438 0.9960938 1 0
## SVM_R_ARS 0.9027778 0.9201389 0.9781746 0.9621280 0.9960938 1 0
## RDA_MGS 0.7638889 0.8680556 0.9126984 0.9115079 0.9811508 1 0
## RDA_AGS 0.7638889 0.8680556 0.9206349 0.9101438 0.9776786 1 0
## RDA_ARS 0.7638889 0.8645833 0.9236111 0.9057788 0.9722222 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R_MGS 0.7777778 0.8784722 1.0000000 0.9319444 1.0000000 1 0
## SVM_R_AGS 0.6666667 0.8020833 0.9444444 0.8986111 1.0000000 1 0
## SVM_R_ARS 0.7777778 0.9166667 1.0000000 0.9541667 1.0000000 1 0
## RDA_MGS 0.6666667 0.7777778 0.8888889 0.8555556 0.8888889 1 0
## RDA_AGS 0.6666667 0.7777778 0.8888889 0.8555556 0.8888889 1 0
## RDA_ARS 0.6666667 0.7777778 0.8333333 0.8444444 0.8888889 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_R_MGS 0.750 0.8616071 0.8750000 0.8857143 0.96875 1 0
## SVM_R_AGS 0.625 0.7500000 0.8750000 0.8339286 0.87500 1 0
## SVM_R_ARS 0.750 0.7767857 0.8750000 0.8607143 0.87500 1 0
## RDA_MGS 0.625 0.7500000 0.8660714 0.8232143 0.87500 1 0
## RDA_AGS 0.625 0.7767857 0.8750000 0.8357143 0.87500 1 0
## RDA_ARS 0.625 0.7812500 0.8750000 0.8500000 0.96875 1 0
##################################
# Exploring the resampling results
##################################
bwplot(OverallResampling,
main = "Model Resampling Performance Comparison (Range)",
ylab = "Model",
pch=16,
cex=2,
layout=c(3,1))

dotplot(OverallResampling,
main = "Model Resampling Performance Comparison (95% Confidence Interval)",
ylab = "Model",
pch=16,
cex=2,
layout=c(3,1))


##################################
# Conducting an analysis
# of the performance differences
##################################
(ResamplingDifferences <- diff(OverallResampling))
##
## Call:
## diff.resamples(x = OverallResampling)
##
## Models: SVM_R_MGS, SVM_R_AGS, SVM_R_ARS, RDA_MGS, RDA_AGS, RDA_ARS
## Metrics: ROC, Sens, Spec
## Number of differences: 15
## p-value adjustment: bonferroni
summary(ResamplingDifferences)
##
## Call:
## summary.diff.resamples(object = ResamplingDifferences)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## ROC
## SVM_R_MGS SVM_R_AGS SVM_R_ARS RDA_MGS RDA_AGS RDA_ARS
## SVM_R_MGS 0.031151 0.004167 0.054787 0.056151 0.060516
## SVM_R_AGS 0.74969 -0.026984 0.023636 0.025000 0.029365
## SVM_R_ARS 1.00000 0.57915 0.050620 0.051984 0.056349
## RDA_MGS 0.13881 1.00000 0.18357 0.001364 0.005729
## RDA_AGS 0.09628 1.00000 0.12814 1.00000 0.004365
## RDA_ARS 0.10637 1.00000 0.17953 1.00000 1.00000
##
## Sens
## SVM_R_MGS SVM_R_AGS SVM_R_ARS RDA_MGS RDA_AGS RDA_ARS
## SVM_R_MGS 0.03333 -0.02222 0.07639 0.07639 0.08750
## SVM_R_AGS 1.0000 -0.05556 0.04306 0.04306 0.05417
## SVM_R_ARS 1.0000 1.0000 0.09861 0.09861 0.10972
## RDA_MGS 1.0000 1.0000 0.3158 0.00000 0.01111
## RDA_AGS 1.0000 1.0000 0.3158 NA 0.01111
## RDA_ARS 1.0000 1.0000 0.2507 1.0000 1.0000
##
## Spec
## SVM_R_MGS SVM_R_AGS SVM_R_ARS RDA_MGS RDA_AGS RDA_ARS
## SVM_R_MGS 0.051786 0.025000 0.062500 0.050000 0.035714
## SVM_R_AGS 0.5584 -0.026786 0.010714 -0.001786 -0.016071
## SVM_R_ARS 1.0000 1.0000 0.037500 0.025000 0.010714
## RDA_MGS 0.2243 1.0000 1.0000 -0.012500 -0.026786
## RDA_AGS 0.5518 1.0000 1.0000 1.0000 -0.014286
## RDA_ARS 1.0000 1.0000 1.0000 1.0000 1.0000
bwplot(ResamplingDifferences,
main = "Model Resampling Performance Comparison (Differences)",
ylab = "Model",
pch=16,
cex=2,
layout=c(3,1))
