Libraries:
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
Libraries = c("readr", "doMC", "caret", "mlbench")
# Install if not present
for(p in Libraries){
if(!require(p, character.only = TRUE)) { install.packages(p) }
library(p, character.only = TRUE)
}
Import Data:
test_harness_paa <- read_csv("test_harness_paa.csv",
col_types = cols(TotalAA = col_skip(),
id = col_skip()))
Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)
Class <- as.factor(test_harness_paa$Class)
typeof(test_harness_paa)
## [1] "list"
class(Class)
## [1] "factor"
Find attributes that are highly corrected (ideally > 0.75)
correlation_matrix <- cor(test_harness_paa[,2:21])
#print(correlation_matrix)
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.5)
print(highly_correlated)
## [1] 13 10
This example loads the dataset and constructs an Learning Vector Quantization (LVQ) model. The varImp is then used to estimate the variable importance, which is printed and plotted.
cache = TRUE
registerDoMC(cores = 3)
start_time <- Sys.time() # Start timer
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# train the model
model <- train(Class ~ .,
data = test_harness_paa,
method = "lvq",
preProcess = c("center", "scale"),
trControl = control)
end_time <- Sys.time() # End timer
end_time - start_time # Display time
## Time difference of 40.90145 secs
model
## Learning Vector Quantization
##
## 3500 samples
## 20 predictors
## 7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb'
##
## Pre-processing: centered (20), scaled (20)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3150, 3150, 3150, 3150, 3150, 3150, ...
## Resampling results across tuning parameters:
##
## size k Accuracy Kappa
## 45 1 0.7447619 0.7022222
## 45 6 0.7528571 0.7116667
## 45 11 0.7516190 0.7102222
## 67 1 0.7814286 0.7450000
## 67 6 0.7811429 0.7446667
## 67 11 0.7746667 0.7371111
## 90 1 0.7946667 0.7604444
## 90 6 0.7939048 0.7595556
## 90 11 0.7974286 0.7636667
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 90 and k = 11.
importance <- filterVarImp(test_harness_paa[,2:21], Class, nonpara = FALSE)
print(importance)
## Ctrl Ery Hcy Hgb Hhe Lgb Mgb
## G 0.561260 0.561204 0.743992 0.561204 0.569036 0.561204 0.561260
## P 0.649366 0.649366 0.649366 0.742156 0.725268 0.801546 0.625314
## A 0.828370 0.828370 0.828370 0.874302 0.828370 0.909904 0.728732
## V 0.765758 0.765758 0.765758 0.938090 0.765758 0.765758 0.679428
## L 0.655980 0.655980 0.736762 0.655980 0.704156 0.691856 0.618786
## I 0.589428 0.672812 0.577912 0.737436 0.611298 0.576648 0.672812
## M 0.631680 0.700150 0.726036 0.530574 0.534866 0.610546 0.700150
## C 0.827834 0.827834 0.827834 0.873232 0.881722 0.827834 0.753460
## F 0.779568 0.779568 0.779568 0.779568 0.779568 0.779568 0.706268
## Y 0.850392 0.850392 0.850392 0.850392 0.850392 0.921528 0.651028
## W 0.526620 0.526620 0.627568 0.526620 0.710588 0.526620 0.513724
## H 0.698610 0.767506 0.903640 0.658870 0.675610 0.626528 0.767506
## K 0.653680 0.626426 0.574420 0.781182 0.873042 0.713586 0.653680
## R 0.745402 0.745402 0.745402 0.941402 0.936478 0.787402 0.563036
## Q 0.675520 0.675520 0.675520 0.675520 0.675520 0.709466 0.588862
## N 0.555958 0.736364 0.796946 0.821308 0.788922 0.555958 0.736364
## E 0.666678 0.666678 0.888360 0.666678 0.735374 0.666678 0.578442
## D 0.643068 0.539162 0.627074 0.576050 0.654318 0.729038 0.643068
## S 0.592406 0.606496 0.736034 0.657180 0.592406 0.603322 0.606496
## T 0.667570 0.658394 0.763168 0.720108 0.814976 0.592000 0.667570
plot(importance)
names(model)
## [1] "method" "modelInfo" "modelType" "results"
## [5] "pred" "bestTune" "call" "dots"
## [9] "metric" "control" "finalModel" "preProcess"
## [13] "trainingData" "resample" "resampledCM" "perfNames"
## [17] "maximize" "yLimits" "times" "levels"
## [21] "terms" "coefnames" "xlevels"
model$bestTune
## size k
## 9 90 11
cache = TRUE
set.seed(1000)
Class <- as.factor(test_harness_paa$Class)
start_time <- Sys.time() # Start timer
# define the control using a random forest selection function
control <- rfeControl(functions = rfFuncs,
method = "cv",
number = 10)
# run the RFE algorithm
results <- rfe(x = test_harness_paa[,2:21],
y = Class,
sizes = c(1:19),
rfeControl = control)
end_time <- Sys.time() # End timer
end_time - start_time # Display time
## Time difference of 6.246691 mins
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.5806 0.5107 0.02094 0.02443
## 2 0.6997 0.6497 0.01685 0.01965
## 3 0.7760 0.7387 0.02414 0.02816
## 4 0.8071 0.7750 0.02105 0.02456
## 5 0.8357 0.8083 0.01606 0.01874
## 6 0.8477 0.8223 0.01914 0.02234
## 7 0.8594 0.8360 0.01979 0.02308
## 8 0.8651 0.8427 0.02019 0.02356
## 9 0.8703 0.8487 0.01629 0.01900
## 10 0.8766 0.8560 0.01865 0.02176
## 11 0.8854 0.8663 0.01652 0.01927
## 12 0.8914 0.8733 0.01374 0.01602
## 13 0.8983 0.8813 0.01439 0.01679
## 14 0.9034 0.8873 0.01050 0.01225
## 15 0.9043 0.8883 0.01168 0.01363
## 16 0.9049 0.8890 0.01271 0.01483
## 17 0.9069 0.8913 0.01368 0.01596
## 18 0.9080 0.8927 0.01587 0.01851
## 19 0.9117 0.8970 0.01460 0.01703 *
## 20 0.9089 0.8937 0.01873 0.02186
##
## The top 5 variables (out of 19):
## E, Y, R, H, A
# list the chosen features
predictors(results)
## [1] "E" "Y" "R" "H" "A" "K" "F" "V" "N" "G" "P" "W" "C" "I" "D" "L" "S"
## [18] "T" "M"
# plot the results
plot(results, type = c("g", "o"))
Machine Settings:
Sys.info()[c(1:3,5)]
## sysname
## "Linux"
## release
## "4.15.0-47-generic"
## version
## "#50~16.04.1-Ubuntu SMP Fri Mar 15 16:06:21 UTC 2019"
## machine
## "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mlbench_2.1-1 caret_6.0-81 ggplot2_3.1.0 lattice_0.20-38
## [5] doMC_1.3.5 iterators_1.0.10 foreach_1.4.4 readr_1.3.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.4 purrr_0.2.5
## [4] reshape2_1.4.3 splines_3.4.4 colorspace_1.3-2
## [7] generics_0.0.2 stats4_3.4.4 htmltools_0.3.6
## [10] yaml_2.2.0 prodlim_2018.04.18 survival_2.43-3
## [13] rlang_0.3.0.1 e1071_1.7-0.1 ModelMetrics_1.2.2
## [16] pillar_1.3.1 glue_1.3.0 withr_2.1.2
## [19] bindrcpp_0.2.2 bindr_0.1.1 plyr_1.8.4
## [22] lava_1.6.4 stringr_1.3.1 timeDate_3043.102
## [25] munsell_0.5.0 gtable_0.2.0 recipes_0.1.4
## [28] codetools_0.2-16 evaluate_0.12 knitr_1.21
## [31] class_7.3-14 Rcpp_1.0.0 scales_1.0.0
## [34] ipred_0.9-8 hms_0.4.2 digest_0.6.18
## [37] stringi_1.2.4 dplyr_0.7.8 grid_3.4.4
## [40] tools_3.4.4 magrittr_1.5 lazyeval_0.2.1
## [43] tibble_1.4.2 randomForest_4.6-14 crayon_1.3.4
## [46] pkgconfig_2.0.2 MASS_7.3-51.1 Matrix_1.2-15
## [49] data.table_1.11.8 lubridate_1.7.4 gower_0.1.2
## [52] assertthat_0.2.0 rmarkdown_1.11 R6_2.3.0
## [55] rpart_4.1-13 nnet_7.3-12 nlme_3.1-137
## [58] compiler_3.4.4
EOF