Title: Feature Selection Using Caret.rmd

Summary:

Libraries:

knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

Libraries = c("readr", "doMC", "caret", "mlbench")

# Install if not present
for(p in Libraries){
    if(!require(p, character.only = TRUE)) { install.packages(p) }
    library(p, character.only = TRUE)
}

Import Data:

test_harness_paa <- read_csv("test_harness_paa.csv",
                             col_types = cols(TotalAA = col_skip(),
                                              id = col_skip()))

Convert ‘Class’ To Factor:

Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)

Class <- as.factor(test_harness_paa$Class)

typeof(test_harness_paa)
## [1] "list"
class(Class)
## [1] "factor"

Remove Redundant Features

Calculate correlation matrix

Find attributes that are highly corrected (ideally > 0.75)

correlation_matrix <- cor(test_harness_paa[,2:21])
#print(correlation_matrix)
 
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.5)
print(highly_correlated)
## [1] 13 10

Rank Features By Importance

Learning Vector Quantization (LVQ) model

This example loads the dataset and constructs an Learning Vector Quantization (LVQ) model. The varImp is then used to estimate the variable importance, which is printed and plotted.

cache = TRUE
registerDoMC(cores = 3)

start_time <- Sys.time() # Start timer

control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# train the model
model <- train(Class ~ ., 
               data = test_harness_paa, 
               method = "lvq", 
               preProcess = c("center", "scale"), 
               trControl = control)

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 40.90145 secs

Learning Vector Quantization

model
## Learning Vector Quantization 
## 
## 3500 samples
##   20 predictors
##    7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb' 
## 
## Pre-processing: centered (20), scaled (20) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3150, 3150, 3150, 3150, 3150, 3150, ... 
## Resampling results across tuning parameters:
## 
##   size  k   Accuracy   Kappa    
##   45     1  0.7447619  0.7022222
##   45     6  0.7528571  0.7116667
##   45    11  0.7516190  0.7102222
##   67     1  0.7814286  0.7450000
##   67     6  0.7811429  0.7446667
##   67    11  0.7746667  0.7371111
##   90     1  0.7946667  0.7604444
##   90     6  0.7939048  0.7595556
##   90    11  0.7974286  0.7636667
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 90 and k = 11.

Estimate variable importance using ‘filterVarImp’

importance <- filterVarImp(test_harness_paa[,2:21], Class, nonpara = FALSE)
print(importance)
##       Ctrl      Ery      Hcy      Hgb      Hhe      Lgb      Mgb
## G 0.561260 0.561204 0.743992 0.561204 0.569036 0.561204 0.561260
## P 0.649366 0.649366 0.649366 0.742156 0.725268 0.801546 0.625314
## A 0.828370 0.828370 0.828370 0.874302 0.828370 0.909904 0.728732
## V 0.765758 0.765758 0.765758 0.938090 0.765758 0.765758 0.679428
## L 0.655980 0.655980 0.736762 0.655980 0.704156 0.691856 0.618786
## I 0.589428 0.672812 0.577912 0.737436 0.611298 0.576648 0.672812
## M 0.631680 0.700150 0.726036 0.530574 0.534866 0.610546 0.700150
## C 0.827834 0.827834 0.827834 0.873232 0.881722 0.827834 0.753460
## F 0.779568 0.779568 0.779568 0.779568 0.779568 0.779568 0.706268
## Y 0.850392 0.850392 0.850392 0.850392 0.850392 0.921528 0.651028
## W 0.526620 0.526620 0.627568 0.526620 0.710588 0.526620 0.513724
## H 0.698610 0.767506 0.903640 0.658870 0.675610 0.626528 0.767506
## K 0.653680 0.626426 0.574420 0.781182 0.873042 0.713586 0.653680
## R 0.745402 0.745402 0.745402 0.941402 0.936478 0.787402 0.563036
## Q 0.675520 0.675520 0.675520 0.675520 0.675520 0.709466 0.588862
## N 0.555958 0.736364 0.796946 0.821308 0.788922 0.555958 0.736364
## E 0.666678 0.666678 0.888360 0.666678 0.735374 0.666678 0.578442
## D 0.643068 0.539162 0.627074 0.576050 0.654318 0.729038 0.643068
## S 0.592406 0.606496 0.736034 0.657180 0.592406 0.603322 0.606496
## T 0.667570 0.658394 0.763168 0.720108 0.814976 0.592000 0.667570

Plot filterVarImp

plot(importance)

names(model)
##  [1] "method"       "modelInfo"    "modelType"    "results"     
##  [5] "pred"         "bestTune"     "call"         "dots"        
##  [9] "metric"       "control"      "finalModel"   "preProcess"  
## [13] "trainingData" "resample"     "resampledCM"  "perfNames"   
## [17] "maximize"     "yLimits"      "times"        "levels"      
## [21] "terms"        "coefnames"    "xlevels"
model$bestTune
##   size  k
## 9   90 11

Random Forest selection function

cache = TRUE
set.seed(1000)
Class <- as.factor(test_harness_paa$Class)

start_time <- Sys.time() # Start timer

# define the control using a random forest selection function
control <- rfeControl(functions = rfFuncs, 
                      method = "cv", 
                      number = 10)

# run the RFE algorithm
results <- rfe(x = test_harness_paa[,2:21], 
               y = Class, 
               sizes = c(1:19), 
               rfeControl = control)

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 6.246691 mins

Summarize RFE results

print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.5806 0.5107    0.02094 0.02443         
##          2   0.6997 0.6497    0.01685 0.01965         
##          3   0.7760 0.7387    0.02414 0.02816         
##          4   0.8071 0.7750    0.02105 0.02456         
##          5   0.8357 0.8083    0.01606 0.01874         
##          6   0.8477 0.8223    0.01914 0.02234         
##          7   0.8594 0.8360    0.01979 0.02308         
##          8   0.8651 0.8427    0.02019 0.02356         
##          9   0.8703 0.8487    0.01629 0.01900         
##         10   0.8766 0.8560    0.01865 0.02176         
##         11   0.8854 0.8663    0.01652 0.01927         
##         12   0.8914 0.8733    0.01374 0.01602         
##         13   0.8983 0.8813    0.01439 0.01679         
##         14   0.9034 0.8873    0.01050 0.01225         
##         15   0.9043 0.8883    0.01168 0.01363         
##         16   0.9049 0.8890    0.01271 0.01483         
##         17   0.9069 0.8913    0.01368 0.01596         
##         18   0.9080 0.8927    0.01587 0.01851         
##         19   0.9117 0.8970    0.01460 0.01703        *
##         20   0.9089 0.8937    0.01873 0.02186         
## 
## The top 5 variables (out of 19):
##    E, Y, R, H, A
# list the chosen features
predictors(results)
##  [1] "E" "Y" "R" "H" "A" "K" "F" "V" "N" "G" "P" "W" "C" "I" "D" "L" "S"
## [18] "T" "M"
# plot the results
plot(results, type = c("g", "o"))

Machine Settings:

Sys.info()[c(1:3,5)]
##                                               sysname 
##                                               "Linux" 
##                                               release 
##                                   "4.15.0-47-generic" 
##                                               version 
## "#50~16.04.1-Ubuntu SMP Fri Mar 15 16:06:21 UTC 2019" 
##                                               machine 
##                                              "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] mlbench_2.1-1    caret_6.0-81     ggplot2_3.1.0    lattice_0.20-38 
## [5] doMC_1.3.5       iterators_1.0.10 foreach_1.4.4    readr_1.3.1     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5    xfun_0.4            purrr_0.2.5        
##  [4] reshape2_1.4.3      splines_3.4.4       colorspace_1.3-2   
##  [7] generics_0.0.2      stats4_3.4.4        htmltools_0.3.6    
## [10] yaml_2.2.0          prodlim_2018.04.18  survival_2.43-3    
## [13] rlang_0.3.0.1       e1071_1.7-0.1       ModelMetrics_1.2.2 
## [16] pillar_1.3.1        glue_1.3.0          withr_2.1.2        
## [19] bindrcpp_0.2.2      bindr_0.1.1         plyr_1.8.4         
## [22] lava_1.6.4          stringr_1.3.1       timeDate_3043.102  
## [25] munsell_0.5.0       gtable_0.2.0        recipes_0.1.4      
## [28] codetools_0.2-16    evaluate_0.12       knitr_1.21         
## [31] class_7.3-14        Rcpp_1.0.0          scales_1.0.0       
## [34] ipred_0.9-8         hms_0.4.2           digest_0.6.18      
## [37] stringi_1.2.4       dplyr_0.7.8         grid_3.4.4         
## [40] tools_3.4.4         magrittr_1.5        lazyeval_0.2.1     
## [43] tibble_1.4.2        randomForest_4.6-14 crayon_1.3.4       
## [46] pkgconfig_2.0.2     MASS_7.3-51.1       Matrix_1.2-15      
## [49] data.table_1.11.8   lubridate_1.7.4     gower_0.1.2        
## [52] assertthat_0.2.0    rmarkdown_1.11      R6_2.3.0           
## [55] rpart_4.1-13        nnet_7.3-12         nlme_3.1-137       
## [58] compiler_3.4.4

EOF