Title: Intial Scouting-Random Forest of AA Using Test-Harness

Summary: Using the ‘caret’ package from Max Kuhn and the Test-Harness file I will run random forest to classify the 7 protein groups.

Libraries:

knitr::opts_chunk$set(echo = TRUE)

Libraries = c("readr", "doMC", "caret")

# Install if not present
for(p in Libraries){
    if(!require(p, character.only = TRUE))
        install.packages(p)
    library(p, character.only = TRUE)
}

Import Data:

test_harness_paa <- read_csv("test_harness_paa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_character(),
##   id = col_character()
## )
## See spec(...) for full column specifications.

Convert ‘Class’ To Factor:

Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)

class_factor <- as.factor(test_harness_paa$Class)

typeof(test_harness_paa)
## [1] "list"
class(class_factor)
## [1] "factor"

Pre-processing:

test_harness_paa <- test_harness_paa[,-c(2:3)] # More later

Partition Testing and Training datasets:

set.seed(1000)

index <- createDataPartition(test_harness_paa$Class, p = 0.8, list = FALSE)
training_set <- test_harness_paa[ index,]
testing_set  <- test_harness_paa[-index,]

RF model 1 training

start_time <- Sys.time() # Start timer

registerDoMC(cores=3)
modFit <- train(Class ~ ., 
                data = training_set,
                method = "rf")

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 2.594309 mins

Model 1 fitting output:

modFit
## Random Forest 
## 
## 2800 samples
##   20 predictors
##    7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2800, 2800, 2800, 2800, 2800, 2800, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8888620  0.8702890
##   11    0.8710888  0.8495418
##   20    0.8460415  0.8203050
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Implement Pre-Processing to compare training

start_time <- Sys.time() # Start timer

registerDoMC(cores=3)
modFit2 <- train(Class ~ ., 
                 data = training_set,
                 method = "rf",
                 preProcess = c("center","scale")
                 )

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 2.557944 mins

Model 2 fitting output:

modFit2
## Random Forest 
## 
## 2800 samples
##   20 predictors
##    7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb' 
## 
## Pre-processing: centered (20), scaled (20) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2800, 2800, 2800, 2800, 2800, 2800, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8890704  0.8705152
##   11    0.8695372  0.8477239
##   20    0.8477447  0.8222883
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Machine Settings:

Sys.info()[c(1:3,5)]
##                                               sysname 
##                                               "Linux" 
##                                               release 
##                                   "4.15.0-46-generic" 
##                                               version 
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019" 
##                                               machine 
##                                              "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] caret_6.0-81     ggplot2_3.1.0    lattice_0.20-38  doMC_1.3.5      
## [5] iterators_1.0.10 foreach_1.4.4    readr_1.3.1     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5    xfun_0.4            purrr_0.2.5        
##  [4] reshape2_1.4.3      splines_3.4.4       colorspace_1.3-2   
##  [7] generics_0.0.2      stats4_3.4.4        htmltools_0.3.6    
## [10] yaml_2.2.0          prodlim_2018.04.18  survival_2.43-3    
## [13] rlang_0.3.0.1       e1071_1.7-0.1       ModelMetrics_1.2.2 
## [16] pillar_1.3.1        glue_1.3.0          withr_2.1.2        
## [19] bindrcpp_0.2.2      bindr_0.1.1         plyr_1.8.4         
## [22] lava_1.6.4          stringr_1.3.1       timeDate_3043.102  
## [25] munsell_0.5.0       gtable_0.2.0        recipes_0.1.4      
## [28] codetools_0.2-16    evaluate_0.12       knitr_1.21         
## [31] class_7.3-14        Rcpp_1.0.0          scales_1.0.0       
## [34] ipred_0.9-8         hms_0.4.2           digest_0.6.18      
## [37] stringi_1.2.4       dplyr_0.7.8         grid_3.4.4         
## [40] tools_3.4.4         magrittr_1.5        lazyeval_0.2.1     
## [43] tibble_1.4.2        randomForest_4.6-14 crayon_1.3.4       
## [46] pkgconfig_2.0.2     MASS_7.3-51.1       Matrix_1.2-15      
## [49] data.table_1.11.8   lubridate_1.7.4     gower_0.1.2        
## [52] assertthat_0.2.0    rmarkdown_1.11      R6_2.3.0           
## [55] rpart_4.1-13        nnet_7.3-12         nlme_3.1-137       
## [58] compiler_3.4.4

EOF