Title: Tuning_RF_Using_ntree.rmd

Summary: Parameter Tuning (ntree) for Random Forest when Using caret Package

Library 1/2:

knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

Libraries = c("readr", "doMC",  "caret", "randomForest", "ggplot2")

# Install if not present
for(p in Libraries){
    if (!require(p, character.only = TRUE)) { install.packages(p) }
    library(p, character.only = TRUE)
}

Import Data:

cache = TRUE
setwd("~/Dropbox/Oxy-RF/5_RF_Tuning_w_ntree")
test_harness_paa <- read_csv("test_harness_paa.csv",
                             col_types = cols(TotalAA = col_skip(),
                                              id = col_skip()))

Convert ‘Class’ To Factor:

Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)

cache = TRUE
Class <- as.factor(test_harness_paa$Class)

typeof(test_harness_paa)
## [1] "list"
class(Class)
## [1] "factor"

Partition Testing and Training datasets:

cache = TRUE
set.seed(1000)

index <- createDataPartition(test_harness_paa$Class, p = 0.8, list = FALSE)
training_set <- test_harness_paa[ index,]
testing_set  <- test_harness_paa[-index,]

preProcValues <- preProcess(training_set, method = c("center", "scale"))

train_transformed <- predict(preProcValues, training_set)

RF grid tuning:

cache = TRUE
set.seed(1000)
registerDoMC(cores = 3)

start_time <- Sys.time() # Start timer

mtry_def <- 2
# How many columns to select in each bootstrap sample?
t_grid <- expand.grid(mtry= c(mtry_def))

set.seed(1234)
start <- proc.time()[3]
model.rf <- train(Class ~ .,
                  data = train_transformed,
                  method = "rf",
                  ntree = 50, # How many trees to grow in total?
                  tuneGrid = t_grid)

end_time <- Sys.time()   # End timer
end_time - start_time    # Display time
## Time difference of 5.529183 secs
print(model.rf)
## Random Forest 
## 
## 2800 samples
##   20 predictors
##    7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2800, 2800, 2800, 2800, 2800, 2800, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8757229  0.8549459
## 
## Tuning parameter 'mtry' was held constant at a value of 2

Machine Settings:

Sys.info()[c(1:3,5)]
##                                               sysname 
##                                               "Linux" 
##                                               release 
##                                   "4.15.0-47-generic" 
##                                               version 
## "#50~16.04.1-Ubuntu SMP Fri Mar 15 16:06:21 UTC 2019" 
##                                               machine 
##                                              "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] randomForest_4.6-14 caret_6.0-81        ggplot2_3.1.0      
## [4] lattice_0.20-38     doMC_1.3.5          iterators_1.0.10   
## [7] foreach_1.4.4       readr_1.3.1        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5   xfun_0.4           purrr_0.2.5       
##  [4] reshape2_1.4.3     splines_3.4.4      colorspace_1.3-2  
##  [7] generics_0.0.2     stats4_3.4.4       htmltools_0.3.6   
## [10] yaml_2.2.0         prodlim_2018.04.18 survival_2.43-3   
## [13] rlang_0.3.0.1      e1071_1.7-0.1      ModelMetrics_1.2.2
## [16] pillar_1.3.1       glue_1.3.0         withr_2.1.2       
## [19] bindrcpp_0.2.2     bindr_0.1.1        plyr_1.8.4        
## [22] lava_1.6.4         stringr_1.3.1      timeDate_3043.102 
## [25] munsell_0.5.0      gtable_0.2.0       recipes_0.1.4     
## [28] codetools_0.2-16   evaluate_0.12      knitr_1.21        
## [31] class_7.3-14       Rcpp_1.0.0         scales_1.0.0      
## [34] ipred_0.9-8        hms_0.4.2          digest_0.6.18     
## [37] stringi_1.2.4      dplyr_0.7.8        grid_3.4.4        
## [40] tools_3.4.4        magrittr_1.5       lazyeval_0.2.1    
## [43] tibble_1.4.2       crayon_1.3.4       pkgconfig_2.0.2   
## [46] MASS_7.3-51.1      Matrix_1.2-15      data.table_1.11.8 
## [49] lubridate_1.7.4    gower_0.1.2        assertthat_0.2.0  
## [52] rmarkdown_1.11     R6_2.3.0           rpart_4.1-13      
## [55] nnet_7.3-12        nlme_3.1-137       compiler_3.4.4

EOF