cache = TRUE
knitr::opts_chunk$set(echo = TRUE)
Libraries = c("readr", "doMC", "caret", "randomForest")
# Install if not present
for(p in Libraries){
if(!require(p, character.only = TRUE))
install.packages(p)
library(p, character.only = TRUE)
}
cache = TRUE
setwd("~/Dropbox/Oxy-RF/4_RF_Tuning_w_mtry")
test_harness_paa <- read_csv("test_harness_paa.csv",
col_types = cols(TotalAA = col_skip(),
id = col_skip()))
Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)
cache = TRUE
class_factor <- as.factor(test_harness_paa$Class)
typeof(test_harness_paa)
## [1] "list"
class(class_factor)
## [1] "factor"
cache = TRUE
set.seed(1000)
index <- createDataPartition(test_harness_paa$Class, p = 0.8, list = FALSE)
training_set <- test_harness_paa[ index,]
testing_set <- test_harness_paa[-index,]
preProcValues <- preProcess(training_set, method = c("center", "scale"))
train_transformed <- predict(preProcValues, training_set)
test_transformed <- predict(preProcValues, testing_set)
cache = TRUE
set.seed(1000)
train_control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = TRUE,
allowParallel = TRUE,
summaryFunction = multiClassSummary)
cache = TRUE
set.seed(1000)
registerDoMC(cores = 3)
start_time <- Sys.time() # Start timer
my_grid1 <- expand.grid(mtry = 1:4)
rf1 <- train(Class ~ .,
data = train_transformed,
method = "rf",
metric = "Accuracy",
tuneGrid = my_grid1,
trControl = train_control)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1 on full training set
end_time <- Sys.time() # End timer
end_time - start_time # Display time
## Time difference of 3.787912 mins
rf1
## Random Forest
##
## 2800 samples
## 20 predictors
## 7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 2520, 2520, 2520, 2520, 2520, 2520, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Mean_F1 Mean_Sensitivity
## 1 0.9082857 0.8930000 0.9081804 0.9082857
## 2 0.9082143 0.8929167 0.9078751 0.9082143
## 3 0.9067857 0.8912500 0.9064189 0.9067857
## 4 0.9043571 0.8884167 0.9039998 0.9043571
## Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.9847143 0.9116104 0.9848154
## 0.9847024 0.9102942 0.9847951
## 0.9844643 0.9085748 0.9845494
## 0.9840595 0.9063132 0.9841469
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.9116104 0.9082857 0.1297551 0.9465000
## 0.9102942 0.9082143 0.1297449 0.9464583
## 0.9085748 0.9067857 0.1295408 0.9456250
## 0.9063132 0.9043571 0.1291939 0.9442083
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
cache = TRUE
my_plot <- function(model) {
theme_set(theme_minimal())
u <- model$results %>%
select(mtry, Accuracy, Kappa, Mean_F1, Mean_Sensitivity,
Mean_Specificity,Mean_Pos_Pred_Value, Mean_Neg_Pred_Value,
Mean_Precision, Mean_Recall, Mean_Detection_Rate) %>%
gather(a, b, -mtry)
u %>% ggplot(aes(mtry, b)) + geom_line() + geom_point() +
facet_wrap(~ a, scales = "free") +
labs(x = "Number of mtry", y = NULL,
title = "The Relationship between Model Performance and mtry")
}
rf1 %>% my_plot()
Machine Settings:
Sys.info()[c(1:3,5)]
## sysname
## "Linux"
## release
## "4.15.0-46-generic"
## version
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019"
## machine
## "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_0.3.0 stringr_1.3.1 purrr_0.2.5
## [4] tidyr_0.8.2 tibble_1.4.2 tidyverse_1.2.1
## [7] dplyr_0.7.8 magrittr_1.5 randomForest_4.6-14
## [10] caret_6.0-81 ggplot2_3.1.0 lattice_0.20-38
## [13] doMC_1.3.5 iterators_1.0.10 foreach_1.4.4
## [16] readr_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 lubridate_1.7.4 class_7.3-14
## [4] assertthat_0.2.0 digest_0.6.18 ipred_0.9-8
## [7] cellranger_1.1.0 R6_2.3.0 plyr_1.8.4
## [10] backports_1.1.3 stats4_3.4.4 evaluate_0.12
## [13] e1071_1.7-0.1 httr_1.4.0 pillar_1.3.1
## [16] rlang_0.3.0.1 readxl_1.2.0 lazyeval_0.2.1
## [19] rstudioapi_0.8 data.table_1.11.8 rpart_4.1-13
## [22] Matrix_1.2-15 rmarkdown_1.11 labeling_0.3
## [25] splines_3.4.4 gower_0.1.2 munsell_0.5.0
## [28] broom_0.5.1 modelr_0.1.2 compiler_3.4.4
## [31] xfun_0.4 pkgconfig_2.0.2 htmltools_0.3.6
## [34] nnet_7.3-12 tidyselect_0.2.5 prodlim_2018.04.18
## [37] codetools_0.2-16 crayon_1.3.4 withr_2.1.2
## [40] MASS_7.3-51.1 recipes_0.1.4 ModelMetrics_1.2.2
## [43] grid_3.4.4 jsonlite_1.6 nlme_3.1-137
## [46] gtable_0.2.0 scales_1.0.0 cli_1.0.1
## [49] stringi_1.2.4 reshape2_1.4.3 bindrcpp_0.2.2
## [52] timeDate_3043.102 xml2_1.2.0 generics_0.0.2
## [55] lava_1.6.4 tools_3.4.4 glue_1.3.0
## [58] hms_0.4.2 survival_2.43-3 yaml_2.2.0
## [61] colorspace_1.3-2 rvest_0.3.2 haven_2.0.0
## [64] knitr_1.21 bindr_0.1.1
EOF