This script has preProcess = c(“center”,“scale”).
cache = TRUE
knitr::opts_chunk$set(echo = TRUE)
Libraries = c("readr", "doMC", "caret")
# Install if not present
for(p in Libraries){
if(!require(p, character.only = TRUE))
install.packages(p)
library(p, character.only = TRUE)
}
test_harness_paa <- read_csv("test_harness_paa.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Class = col_character(),
## id = col_character()
## )
## See spec(...) for full column specifications.
Convert Class(numerical) to Factor of 7 Protein Classes(Prot_Class)
class_factor <- as.factor(test_harness_paa$Class)
typeof(test_harness_paa)
## [1] "list"
class(class_factor)
## [1] "factor"
test_harness_paa <- test_harness_paa[,-c(2:3)]
cache = TRUE
set.seed(1000)
index <- createDataPartition(test_harness_paa$Class, p = 0.8, list = FALSE)
training_set <- test_harness_paa[ index,]
testing_set <- test_harness_paa[-index,]
cache = TRUE
registerDoMC(cores=3)
start_time <- Sys.time() # Start timer
# Setting the parameters for model preprocessing and tuning from the caret package:
fitControl <- trainControl(method = "repeatedcv",
number = 10, # 10-fold Crossvalidation
repeats = 5, # repeated ten times
verboseIter=FALSE,
preProcOptions="pca", # PCA Preprocessing
allowParallel=TRUE) # With parallel backend
RF_model <- train(Class ~ .,
data = training_set,
method = "rf",
preProcess = c("center","scale"),
trControl= fitControl)
end_time <- Sys.time() # End timer
end_time - start_time # Display time
## Time difference of 3.837104 mins
cache = TRUE
RF_model
## Random Forest
##
## 2800 samples
## 20 predictors
## 7 classes: 'Ctrl', 'Ery', 'Hcy', 'Hgb', 'Hhe', 'Lgb', 'Mgb'
##
## Pre-processing: centered (20), scaled (20)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 2520, 2520, 2520, 2520, 2520, 2520, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9086429 0.8934167
## 11 0.8908571 0.8726667
## 20 0.8712143 0.8497500
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
cache = TRUE
RF_model$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 8.86%
## Confusion matrix:
## Ctrl Ery Hcy Hgb Hhe Lgb Mgb class.error
## Ctrl 361 2 13 15 7 0 2 0.0975
## Ery 1 381 2 2 10 0 4 0.0475
## Hcy 5 0 366 18 10 1 0 0.0850
## Hgb 13 17 9 316 39 4 2 0.2100
## Hhe 2 3 8 18 367 1 1 0.0825
## Lgb 0 0 0 1 0 398 1 0.0050
## Mgb 2 9 4 13 9 0 363 0.0925
cache = TRUE
#testing = as.factor(testing_set$Class)
#predictRF <- predict(modFit2, testing)
#confusionMatrix(predictRF, testing)
#confusionMatrixRF
#levels(predictRF)
#levels(testing_set)
Machine Settings:
Sys.info()[c(1:3,5)]
## sysname
## "Linux"
## release
## "4.15.0-46-generic"
## version
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019"
## machine
## "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] caret_6.0-81 ggplot2_3.1.0 lattice_0.20-38 doMC_1.3.5
## [5] iterators_1.0.10 foreach_1.4.4 readr_1.3.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.4 purrr_0.2.5
## [4] reshape2_1.4.3 splines_3.4.4 colorspace_1.3-2
## [7] generics_0.0.2 stats4_3.4.4 htmltools_0.3.6
## [10] yaml_2.2.0 prodlim_2018.04.18 survival_2.43-3
## [13] rlang_0.3.0.1 e1071_1.7-0.1 ModelMetrics_1.2.2
## [16] pillar_1.3.1 glue_1.3.0 withr_2.1.2
## [19] bindrcpp_0.2.2 bindr_0.1.1 plyr_1.8.4
## [22] lava_1.6.4 stringr_1.3.1 timeDate_3043.102
## [25] munsell_0.5.0 gtable_0.2.0 recipes_0.1.4
## [28] codetools_0.2-16 evaluate_0.12 knitr_1.21
## [31] class_7.3-14 Rcpp_1.0.0 scales_1.0.0
## [34] ipred_0.9-8 hms_0.4.2 digest_0.6.18
## [37] stringi_1.2.4 dplyr_0.7.8 grid_3.4.4
## [40] tools_3.4.4 magrittr_1.5 lazyeval_0.2.1
## [43] tibble_1.4.2 randomForest_4.6-14 crayon_1.3.4
## [46] pkgconfig_2.0.2 MASS_7.3-51.1 Matrix_1.2-15
## [49] data.table_1.11.8 lubridate_1.7.4 gower_0.1.2
## [52] assertthat_0.2.0 rmarkdown_1.11 R6_2.3.0
## [55] rpart_4.1-13 nnet_7.3-12 nlme_3.1-137
## [58] compiler_3.4.4
EOF