This goal of this R markdown document is to explore the data derived from a larger set of proteins. The data set consists of 13562 prote ins. The final goal of this work is to determine what impact machine-leanring on the classification methods such as Random-Forest and CART.
library(corrplot) # produces correlation plots of variables## corrplot 0.84 loaded
library(caret) #For Random Forest & CART & partioning of data## Loading required package: lattice
## Loading required package: ggplot2
library(rpart) #For CART
library(rpart.plot)
library(rattle) #To plot decission plots## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
setwd("~/Dropbox/git_projects/random_forest/2_single_aa")
total_aa <- read.csv("~/Dropbox/git_projects/random_forest/2_single_aa/total_aa.csv")
# View(total_aa)
# View the top few rows of the data in R console
# head(total_aa,5)set.seed(1234)
# Loading the Caret package (above) allows us to partition the data
# using 'createDataPartition'.
# We use the dataset to create a partition (80% training 20% testing)
index <- createDataPartition(total_aa$Class, p = 0.80, list = FALSE)
# select 80% of data to train the models
trainset <- total_aa[ index,]
# select 20% of the data for testing
testset <- total_aa[-index,]
dim(testset)## [1] 3390 22
# Dimensions of the data
dim(trainset)## [1] 13562 22
trainset$Class <- factor(trainset$Class)
levels(trainset$Class) <- c("erythrocruorin", "hemerythrin",
"hemocyanin","hemoglobin",
"homo_sapiens", "leghemoglobin",
"myoglobin")
levels(trainset$Class)## [1] "erythrocruorin" "hemerythrin" "hemocyanin" "hemoglobin"
## [5] "homo_sapiens" "leghemoglobin" "myoglobin"
str(trainset)## 'data.frame': 13562 obs. of 22 variables:
## $ Class : Factor w/ 7 levels "erythrocruorin",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ TotalAA: int 781 2351 2527 920 34350 1002 2009 1863 431 480 ...
## $ G : num 0.0717 0.0549 0.0471 0.1043 0.0601 ...
## $ P : num 0.0448 0.0544 0.0372 0.0804 0.0733 ...
## $ A : num 0.1012 0.0468 0.0503 0.088 0.0607 ...
## $ V : num 0.0691 0.0532 0.0633 0.0435 0.0927 ...
## $ L : num 0.1204 0.0944 0.1409 0.0946 0.0616 ...
## $ I : num 0.0423 0.0468 0.0633 0.025 0.06 ...
## $ M : num 0.0384 0.0259 0.0277 0.0239 0.0116 ...
## $ C : num 0.0141 0.0111 0.0253 0.0293 0.0149 ...
## $ F : num 0.0166 0.0464 0.038 0.0304 0.0264 ...
## $ Y : num 0.0218 0.0336 0.0214 0.0359 0.0291 ...
## $ W : num 0.00896 0.01574 0.01029 0.0087 0.01357 ...
## $ H : num 0.0346 0.0319 0.0313 0.0207 0.0139 ...
## $ K : num 0.0333 0.0672 0.0696 0.0435 0.0857 ...
## $ R : num 0.0499 0.0442 0.0416 0.0446 0.0477 ...
## $ Q : num 0.0627 0.0468 0.0416 0.0772 0.0274 ...
## $ N : num 0.0397 0.0532 0.0507 0.0217 0.0323 ...
## $ E : num 0.0615 0.063 0.0728 0.0598 0.093 ...
## $ D : num 0.0538 0.0515 0.0487 0.0402 0.0501 ...
## $ S : num 0.0538 0.0932 0.0811 0.088 0.0717 ...
## $ T : num 0.0615 0.0659 0.038 0.0402 0.0741 ...
tail(trainset)## Class TotalAA G P A V
## 16946 myoglobin 142 0.03521127 0.04929577 0.11971830 0.06338028
## 16948 myoglobin 156 0.03846154 0.03846154 0.06410256 0.05769231
## 16949 myoglobin 147 0.08843537 0.03401361 0.09523810 0.09523810
## 16950 myoglobin 143 0.06993007 0.04895105 0.10489510 0.04895105
## 16951 myoglobin 146 0.09589041 0.03424658 0.09589041 0.12328770
## 16952 myoglobin 141 0.10638300 0.04255319 0.06382979 0.09929078
## L I M C F Y
## 16946 0.1408451 0.007042254 0.021126760 0.014084510 0.05633803 0.02816901
## 16948 0.1217949 0.032051280 0.025641030 0.012820510 0.05128205 0.02564103
## 16949 0.1156463 0.040816330 0.020408160 0.013605440 0.04761905 0.02040816
## 16950 0.1608392 0.006993007 0.013986010 0.013986010 0.05594406 0.02097902
## 16951 0.1301370 0.000000000 0.006849315 0.006849315 0.05479452 0.02054795
## 16952 0.1063830 0.014184400 0.021276600 0.021276600 0.04964539 0.02127660
## W H K R Q N
## 16946 0.014084510 0.06338028 0.05633803 0.03521127 0.04225352 0.01408451
## 16948 0.019230770 0.05128205 0.06410256 0.07692308 0.03205128 0.01923077
## 16949 0.013605440 0.06122449 0.08163265 0.01360544 0.02721088 0.04761905
## 16950 0.006993007 0.06293706 0.04195804 0.03496503 0.04195804 0.02097902
## 16951 0.013698630 0.06164384 0.07534247 0.02739726 0.02739726 0.04794521
## 16952 0.014184400 0.03546099 0.09929078 0.02127660 0.02836879 0.04255319
## E D S T
## 16946 0.02112676 0.07042254 0.09154930 0.05633803
## 16948 0.05769231 0.05128205 0.10897440 0.05128205
## 16949 0.04081633 0.04761905 0.06122449 0.03401361
## 16950 0.02097902 0.06993007 0.09090909 0.06293706
## 16951 0.06849315 0.04794521 0.04109589 0.02054795
## 16952 0.06382979 0.04964539 0.05673759 0.04255319
summary(trainset)## Class TotalAA G
## erythrocruorin:13065 Min. : 4 Min. :0.00000
## hemerythrin : 13 1st Qu.: 254 1st Qu.:0.05056
## hemocyanin : 7 Median : 434 Median :0.06475
## hemoglobin : 55 Mean : 595 Mean :0.06767
## homo_sapiens : 7 3rd Qu.: 708 3rd Qu.:0.08045
## leghemoglobin : 109 Max. :34350 Max. :0.46474
## myoglobin : 306
## P A V L
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.04131 1st Qu.:0.05512 1st Qu.:0.04851 1st Qu.:0.08068
## Median :0.05464 Median :0.06977 Median :0.06028 Median :0.09783
## Mean :0.06094 Mean :0.07343 Mean :0.06145 Mean :0.09907
## 3rd Qu.:0.07355 3rd Qu.:0.08755 3rd Qu.:0.07269 3rd Qu.:0.11679
## Max. :0.38060 Max. :0.40000 Max. :0.33333 Max. :0.32323
##
## I M C F
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02899 1st Qu.:0.01523 1st Qu.:0.01177 1st Qu.:0.02641
## Median :0.04240 Median :0.02113 Median :0.01880 Median :0.03608
## Mean :0.04284 Mean :0.02244 Mean :0.02290 Mean :0.03756
## 3rd Qu.:0.05556 3rd Qu.:0.02778 3rd Qu.:0.02761 3rd Qu.:0.04724
## Max. :0.21538 Max. :0.13836 Max. :0.36747 Max. :0.17391
##
## Y W H K
## Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01842 1st Qu.:0.006613 1st Qu.:0.01770 1st Qu.:0.03968
## Median :0.02651 Median :0.011494 Median :0.02439 Median :0.05614
## Mean :0.02765 Mean :0.012832 Mean :0.02640 Mean :0.05891
## 3rd Qu.:0.03541 3rd Qu.:0.017337 3rd Qu.:0.03192 3rd Qu.:0.07445
## Max. :0.24194 Max. :0.232877 Max. :0.30000 Max. :0.31250
##
## R Q N E
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.04263 1st Qu.:0.03350 1st Qu.:0.02462 1st Qu.:0.05164
## Median :0.05482 Median :0.04356 Median :0.03487 Median :0.06655
## Mean :0.05750 Mean :0.04562 Mean :0.03547 Mean :0.06897
## 3rd Qu.:0.06925 3rd Qu.:0.05458 3rd Qu.:0.04505 3rd Qu.:0.08295
## Max. :0.47059 Max. :0.98750 Max. :0.14286 Max. :0.31532
##
## D S T
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.03774 1st Qu.:0.06106 1st Qu.:0.04154
## Median :0.04792 Median :0.07514 Median :0.05111
## Mean :0.04810 Mean :0.07779 Mean :0.05231
## 3rd Qu.:0.05758 3rd Qu.:0.09146 3rd Qu.:0.06111
## Max. :0.20000 Max. :0.41660 Max. :0.34949
##
par(mfrow=c(1,5))
for(i in 2:22) {
boxplot(trainset[,i], main = names(trainset)[i])
}# Reduce total_aa data set to amino acids only
corrSet <- total_aa[,2:22]
#str(corrSet)
corrMatrix <- cor(corrSet) # Calculate correlation coefficients
#corrMatrix
# Determine Max and Min
#which(abs(corrMatrix) > 0.4, arr.ind = T)
#Check for corrlation coefficients |corrMatrix(i,j)| > 0.4
MinCorrelation <- min(corrMatrix)
MinCorrelation## [1] -0.3673635
MaxCorrelation <- corrMatrix[ corrMatrix > 0.35 & corrMatrix < 1 ]
MaxCorrelation## [1] 0.3607724 0.3607724
which(corrMatrix < -0.36)## [1] 49 129
which(corrMatrix > 0.35 & corrMatrix < 1) ## [1] 143 343
corrplot(corrMatrix, method = "color", type = "upper")Parallel processing in caret can be accomplished with the parallel and doParallel packages. The following code loads the required libraries (note, these libraries also depend on the iterators and foreach libraries).
library(parallel)
library(doParallel)## Loading required package: foreach
## Loading required package: iterators
cluster <- makeCluster(detectCores() - 1)
# leave 1 core for OS
registerDoParallel(cluster)set.seed(1234)
# Time
start_RF <- Sys.time()
# Train model with CART algorithm
DecTreeModel <- factor(trainset$Class,
levels = 0:6,
labels = c("erythrocruorin", "hemerythrin",
"hemocyanin","hemoglobin",
"homo_sapiens", "leghemoglobin",
"myoglobin"))
DecTreeModel <- rpart(Class ~ ., data = trainset, method = 'class')
# Time
end_RF <- Sys.time()
RF_time = end_RF - start_RF
RF_time## Time difference of 1.458672 secs
After processing the data, we explicitly shut down the cluster by calling the stopCluster() function.
stopCluster(cluster)# Let's check the model
DecTreeModel## n= 13562
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 13562 497 erythrocruorin (0.96 0.00096 0.00052 0.0041 0.00052 0.008 0.023)
## 2) R>=0.02130497 12966 165 erythrocruorin (0.99 0.00077 0.00046 0.0029 0 0.0018 0.0068)
## 4) V< 0.1087696 12787 99 erythrocruorin (0.99 0.00078 0.00047 0.0027 0 0.0017 0.0021)
## 8) M>=0.000514668 12766 83 erythrocruorin (0.99 0.00047 0.00047 0.0018 0 0.0017 0.002) *
## 9) M< 0.000514668 21 10 hemoglobin (0.24 0.19 0 0.52 0 0 0.048) *
## 5) V>=0.1087696 179 66 erythrocruorin (0.63 0 0 0.022 0 0.0056 0.34)
## 10) I>=0.01414293 108 2 erythrocruorin (0.98 0 0 0.0093 0 0.0093 0) *
## 11) I< 0.01414293 71 10 myoglobin (0.099 0 0 0.042 0 0 0.86)
## 22) Q>=0.0361354 7 1 erythrocruorin (0.86 0 0 0.14 0 0 0) *
## 23) Q< 0.0361354 64 3 myoglobin (0.016 0 0 0.031 0 0 0.95) *
## 3) R< 0.02130497 596 332 erythrocruorin (0.44 0.005 0.0017 0.029 0.012 0.14 0.37)
## 6) H< 0.0452498 297 44 erythrocruorin (0.85 0.01 0.0034 0.037 0.024 0.047 0.027)
## 12) TotalAA>=34.5 281 31 erythrocruorin (0.89 0.011 0.0036 0 0.025 0.05 0.021) *
## 13) TotalAA< 34.5 16 5 hemoglobin (0.19 0 0 0.69 0 0 0.12) *
## 7) H>=0.0452498 299 89 myoglobin (0.037 0 0 0.02 0 0.24 0.7)
## 14) V< 0.06331586 81 10 leghemoglobin (0.099 0 0 0.012 0 0.88 0.012)
## 28) K< 0.1127894 10 3 erythrocruorin (0.7 0 0 0.1 0 0.1 0.1) *
## 29) K>=0.1127894 71 1 leghemoglobin (0.014 0 0 0 0 0.99 0) *
## 15) V>=0.06331586 218 9 myoglobin (0.014 0 0 0.023 0 0.0046 0.96) *
rattle::fancyRpartPlot(DecTreeModel, main="CART model")drawTreeNodes(DecTreeModel, cex = 1, pch = par(19),
size = 3, col = NULL, nodeinfo = FALSE,
units = "", cases = "obs", digits = getOption("digits"),
decimals = 2, print.levels = TRUE, new = TRUE) rpart.plot(DecTreeModel, extra = "auto",
box.palette = "auto", branch.lty = 3)sessionInfo()## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.2
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.11 iterators_1.0.9 foreach_1.4.4
## [4] rattle_5.1.0 rpart.plot_2.1.2 rpart_4.1-11
## [7] caret_6.0-78 ggplot2_2.2.1 lattice_0.20-35
## [10] corrplot_0.84
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 lubridate_1.7.1 RGtk2_2.20.33
## [4] tidyr_0.7.2 class_7.3-14 assertthat_0.2.0
## [7] rprojroot_1.2 digest_0.6.13 ipred_0.9-6
## [10] psych_1.7.8 R6_2.2.2 plyr_1.8.4
## [13] backports_1.1.2 stats4_3.4.2 evaluate_0.10.1
## [16] rlang_0.1.4 lazyeval_0.2.1 kernlab_0.9-25
## [19] Matrix_1.2-12 rmarkdown_1.8 splines_3.4.2
## [22] CVST_0.2-1 ddalpha_1.3.1 gower_0.1.2
## [25] stringr_1.2.0 foreign_0.8-69 munsell_0.4.3
## [28] broom_0.4.3 compiler_3.4.2 pkgconfig_2.0.1
## [31] mnormt_1.5-5 dimRed_0.1.0 htmltools_0.3.6
## [34] nnet_7.3-12 tidyselect_0.2.3 tibble_1.3.4
## [37] prodlim_1.6.1 DRR_0.0.2 codetools_0.2-15
## [40] RcppRoll_0.2.2 dplyr_0.7.4 withr_2.1.0
## [43] MASS_7.3-47 recipes_0.1.1 ModelMetrics_1.1.0
## [46] grid_3.4.2 nlme_3.1-131 gtable_0.2.0
## [49] magrittr_1.5 scales_0.5.0 stringi_1.1.6
## [52] reshape2_1.4.3 bindrcpp_0.2 timeDate_3042.101
## [55] robustbase_0.92-8 lava_1.5.1 RColorBrewer_1.1-2
## [58] tools_3.4.2 glue_1.2.0 DEoptimR_1.0-8
## [61] purrr_0.2.4 sfsmisc_1.1-1 survival_2.41-3
## [64] yaml_2.1.16 colorspace_1.3-2 knitr_1.17
## [67] bindr_0.1