Use of Machine Learning (CART) wrt Oxygen-Binding Proteins

This goal of this R markdown document is to explore the data derived from a larger set of proteins. The data set consists of 13562 prote ins. The final goal of this work is to determine what impact machine-leanring on the classification methods such as Random-Forest and CART.

Load Libraries

library(corrplot) # produces correlation plots of variables
## corrplot 0.84 loaded
library(caret) #For Random Forest & CART & partioning of data
## Loading required package: lattice

## Loading required package: ggplot2
library(rpart) #For CART
library(rpart.plot)
library(rattle) #To plot decission plots
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

Load / Inspect Data

setwd("~/Dropbox/git_projects/random_forest/2_single_aa")
total_aa <- read.csv("~/Dropbox/git_projects/random_forest/2_single_aa/total_aa.csv")
# View(total_aa)

# View the top few rows of the data in R console
# head(total_aa,5)

Partition data into 80/20: trainset & testset

Dimensions of the testset

set.seed(1234)
# Loading the Caret package (above) allows us to partition the data
# using 'createDataPartition'.
# We use the dataset to create a partition (80% training 20% testing)
index <- createDataPartition(total_aa$Class, p = 0.80, list = FALSE)
# select 80% of data to train the models
trainset <- total_aa[ index,]
# select 20% of the data for testing
testset <-  total_aa[-index,]

dim(testset)
## [1] 3390   22

Dimensions of the trainset

# Dimensions of the data
dim(trainset)
## [1] 13562    22

Structure of data, Convert Class to Factor w/ 7 Levels

trainset$Class <- factor(trainset$Class)
levels(trainset$Class) <- c("erythrocruorin", "hemerythrin",
                            "hemocyanin","hemoglobin", 
                            "homo_sapiens", "leghemoglobin",
                            "myoglobin")

levels(trainset$Class)
## [1] "erythrocruorin" "hemerythrin"    "hemocyanin"     "hemoglobin"    
## [5] "homo_sapiens"   "leghemoglobin"  "myoglobin"
str(trainset)
## 'data.frame':    13562 obs. of  22 variables:
##  $ Class  : Factor w/ 7 levels "erythrocruorin",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ TotalAA: int  781 2351 2527 920 34350 1002 2009 1863 431 480 ...
##  $ G      : num  0.0717 0.0549 0.0471 0.1043 0.0601 ...
##  $ P      : num  0.0448 0.0544 0.0372 0.0804 0.0733 ...
##  $ A      : num  0.1012 0.0468 0.0503 0.088 0.0607 ...
##  $ V      : num  0.0691 0.0532 0.0633 0.0435 0.0927 ...
##  $ L      : num  0.1204 0.0944 0.1409 0.0946 0.0616 ...
##  $ I      : num  0.0423 0.0468 0.0633 0.025 0.06 ...
##  $ M      : num  0.0384 0.0259 0.0277 0.0239 0.0116 ...
##  $ C      : num  0.0141 0.0111 0.0253 0.0293 0.0149 ...
##  $ F      : num  0.0166 0.0464 0.038 0.0304 0.0264 ...
##  $ Y      : num  0.0218 0.0336 0.0214 0.0359 0.0291 ...
##  $ W      : num  0.00896 0.01574 0.01029 0.0087 0.01357 ...
##  $ H      : num  0.0346 0.0319 0.0313 0.0207 0.0139 ...
##  $ K      : num  0.0333 0.0672 0.0696 0.0435 0.0857 ...
##  $ R      : num  0.0499 0.0442 0.0416 0.0446 0.0477 ...
##  $ Q      : num  0.0627 0.0468 0.0416 0.0772 0.0274 ...
##  $ N      : num  0.0397 0.0532 0.0507 0.0217 0.0323 ...
##  $ E      : num  0.0615 0.063 0.0728 0.0598 0.093 ...
##  $ D      : num  0.0538 0.0515 0.0487 0.0402 0.0501 ...
##  $ S      : num  0.0538 0.0932 0.0811 0.088 0.0717 ...
##  $ T      : num  0.0615 0.0659 0.038 0.0402 0.0741 ...
tail(trainset)
##           Class TotalAA          G          P          A          V
## 16946 myoglobin     142 0.03521127 0.04929577 0.11971830 0.06338028
## 16948 myoglobin     156 0.03846154 0.03846154 0.06410256 0.05769231
## 16949 myoglobin     147 0.08843537 0.03401361 0.09523810 0.09523810
## 16950 myoglobin     143 0.06993007 0.04895105 0.10489510 0.04895105
## 16951 myoglobin     146 0.09589041 0.03424658 0.09589041 0.12328770
## 16952 myoglobin     141 0.10638300 0.04255319 0.06382979 0.09929078
##               L           I           M           C          F          Y
## 16946 0.1408451 0.007042254 0.021126760 0.014084510 0.05633803 0.02816901
## 16948 0.1217949 0.032051280 0.025641030 0.012820510 0.05128205 0.02564103
## 16949 0.1156463 0.040816330 0.020408160 0.013605440 0.04761905 0.02040816
## 16950 0.1608392 0.006993007 0.013986010 0.013986010 0.05594406 0.02097902
## 16951 0.1301370 0.000000000 0.006849315 0.006849315 0.05479452 0.02054795
## 16952 0.1063830 0.014184400 0.021276600 0.021276600 0.04964539 0.02127660
##                 W          H          K          R          Q          N
## 16946 0.014084510 0.06338028 0.05633803 0.03521127 0.04225352 0.01408451
## 16948 0.019230770 0.05128205 0.06410256 0.07692308 0.03205128 0.01923077
## 16949 0.013605440 0.06122449 0.08163265 0.01360544 0.02721088 0.04761905
## 16950 0.006993007 0.06293706 0.04195804 0.03496503 0.04195804 0.02097902
## 16951 0.013698630 0.06164384 0.07534247 0.02739726 0.02739726 0.04794521
## 16952 0.014184400 0.03546099 0.09929078 0.02127660 0.02836879 0.04255319
##                E          D          S          T
## 16946 0.02112676 0.07042254 0.09154930 0.05633803
## 16948 0.05769231 0.05128205 0.10897440 0.05128205
## 16949 0.04081633 0.04761905 0.06122449 0.03401361
## 16950 0.02097902 0.06993007 0.09090909 0.06293706
## 16951 0.06849315 0.04794521 0.04109589 0.02054795
## 16952 0.06382979 0.04964539 0.05673759 0.04255319

Five number summary of data variables

summary(trainset)
##             Class          TotalAA            G          
##  erythrocruorin:13065   Min.   :    4   Min.   :0.00000  
##  hemerythrin   :   13   1st Qu.:  254   1st Qu.:0.05056  
##  hemocyanin    :    7   Median :  434   Median :0.06475  
##  hemoglobin    :   55   Mean   :  595   Mean   :0.06767  
##  homo_sapiens  :    7   3rd Qu.:  708   3rd Qu.:0.08045  
##  leghemoglobin :  109   Max.   :34350   Max.   :0.46474  
##  myoglobin     :  306                                    
##        P                 A                 V                 L          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04131   1st Qu.:0.05512   1st Qu.:0.04851   1st Qu.:0.08068  
##  Median :0.05464   Median :0.06977   Median :0.06028   Median :0.09783  
##  Mean   :0.06094   Mean   :0.07343   Mean   :0.06145   Mean   :0.09907  
##  3rd Qu.:0.07355   3rd Qu.:0.08755   3rd Qu.:0.07269   3rd Qu.:0.11679  
##  Max.   :0.38060   Max.   :0.40000   Max.   :0.33333   Max.   :0.32323  
##                                                                         
##        I                 M                 C                 F          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.02899   1st Qu.:0.01523   1st Qu.:0.01177   1st Qu.:0.02641  
##  Median :0.04240   Median :0.02113   Median :0.01880   Median :0.03608  
##  Mean   :0.04284   Mean   :0.02244   Mean   :0.02290   Mean   :0.03756  
##  3rd Qu.:0.05556   3rd Qu.:0.02778   3rd Qu.:0.02761   3rd Qu.:0.04724  
##  Max.   :0.21538   Max.   :0.13836   Max.   :0.36747   Max.   :0.17391  
##                                                                         
##        Y                 W                  H                 K          
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01842   1st Qu.:0.006613   1st Qu.:0.01770   1st Qu.:0.03968  
##  Median :0.02651   Median :0.011494   Median :0.02439   Median :0.05614  
##  Mean   :0.02765   Mean   :0.012832   Mean   :0.02640   Mean   :0.05891  
##  3rd Qu.:0.03541   3rd Qu.:0.017337   3rd Qu.:0.03192   3rd Qu.:0.07445  
##  Max.   :0.24194   Max.   :0.232877   Max.   :0.30000   Max.   :0.31250  
##                                                                          
##        R                 Q                 N                 E          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04263   1st Qu.:0.03350   1st Qu.:0.02462   1st Qu.:0.05164  
##  Median :0.05482   Median :0.04356   Median :0.03487   Median :0.06655  
##  Mean   :0.05750   Mean   :0.04562   Mean   :0.03547   Mean   :0.06897  
##  3rd Qu.:0.06925   3rd Qu.:0.05458   3rd Qu.:0.04505   3rd Qu.:0.08295  
##  Max.   :0.47059   Max.   :0.98750   Max.   :0.14286   Max.   :0.31532  
##                                                                         
##        D                 S                 T          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.03774   1st Qu.:0.06106   1st Qu.:0.04154  
##  Median :0.04792   Median :0.07514   Median :0.05111  
##  Mean   :0.04810   Mean   :0.07779   Mean   :0.05231  
##  3rd Qu.:0.05758   3rd Qu.:0.09146   3rd Qu.:0.06111  
##  Max.   :0.20000   Max.   :0.41660   Max.   :0.34949  
## 

Box plots to understand how the distribution varies by % Amino Acid

par(mfrow=c(1,5))
for(i in 2:22) {
    boxplot(trainset[,i], main = names(trainset)[i])
    }

Compute the correlation matrix for the 21 variables

# Reduce total_aa data set to amino acids only
corrSet <- total_aa[,2:22]

#str(corrSet)

corrMatrix <- cor(corrSet) # Calculate correlation coefficients
#corrMatrix

# Determine Max and Min
#which(abs(corrMatrix) > 0.4, arr.ind = T) 
#Check for corrlation coefficients |corrMatrix(i,j)| > 0.4

MinCorrelation <- min(corrMatrix)
MinCorrelation
## [1] -0.3673635
MaxCorrelation <- corrMatrix[ corrMatrix > 0.35 & corrMatrix < 1 ]
MaxCorrelation
## [1] 0.3607724 0.3607724

Which samples give min/max correlations

which(corrMatrix < -0.36)
## [1]  49 129
which(corrMatrix > 0.35 & corrMatrix < 1) 
## [1] 143 343

Generate Correlation plot of Amino Acids

corrplot(corrMatrix, method = "color", type = "upper")

Configure parallel processing

Parallel processing in caret can be accomplished with the parallel and doParallel packages. The following code loads the required libraries (note, these libraries also depend on the iterators and foreach libraries).

library(parallel)
library(doParallel)
## Loading required package: foreach

## Loading required package: iterators
cluster <- makeCluster(detectCores() - 1) 
# leave 1 core for OS
registerDoParallel(cluster)

Develop training model

set.seed(1234)
# Time
start_RF <- Sys.time()

# Train model with CART algorithm
DecTreeModel <- factor(trainset$Class, 
                   levels = 0:6, 
                   labels = c("erythrocruorin", "hemerythrin",
                              "hemocyanin","hemoglobin",
                              "homo_sapiens", "leghemoglobin",
                              "myoglobin"))
DecTreeModel <- rpart(Class ~ ., data = trainset, method = 'class')

# Time
end_RF <- Sys.time()
RF_time = end_RF - start_RF
RF_time
## Time difference of 1.458672 secs

De-register parallel processing cluster

After processing the data, we explicitly shut down the cluster by calling the stopCluster() function.

stopCluster(cluster)

Investigate Fit:

# Let's check the model
DecTreeModel
## n= 13562 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 13562 497 erythrocruorin (0.96 0.00096 0.00052 0.0041 0.00052 0.008 0.023)  
##    2) R>=0.02130497 12966 165 erythrocruorin (0.99 0.00077 0.00046 0.0029 0 0.0018 0.0068)  
##      4) V< 0.1087696 12787  99 erythrocruorin (0.99 0.00078 0.00047 0.0027 0 0.0017 0.0021)  
##        8) M>=0.000514668 12766  83 erythrocruorin (0.99 0.00047 0.00047 0.0018 0 0.0017 0.002) *
##        9) M< 0.000514668 21  10 hemoglobin (0.24 0.19 0 0.52 0 0 0.048) *
##      5) V>=0.1087696 179  66 erythrocruorin (0.63 0 0 0.022 0 0.0056 0.34)  
##       10) I>=0.01414293 108   2 erythrocruorin (0.98 0 0 0.0093 0 0.0093 0) *
##       11) I< 0.01414293 71  10 myoglobin (0.099 0 0 0.042 0 0 0.86)  
##         22) Q>=0.0361354 7   1 erythrocruorin (0.86 0 0 0.14 0 0 0) *
##         23) Q< 0.0361354 64   3 myoglobin (0.016 0 0 0.031 0 0 0.95) *
##    3) R< 0.02130497 596 332 erythrocruorin (0.44 0.005 0.0017 0.029 0.012 0.14 0.37)  
##      6) H< 0.0452498 297  44 erythrocruorin (0.85 0.01 0.0034 0.037 0.024 0.047 0.027)  
##       12) TotalAA>=34.5 281  31 erythrocruorin (0.89 0.011 0.0036 0 0.025 0.05 0.021) *
##       13) TotalAA< 34.5 16   5 hemoglobin (0.19 0 0 0.69 0 0 0.12) *
##      7) H>=0.0452498 299  89 myoglobin (0.037 0 0 0.02 0 0.24 0.7)  
##       14) V< 0.06331586 81  10 leghemoglobin (0.099 0 0 0.012 0 0.88 0.012)  
##         28) K< 0.1127894 10   3 erythrocruorin (0.7 0 0 0.1 0 0.1 0.1) *
##         29) K>=0.1127894 71   1 leghemoglobin (0.014 0 0 0 0 0.99 0) *
##       15) V>=0.06331586 218   9 myoglobin (0.014 0 0 0.023 0 0.0046 0.96) *

Plot CART model

rattle::fancyRpartPlot(DecTreeModel, main="CART model")

drawTreeNodes(DecTreeModel, cex = 1, pch = par(19),
              size = 3, col = NULL, nodeinfo = FALSE,
              units = "", cases = "obs", digits = getOption("digits"),
              decimals = 2, print.levels = TRUE, new = TRUE) 

rpart.plot(DecTreeModel, extra = "auto",
           box.palette = "auto", branch.lty = 3)

Machine Set up

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.2
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.11 iterators_1.0.9   foreach_1.4.4    
##  [4] rattle_5.1.0      rpart.plot_2.1.2  rpart_4.1-11     
##  [7] caret_6.0-78      ggplot2_2.2.1     lattice_0.20-35  
## [10] corrplot_0.84    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14       lubridate_1.7.1    RGtk2_2.20.33     
##  [4] tidyr_0.7.2        class_7.3-14       assertthat_0.2.0  
##  [7] rprojroot_1.2      digest_0.6.13      ipred_0.9-6       
## [10] psych_1.7.8        R6_2.2.2           plyr_1.8.4        
## [13] backports_1.1.2    stats4_3.4.2       evaluate_0.10.1   
## [16] rlang_0.1.4        lazyeval_0.2.1     kernlab_0.9-25    
## [19] Matrix_1.2-12      rmarkdown_1.8      splines_3.4.2     
## [22] CVST_0.2-1         ddalpha_1.3.1      gower_0.1.2       
## [25] stringr_1.2.0      foreign_0.8-69     munsell_0.4.3     
## [28] broom_0.4.3        compiler_3.4.2     pkgconfig_2.0.1   
## [31] mnormt_1.5-5       dimRed_0.1.0       htmltools_0.3.6   
## [34] nnet_7.3-12        tidyselect_0.2.3   tibble_1.3.4      
## [37] prodlim_1.6.1      DRR_0.0.2          codetools_0.2-15  
## [40] RcppRoll_0.2.2     dplyr_0.7.4        withr_2.1.0       
## [43] MASS_7.3-47        recipes_0.1.1      ModelMetrics_1.1.0
## [46] grid_3.4.2         nlme_3.1-131       gtable_0.2.0      
## [49] magrittr_1.5       scales_0.5.0       stringi_1.1.6     
## [52] reshape2_1.4.3     bindrcpp_0.2       timeDate_3042.101 
## [55] robustbase_0.92-8  lava_1.5.1         RColorBrewer_1.1-2
## [58] tools_3.4.2        glue_1.2.0         DEoptimR_1.0-8    
## [61] purrr_0.2.4        sfsmisc_1.1-1      survival_2.41-3   
## [64] yaml_2.1.16        colorspace_1.3-2   knitr_1.17        
## [67] bindr_0.1