A: Introduction

The data set consists of 13562 proteins. The percent composition of the amino acids was determined for every protein.

The proteins can be broken down into 6 categories: “erythrocruorin”, “hemerythrin”, “hemocyanin”, “hemoglobin”, “homo_sapien proteins NOT O2-binding”, “leghemoglobin”, “myoglobin”.

2: Load / Inspect Data

setwd("~/Dropbox/git_projects/random_forest/2_single_aa")
total_aa <- read.csv("~/Dropbox/git_projects/random_forest/2_single_aa/total_aa.csv")

head(total_aa,5)
##   Class TotalAA          G          P          A          V          L
## 1     0     393 0.05852417 0.11450380 0.06106870 0.04580153 0.08142494
## 2     0    2016 0.05803571 0.05257937 0.06746032 0.05902778 0.11259920
## 3     0    1210 0.07024793 0.06198347 0.05950413 0.05785124 0.09173554
## 4     0    2871 0.10693140 0.06095437 0.03343783 0.03761755 0.04911181
## 5     0     770 0.04935065 0.04545455 0.08181818 0.08441558 0.07272727
##            I          M          C          F          Y           W
## 1 0.02035623 0.03053435 0.02544529 0.02798982 0.02290076 0.010178120
## 2 0.06051587 0.03422619 0.02083333 0.06150794 0.02678571 0.016369050
## 3 0.05702479 0.02066116 0.04958678 0.02975207 0.02975207 0.010743800
## 4 0.05189829 0.01811216 0.12608850 0.02925810 0.03239289 0.004528039
## 5 0.03116883 0.03116883 0.02337662 0.02727273 0.02597403 0.011688310
##            H          K          R          Q          N          E
## 1 0.03053435 0.05089059 0.06615776 0.03816794 0.03562341 0.07633588
## 2 0.01339286 0.04117063 0.05158730 0.03521825 0.03819444 0.06696429
## 3 0.02561983 0.05454545 0.04958678 0.04049587 0.05454545 0.06363636
## 4 0.01671891 0.03866249 0.04562870 0.03483107 0.06548241 0.07001045
## 5 0.03246753 0.05324675 0.04805195 0.04675325 0.04025974 0.11948050
##            D          S          T
## 1 0.05089059 0.09669211 0.05597964
## 2 0.04662698 0.07837302 0.05853175
## 3 0.05041322 0.06942149 0.05289256
## 4 0.06025775 0.06025775 0.05781958
## 5 0.06493506 0.04545455 0.06493506

3: Identify variable types in ‘total_aa’

str(total_aa) # View 'total_aa' data structure
## 'data.frame':    16952 obs. of  22 variables:
##  $ Class  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TotalAA: int  393 2016 1210 2871 770 781 2351 2527 920 34350 ...
##  $ G      : num  0.0585 0.058 0.0702 0.1069 0.0494 ...
##  $ P      : num  0.1145 0.0526 0.062 0.061 0.0455 ...
##  $ A      : num  0.0611 0.0675 0.0595 0.0334 0.0818 ...
##  $ V      : num  0.0458 0.059 0.0579 0.0376 0.0844 ...
##  $ L      : num  0.0814 0.1126 0.0917 0.0491 0.0727 ...
##  $ I      : num  0.0204 0.0605 0.057 0.0519 0.0312 ...
##  $ M      : num  0.0305 0.0342 0.0207 0.0181 0.0312 ...
##  $ C      : num  0.0254 0.0208 0.0496 0.1261 0.0234 ...
##  $ F      : num  0.028 0.0615 0.0298 0.0293 0.0273 ...
##  $ Y      : num  0.0229 0.0268 0.0298 0.0324 0.026 ...
##  $ W      : num  0.01018 0.01637 0.01074 0.00453 0.01169 ...
##  $ H      : num  0.0305 0.0134 0.0256 0.0167 0.0325 ...
##  $ K      : num  0.0509 0.0412 0.0545 0.0387 0.0532 ...
##  $ R      : num  0.0662 0.0516 0.0496 0.0456 0.0481 ...
##  $ Q      : num  0.0382 0.0352 0.0405 0.0348 0.0468 ...
##  $ N      : num  0.0356 0.0382 0.0545 0.0655 0.0403 ...
##  $ E      : num  0.0763 0.067 0.0636 0.07 0.1195 ...
##  $ D      : num  0.0509 0.0466 0.0504 0.0603 0.0649 ...
##  $ S      : num  0.0967 0.0784 0.0694 0.0603 0.0455 ...
##  $ T      : num  0.056 0.0585 0.0529 0.0578 0.0649 ...
# library(dplyr)
# glimpse(total_aa) # An alternative to 'str', data strcuture

4: Summarize ‘total_aa’

summary(total_aa)
##      Class           TotalAA              G                 P          
##  Min.   :0.0000   Min.   :    4.0   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:  253.0   1st Qu.:0.05056   1st Qu.:0.04158  
##  Median :0.0000   Median :  432.5   Median :0.06466   Median :0.05483  
##  Mean   :0.1925   Mean   :  591.9   Mean   :0.06751   Mean   :0.06105  
##  3rd Qu.:0.0000   3rd Qu.:  707.0   3rd Qu.:0.08023   3rd Qu.:0.07380  
##  Max.   :6.0000   Max.   :34350.0   Max.   :0.46474   Max.   :0.39241  
##        A                 V                 L                 I          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.05503   1st Qu.:0.04847   1st Qu.:0.08071   1st Qu.:0.02894  
##  Median :0.06964   Median :0.06042   Median :0.09804   Median :0.04247  
##  Mean   :0.07346   Mean   :0.06148   Mean   :0.09912   Mean   :0.04286  
##  3rd Qu.:0.08768   3rd Qu.:0.07282   3rd Qu.:0.11678   3rd Qu.:0.05556  
##  Max.   :0.40000   Max.   :0.33333   Max.   :0.32323   Max.   :0.21538  
##        M                 C                 F                 Y          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01523   1st Qu.:0.01181   1st Qu.:0.02637   1st Qu.:0.01852  
##  Median :0.02111   Median :0.01881   Median :0.03611   Median :0.02649  
##  Mean   :0.02241   Mean   :0.02305   Mean   :0.03753   Mean   :0.02767  
##  3rd Qu.:0.02772   3rd Qu.:0.02778   3rd Qu.:0.04725   3rd Qu.:0.03542  
##  Max.   :0.13836   Max.   :0.36816   Max.   :0.17391   Max.   :0.24194  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.006623   1st Qu.:0.01772   1st Qu.:0.03955   1st Qu.:0.04261  
##  Median :0.011468   Median :0.02431   Median :0.05621   Median :0.05479  
##  Mean   :0.012820   Mean   :0.02638   Mean   :0.05883   Mean   :0.05735  
##  3rd Qu.:0.017285   3rd Qu.:0.03193   3rd Qu.:0.07434   3rd Qu.:0.06897  
##  Max.   :0.232877   Max.   :0.30000   Max.   :0.31250   Max.   :0.47059  
##        Q                 N                 E                 D          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.03358   1st Qu.:0.02465   1st Qu.:0.05147   1st Qu.:0.03763  
##  Median :0.04360   Median :0.03485   Median :0.06643   Median :0.04787  
##  Mean   :0.04558   Mean   :0.03550   Mean   :0.06873   Mean   :0.04808  
##  3rd Qu.:0.05457   3rd Qu.:0.04514   3rd Qu.:0.08279   3rd Qu.:0.05756  
##  Max.   :0.98750   Max.   :0.14286   Max.   :0.38235   Max.   :0.27273  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.06122   1st Qu.:0.04157  
##  Median :0.07533   Median :0.05104  
##  Mean   :0.07809   Mean   :0.05235  
##  3rd Qu.:0.09184   3rd Qu.:0.06109  
##  Max.   :0.41660   Max.   :0.34949

5: Box plots of % Amino Acid composition data

boxplot(total_aa[3:7], ylim = c(0, 0.5), main = "1. AA vs % Composition")

boxplot(total_aa[8:12], ylim = c(0, 0.5), main = "2. AA vs % Composition")

boxplot(total_aa[13:17], ylim = c(0, 0.5), main = "3. AA vs % Composition")

boxplot(total_aa[18:22], ylim = c(0, 0.5), main = "4. AA vs % Composition")

6: Compute the correlation matrix for the 20 AA variables NOT TotalAA

library(corrplot) # produces correlation plots of variables
## corrplot 0.84 loaded
# Reduce total_aa data set to amino acids only
corrSet <- total_aa[,3:22]

#str(corrSet)

corrMatrix <- cor(corrSet) # Calculate correlation coefficients
#corrMatrix

7: Generate (Upper) Correlation plot of Amino Acids

  • Combine correlogram with the significance test
  • Mark the insignificant coefficients according to the specified p-value significance level with ‘x’.
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
cor_5 <- rcorr(as.matrix(corrMatrix))
M <- cor_5$r
p_mat <- cor_5$P

corrplot(corrMatrix, method = "color", type = "upper", order = "hclust", 
         p.mat = p_mat, sig.level = 0.01)
text(9, 1.7, "Correlation Plot (R^2) of AA", cex = 2)
text(9, 4, "'X' indicate insignificant p-value (p=0.01) for Correlation")

8: Display the Sigificant Correlation Coefficients Only:

While leaving the insigificant values blank

corrplot(corrMatrix, method = "number", type = "upper", order = "hclust",
         p.mat = p_mat, sig.level = 0.01, insig = "blank")
text(9, 1.7, "Correlation Plot (R^2) of AA", cex = 2)
text(9, 3, "BLANK squares indicate insignificant p-value (p=0.01) for Correlation")

Preleminary Tests for NORMALITY

  1. Are the data from each of the 2 variables (x, y) following a normal distribution?
  2. Use Shapiro-Wilk normality test → R function: shapiro.test()
  3. Look at the normality plot → R function: ggpubr::ggqqplot()

Shapiro-Wilk test can be performed as follow:

  1. Null hypothesis: the data are normally distributed
  2. Alternative hypothesis: the data are not normally distributed
x <- order(total_aa$G)
aa_columns = c("Class", "G", "P", "A", "V", "L", "I", "M", "C", "F", "Y",
               "W", "H", "K", "R", "Q", "N", "E", "D", "S", "T")
for (n in seq_along(aa_columns)) {
    main_title = paste("Amino Acid = ", aa_columns[n], sep = "")
    qqplot(x = x, y = total_aa[,n], 
           ylab = main_title, xlab = "order",
           main = main_title)
    # Add text to QQ Plots showing Skew and Kurtosis
    text(8000, 30000, "expression(hat(beta) == (X^t * X)^{-1} * X^t * y)", cex = 1)
}

9: Univariate tests of normality plots for all variables in the dataset

# univariate normality tests: Shapiro Wilk's test;
# desc => descriptive stats = size, mean, sd, q5, skew, kurtosis
library(nortest)
Anderson_Darling_Test = ad.test(total_aa[,3])
#names(Anderson_Darling_Test)
#Anderson_Darling_Test


aa_columns = c("Class", "G", "P", "A", "V", "L", "I", "M", "C", "F", "Y",
               "W", "H", "K", "R", "Q", "N", "E", "D", "S", "T")
for (n in seq_along(aa_columns)) {
    Anderson_Darling_Test = ad.test(total_aa[,n])
    paste("Amino Acid = ", aa_columns[n], Anderson_Darling_Test, sep = "")
}

By using the ‘nortest’ package of R, these tests can be conducted:

https://www.r-bloggers.com/normality-tests-for-continuous-data/

-Perform Anderson-Darling normality test ad. test(data1) -Perform Cramér-von Mises test for normality cvm. test(data1) -Perform Pearson chi-square test for normality pearson. test(data1) -Perform Shapiro-Francia test for normality sf. test(data1)

10: System Information

Sys.info()[c(1:3,5)]
##                                              sysname 
##                                              "Linux" 
##                                              release 
##                                  "4.10.0-42-generic" 
##                                              version 
## "#46~16.04.1-Ubuntu SMP Mon Dec 4 15:57:59 UTC 2017" 
##                                              machine 
##                                             "x86_64"
sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.2
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] nortest_1.0-4   Hmisc_4.0-3     ggplot2_2.2.1   Formula_1.2-2  
## [5] survival_2.41-3 lattice_0.20-35 corrplot_0.84  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14        compiler_3.4.2      pillar_1.0.1       
##  [4] RColorBrewer_1.1-2  plyr_1.8.4          base64enc_0.1-3    
##  [7] tools_3.4.2         rpart_4.1-11        digest_0.6.13      
## [10] checkmate_1.8.5     evaluate_0.10.1     tibble_1.4.1       
## [13] gtable_0.2.0        htmlTable_1.11.1    rlang_0.1.6        
## [16] Matrix_1.2-12       rstudioapi_0.7      yaml_2.1.16        
## [19] gridExtra_2.3       stringr_1.2.0       knitr_1.18         
## [22] cluster_2.0.6       htmlwidgets_0.9     rprojroot_1.3-1    
## [25] grid_3.4.2          nnet_7.3-12         data.table_1.10.4-3
## [28] foreign_0.8-69      rmarkdown_1.8       latticeExtra_0.6-28
## [31] magrittr_1.5        backports_1.1.2     scales_0.5.0       
## [34] htmltools_0.3.6     splines_3.4.2       colorspace_1.3-2   
## [37] stringi_1.1.6       acepack_1.4.1       lazyeval_0.2.1     
## [40] munsell_0.4.3

EOF