The data set consists of 13562 proteins. The percent composition of the amino acids was determined for every protein.
The proteins can be broken down into 6 categories: “erythrocruorin”, “hemerythrin”, “hemocyanin”, “hemoglobin”, “homo_sapien proteins NOT O2-binding”, “leghemoglobin”, “myoglobin”.
setwd("~/Dropbox/git_projects/random_forest/2_single_aa")
total_aa <- read.csv("~/Dropbox/git_projects/random_forest/2_single_aa/total_aa.csv")
head(total_aa,5)
## Class TotalAA G P A V L
## 1 0 393 0.05852417 0.11450380 0.06106870 0.04580153 0.08142494
## 2 0 2016 0.05803571 0.05257937 0.06746032 0.05902778 0.11259920
## 3 0 1210 0.07024793 0.06198347 0.05950413 0.05785124 0.09173554
## 4 0 2871 0.10693140 0.06095437 0.03343783 0.03761755 0.04911181
## 5 0 770 0.04935065 0.04545455 0.08181818 0.08441558 0.07272727
## I M C F Y W
## 1 0.02035623 0.03053435 0.02544529 0.02798982 0.02290076 0.010178120
## 2 0.06051587 0.03422619 0.02083333 0.06150794 0.02678571 0.016369050
## 3 0.05702479 0.02066116 0.04958678 0.02975207 0.02975207 0.010743800
## 4 0.05189829 0.01811216 0.12608850 0.02925810 0.03239289 0.004528039
## 5 0.03116883 0.03116883 0.02337662 0.02727273 0.02597403 0.011688310
## H K R Q N E
## 1 0.03053435 0.05089059 0.06615776 0.03816794 0.03562341 0.07633588
## 2 0.01339286 0.04117063 0.05158730 0.03521825 0.03819444 0.06696429
## 3 0.02561983 0.05454545 0.04958678 0.04049587 0.05454545 0.06363636
## 4 0.01671891 0.03866249 0.04562870 0.03483107 0.06548241 0.07001045
## 5 0.03246753 0.05324675 0.04805195 0.04675325 0.04025974 0.11948050
## D S T
## 1 0.05089059 0.09669211 0.05597964
## 2 0.04662698 0.07837302 0.05853175
## 3 0.05041322 0.06942149 0.05289256
## 4 0.06025775 0.06025775 0.05781958
## 5 0.06493506 0.04545455 0.06493506
str(total_aa) # View 'total_aa' data structure
## 'data.frame': 16952 obs. of 22 variables:
## $ Class : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TotalAA: int 393 2016 1210 2871 770 781 2351 2527 920 34350 ...
## $ G : num 0.0585 0.058 0.0702 0.1069 0.0494 ...
## $ P : num 0.1145 0.0526 0.062 0.061 0.0455 ...
## $ A : num 0.0611 0.0675 0.0595 0.0334 0.0818 ...
## $ V : num 0.0458 0.059 0.0579 0.0376 0.0844 ...
## $ L : num 0.0814 0.1126 0.0917 0.0491 0.0727 ...
## $ I : num 0.0204 0.0605 0.057 0.0519 0.0312 ...
## $ M : num 0.0305 0.0342 0.0207 0.0181 0.0312 ...
## $ C : num 0.0254 0.0208 0.0496 0.1261 0.0234 ...
## $ F : num 0.028 0.0615 0.0298 0.0293 0.0273 ...
## $ Y : num 0.0229 0.0268 0.0298 0.0324 0.026 ...
## $ W : num 0.01018 0.01637 0.01074 0.00453 0.01169 ...
## $ H : num 0.0305 0.0134 0.0256 0.0167 0.0325 ...
## $ K : num 0.0509 0.0412 0.0545 0.0387 0.0532 ...
## $ R : num 0.0662 0.0516 0.0496 0.0456 0.0481 ...
## $ Q : num 0.0382 0.0352 0.0405 0.0348 0.0468 ...
## $ N : num 0.0356 0.0382 0.0545 0.0655 0.0403 ...
## $ E : num 0.0763 0.067 0.0636 0.07 0.1195 ...
## $ D : num 0.0509 0.0466 0.0504 0.0603 0.0649 ...
## $ S : num 0.0967 0.0784 0.0694 0.0603 0.0455 ...
## $ T : num 0.056 0.0585 0.0529 0.0578 0.0649 ...
# library(dplyr)
# glimpse(total_aa) # An alternative to 'str', data strcuture
summary(total_aa)
## Class TotalAA G P
## Min. :0.0000 Min. : 4.0 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.: 253.0 1st Qu.:0.05056 1st Qu.:0.04158
## Median :0.0000 Median : 432.5 Median :0.06466 Median :0.05483
## Mean :0.1925 Mean : 591.9 Mean :0.06751 Mean :0.06105
## 3rd Qu.:0.0000 3rd Qu.: 707.0 3rd Qu.:0.08023 3rd Qu.:0.07380
## Max. :6.0000 Max. :34350.0 Max. :0.46474 Max. :0.39241
## A V L I
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.05503 1st Qu.:0.04847 1st Qu.:0.08071 1st Qu.:0.02894
## Median :0.06964 Median :0.06042 Median :0.09804 Median :0.04247
## Mean :0.07346 Mean :0.06148 Mean :0.09912 Mean :0.04286
## 3rd Qu.:0.08768 3rd Qu.:0.07282 3rd Qu.:0.11678 3rd Qu.:0.05556
## Max. :0.40000 Max. :0.33333 Max. :0.32323 Max. :0.21538
## M C F Y
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01523 1st Qu.:0.01181 1st Qu.:0.02637 1st Qu.:0.01852
## Median :0.02111 Median :0.01881 Median :0.03611 Median :0.02649
## Mean :0.02241 Mean :0.02305 Mean :0.03753 Mean :0.02767
## 3rd Qu.:0.02772 3rd Qu.:0.02778 3rd Qu.:0.04725 3rd Qu.:0.03542
## Max. :0.13836 Max. :0.36816 Max. :0.17391 Max. :0.24194
## W H K R
## Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.006623 1st Qu.:0.01772 1st Qu.:0.03955 1st Qu.:0.04261
## Median :0.011468 Median :0.02431 Median :0.05621 Median :0.05479
## Mean :0.012820 Mean :0.02638 Mean :0.05883 Mean :0.05735
## 3rd Qu.:0.017285 3rd Qu.:0.03193 3rd Qu.:0.07434 3rd Qu.:0.06897
## Max. :0.232877 Max. :0.30000 Max. :0.31250 Max. :0.47059
## Q N E D
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.03358 1st Qu.:0.02465 1st Qu.:0.05147 1st Qu.:0.03763
## Median :0.04360 Median :0.03485 Median :0.06643 Median :0.04787
## Mean :0.04558 Mean :0.03550 Mean :0.06873 Mean :0.04808
## 3rd Qu.:0.05457 3rd Qu.:0.04514 3rd Qu.:0.08279 3rd Qu.:0.05756
## Max. :0.98750 Max. :0.14286 Max. :0.38235 Max. :0.27273
## S T
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.06122 1st Qu.:0.04157
## Median :0.07533 Median :0.05104
## Mean :0.07809 Mean :0.05235
## 3rd Qu.:0.09184 3rd Qu.:0.06109
## Max. :0.41660 Max. :0.34949
boxplot(total_aa[3:7], ylim = c(0, 0.5), main = "1. AA vs % Composition")
boxplot(total_aa[8:12], ylim = c(0, 0.5), main = "2. AA vs % Composition")
boxplot(total_aa[13:17], ylim = c(0, 0.5), main = "3. AA vs % Composition")
boxplot(total_aa[18:22], ylim = c(0, 0.5), main = "4. AA vs % Composition")
library(corrplot) # produces correlation plots of variables
## corrplot 0.84 loaded
# Reduce total_aa data set to amino acids only
corrSet <- total_aa[,3:22]
#str(corrSet)
corrMatrix <- cor(corrSet) # Calculate correlation coefficients
#corrMatrix
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
cor_5 <- rcorr(as.matrix(corrMatrix))
M <- cor_5$r
p_mat <- cor_5$P
corrplot(corrMatrix, method = "color", type = "upper", order = "hclust",
p.mat = p_mat, sig.level = 0.01)
text(9, 1.7, "Correlation Plot (R^2) of AA", cex = 2)
text(9, 4, "'X' indicate insignificant p-value (p=0.01) for Correlation")
While leaving the insigificant values blank
corrplot(corrMatrix, method = "number", type = "upper", order = "hclust",
p.mat = p_mat, sig.level = 0.01, insig = "blank")
text(9, 1.7, "Correlation Plot (R^2) of AA", cex = 2)
text(9, 3, "BLANK squares indicate insignificant p-value (p=0.01) for Correlation")
x <- order(total_aa$G)
aa_columns = c("Class", "G", "P", "A", "V", "L", "I", "M", "C", "F", "Y",
"W", "H", "K", "R", "Q", "N", "E", "D", "S", "T")
for (n in seq_along(aa_columns)) {
main_title = paste("Amino Acid = ", aa_columns[n], sep = "")
qqplot(x = x, y = total_aa[,n],
ylab = main_title, xlab = "order",
main = main_title)
# Add text to QQ Plots showing Skew and Kurtosis
text(8000, 30000, "expression(hat(beta) == (X^t * X)^{-1} * X^t * y)", cex = 1)
}
# univariate normality tests: Shapiro Wilk's test;
# desc => descriptive stats = size, mean, sd, q5, skew, kurtosis
library(nortest)
Anderson_Darling_Test = ad.test(total_aa[,3])
#names(Anderson_Darling_Test)
#Anderson_Darling_Test
aa_columns = c("Class", "G", "P", "A", "V", "L", "I", "M", "C", "F", "Y",
"W", "H", "K", "R", "Q", "N", "E", "D", "S", "T")
for (n in seq_along(aa_columns)) {
Anderson_Darling_Test = ad.test(total_aa[,n])
paste("Amino Acid = ", aa_columns[n], Anderson_Darling_Test, sep = "")
}
By using the ‘nortest’ package of R, these tests can be conducted:
https://www.r-bloggers.com/normality-tests-for-continuous-data/
-Perform Anderson-Darling normality test ad. test(data1) -Perform Cramér-von Mises test for normality cvm. test(data1) -Perform Pearson chi-square test for normality pearson. test(data1) -Perform Shapiro-Francia test for normality sf. test(data1)
Sys.info()[c(1:3,5)]
## sysname
## "Linux"
## release
## "4.10.0-42-generic"
## version
## "#46~16.04.1-Ubuntu SMP Mon Dec 4 15:57:59 UTC 2017"
## machine
## "x86_64"
sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.2
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] nortest_1.0-4 Hmisc_4.0-3 ggplot2_2.2.1 Formula_1.2-2
## [5] survival_2.41-3 lattice_0.20-35 corrplot_0.84
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 compiler_3.4.2 pillar_1.0.1
## [4] RColorBrewer_1.1-2 plyr_1.8.4 base64enc_0.1-3
## [7] tools_3.4.2 rpart_4.1-11 digest_0.6.13
## [10] checkmate_1.8.5 evaluate_0.10.1 tibble_1.4.1
## [13] gtable_0.2.0 htmlTable_1.11.1 rlang_0.1.6
## [16] Matrix_1.2-12 rstudioapi_0.7 yaml_2.1.16
## [19] gridExtra_2.3 stringr_1.2.0 knitr_1.18
## [22] cluster_2.0.6 htmlwidgets_0.9 rprojroot_1.3-1
## [25] grid_3.4.2 nnet_7.3-12 data.table_1.10.4-3
## [28] foreign_0.8-69 rmarkdown_1.8 latticeExtra_0.6-28
## [31] magrittr_1.5 backports_1.1.2 scales_0.5.0
## [34] htmltools_0.3.6 splines_3.4.2 colorspace_1.3-2
## [37] stringi_1.1.6 acepack_1.4.1 lazyeval_0.2.1
## [40] munsell_0.4.3
EOF