One important question which I would like to answer with an exploratory data analysis is this data set sufficient to succesfully classify the seven classes of protein using a Random Forest machine learning approach? This should clearer as we test and further describe the data.
| Code Key | Protein Class |
|---|---|
| 0 / Ctrl | Controls consist of human proteins which do not bind oxygen |
| 1 / Ery | Erythrocruorin |
| 2 / Hcy | Hemocyanin |
| 3 / Hgb | Hemoglobin |
| 4 / Hhe | Hemerythrin |
| 5 / Lgb | Leghemoglobin |
| 6 / Mgb | Myoglobin |
Libraries:
knitr::opts_chunk$set(echo = TRUE)
Libraries = c("readr", "knitr")
# Install if not present
for(p in Libraries){
if(!require(p, character.only = TRUE))
install.packages(p)
library(p, character.only = TRUE)
}
Import Single Amino Acid percent composition for first round of Exploratory Data Analysis.
complete_aa <- read_csv("complete_aa.csv")
class_table <- table(complete_aa$Class)
knitr::kable(class_table)
| Var1 | Freq |
|---|---|
| 0 | 16328 |
| 1 | 14 |
| 2 | 10 |
| 3 | 74 |
| 4 | 11 |
| 5 | 127 |
| 6 | 388 |
aa_vars = apply(complete_aa[,3:22], 2, var)
aa_vars_sorted = sort(10^5*aa_vars, decreasing = TRUE)
aa_vars_sorted
## P L K E G A S
## 95.226839 84.037862 81.083671 79.140643 76.434660 75.886119 70.742432
## R C Q I V T D
## 57.402145 52.582790 41.112459 40.287289 39.498074 33.466338 30.276197
## F N H Y M W
## 26.999361 24.318450 20.881076 19.575014 11.396720 8.149657
aa_names <- c("P", "L", "K", "E", "G",
"A", "S", "R", "C", "Q",
"I", "V", "T", "D", "F",
"N", "H", "Y", "M", "W")
plot(aa_vars_sorted,
main = "Plot of Variances (x 10^5) Vs Amino Acid Type",
ylab = "Variances (x 10^5)",
xlab = "Amino Acid Type",
ylim = c(0,100),
xaxt = "n")
axis(1, at = 1:20, labels = aa_names)
aa_means = apply(complete_aa[,3:22], 2, mean)
aa_means_sorted = sort(100*aa_means, decreasing = TRUE)
aa_means_sorted
## L S A E G V P K
## 9.911609 7.808515 7.345727 6.873065 6.751433 6.148256 6.105231 5.882587
## R T D Q I F N Y
## 5.735130 5.234892 4.808019 4.557671 4.286352 3.753443 3.550393 2.766902
## H C M W
## 2.637827 2.304834 2.240912 1.281975
plot(aa_means_sorted,
main = "Plot of Means (% of Total) Vs Amino Acid Type",
ylab = "Means % of Total)",
xlab = "Amino Acid Type",
ylim = c(0,10),
xaxt = "n")
axis(1, at = 1:20, labels = aa_names)
Sys.info()[c(1:3,5)]
## sysname
## "Linux"
## release
## "4.15.0-46-generic"
## version
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019"
## machine
## "x86_64"
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.21 readr_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 crayon_1.3.4 digest_0.6.18 R6_2.3.0
## [5] magrittr_1.5 evaluate_0.12 highr_0.7 pillar_1.3.1
## [9] rlang_0.3.0.1 stringi_1.2.4 rmarkdown_1.11 tools_3.4.4
## [13] stringr_1.3.1 hms_0.4.2 xfun_0.4 yaml_2.2.0
## [17] compiler_3.4.4 pkgconfig_2.0.2 htmltools_0.3.6 tibble_1.4.2
EOF