EDA-Part2-Oxygen-binding.rmd

File Name: EDA-Part2-Oxygen-binding.rmd

Summary:

One important question which I would like to answer with an exploratory data analysis is this data set sufficient to succesfully classify the seven classes of protein using a Random Forest machine learning approach? This should clearer as we test and further describe the data.

Code Key	Protein Class
0 / Ctrl	Controls consist of human proteins which do not bind oxygen
1 / Ery	Erythrocruorin
2 / Hcy	Hemocyanin
3 / Hgb	Hemoglobin
4 / Hhe	Hemerythrin
5 / Lgb	Leghemoglobin
6 / Mgb	Myoglobin

Libraries:

knitr::opts_chunk$set(echo = TRUE)

Libraries = c("readr", "knitr")

# Install if not present
for(p in Libraries){
    if(!require(p, character.only = TRUE))
        install.packages(p)
    library(p, character.only = TRUE)
}

A. Load Data

Import Single Amino Acid percent composition for first round of Exploratory Data Analysis.

complete_aa <- read_csv("complete_aa.csv")

B. Number of Proteins Per Class

class_table <- table(complete_aa$Class)
knitr::kable(class_table)

Var1	Freq
0	16328
1	14
2	10
3	74
4	11
5	127
6	388

C. Plot Variances vs Amino Acid

aa_vars = apply(complete_aa[,3:22], 2, var)
aa_vars_sorted = sort(10^5*aa_vars, decreasing = TRUE)
aa_vars_sorted

##         P         L         K         E         G         A         S 
## 95.226839 84.037862 81.083671 79.140643 76.434660 75.886119 70.742432 
##         R         C         Q         I         V         T         D 
## 57.402145 52.582790 41.112459 40.287289 39.498074 33.466338 30.276197 
##         F         N         H         Y         M         W 
## 26.999361 24.318450 20.881076 19.575014 11.396720  8.149657

D. Plot of Variances

aa_names <- c("P", "L", "K", "E", "G", 
              "A", "S", "R", "C", "Q", 
              "I", "V", "T", "D", "F", 
              "N", "H", "Y", "M", "W") 

plot(aa_vars_sorted,
     main = "Plot of Variances (x 10^5) Vs Amino Acid Type",
     ylab = "Variances (x 10^5)",
     xlab = "Amino Acid Type",
     ylim =  c(0,100),
     xaxt = "n")
axis(1, at = 1:20, labels = aa_names)

E. % Means vs Amino Acids

aa_means = apply(complete_aa[,3:22], 2, mean)
aa_means_sorted = sort(100*aa_means, decreasing = TRUE)
aa_means_sorted

##        L        S        A        E        G        V        P        K 
## 9.911609 7.808515 7.345727 6.873065 6.751433 6.148256 6.105231 5.882587 
##        R        T        D        Q        I        F        N        Y 
## 5.735130 5.234892 4.808019 4.557671 4.286352 3.753443 3.550393 2.766902 
##        H        C        M        W 
## 2.637827 2.304834 2.240912 1.281975

F. Plot of Means (% of Total) Vs Amino Acid Type

plot(aa_means_sorted,
     main = "Plot of Means (% of Total) Vs Amino Acid Type",
     ylab = "Means % of Total)",
     xlab = "Amino Acid Type",
     ylim =  c(0,10),
     xaxt = "n")
axis(1, at = 1:20, labels = aa_names)

G. Machine Settings:

Sys.info()[c(1:3,5)]

##                                               sysname 
##                                               "Linux" 
##                                               release 
##                                   "4.15.0-46-generic" 
##                                               version 
## "#49~16.04.1-Ubuntu SMP Tue Feb 12 17:45:24 UTC 2019" 
##                                               machine 
##                                              "x86_64"

sessionInfo()

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 18.3
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.21  readr_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0      crayon_1.3.4    digest_0.6.18   R6_2.3.0       
##  [5] magrittr_1.5    evaluate_0.12   highr_0.7       pillar_1.3.1   
##  [9] rlang_0.3.0.1   stringi_1.2.4   rmarkdown_1.11  tools_3.4.4    
## [13] stringr_1.3.1   hms_0.4.2       xfun_0.4        yaml_2.2.0     
## [17] compiler_3.4.4  pkgconfig_2.0.2 htmltools_0.3.6 tibble_1.4.2

EOF