library(readr)

protein_class = c(“homosapiens”, “erythrocruorin”, “hemerythrin”, “hemocyanin”, “leghemoglobin”, “myoglobin”, “hemoglobin”)

protein_class_0_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_0_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_0_aa)
##      Class      TotalAA              G                 P          
##  Min.   :0   Min.   :    4.0   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0   1st Qu.:  272.0   1st Qu.:0.05031   1st Qu.:0.04245  
##  Median :0   Median :  448.0   Median :0.06435   Median :0.05575  
##  Mean   :0   Mean   :  608.1   Mean   :0.06731   Mean   :0.06200  
##  3rd Qu.:0   3rd Qu.:  721.0   3rd Qu.:0.07968   3rd Qu.:0.07477  
##  Max.   :0   Max.   :34350.0   Max.   :0.46474   Max.   :0.39241  
##        A                 V                 L                 I          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.05464   1st Qu.:0.04830   1st Qu.:0.08023   1st Qu.:0.02981  
##  Median :0.06887   Median :0.06008   Median :0.09709   Median :0.04280  
##  Mean   :0.07232   Mean   :0.06064   Mean   :0.09839   Mean   :0.04358  
##  3rd Qu.:0.08599   3rd Qu.:0.07200   3rd Qu.:0.11528   3rd Qu.:0.05577  
##  Max.   :0.30723   Max.   :0.18852   Max.   :0.32323   Max.   :0.21538  
##        M                 C                 F                 Y          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01556   1st Qu.:0.01231   1st Qu.:0.02597   1st Qu.:0.01873  
##  Median :0.02135   Median :0.01930   Median :0.03556   Median :0.02681  
##  Mean   :0.02270   Mean   :0.02362   Mean   :0.03699   Mean   :0.02792  
##  3rd Qu.:0.02794   3rd Qu.:0.02821   3rd Qu.:0.04600   3rd Qu.:0.03571  
##  Max.   :0.13836   Max.   :0.36816   Max.   :0.17391   Max.   :0.24194  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.006515   1st Qu.:0.01750   1st Qu.:0.03893   1st Qu.:0.04398  
##  Median :0.011364   Median :0.02391   Median :0.05522   Median :0.05556  
##  Mean   :0.012872   Mean   :0.02520   Mean   :0.05774   Mean   :0.05863  
##  3rd Qu.:0.017467   3rd Qu.:0.03102   3rd Qu.:0.07248   3rd Qu.:0.06962  
##  Max.   :0.232877   Max.   :0.30000   Max.   :0.31250   Max.   :0.47059  
##        Q                 N                 E                 D          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.03425   1st Qu.:0.02481   1st Qu.:0.05208   1st Qu.:0.03727  
##  Median :0.04405   Median :0.03488   Median :0.06693   Median :0.04750  
##  Mean   :0.04616   Mean   :0.03552   Mean   :0.06942   Mean   :0.04766  
##  3rd Qu.:0.05491   3rd Qu.:0.04501   3rd Qu.:0.08309   3rd Qu.:0.05714  
##  Max.   :0.98750   Max.   :0.13000   Max.   :0.38235   Max.   :0.20000  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.06192   1st Qu.:0.04196  
##  Median :0.07568   Median :0.05117  
##  Mean   :0.07877   Mean   :0.05253  
##  3rd Qu.:0.09232   3rd Qu.:0.06103  
##  Max.   :0.41660   Max.   :0.34949
protein_class_1_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_1_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_1_aa)
##      Class      TotalAA            G                 P          
##  Min.   :1   Min.   :136.0   Min.   :0.03974   Min.   :0.01863  
##  1st Qu.:1   1st Qu.:141.2   1st Qu.:0.05617   1st Qu.:0.02778  
##  Median :1   Median :147.0   Median :0.06784   Median :0.03126  
##  Mean   :1   Mean   :148.4   Mean   :0.06591   Mean   :0.03044  
##  3rd Qu.:1   3rd Qu.:151.0   3rd Qu.:0.07496   3rd Qu.:0.03497  
##  Max.   :1   Max.   :170.0   Max.   :0.08276   Max.   :0.03797  
##        A                 V                 L                 I          
##  Min.   :0.06383   Min.   :0.05517   Min.   :0.04317   Min.   :0.03797  
##  1st Qu.:0.07178   1st Qu.:0.06349   1st Qu.:0.08019   1st Qu.:0.04301  
##  Median :0.10190   Median :0.06733   Median :0.09711   Median :0.05334  
##  Mean   :0.10598   Mean   :0.07161   Mean   :0.09424   Mean   :0.05552  
##  3rd Qu.:0.12247   3rd Qu.:0.08019   3rd Qu.:0.10685   3rd Qu.:0.06581  
##  Max.   :0.20497   Max.   :0.09317   Max.   :0.16471   Max.   :0.07914  
##        M                 C                  F                 Y           
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.02941   Min.   :0.005882  
##  1st Qu.:0.00000   1st Qu.:0.008488   1st Qu.:0.04945   1st Qu.:0.008387  
##  Median :0.00959   Median :0.014337   Median :0.06185   Median :0.013975  
##  Mean   :0.01160   Mean   :0.015320   Mean   :0.06679   Mean   :0.015654  
##  3rd Qu.:0.01784   3rd Qu.:0.020654   3rd Qu.:0.08556   3rd Qu.:0.020654  
##  Max.   :0.03311   Max.   :0.026846   Max.   :0.10596   Max.   :0.028777  
##        W                  H                 K                 R          
##  Min.   :0.006623   Min.   :0.01266   Min.   :0.04027   Min.   :0.01863  
##  1st Qu.:0.015203   1st Qu.:0.03174   1st Qu.:0.04977   1st Qu.:0.02718  
##  Median :0.019427   Median :0.04667   Median :0.05920   Median :0.04818  
##  Mean   :0.018245   Mean   :0.04758   Mean   :0.06085   Mean   :0.04734  
##  3rd Qu.:0.021017   3rd Qu.:0.06714   3rd Qu.:0.07153   3rd Qu.:0.06196  
##  Max.   :0.027397   Max.   :0.07857   Max.   :0.09494   Max.   :0.09272  
##        Q                 N                  E                 D          
##  Min.   :0.01342   Min.   :0.006897   Min.   :0.01899   Min.   :0.04969  
##  1st Qu.:0.02722   1st Qu.:0.016597   1st Qu.:0.03403   1st Qu.:0.06676  
##  Median :0.03454   Median :0.030255   Median :0.05577   Median :0.07671  
##  Mean   :0.03843   Mean   :0.030074   Mean   :0.05217   Mean   :0.07642  
##  3rd Qu.:0.04904   3rd Qu.:0.040473   3rd Qu.:0.06662   3rd Qu.:0.08452  
##  Max.   :0.06475   Max.   :0.050633   Max.   :0.08054   Max.   :0.10596  
##        S                 T          
##  Min.   :0.03546   Min.   :0.01351  
##  1st Qu.:0.04145   1st Qu.:0.02691  
##  Median :0.04914   Median :0.04079  
##  Mean   :0.05493   Mean   :0.04090  
##  3rd Qu.:0.06453   3rd Qu.:0.05598  
##  Max.   :0.09317   Max.   :0.06618
protein_class_2_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_2_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_2_aa)
##      Class      TotalAA            G                 P          
##  Min.   :2   Min.   :113.0   Min.   :0.03676   Min.   :0.02206  
##  1st Qu.:2   1st Qu.:113.2   1st Qu.:0.05162   1st Qu.:0.02824  
##  Median :2   Median :117.0   Median :0.05646   Median :0.03419  
##  Mean   :2   Mean   :117.7   Mean   :0.06162   Mean   :0.03414  
##  3rd Qu.:2   3rd Qu.:117.0   3rd Qu.:0.06142   3rd Qu.:0.03532  
##  Max.   :2   Max.   :136.0   Max.   :0.10619   Max.   :0.05128  
##        A                 V                 L                 I          
##  Min.   :0.04386   Min.   :0.02655   Min.   :0.06667   Min.   :0.02941  
##  1st Qu.:0.05478   1st Qu.:0.03441   1st Qu.:0.07033   1st Qu.:0.03449  
##  Median :0.06943   Median :0.04349   Median :0.07386   Median :0.04637  
##  Mean   :0.07529   Mean   :0.04502   Mean   :0.07702   Mean   :0.05066  
##  3rd Qu.:0.08494   3rd Qu.:0.05096   3rd Qu.:0.08333   3rd Qu.:0.06592  
##  Max.   :0.14159   Max.   :0.08824   Max.   :0.09559   Max.   :0.07965  
##        M                 C                  F                 Y          
##  Min.   :0.00885   Min.   :0.000000   Min.   :0.02941   Min.   :0.02564  
##  1st Qu.:0.01102   1st Qu.:0.002083   1st Qu.:0.06838   1st Qu.:0.03419  
##  Median :0.02353   Median :0.008547   Median :0.07697   Median :0.04276  
##  Mean   :0.02450   Mean   :0.007845   Mean   :0.07379   Mean   :0.04145  
##  3rd Qu.:0.03419   3rd Qu.:0.008830   3rd Qu.:0.07965   3rd Qu.:0.05088  
##  Max.   :0.04274   Max.   :0.017699   Max.   :0.10256   Max.   :0.05882  
##        W                 H                 K                 R          
##  Min.   :0.01667   Min.   :0.03676   Min.   :0.04274   Min.   :0.00000  
##  1st Qu.:0.02295   1st Qu.:0.04601   1st Qu.:0.07506   1st Qu.:0.02581  
##  Median :0.02564   Median :0.05646   Median :0.09402   Median :0.03419  
##  Mean   :0.02560   Mean   :0.05299   Mean   :0.08518   Mean   :0.03292  
##  3rd Qu.:0.02632   3rd Qu.:0.05983   3rd Qu.:0.09713   3rd Qu.:0.03510  
##  Max.   :0.03540   Max.   :0.06195   Max.   :0.10833   Max.   :0.09559  
##        Q                 N                 E                 D          
##  Min.   :0.01471   Min.   :0.02655   Min.   :0.03540   Min.   :0.04274  
##  1st Qu.:0.03419   1st Qu.:0.04487   1st Qu.:0.05275   1st Qu.:0.05983  
##  Median :0.03524   Median :0.05286   Median :0.06410   Median :0.08149  
##  Mean   :0.03699   Mean   :0.05124   Mean   :0.07144   Mean   :0.07723  
##  3rd Qu.:0.04387   3rd Qu.:0.05702   3rd Qu.:0.09252   3rd Qu.:0.09627  
##  Max.   :0.05310   Max.   :0.07692   Max.   :0.11966   Max.   :0.10619  
##        S                 T          
##  Min.   :0.01709   Min.   :0.01667  
##  1st Qu.:0.02295   1st Qu.:0.03449  
##  Median :0.03037   Median :0.04031  
##  Mean   :0.02896   Mean   :0.04611  
##  3rd Qu.:0.03486   3rd Qu.:0.05752  
##  Max.   :0.04167   Max.   :0.08547
protein_class_3_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_3_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_3_aa)
##      Class      TotalAA              G                 P          
##  Min.   :3   Min.   :   7.00   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:3   1st Qu.:  20.25   1st Qu.:0.03253   1st Qu.:0.03253  
##  Median :3   Median :  26.50   Median :0.05701   Median :0.04758  
##  Mean   :3   Mean   : 337.62   Mean   :0.05230   Mean   :0.04388  
##  3rd Qu.:3   3rd Qu.: 564.50   3rd Qu.:0.06742   3rd Qu.:0.05533  
##  Max.   :3   Max.   :3408.00   Max.   :0.18182   Max.   :0.10000  
##        A                 V                 L                 I          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04212   1st Qu.:0.04936   1st Qu.:0.07461   1st Qu.:0.03483  
##  Median :0.05510   Median :0.06469   Median :0.08951   Median :0.04718  
##  Mean   :0.05893   Mean   :0.07263   Mean   :0.10708   Mean   :0.04610  
##  3rd Qu.:0.07143   3rd Qu.:0.08371   3rd Qu.:0.14123   3rd Qu.:0.05735  
##  Max.   :0.25000   Max.   :0.33333   Max.   :0.25000   Max.   :0.21429  
##        M                 C                  F                 Y          
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.000000   Median :0.04799   Median :0.04047  
##  Mean   :0.01061   Mean   :0.006165   Mean   :0.03852   Mean   :0.03516  
##  3rd Qu.:0.02170   3rd Qu.:0.013300   3rd Qu.:0.05964   3rd Qu.:0.04747  
##  Max.   :0.09091   Max.   :0.040000   Max.   :0.13636   Max.   :0.10000  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.04750   1st Qu.:0.00000  
##  Median :0.000000   Median :0.04881   Median :0.06230   Median :0.04374  
##  Mean   :0.005594   Mean   :0.04519   Mean   :0.07243   Mean   :0.03714  
##  3rd Qu.:0.013459   3rd Qu.:0.06367   3rd Qu.:0.08665   3rd Qu.:0.05301  
##  Max.   :0.024000   Max.   :0.15789   Max.   :0.19048   Max.   :0.14286  
##        Q                 N                 E                 D          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.02929   1st Qu.:0.03497   1st Qu.:0.00000   1st Qu.:0.04821  
##  Median :0.05000   Median :0.04702   Median :0.05772   Median :0.07796  
##  Mean   :0.06447   Mean   :0.04728   Mean   :0.04752   Mean   :0.08086  
##  3rd Qu.:0.08333   3rd Qu.:0.05825   3rd Qu.:0.07151   3rd Qu.:0.09821  
##  Max.   :0.16667   Max.   :0.14286   Max.   :0.14286   Max.   :0.27273  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04410   1st Qu.:0.02833  
##  Median :0.05612   Median :0.04867  
##  Mean   :0.06130   Mean   :0.04845  
##  3rd Qu.:0.07692   3rd Qu.:0.06092  
##  Max.   :0.16667   Max.   :0.18182
protein_class_4_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_4_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_4_aa)
##      Class      TotalAA            G                 P          
##  Min.   :4   Min.   :144.0   Min.   :0.04167   Min.   :0.02069  
##  1st Qu.:4   1st Qu.:145.0   1st Qu.:0.05337   1st Qu.:0.02759  
##  Median :4   Median :148.0   Median :0.06757   Median :0.03247  
##  Mean   :4   Mean   :215.8   Mean   :0.06655   Mean   :0.03036  
##  3rd Qu.:4   3rd Qu.:154.0   3rd Qu.:0.07188   3rd Qu.:0.03314  
##  Max.   :4   Max.   :523.0   Max.   :0.10134   Max.   :0.03472  
##        A                 V                 L                 I          
##  Min.   :0.08987   Min.   :0.07534   Min.   :0.07457   Min.   :0.01351  
##  1st Qu.:0.12094   1st Qu.:0.09253   1st Qu.:0.08997   1st Qu.:0.02759  
##  Median :0.13636   Median :0.10811   Median :0.09091   Median :0.04138  
##  Mean   :0.13959   Mean   :0.10076   Mean   :0.09262   Mean   :0.04032  
##  3rd Qu.:0.16493   3rd Qu.:0.11039   3rd Qu.:0.09895   3rd Qu.:0.05844  
##  Max.   :0.18621   Max.   :0.11090   Max.   :0.11644   Max.   :0.06119  
##        M                  C                  F                 Y          
##  Min.   :0.006849   Min.   :0.000000   Min.   :0.02677   Min.   :0.01299  
##  1st Qu.:0.006920   1st Qu.:0.000000   1st Qu.:0.04638   1st Qu.:0.02027  
##  Median :0.012987   Median :0.000000   Median :0.04828   Median :0.02083  
##  Mean   :0.014701   Mean   :0.001912   Mean   :0.04714   Mean   :0.02186  
##  3rd Qu.:0.020552   3rd Qu.:0.000000   3rd Qu.:0.05300   3rd Qu.:0.02486  
##  Max.   :0.028681   Max.   :0.011472   Max.   :0.06207   Max.   :0.03448  
##        W                  H                 K                 R           
##  Min.   :0.005736   Min.   :0.01370   Min.   :0.08031   Min.   :0.006494  
##  1st Qu.:0.010318   1st Qu.:0.01384   1st Qu.:0.08220   1st Qu.:0.010205  
##  Median :0.013514   Median :0.02027   Median :0.09459   Median :0.013699  
##  Mean   :0.012277   Mean   :0.02095   Mean   :0.09057   Mean   :0.014851  
##  3rd Qu.:0.013841   3rd Qu.:0.02637   3rd Qu.:0.09689   3rd Qu.:0.013889  
##  Max.   :0.019481   Max.   :0.03247   Max.   :0.09740   Max.   :0.030593  
##        Q                 N                 E                 D          
##  Min.   :0.02103   Min.   :0.02083   Min.   :0.05479   Min.   :0.03247  
##  1st Qu.:0.03003   1st Qu.:0.02486   1st Qu.:0.06100   1st Qu.:0.04564  
##  Median :0.03448   Median :0.03448   Median :0.06250   Median :0.04795  
##  Mean   :0.03466   Mean   :0.03456   Mean   :0.06528   Mean   :0.04914  
##  3rd Qu.:0.04110   3rd Qu.:0.04545   3rd Qu.:0.06890   3rd Qu.:0.05536  
##  Max.   :0.04795   Max.   :0.04795   Max.   :0.08442   Max.   :0.06250  
##        S                 T          
##  Min.   :0.04828   Min.   :0.02703  
##  1st Qu.:0.06287   1st Qu.:0.04152  
##  Median :0.07457   Median :0.04861  
##  Mean   :0.07265   Mean   :0.04926  
##  3rd Qu.:0.08211   3rd Qu.:0.05657  
##  Max.   :0.09028   Max.   :0.06757
protein_class_5_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_5_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_5_aa)
##      Class      TotalAA            G                 P           
##  Min.   :5   Min.   : 74.0   Min.   :0.04027   Min.   :0.006803  
##  1st Qu.:5   1st Qu.:153.0   1st Qu.:0.07792   1st Qu.:0.025974  
##  Median :5   Median :154.0   Median :0.09091   Median :0.025974  
##  Mean   :5   Mean   :150.8   Mean   :0.08522   Mean   :0.028144  
##  3rd Qu.:5   3rd Qu.:154.0   3rd Qu.:0.09524   3rd Qu.:0.032468  
##  Max.   :5   Max.   :378.0   Max.   :0.14151   Max.   :0.053691  
##        A                 V                 L                 I          
##  Min.   :0.04698   Min.   :0.02649   Min.   :0.04828   Min.   :0.01961  
##  1st Qu.:0.08442   1st Qu.:0.03961   1st Qu.:0.10390   1st Qu.:0.04339  
##  Median :0.09091   Median :0.04545   Median :0.11565   Median :0.05195  
##  Mean   :0.10066   Mean   :0.04878   Mean   :0.10985   Mean   :0.05130  
##  3rd Qu.:0.11039   3rd Qu.:0.05299   3rd Qu.:0.11688   3rd Qu.:0.05844  
##  Max.   :0.19728   Max.   :0.11966   Max.   :0.15625   Max.   :0.08966  
##        M                 C                  F                 Y          
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.02381   Min.   :0.00000  
##  1st Qu.:0.01948   1st Qu.:0.000000   1st Qu.:0.04545   1st Qu.:0.01299  
##  Median :0.02041   Median :0.000000   Median :0.04545   Median :0.01299  
##  Mean   :0.02443   Mean   :0.002208   Mean   :0.04812   Mean   :0.01395  
##  3rd Qu.:0.02597   3rd Qu.:0.006494   3rd Qu.:0.05162   3rd Qu.:0.01299  
##  Max.   :0.05882   Max.   :0.032680   Max.   :0.10417   Max.   :0.03774  
##        W                 H                  K                 R          
##  Min.   :0.00000   Min.   :0.006803   Min.   :0.01709   Min.   :0.00000  
##  1st Qu.:0.01299   1st Qu.:0.050767   1st Qu.:0.11688   1st Qu.:0.01299  
##  Median :0.01299   Median :0.058442   Median :0.12987   Median :0.01361  
##  Mean   :0.01249   Mean   :0.056105   Mean   :0.12025   Mean   :0.01746  
##  3rd Qu.:0.01299   3rd Qu.:0.071429   3rd Qu.:0.12987   3rd Qu.:0.01948  
##  Max.   :0.02759   Max.   :0.090909   Max.   :0.15625   Max.   :0.03896  
##        Q                 N                 E                 D          
##  Min.   :0.01361   Min.   :0.00000   Min.   :0.01911   Min.   :0.01835  
##  1st Qu.:0.02597   1st Qu.:0.01299   1st Qu.:0.07792   1st Qu.:0.04707  
##  Median :0.03896   Median :0.01948   Median :0.08442   Median :0.05195  
##  Mean   :0.03947   Mean   :0.02383   Mean   :0.07879   Mean   :0.05364  
##  3rd Qu.:0.04545   3rd Qu.:0.02597   3rd Qu.:0.09091   3rd Qu.:0.05844  
##  Max.   :0.07143   Max.   :0.07438   Max.   :0.11688   Max.   :0.10828  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.03896   1st Qu.:0.02597  
##  Median :0.04545   Median :0.03247  
##  Mean   :0.04584   Mean   :0.03838  
##  3rd Qu.:0.05195   3rd Qu.:0.04545  
##  Max.   :0.12500   Max.   :0.10256
protein_class_6_aa <- read_csv("~/Dropbox/git_projects/random_forest/4_exploratory/mono/protein_class_6_aa.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Class = col_integer(),
##   TotalAA = col_integer()
## )
## See spec(...) for full column specifications.
summary(protein_class_6_aa)
##      Class      TotalAA             G                 P          
##  Min.   :6   Min.   :  15.0   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:6   1st Qu.: 141.0   1st Qu.:0.06338   1st Qu.:0.02740  
##  Median :6   Median : 145.0   Median :0.07586   Median :0.04082  
##  Mean   :6   Mean   : 143.7   Mean   :0.07339   Mean   :0.03793  
##  3rd Qu.:6   3rd Qu.: 146.0   3rd Qu.:0.08844   3rd Qu.:0.04930  
##  Max.   :6   Max.   :1156.0   Max.   :0.11159   Max.   :0.10000  
##        A                 V                 L                 I           
##  Min.   :0.02941   Min.   :0.03333   Min.   :0.06667   Min.   :0.000000  
##  1st Qu.:0.09589   1st Qu.:0.07857   1st Qu.:0.12057   1st Qu.:0.000000  
##  Median :0.10959   Median :0.09220   Median :0.12329   Median :0.006803  
##  Mean   :0.11209   Mean   :0.09778   Mean   :0.12544   Mean   :0.008851  
##  3rd Qu.:0.12766   3rd Qu.:0.11724   3rd Qu.:0.13014   3rd Qu.:0.014085  
##  Max.   :0.40000   Max.   :0.13333   Max.   :0.18182   Max.   :0.111111  
##        M                  C                  F                 Y          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.006849   1st Qu.:0.006897   1st Qu.:0.04965   1st Qu.:0.01418  
##  Median :0.013605   Median :0.007092   Median :0.05479   Median :0.02055  
##  Mean   :0.012146   Mean   :0.010325   Mean   :0.05435   Mean   :0.02022  
##  3rd Qu.:0.014184   3rd Qu.:0.013699   3rd Qu.:0.05674   3rd Qu.:0.02128  
##  Max.   :0.066667   Max.   :0.064014   Max.   :0.07092   Max.   :0.05263  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.03521   Min.   :0.00000  
##  1st Qu.:0.007092   1st Qu.:0.05479   1st Qu.:0.07534   1st Qu.:0.02055  
##  Median :0.013605   Median :0.06164   Median :0.07801   Median :0.02128  
##  Mean   :0.011644   Mean   :0.06137   Mean   :0.08010   Mean   :0.02275  
##  3rd Qu.:0.013699   3rd Qu.:0.07092   3rd Qu.:0.08451   3rd Qu.:0.02721  
##  Max.   :0.094118   Max.   :0.08571   Max.   :0.26667   Max.   :0.07692  
##        Q                  N                 E                 D          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.007092   1st Qu.:0.02128   1st Qu.:0.02817   1st Qu.:0.04795  
##  Median :0.020548   Median :0.03546   Median :0.04255   Median :0.05674  
##  Mean   :0.020088   Mean   :0.03614   Mean   :0.04110   Mean   :0.05599  
##  3rd Qu.:0.027397   3rd Qu.:0.04795   3rd Qu.:0.05479   3rd Qu.:0.06383  
##  Max.   :0.088235   Max.   :0.08772   Max.   :0.10000   Max.   :0.08511  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.04762   1st Qu.:0.03425  
##  Median :0.06863   Median :0.04828  
##  Mean   :0.06536   Mean   :0.05075  
##  3rd Qu.:0.08451   3rd Qu.:0.06383  
##  Max.   :0.11348   Max.   :0.10563

Boxplots of Total AA Versus Protein Category

setwd("~/Dropbox/git_projects/random_forest/data")

boxplot(protein_class_0_aa$TotalAA, protein_class_1_aa$TotalAA,
        protein_class_2_aa$TotalAA, protein_class_3_aa$TotalAA,
        protein_class_4_aa$TotalAA, protein_class_5_aa$TotalAA,
        protein_class_6_aa$TotalAA,
        log = "y", horizontal = FALSE,
        names = c("homosapiens", "erythrocruorin", "hemerythrin",
                  "hemocyanin", "leghemoglobin", "myoglobin", "hemoglobin"),
        main = "Total AA Versus Protein Category",
        ylab = "Total Amino Acids per Protein")

# Save copy to file.
svg(filename="boxplot_total_aa_vs_protein_category.svg")
boxplot(protein_class_0_aa$TotalAA, protein_class_1_aa$TotalAA,
        protein_class_2_aa$TotalAA, protein_class_3_aa$TotalAA,
        protein_class_4_aa$TotalAA, protein_class_5_aa$TotalAA,
        protein_class_6_aa$TotalAA,
        log = "y", horizontal = FALSE,
        names = c("homosapiens", "erythrocruorin", "hemerythrin",
                  "hemocyanin", "leghemoglobin", "myoglobin", "hemoglobin"),
        main = "Total AA Versus Protein Category",
        ylab = "Total Amino Acids per Protein")
dev.off()
## png 
##   2

Due to the very low molecular weight proteins drastically changing the percent amino acid composition of some of the categories of proteins I decided to trim proteins below 26 AA in length from three categories;

  • homosapiens, class = 0
  • hemocyanin, class = 4
  • hemoglobin, class = 6
setwd("~/Dropbox/git_projects/random_forest/2_single_aa")

trimmed_homo_sapien_class_0 = subset(protein_class_0_aa, protein_class_0_aa[,2] > 27)
write.csv(x = trimmed_homo_sapien_class_0, 
          file = "trimmed_homo_sapien_class_0.csv",
          row.names = FALSE)

trimmed_hemocyanin_class_4 = subset(protein_class_4_aa, protein_class_4_aa[,2] > 27)
write.csv(x = trimmed_hemocyanin_class_4, 
          file = "trimmed_hemocyanin_class_4.csv",
          row.names = FALSE)

trimmed_hemoglobin_class_6 = subset(protein_class_6_aa, protein_class_6_aa[,2] > 27)
write.csv(x = trimmed_hemoglobin_class_6, 
          file = "trimmed_hemoglobin_class_6.csv",
          row.names = FALSE)
setwd("~/Dropbox/git_projects/random_forest/data")

boxplot(trimmed_homo_sapien_class_0$TotalAA, protein_class_1_aa$TotalAA,
        protein_class_2_aa$TotalAA, protein_class_3_aa$TotalAA,
        trimmed_hemocyanin_class_4$TotalAA, protein_class_5_aa$TotalAA,
        trimmed_hemoglobin_class_6$TotalAA,
        log = "y", horizontal = FALSE,
        names = c("homosapiens", "erythrocruorin", "hemerythrin",
                  "hemocyanin", "leghemoglobin", "myoglobin", "hemoglobin"),
        main = "Total AA Versus Protein Category",
        ylab = "Total Amino Acids per Protein")

# Save copy to file.
svg(filename="boxplot_total_aa_vs_protein_category.svg")
boxplot(trimmed_homo_sapien_class_0$TotalAA, protein_class_1_aa$TotalAA,
        protein_class_2_aa$TotalAA, protein_class_3_aa$TotalAA,
        trimmed_hemocyanin_class_4$TotalAA, protein_class_5_aa$TotalAA,
        trimmed_hemoglobin_class_6$TotalAA,
        log = "y", horizontal = FALSE,
        names = c("homosapiens", "erythrocruorin", "hemerythrin",
                  "hemocyanin", "leghemoglobin", "myoglobin", "hemoglobin"),
        main = "Total AA Versus Protein Category",
        ylab = "Total Amino Acids per Protein")
dev.off()
## png 
##   2
summary(trimmed_homo_sapien_class_0)
##      Class      TotalAA              G                 P          
##  Min.   :0   Min.   :   31.0   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0   1st Qu.:  272.0   1st Qu.:0.05032   1st Qu.:0.04245  
##  Median :0   Median :  448.0   Median :0.06435   Median :0.05575  
##  Mean   :0   Mean   :  608.5   Mean   :0.06730   Mean   :0.06199  
##  3rd Qu.:0   3rd Qu.:  721.0   3rd Qu.:0.07968   3rd Qu.:0.07476  
##  Max.   :0   Max.   :34350.0   Max.   :0.46474   Max.   :0.39241  
##        A                 V                 L                 I          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.05464   1st Qu.:0.04831   1st Qu.:0.08024   1st Qu.:0.02982  
##  Median :0.06886   Median :0.06008   Median :0.09712   Median :0.04281  
##  Mean   :0.07231   Mean   :0.06066   Mean   :0.09841   Mean   :0.04359  
##  3rd Qu.:0.08597   3rd Qu.:0.07200   3rd Qu.:0.11529   3rd Qu.:0.05578  
##  Max.   :0.30723   Max.   :0.18852   Max.   :0.32323   Max.   :0.21538  
##        M                 C                 F                 Y          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01557   1st Qu.:0.01232   1st Qu.:0.02597   1st Qu.:0.01874  
##  Median :0.02135   Median :0.01931   Median :0.03556   Median :0.02682  
##  Mean   :0.02270   Mean   :0.02362   Mean   :0.03700   Mean   :0.02793  
##  3rd Qu.:0.02794   3rd Qu.:0.02820   3rd Qu.:0.04600   3rd Qu.:0.03571  
##  Max.   :0.13836   Max.   :0.36816   Max.   :0.17391   Max.   :0.24194  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.006523   1st Qu.:0.01750   1st Qu.:0.03894   1st Qu.:0.04399  
##  Median :0.011364   Median :0.02392   Median :0.05521   Median :0.05556  
##  Mean   :0.012874   Mean   :0.02518   Mean   :0.05772   Mean   :0.05860  
##  3rd Qu.:0.017467   3rd Qu.:0.03102   3rd Qu.:0.07246   3rd Qu.:0.06960  
##  Max.   :0.232877   Max.   :0.13725   Max.   :0.31250   Max.   :0.47059  
##        Q                 N                 E                 D          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.03425   1st Qu.:0.02484   1st Qu.:0.05208   1st Qu.:0.03729  
##  Median :0.04405   Median :0.03488   Median :0.06693   Median :0.04750  
##  Mean   :0.04618   Mean   :0.03554   Mean   :0.06942   Mean   :0.04764  
##  3rd Qu.:0.05492   3rd Qu.:0.04502   3rd Qu.:0.08309   3rd Qu.:0.05714  
##  Max.   :0.98750   Max.   :0.13000   Max.   :0.38235   Max.   :0.19908  
##        S                 T          
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.06195   1st Qu.:0.04197  
##  Median :0.07568   Median :0.05117  
##  Mean   :0.07879   Mean   :0.05254  
##  3rd Qu.:0.09232   3rd Qu.:0.06103  
##  Max.   :0.41660   Max.   :0.34949
summary(trimmed_hemocyanin_class_4)
##      Class      TotalAA            G                 P          
##  Min.   :4   Min.   :144.0   Min.   :0.04167   Min.   :0.02069  
##  1st Qu.:4   1st Qu.:145.0   1st Qu.:0.05337   1st Qu.:0.02759  
##  Median :4   Median :148.0   Median :0.06757   Median :0.03247  
##  Mean   :4   Mean   :215.8   Mean   :0.06655   Mean   :0.03036  
##  3rd Qu.:4   3rd Qu.:154.0   3rd Qu.:0.07188   3rd Qu.:0.03314  
##  Max.   :4   Max.   :523.0   Max.   :0.10134   Max.   :0.03472  
##        A                 V                 L                 I          
##  Min.   :0.08987   Min.   :0.07534   Min.   :0.07457   Min.   :0.01351  
##  1st Qu.:0.12094   1st Qu.:0.09253   1st Qu.:0.08997   1st Qu.:0.02759  
##  Median :0.13636   Median :0.10811   Median :0.09091   Median :0.04138  
##  Mean   :0.13959   Mean   :0.10076   Mean   :0.09262   Mean   :0.04032  
##  3rd Qu.:0.16493   3rd Qu.:0.11039   3rd Qu.:0.09895   3rd Qu.:0.05844  
##  Max.   :0.18621   Max.   :0.11090   Max.   :0.11644   Max.   :0.06119  
##        M                  C                  F                 Y          
##  Min.   :0.006849   Min.   :0.000000   Min.   :0.02677   Min.   :0.01299  
##  1st Qu.:0.006920   1st Qu.:0.000000   1st Qu.:0.04638   1st Qu.:0.02027  
##  Median :0.012987   Median :0.000000   Median :0.04828   Median :0.02083  
##  Mean   :0.014701   Mean   :0.001912   Mean   :0.04714   Mean   :0.02186  
##  3rd Qu.:0.020552   3rd Qu.:0.000000   3rd Qu.:0.05300   3rd Qu.:0.02486  
##  Max.   :0.028681   Max.   :0.011472   Max.   :0.06207   Max.   :0.03448  
##        W                  H                 K                 R           
##  Min.   :0.005736   Min.   :0.01370   Min.   :0.08031   Min.   :0.006494  
##  1st Qu.:0.010318   1st Qu.:0.01384   1st Qu.:0.08220   1st Qu.:0.010205  
##  Median :0.013514   Median :0.02027   Median :0.09459   Median :0.013699  
##  Mean   :0.012277   Mean   :0.02095   Mean   :0.09057   Mean   :0.014851  
##  3rd Qu.:0.013841   3rd Qu.:0.02637   3rd Qu.:0.09689   3rd Qu.:0.013889  
##  Max.   :0.019481   Max.   :0.03247   Max.   :0.09740   Max.   :0.030593  
##        Q                 N                 E                 D          
##  Min.   :0.02103   Min.   :0.02083   Min.   :0.05479   Min.   :0.03247  
##  1st Qu.:0.03003   1st Qu.:0.02486   1st Qu.:0.06100   1st Qu.:0.04564  
##  Median :0.03448   Median :0.03448   Median :0.06250   Median :0.04795  
##  Mean   :0.03466   Mean   :0.03456   Mean   :0.06528   Mean   :0.04914  
##  3rd Qu.:0.04110   3rd Qu.:0.04545   3rd Qu.:0.06890   3rd Qu.:0.05536  
##  Max.   :0.04795   Max.   :0.04795   Max.   :0.08442   Max.   :0.06250  
##        S                 T          
##  Min.   :0.04828   Min.   :0.02703  
##  1st Qu.:0.06287   1st Qu.:0.04152  
##  Median :0.07457   Median :0.04861  
##  Mean   :0.07265   Mean   :0.04926  
##  3rd Qu.:0.08211   3rd Qu.:0.05657  
##  Max.   :0.09028   Max.   :0.06757
summary(trimmed_hemoglobin_class_6)
##      Class      TotalAA             G                 P          
##  Min.   :6   Min.   :  30.0   Min.   :0.00000   Min.   :0.01370  
##  1st Qu.:6   1st Qu.: 141.0   1st Qu.:0.06230   1st Qu.:0.02740  
##  Median :6   Median : 145.0   Median :0.07586   Median :0.04082  
##  Mean   :6   Mean   : 144.4   Mean   :0.07343   Mean   :0.03796  
##  3rd Qu.:6   3rd Qu.: 146.0   3rd Qu.:0.08844   3rd Qu.:0.04916  
##  Max.   :6   Max.   :1156.0   Max.   :0.11159   Max.   :0.10000  
##        A                 V                 L                 I           
##  Min.   :0.02941   Min.   :0.03333   Min.   :0.07353   Min.   :0.000000  
##  1st Qu.:0.09589   1st Qu.:0.07829   1st Qu.:0.12057   1st Qu.:0.000000  
##  Median :0.10959   Median :0.09220   Median :0.12329   Median :0.006803  
##  Mean   :0.11111   Mean   :0.09759   Mean   :0.12557   Mean   :0.008896  
##  3rd Qu.:0.12766   3rd Qu.:0.11724   3rd Qu.:0.13014   3rd Qu.:0.014085  
##  Max.   :0.16901   Max.   :0.13103   Max.   :0.18182   Max.   :0.111111  
##        M                  C                  F                 Y          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.006849   1st Qu.:0.006897   1st Qu.:0.04965   1st Qu.:0.01418  
##  Median :0.013605   Median :0.007092   Median :0.05479   Median :0.02055  
##  Mean   :0.012036   Mean   :0.010378   Mean   :0.05463   Mean   :0.02033  
##  3rd Qu.:0.014184   3rd Qu.:0.013699   3rd Qu.:0.05674   3rd Qu.:0.02128  
##  Max.   :0.066667   Max.   :0.064014   Max.   :0.07092   Max.   :0.05263  
##        W                  H                 K                 R          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.03521   Min.   :0.00000  
##  1st Qu.:0.007092   1st Qu.:0.05479   1st Qu.:0.07534   1st Qu.:0.02055  
##  Median :0.013605   Median :0.06164   Median :0.07801   Median :0.02128  
##  Mean   :0.011705   Mean   :0.06152   Mean   :0.07948   Mean   :0.02287  
##  3rd Qu.:0.013699   3rd Qu.:0.07092   3rd Qu.:0.08363   3rd Qu.:0.02721  
##  Max.   :0.094118   Max.   :0.08571   Max.   :0.09929   Max.   :0.07692  
##        Q                  N                 E                 D          
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.007092   1st Qu.:0.02128   1st Qu.:0.02817   1st Qu.:0.04795  
##  Median :0.020548   Median :0.03546   Median :0.04706   Median :0.05674  
##  Mean   :0.020192   Mean   :0.03632   Mean   :0.04132   Mean   :0.05628  
##  3rd Qu.:0.027397   3rd Qu.:0.04795   3rd Qu.:0.05479   3rd Qu.:0.06383  
##  Max.   :0.088235   Max.   :0.08772   Max.   :0.10000   Max.   :0.08511  
##        S                 T           
##  Min.   :0.02041   Min.   :0.006803  
##  1st Qu.:0.04762   1st Qu.:0.034247  
##  Median :0.07042   Median :0.049296  
##  Mean   :0.06553   Mean   :0.051013  
##  3rd Qu.:0.08451   3rd Qu.:0.063830  
##  Max.   :0.11348   Max.   :0.105634