library("seqinr")
getwd()
## [1] "/home/mcc/Dropbox/git_projects/random_forest_dk_project/code"
setwd("~/Dropbox/git_projects/random_forest_dk_project/uniprot_1")
getwd()
## [1] "/home/mcc/Dropbox/git_projects/random_forest_dk_project/uniprot_1"
o2_binders = list.files()

df_erythrocruorin <- read.fasta(file = o2_binders[1], 
                                seqtype = "AA", 
                                seqonly = TRUE, 
                                as.string = TRUE)

#head(df_erythrocruorin)

length(df_erythrocruorin)
## [1] 14
df_erythrocruorin[1]
## [[1]]
## [1] "MKFLILALCFAAASALSADQISTVQASFDKVKGDPVGILYAVFKADPSIMAKFTQFAGKDLESIKGTAPFEIHANRIVGFFSKIIGELPNIEADVNTFVASHKPRGVTHDQLNNFRAGFVSYMKAHTDFAGAEAAWGATLDTFFGMIFSKM"
o2_binders
## [1] "uniprot-name%3Aerythrocruorin+AND+existence%3A-evidence+at+protein+level-.fasta"       
## [2] "uniprot-name%3Ahemerythrin+AND+existence%3A-evidence+at+protein+level-.fasta"          
## [3] "uniprot-name%3Ahemocyanin+AND+existence%3A-evidence+at+protein+level-.fasta"           
## [4] "uniprot-name%3Aleghemoglobin+AND+existence%3A-evidence+at+protein+level-.fasta"        
## [5] "uniprot-name%3Amyoglobin+AND+existence%3A-evidence+at+protein+level-.fasta"            
## [6] "uniprot-taxonomy%3A-Homo+sapiens+%28Human%29+%5B9606%5D-+AND+reviewed%3Ayes+--.fasta"  
## [7] "uniprot-taxonomy%3Amammalia+AND+name%3Ahemoglobin+AND+existence%3A-evidence+at--.fasta"

Protein categories:
1. erythrocruorin
2. hemerythrin
3. hemocyanin
4. leghemoglobin
5. myoglobin
6. hemoglobin
7. Homo sapiens reviewed

library(stringr)

# o2_binders

df_erythrocruorin_aa = data.frame(matrix(NA, nrow = length(df_erythrocruorin), ncol = 22))

## Produce AA matrix
# Col. 1 = Total AA per protein
# Col. 2:21; (G, P, A, V, L, I, M, C, F, Y, W, H, K, R, Q, N, E, D, S, T)

#df_erythrocruorin_aa <- 
# FIRST COLUMN IS PROTEIN CATEGORY
# df_erythrocruorin = 1
# SECOND COLUMN IS TOTAL AA 
total_aa = nchar(df_erythrocruorin[1], keepNA = FALSE)
###############################################
str_count(df_erythrocruorin[1], "G") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "P") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "A") / total_aa
## [1] 0.1456954
str_count(df_erythrocruorin[1], "V") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "L") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "I") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "M") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "C") / total_aa
## [1] 0.006622517
str_count(df_erythrocruorin[1], "F") / total_aa
## [1] 0.1059603
str_count(df_erythrocruorin[1], "Y") / total_aa
## [1] 0.01324503
str_count(df_erythrocruorin[1], "W") / total_aa
## [1] 0.006622517
str_count(df_erythrocruorin[1], "H") / total_aa
## [1] 0.02649007
str_count(df_erythrocruorin[1], "K") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "R") / total_aa
## [1] 0.01986755
str_count(df_erythrocruorin[1], "Q") / total_aa
## [1] 0.02649007
str_count(df_erythrocruorin[1], "N") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "E") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "D") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "S") / total_aa
## [1] 0.06622517
str_count(df_erythrocruorin[1], "T") / total_aa
## [1] 0.05298013
###############################################