library("seqinr")
getwd()
## [1] "/home/mcc/Dropbox/git_projects/random_forest_dk_project/code"
setwd("~/Dropbox/git_projects/random_forest_dk_project/uniprot_1")
getwd()
## [1] "/home/mcc/Dropbox/git_projects/random_forest_dk_project/uniprot_1"
o2_binders = list.files()
df_erythrocruorin <- read.fasta(file = o2_binders[1],
seqtype = "AA",
seqonly = TRUE,
as.string = TRUE)
#head(df_erythrocruorin)
length(df_erythrocruorin)
## [1] 14
df_erythrocruorin[1]
## [[1]]
## [1] "MKFLILALCFAAASALSADQISTVQASFDKVKGDPVGILYAVFKADPSIMAKFTQFAGKDLESIKGTAPFEIHANRIVGFFSKIIGELPNIEADVNTFVASHKPRGVTHDQLNNFRAGFVSYMKAHTDFAGAEAAWGATLDTFFGMIFSKM"
o2_binders
## [1] "uniprot-name%3Aerythrocruorin+AND+existence%3A-evidence+at+protein+level-.fasta"
## [2] "uniprot-name%3Ahemerythrin+AND+existence%3A-evidence+at+protein+level-.fasta"
## [3] "uniprot-name%3Ahemocyanin+AND+existence%3A-evidence+at+protein+level-.fasta"
## [4] "uniprot-name%3Aleghemoglobin+AND+existence%3A-evidence+at+protein+level-.fasta"
## [5] "uniprot-name%3Amyoglobin+AND+existence%3A-evidence+at+protein+level-.fasta"
## [6] "uniprot-taxonomy%3A-Homo+sapiens+%28Human%29+%5B9606%5D-+AND+reviewed%3Ayes+--.fasta"
## [7] "uniprot-taxonomy%3Amammalia+AND+name%3Ahemoglobin+AND+existence%3A-evidence+at--.fasta"
Protein categories:
1. erythrocruorin
2. hemerythrin
3. hemocyanin
4. leghemoglobin
5. myoglobin
6. hemoglobin
7. Homo sapiens reviewed
library(stringr)
# o2_binders
df_erythrocruorin_aa = data.frame(matrix(NA, nrow = length(df_erythrocruorin), ncol = 22))
## Produce AA matrix
# Col. 1 = Total AA per protein
# Col. 2:21; (G, P, A, V, L, I, M, C, F, Y, W, H, K, R, Q, N, E, D, S, T)
#df_erythrocruorin_aa <-
# FIRST COLUMN IS PROTEIN CATEGORY
# df_erythrocruorin = 1
# SECOND COLUMN IS TOTAL AA
total_aa = nchar(df_erythrocruorin[1], keepNA = FALSE)
###############################################
str_count(df_erythrocruorin[1], "G") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "P") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "A") / total_aa
## [1] 0.1456954
str_count(df_erythrocruorin[1], "V") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "L") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "I") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "M") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "C") / total_aa
## [1] 0.006622517
str_count(df_erythrocruorin[1], "F") / total_aa
## [1] 0.1059603
str_count(df_erythrocruorin[1], "Y") / total_aa
## [1] 0.01324503
str_count(df_erythrocruorin[1], "W") / total_aa
## [1] 0.006622517
str_count(df_erythrocruorin[1], "H") / total_aa
## [1] 0.02649007
str_count(df_erythrocruorin[1], "K") / total_aa
## [1] 0.07284768
str_count(df_erythrocruorin[1], "R") / total_aa
## [1] 0.01986755
str_count(df_erythrocruorin[1], "Q") / total_aa
## [1] 0.02649007
str_count(df_erythrocruorin[1], "N") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "E") / total_aa
## [1] 0.03311258
str_count(df_erythrocruorin[1], "D") / total_aa
## [1] 0.05960265
str_count(df_erythrocruorin[1], "S") / total_aa
## [1] 0.06622517
str_count(df_erythrocruorin[1], "T") / total_aa
## [1] 0.05298013
###############################################