rm(list = ls())
#install.packages("protr")
library("protr")
## Warning: package 'protr' was built under R version 4.2.3
##############################input data
dir_path <- "D:\\2024\\20231218_kinase\\"
dir_path_name <- list.files(pattern = ".*fasta",dir_path,full.names = T, recursive = F)
dir_path_name
## [1] "D:\\2024\\20231218_kinase\\Human_kinase_domain.fasta"
## [2] "D:\\2024\\20231218_kinase\\Human_kinase_protein.fasta"
## [3] "D:\\2024\\20231218_kinase\\protr_protein_descriptor_fasta.R"
## [4] "D:\\2024\\20231218_kinase\\protr_protein_descriptor_fasta.spin.R"
## [5] "D:\\2024\\20231218_kinase\\protr_protein_descriptor_fasta.spin.Rmd"
#Data Sample
data_f <- readFASTA(grep("Human_kinase_domain.fasta",dir_path_name,value = T))
length(data_f) #516
## [1] 516
data_f[1]
## $TTBK2_Hsap
## [1] "WKVLRKIGGGGFGEIYDALDMLTRENVALKVESAQQPKQVLKMEVAVLKKLQGKDHVCRFIGCGRNDRFNYVVMQLQGRNLADLRRSQSRGTFTISTTLRLGRQILESIESIHSVGFLHRDIKPSNFAMGRFPSTCRKCYMLDFGLARQFTNSCGDVRPPRAVAGFRGTVRYASINAHRNREMGRHDDLWSLFYMLVEFVVGQLPWRKIKDKEQVGSIKERYDHRLMLKHLPPEFSIFLDHISSLDYFTKPDYQLL"
class(data_f) #[1] "list"
## [1] "list"
extracell <- data_f[(sapply(data_f, protcheck))] #to do the amino acid type sanity check and remove the non-standard sequences
length(extracell) #516
## [1] 516
# calculate Amino acid composition descriptors
ls("package:protr")
## [1] "AA2DACOR" "AA3DMoRSE" "AAACF"
## [4] "AABLOSUM100" "AABLOSUM45" "AABLOSUM50"
## [7] "AABLOSUM62" "AABLOSUM80" "AABurden"
## [10] "AAConn" "AAConst" "AACPSA"
## [13] "AADescAll" "AAEdgeAdj" "AAEigIdx"
## [16] "AAFGC" "AAGeom" "AAGETAWAY"
## [19] "AAindex" "AAInfo" "AAMetaInfo"
## [22] "AAMOE2D" "AAMOE3D" "AAMolProp"
## [25] "AAPAM120" "AAPAM250" "AAPAM30"
## [28] "AAPAM40" "AAPAM70" "AARandic"
## [31] "AARDF" "AATopo" "AATopoChg"
## [34] "AAWalk" "AAWHIM" "acc"
## [37] "crossSetSim" "extractAAC" "extractAPAAC"
## [40] "extractBLOSUM" "extractCTDC" "extractCTDCClass"
## [43] "extractCTDD" "extractCTDDClass" "extractCTDT"
## [46] "extractCTDTClass" "extractCTriad" "extractCTriadClass"
## [49] "extractDC" "extractDescScales" "extractFAScales"
## [52] "extractGeary" "extractMDSScales" "extractMoran"
## [55] "extractMoreauBroto" "extractPAAC" "extractProtFP"
## [58] "extractProtFPGap" "extractPSSM" "extractPSSMAcc"
## [61] "extractPSSMFeature" "extractQSO" "extractScales"
## [64] "extractScalesGap" "extractSOCN" "extractTC"
## [67] "getUniProt" "parGOSim" "parSeqSim"
## [70] "parSeqSimDisk" "protcheck" "protseg"
## [73] "readFASTA" "readPDB" "removeGaps"
## [76] "twoGOSim" "twoSeqSim"
func_ls <- ls("package:protr")[c(38, 49, 66, 55, 54, 52, 41, 45, 43, 47, 65, 62, 56, 39)]
func_ls
## [1] "extractAAC" "extractDC" "extractTC"
## [4] "extractMoreauBroto" "extractMoran" "extractGeary"
## [7] "extractCTDC" "extractCTDT" "extractCTDD"
## [10] "extractCTriad" "extractSOCN" "extractQSO"
## [13] "extractPAAC" "extractAPAAC"
data_list <- list()
for (i in 1:length(func_ls)) {
#i =1
data_1 <- data.frame(t(sapply(extracell, func_ls[i])))
print(dim(data_1))
data_list[[i]] <- data_1
}
## [1] 516 20
## [1] 516 400
## [1] 516 8000
## [1] 516 240
## [1] 516 240
## [1] 516 240
## [1] 516 21
## [1] 516 21
## [1] 516 105
## [1] 516 343
## [1] 516 60
## [1] 516 100
## [1] 516 50
## [1] 516 80
data_2 <- do.call("cbind", data_list)
dim(data_2) #[1] 516 9920
## [1] 516 9920
head(data_2[1:10, 1:10])
## A R N D C E
## TTBK2_Hsap 0.04296875 0.09765625 0.03125000 0.05859375 0.01953125 0.04296875
## TTBK1_Hsap 0.05078125 0.07812500 0.03515625 0.04687500 0.01171875 0.05468750
## TSSK4_Hsap 0.05947955 0.04089219 0.03717472 0.04089219 0.02230483 0.04832714
## TSSK3_Hsap 0.05859375 0.04296875 0.01562500 0.05859375 0.03125000 0.06640625
## TSSK2_Hsap 0.04214559 0.06513410 0.02681992 0.08045977 0.03065134 0.04980843
## TSSK1_Hsap 0.05363985 0.05363985 0.03065134 0.07279693 0.03065134 0.05747126
## Q G H I
## TTBK2_Hsap 0.04687500 0.07812500 0.03125000 0.05078125
## TTBK1_Hsap 0.04687500 0.07812500 0.03515625 0.04296875
## TSSK4_Hsap 0.04460967 0.05204461 0.02602230 0.07063197
## TSSK3_Hsap 0.05078125 0.07421875 0.03125000 0.05468750
## TSSK2_Hsap 0.03065134 0.04980843 0.02681992 0.09578544
## TSSK1_Hsap 0.02298851 0.04597701 0.04214559 0.08429119
write.csv(data_2, paste0(dir_path,Sys.Date(),"-","kinase_descriptor_9920.csv"),row.names = FALSE)
######################
#Amino acid composition
#extractAAC() - Amino acid composition
#extractDC() - Dipeptide composition
#extractTC() - Tripeptide composition
#Autocorrelation
#extractMoreauBroto() - Normalized Moreau-Broto autocorrelation
#extractMoran() - Moran autocorrelation
#extractGeary() - Geary autocorrelation
#CTD descriptors
#extractCTDC() - Composition
#extractCTDT() - Transition
#extractCTDD() - Distribution
#Conjoint triad descriptors
#extractCTriad() - Conjoint triad descriptors
#Quasi-sequence-order descriptors
#extractSOCN() - Sequence-order-coupling number
#extractQSO() - Quasi-sequence-order descriptors
#Pseudo-amino acid composition
#extractPAAC() - Pseudo-amino acid composition (PseAAC)
#extractAPAAC() - Amphiphilic pseudo-amino acid composition (APseAAC)
#Profile-based descriptors
#extractPSSM()
#extractPSSMAcc()
#extractPSSMFeature()
##ref https://cran.r-project.org/web/packages/protr/vignettes/protr.html
##ref https://cran.r-project.org/web/packages/protr/vignettes/protr.html