使用Peptides获取序列特征

library(Peptides) # 多肽和蛋白质的理化性质分析
library(seqinr) # 序列数据的输入、输出

# 定义所有序列
sequences <- list(
    Exenatide = "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS",
    Liraglutide = "HAEGTFTSDVSSYLEGQAAKEEFIAWLVRGRG",
    Lixisenatide = "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPSKKKKKK",
    Tirzepatide = "YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS"
)

# 创建结果存储列表
all_results <- list()

# 遍历并分析每个序列
for (name in names(sequences)) {
    seq <- sequences[[name]]
    
    # 1. 基础理化性质
    mw <- mw(seq)                  # 分子量
    pI <- pI(seq)                  # 等电点
    aacomp <- aaComp(seq)          # 氨基酸组成
    instab <- instaIndex(seq)      # 稳定性指数
    hydro <- hydrophobicity(seq)   # 平均疏水性
    
    # 2. 酶切模拟
    # Trypsin: 切割K/R之后,不切割KP/RP
    trypsin_cut <- unlist(strsplit(seq, "(?<=[KR])(?!P)", perl=TRUE))
    
    # DPP-IV: 识别N端的Xaa-Pro或Xaa-Ala二肽
    dppiv_cleavage <- ifelse(substr(seq, 2, 2) %in% c("P", "A"),
                                                    substr(seq, 3, nchar(seq)),
                                                    "无切割")
    
    # 保存结果
    all_results[[name]] <- list(
        Sequence = seq,
        MW = mw,
        pI = pI,
        InstabilityIndex = instab,
        Hydrophobicity = hydro,
        AminoAcidComp = aacomp,
        TrypsinFragments = trypsin_cut,
        DPPIV_Cleavage = dppiv_cleavage
    )
    
    # 输出每个序列的基本信息
    cat("\n===", name, "===\n")
    cat("序列:", seq, "\n")
    cat("分子量:", mw, "\n")
    cat("等电点:", pI, "\n")
    cat("氨基酸组成:\n")
    print(aacomp[[1]])
    cat("\n稳定性指数:", instab, "\n")
    cat("平均疏水性:", hydro, "\n")
    cat("Trypsin切割片段数:", length(trypsin_cut), "\n")
    cat("DPP-IV 预测切割结果:", dppiv_cleavage, "\n")
}
## 
## === Exenatide ===
## 序列: HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS 
## 分子量: 4187.607 
## 等电点: 4.41179 
## 氨基酸组成:
##           Number  Mole%
## Tiny          14 35.897
## Small         21 53.846
## Aliphatic      7 17.949
## Aromatic       4 10.256
## NonPolar      20 51.282
## Polar         19 48.718
## Charged       10 25.641
## Basic          4 10.256
## Acidic         6 15.385
## 
## 稳定性指数: 64.09231 
## 平均疏水性: -0.6923077 
## Trypsin切割片段数: 4 
## DPP-IV 预测切割结果: 无切割 
## 
## === Liraglutide ===
## 序列: HAEGTFTSDVSSYLEGQAAKEEFIAWLVRGRG 
## 分子量: 3512.84 
## 等电点: 4.69885 
## 氨基酸组成:
##           Number  Mole%
## Tiny          13 40.625
## Small         16 50.000
## Aliphatic      9 28.125
## Aromatic       5 15.625
## NonPolar      17 53.125
## Polar         15 46.875
## Charged        9 28.125
## Basic          4 12.500
## Acidic         5 15.625
## 
## 稳定性指数: 25.64688 
## 平均疏水性: -0.35625 
## Trypsin切割片段数: 4 
## DPP-IV 预测切割结果: EGTFTSDVSSYLEGQAAKEEFIAWLVRGRG 
## 
## === Lixisenatide ===
## 序列: HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPSKKKKKK 
## 分子量: 4859.535 
## 等电点: 10.32631 
## 氨基酸组成:
##           Number  Mole%
## Tiny          14 31.818
## Small         20 45.455
## Aliphatic      7 15.909
## Aromatic       4  9.091
## NonPolar      19 43.182
## Polar         25 56.818
## Charged       16 36.364
## Basic         10 22.727
## Acidic         6 13.636
## 
## 稳定性指数: 53.56818 
## 平均疏水性: -1.109091 
## Trypsin切割片段数: 9 
## DPP-IV 预测切割结果: 无切割
## 
## === Tirzepatide ===
## 序列: YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS 
## 分子量: 3900.356 
## 等电点: 4.370415 
## 氨基酸组成:
##           Number  Mole%
## Tiny          15 38.462
## Small         22 56.410
## Aliphatic     10 25.641
## Aromatic       5 12.821
## NonPolar      23 58.974
## Polar         14 35.897
## Charged        5 12.821
## Basic          2  5.128
## Acidic         3  7.692
## 
## 稳定性指数: 39.13333 
## 平均疏水性: -0.1051282 
## Trypsin切割片段数: 3 
## DPP-IV 预测切割结果: 无切割
# 比较分析
cat("\n=== 多肽药物比较 ===\n")
## 
## === 多肽药物比较 ===
mws <- sapply(all_results, function(x) x$MW)
pIs <- sapply(all_results, function(x) x$pI)
instabs <- sapply(all_results, function(x) x$InstabilityIndex)
hydros <- sapply(all_results, function(x) x$Hydrophobicity)

# 创建比较数据框
comparison <- data.frame(
    Peptide = names(sequences),
    MW = mws,
    pI = pIs,
    InstabilityIndex = instabs,
    Hydrophobicity = hydros,
    Length = sapply(sequences, nchar)
)

print(comparison)
##                   Peptide       MW        pI InstabilityIndex Hydrophobicity
## Exenatide       Exenatide 4187.607  4.411790         64.09231     -0.6923077
## Liraglutide   Liraglutide 3512.840  4.698850         25.64688     -0.3562500
## Lixisenatide Lixisenatide 4859.535 10.326311         53.56818     -1.1090909
## Tirzepatide   Tirzepatide 3900.356  4.370415         39.13333     -0.1051282
##              Length
## Exenatide        39
## Liraglutide      32
## Lixisenatide     44
## Tirzepatide      39
# 绘制比较图表
library(ggplot2)
library(reshape2)

# 转换为长格式用于绘图
comp_melt <- melt(comparison, id.vars = "Peptide")

# 创建比较柱状图
p <- ggplot(comp_melt, aes(x = Peptide, y = value, fill = Peptide)) +
    geom_bar(stat = "identity") +
    facet_wrap(~variable, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p)

使用ftrCOOL获取序列特征

library(ftrCOOL) 
# Convert sequence list to a character vector for ftrCOOL processing
seq_vector <- unlist(sequences[-4])
names(seq_vector) <- names(sequences[-4])

# Create a temporary FASTA file for ftrCOOL
temp_fasta <- tempfile(fileext = ".fasta")
write.fasta(sequences = strsplit(seq_vector, ""), names = names(seq_vector), file.out = temp_fasta)

# Extract various features using ftrCOOL
# APAAC 在普通氨基酸组成的基础上加入 序列顺序效应 
# 常用于下游聚类、分类、活性预测
aa_comp <- APAAC(temp_fasta)
datatable(aa_comp)
# AAindex properties 上千种氨基酸理化性质(如疏水性、极性、体积、二级结构倾向、β-折叠倾向等)。
# 研究稳定性、溶解度、膜透性、受体结合
# aaindex_props <- AAindex(temp_fasta, outFormat = "txt")


# CTD (Composition-Transition-Distribution) 
# 将序列转化为 组成 (Composition)、转移 (Transition)、分布 (Distribution) 三个层次
# 用于 抗菌肽、细胞穿透肽、免疫肽 的活性预测
ctd_features <- CTD(temp_fasta)
datatable(ctd_features)

使用ProGen生成功能序列

Large language models generate functional protein sequences across diverse families

Nature Biotechnology volume 41, pages1099–1106 (2023)

python sample.py --model ${model} --t 0.8 --p 0.9 --max-length 120 --num-samples 2 --context "HAEGTFTSDVSSYLEGQAAKEFI"
loading parameters
loading parameters took 12.82s
loading tokenizer
loading tokenizer took 0.00s
sanity cross-entropy
1.9 1.9180001020431519 0.018000102043151944
sanity cross-entropy took 0.06s
sampling
HAEGTFTSDVSSYLEGQAAKEFI

0
HAEGTFTSDVSSYLEGQAAKEFISKEQSFLGVSFALMLALLLTKM1

python likelihood.py --model ${model} --context "1HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS2"
loading parameters
loading parameters took 12.60s
loading tokenizer
loading tokenizer took 0.00s
log-likelihood (left-to-right, right-to-left)
ll_sum=-108.58344268798828
ll_mean=-2.7841907739639282
log-likelihood (left-to-right, right-to-left) took 0.11s
done.

ll_mean = -2.78
每个氨基酸残基的平均 log-likelihood。
这个值更适合比较不同长度的序列。
越接近 0 越好,说明每个残基出现的概率更合理。
典型范围:
-1 ~ -2:模型认为很自然(高置信度)。
-3 ~ -4:一般(存在些不常见模式)。
< -5:很不自然,可能模型没见过这种序列。

python likelihood.py --model ${model} --context "1YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS2"
loading parameters
loading parameters took 12.63s
loading tokenizer
loading tokenizer took 0.00s
log-likelihood (left-to-right, right-to-left)
ll_sum=-121.2396011352539
ll_mean=-3.1087077856063843
log-likelihood (left-to-right, right-to-left) took 0.11s
done.