library(Peptides) # 多肽和蛋白质的理化性质分析
library(seqinr) # 序列数据的输入、输出
# 定义所有序列
sequences <- list(
Exenatide = "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS",
Liraglutide = "HAEGTFTSDVSSYLEGQAAKEEFIAWLVRGRG",
Lixisenatide = "HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPSKKKKKK",
Tirzepatide = "YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS"
)
# 创建结果存储列表
all_results <- list()
# 遍历并分析每个序列
for (name in names(sequences)) {
seq <- sequences[[name]]
# 1. 基础理化性质
mw <- mw(seq) # 分子量
pI <- pI(seq) # 等电点
aacomp <- aaComp(seq) # 氨基酸组成
instab <- instaIndex(seq) # 稳定性指数
hydro <- hydrophobicity(seq) # 平均疏水性
# 2. 酶切模拟
# Trypsin: 切割K/R之后,不切割KP/RP
trypsin_cut <- unlist(strsplit(seq, "(?<=[KR])(?!P)", perl=TRUE))
# DPP-IV: 识别N端的Xaa-Pro或Xaa-Ala二肽
dppiv_cleavage <- ifelse(substr(seq, 2, 2) %in% c("P", "A"),
substr(seq, 3, nchar(seq)),
"无切割")
# 保存结果
all_results[[name]] <- list(
Sequence = seq,
MW = mw,
pI = pI,
InstabilityIndex = instab,
Hydrophobicity = hydro,
AminoAcidComp = aacomp,
TrypsinFragments = trypsin_cut,
DPPIV_Cleavage = dppiv_cleavage
)
# 输出每个序列的基本信息
cat("\n===", name, "===\n")
cat("序列:", seq, "\n")
cat("分子量:", mw, "\n")
cat("等电点:", pI, "\n")
cat("氨基酸组成:\n")
print(aacomp[[1]])
cat("\n稳定性指数:", instab, "\n")
cat("平均疏水性:", hydro, "\n")
cat("Trypsin切割片段数:", length(trypsin_cut), "\n")
cat("DPP-IV 预测切割结果:", dppiv_cleavage, "\n")
}
##
## === Exenatide ===
## 序列: HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS
## 分子量: 4187.607
## 等电点: 4.41179
## 氨基酸组成:
## Number Mole%
## Tiny 14 35.897
## Small 21 53.846
## Aliphatic 7 17.949
## Aromatic 4 10.256
## NonPolar 20 51.282
## Polar 19 48.718
## Charged 10 25.641
## Basic 4 10.256
## Acidic 6 15.385
##
## 稳定性指数: 64.09231
## 平均疏水性: -0.6923077
## Trypsin切割片段数: 4
## DPP-IV 预测切割结果: 无切割
##
## === Liraglutide ===
## 序列: HAEGTFTSDVSSYLEGQAAKEEFIAWLVRGRG
## 分子量: 3512.84
## 等电点: 4.69885
## 氨基酸组成:
## Number Mole%
## Tiny 13 40.625
## Small 16 50.000
## Aliphatic 9 28.125
## Aromatic 5 15.625
## NonPolar 17 53.125
## Polar 15 46.875
## Charged 9 28.125
## Basic 4 12.500
## Acidic 5 15.625
##
## 稳定性指数: 25.64688
## 平均疏水性: -0.35625
## Trypsin切割片段数: 4
## DPP-IV 预测切割结果: EGTFTSDVSSYLEGQAAKEEFIAWLVRGRG
##
## === Lixisenatide ===
## 序列: HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPSKKKKKK
## 分子量: 4859.535
## 等电点: 10.32631
## 氨基酸组成:
## Number Mole%
## Tiny 14 31.818
## Small 20 45.455
## Aliphatic 7 15.909
## Aromatic 4 9.091
## NonPolar 19 43.182
## Polar 25 56.818
## Charged 16 36.364
## Basic 10 22.727
## Acidic 6 13.636
##
## 稳定性指数: 53.56818
## 平均疏水性: -1.109091
## Trypsin切割片段数: 9
## DPP-IV 预测切割结果: 无切割
##
## === Tirzepatide ===
## 序列: YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS
## 分子量: 3900.356
## 等电点: 4.370415
## 氨基酸组成:
## Number Mole%
## Tiny 15 38.462
## Small 22 56.410
## Aliphatic 10 25.641
## Aromatic 5 12.821
## NonPolar 23 58.974
## Polar 14 35.897
## Charged 5 12.821
## Basic 2 5.128
## Acidic 3 7.692
##
## 稳定性指数: 39.13333
## 平均疏水性: -0.1051282
## Trypsin切割片段数: 3
## DPP-IV 预测切割结果: 无切割
##
## === 多肽药物比较 ===
mws <- sapply(all_results, function(x) x$MW)
pIs <- sapply(all_results, function(x) x$pI)
instabs <- sapply(all_results, function(x) x$InstabilityIndex)
hydros <- sapply(all_results, function(x) x$Hydrophobicity)
# 创建比较数据框
comparison <- data.frame(
Peptide = names(sequences),
MW = mws,
pI = pIs,
InstabilityIndex = instabs,
Hydrophobicity = hydros,
Length = sapply(sequences, nchar)
)
print(comparison)
## Peptide MW pI InstabilityIndex Hydrophobicity
## Exenatide Exenatide 4187.607 4.411790 64.09231 -0.6923077
## Liraglutide Liraglutide 3512.840 4.698850 25.64688 -0.3562500
## Lixisenatide Lixisenatide 4859.535 10.326311 53.56818 -1.1090909
## Tirzepatide Tirzepatide 3900.356 4.370415 39.13333 -0.1051282
## Length
## Exenatide 39
## Liraglutide 32
## Lixisenatide 44
## Tirzepatide 39
# 绘制比较图表
library(ggplot2)
library(reshape2)
# 转换为长格式用于绘图
comp_melt <- melt(comparison, id.vars = "Peptide")
# 创建比较柱状图
p <- ggplot(comp_melt, aes(x = Peptide, y = value, fill = Peptide)) +
geom_bar(stat = "identity") +
facet_wrap(~variable, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
library(ftrCOOL)
# Convert sequence list to a character vector for ftrCOOL processing
seq_vector <- unlist(sequences[-4])
names(seq_vector) <- names(sequences[-4])
# Create a temporary FASTA file for ftrCOOL
temp_fasta <- tempfile(fileext = ".fasta")
write.fasta(sequences = strsplit(seq_vector, ""), names = names(seq_vector), file.out = temp_fasta)
# Extract various features using ftrCOOL
# APAAC 在普通氨基酸组成的基础上加入 序列顺序效应
# 常用于下游聚类、分类、活性预测
aa_comp <- APAAC(temp_fasta)
datatable(aa_comp)
# AAindex properties 上千种氨基酸理化性质(如疏水性、极性、体积、二级结构倾向、β-折叠倾向等)。
# 研究稳定性、溶解度、膜透性、受体结合
# aaindex_props <- AAindex(temp_fasta, outFormat = "txt")
# CTD (Composition-Transition-Distribution)
# 将序列转化为 组成 (Composition)、转移 (Transition)、分布 (Distribution) 三个层次
# 用于 抗菌肽、细胞穿透肽、免疫肽 的活性预测
ctd_features <- CTD(temp_fasta)
datatable(ctd_features)
Large language models generate functional protein sequences across diverse families
Nature Biotechnology volume 41, pages1099–1106 (2023)
python sample.py --model ${model} --t 0.8 --p 0.9 --max-length 120 --num-samples 2 --context "HAEGTFTSDVSSYLEGQAAKEFI"
loading parameters
loading parameters took 12.82s
loading tokenizer
loading tokenizer took 0.00s
sanity cross-entropy
1.9 1.9180001020431519 0.018000102043151944
sanity cross-entropy took 0.06s
sampling
HAEGTFTSDVSSYLEGQAAKEFI
0
HAEGTFTSDVSSYLEGQAAKEFISKEQSFLGVSFALMLALLLTKM1
python likelihood.py --model ${model} --context "1HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPSSGAPPPS2"
loading parameters
loading parameters took 12.60s
loading tokenizer
loading tokenizer took 0.00s
log-likelihood (left-to-right, right-to-left)
ll_sum=-108.58344268798828
ll_mean=-2.7841907739639282
log-likelihood (left-to-right, right-to-left) took 0.11s
done.
ll_mean = -2.78
每个氨基酸残基的平均 log-likelihood。
这个值更适合比较不同长度的序列。
越接近 0 越好,说明每个残基出现的概率更合理。
典型范围:
-1 ~ -2:模型认为很自然(高置信度)。
-3 ~ -4:一般(存在些不常见模式)。
< -5:很不自然,可能模型没见过这种序列。
python likelihood.py --model ${model} --context "1YXEGTFTSDYSIXLDKIAQKAFVQWLIAGGPSSGAPPPS2"
loading parameters
loading parameters took 12.63s
loading tokenizer
loading tokenizer took 0.00s
log-likelihood (left-to-right, right-to-left)
ll_sum=-121.2396011352539
ll_mean=-3.1087077856063843
log-likelihood (left-to-right, right-to-left) took 0.11s
done.