get longest transcript seq
提取最长转录本
导入模块,设置目录
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
new_path = "E:/code/get_longest_trans_test/"
os.chdir(new_path)读取fa和gff文件
# 读取fa文件
fa = "Homo_sapiens.chr1.chr2.fa"
fa_index = SeqIO.index(fa, "fasta")
chr_seq = {k:str(v.seq) for k,v in fa_index.items()}
# 读取gff文件
gff = "Homo_sapiens.GRCh38.110.chr1.chr2.gff"
gff_fi = open(gff,'r')写入四个字典:gene_dict、txID_cdsID_dict、cds_length_dict和cds_position_dict
gene_dict = {}
txID_cdsID_dict = {}
cds_length_dict = {}
cds_position_dict = {}
# 循环写入字典
gff_fi.seek(0)
for line in gff_fi:
content = line.strip().split('\t')
if len(content) <= 8:
continue
if content[2] == "gene":
# 存储geneID
gene_id = content[8].split(';')[0].split('=')[1]
gene_dict[gene_id] = []
if content[2] == "mRNA" :
# 取tx_id和tx_parent(即gengID)
tx_id = content[8].split(';')[0].split('=')[1]
tx_parent = content[8].split(';')[1]
tx_parent = tx_parent.strip().split('=')[1]
# gene_dict键为geneID,值为[mRNA_ID]
if tx_parent in gene_dict:
gene_dict[tx_parent].append(tx_id)
else:
gene_dict[tx_parent] = [tx_id]
txID_cdsID_dict[tx_id] = []
cds_length_dict[tx_id] = []
if content[2] == "CDS":
# 取cds_ID cds_parent(即mRNA_ID)以及cds_length
cds_id = content[8].split(';')[0].split('=')[1]
cds_parent = content[8].split(';')[1].split('=')[1]
cds_length = int(content[4]) - int(content[3]) + 1
# cds_length_dict键为mRNA_ID,值为[length]
if cds_parent in cds_length_dict:
cds_length_dict[cds_parent].append(cds_length)
else:
cds_length_dict[cds_parent] = [cds_length]
# 存储CDS 的 content
cds_position_dict[cds_id] = content
# txID_cdsID_dict键为mRNA_ID,值为[cds_ID]
if cds_parent in txID_cdsID_dict:
txID_cdsID_dict[cds_parent].append(cds_id)
else:
txID_cdsID_dict[cds_parent] = [cds_id]
提取最长转录本序列
# 提取最长转录本ID
for gene_id in gene_dict.keys():
temp_length = 0 # 用于存储当前最长转录本的长度
longest_tx_id = None # 用于存储当前最长转录本的ID
for tx_id in gene_dict[gene_id]:
sum_tx_length = sum(cds_length_dict[tx_id]) # 计算转录本的总长度
# 如果当前转录本的总长度大于已知的最长长度,则更新
if sum_tx_length >= temp_length:
longest_tx_id = tx_id
temp_length = sum_tx_length
# 更新gene_dict,使其值为最长转录本的ID
gene_dict[gene_id] = longest_tx_id
# 去除值为None的键值对
gene_dict = {k: v for k, v in gene_dict.items() if v is not None}写入tx_seq_fa文件
# 以上操作得到的此时gene_dict这个字典的键为最长转录本的geneID,值为最长转录本的mRNAID
# 开始将序列写入fa文件
gff_fi.seek(0)
with open("tx_seq_fa", 'w') as tx_seq_fa:
for line in gff_fi:
content = line.strip().split('\t')
if content[2] == "gene":
geneid = content[8].split(';')[0].split('=')[1]
if geneid in gene_dict.keys():
# 如果geneid存在于gene_dict的键,就写入tx_seq_fa
gene_name = content[8].split(';')[1].split('=')[1]
tx_seq_fa.write(">" + geneid + '|' + gene_name + '\n')
if content[2] == "mRNA":
tx_id = content[8].split(';')[0].split('=')[1]
if tx_id in gene_dict.values():
# 如果mRNAID存在于gene_dict的值,就写入tx_seq_fa
seq_mRNA = ""
for cds_id in txID_cdsID_dict[tx_id]:
chrom = cds_position_dict[cds_id][0]
start = int(cds_position_dict[cds_id][3])
end = int(cds_position_dict[cds_id][4])
seq = chr_seq[chrom][(start-1):end]
#seq_reverse = str(Seq(seq).reverse_complement())
strand = cds_position_dict[cds_id][6]
if strand == "-":
seq = str(Seq(seq).reverse_complement())
seq_mRNA = seq + seq_mRNA
if strand == "+":
seq_mRNA += seq
tx_seq_fa.write(seq_mRNA + '\n')