提取最长转录本 | Zhang Lab Forum

By Gengxin, 25 September, 2025

Forums

非最长转录本的蛋白质示意图

提取最长转录本

from collections import defaultdict
from Bio import SeqIO

# 输入输出文件名
input_fasta = "proteins.fa"  # ← 改成你真实的文件名
output_fasta = "longest_protein_per_gene.fa"

# 存储最长序列
longest_proteins = {}

# 遍历每条序列
for record in SeqIO.parse(input_fasta, "fasta"):
    header = record.description
    seq = str(record.seq).replace("X", "")  # 去除终止符X，不然长度会不准确
    transcript_id = record.id

    # 解析 gene_id（从 header 中提取 gene=XXX）
    gene_id = None
    for item in header.split():
        if item.startswith("gene="):
            gene_id = item.split("=", 1)[1]
            break

    if gene_id is None:
        continue  # 跳过没有 gene= 的条目

    # 如果更长，就更新
    if gene_id not in longest_proteins or len(seq) > len(longest_proteins[gene_id].seq):
        longest_proteins[gene_id] = record

# 写入输出
with open(output_fasta, "w") as out:
    SeqIO.write(longest_proteins.values(), out, "fasta")

print(f"✅ 已输出每个基因的最长蛋白到: {output_fasta}")

原始ID去掉版本号

from Bio import SeqIO

# 输入输出文件名
input_fasta = "proteins.fa"  # ← 请替换为你的真实文件名
output_fasta = "cleaned_proteins.fa"

# 打开输出文件
with open(output_fasta, "w") as out:
    # 解析输入的FASTA文件
    for record in SeqIO.parse(input_fasta, "fasta"):
        # 获取原始ID并去掉版本号（例如 ".1" => 空）
        new_id = record.id.split(".")[0]  # 仅保留版本号前的部分

        # 更新转录本的ID
        record.id = new_id
        record.description = new_id  # 也更新description

        # 写入新的FASTA文件
        SeqIO.write(record, out, "fasta")

print(f"✅ 已输出去除版本号后的转录本：{output_fasta}")

修改后文件