TransDecoder转录组CDS预测多物种批量流程

By yangyulei, 25 May, 2026

Forums

一、流程说明

批量对多个物种的转录组组装 fasta 文件进行长开放阅读框（TransDecoder.LongOrfs）预测与蛋白质序列翻译，输出蛋白 pep 序列。

二、环境配置

conda install -c bioconda transdecoder
conda install -c bioconda biopython
conda install -c anaconda pathlib

三、核心代码

import subprocess
from pathlib import Path
import shutil

# 原始转录组fasta存放目录
DATA_DIR = Path("/data/data")
# 结果总目录
RESULT_DIR = Path("/data/result")
# TransDecoder输出目录
TD_DIR = RESULT_DIR / "transdecoder"

# 自动创建所有目录
TD_DIR.mkdir(parents=True, exist_ok=True)

# 物种列表
species_dict = {
    'fargesii': ['TF001', 'TF002', 'TF003', 'TF004', 'TF005', 'TF006'],
    'nucifera': ['TN001', 'TN002', 'TN003', 'TN004', 'TN005', 'TN006'],
    'taxifolia': ['JS2-1', 'JS2-2', 'JS2-3', 'JS2-4'],
    'californica': ['JS2-6', 'JS2-7', 'JS2-8', 'JS2-9', 'JS2-11'],
    'jackii': ['TJ001', 'TJ002', 'TJ003', 'TJ004', 'TJ005', 'TJ006']
}

# 批量运行TransDecoder
def run_transdecoder_batch():
    pep_files = {}
    # 遍历所有物种和样本
    for species, samples in species_dict.items():
        for sample in samples:
            # 输入fasta路径
            fasta_file = DATA_DIR / f"{sample}.fasta"
            if not fasta_file.exists():
                raise FileNotFoundError(f"缺失文件：{fasta_file}")

            # 输出路径
            sample_out_dir = TD_DIR / sample
            sample_out_dir.mkdir(exist_ok=True)
            final_pep = sample_out_dir / f"{sample}.transdecoder.pep"

            # 预测长ORF
            print(f"\n[运行] {sample} - 长ORF预测")
            subprocess.run(
                ["TransDecoder.LongOrfs", "-t", str(fasta_file)],
                cwd=str(sample_out_dir), check=True
            )

            # CDS与蛋白序列预测（调用4线程运行，可修改--cpu参数调整）
            print(f"[运行] {sample} - CDS/蛋白预测")
            subprocess.run(
                ["TransDecoder.Predict", "-t", str(fasta_file), "--cpu", "4"],
                cwd=str(sample_out_dir), check=True
            )

            # 整理结果文件
            pep_result = list(sample_out_dir.glob("*.transdecoder.pep"))[0]
            shutil.move(str(pep_result), str(final_pep))
            pep_files[sample] = final_pep
            print(f"[完成] {sample} 蛋白序列已生成")

    return pep_files

# 主程序
if __name__ == "__main__":
    print("===== TransDecoder 多物种批量预测开始 =====")
    pep_result_dict = run_transdecoder_batch()
    print("\n===== 全部任务完成！=====")

四、结果输出

result/transdecoder/[物种名]/[物种名].transdecoder.pep

【金山文档 | WPS云文档】 TransDecoder转录组CDS预测多物种批量流程 https://www.kdocs.cn/l/ckkQ8raW8jaK