一、流程说明
批量对多个物种的转录组组装 fasta 文件进行长开放阅读框(TransDecoder.LongOrfs)预测与蛋白质序列翻译,输出蛋白 pep 序列。
二、环境配置
conda install -c bioconda transdecoder
conda install -c bioconda biopython
conda install -c anaconda pathlib三、核心代码
import subprocess
from pathlib import Path
import shutil
# 原始转录组fasta存放目录
DATA_DIR = Path("/data/data")
# 结果总目录
RESULT_DIR = Path("/data/result")
# TransDecoder输出目录
TD_DIR = RESULT_DIR / "transdecoder"
# 自动创建所有目录
TD_DIR.mkdir(parents=True, exist_ok=True)
# 物种列表
species_dict = {
'fargesii': ['TF001', 'TF002', 'TF003', 'TF004', 'TF005', 'TF006'],
'nucifera': ['TN001', 'TN002', 'TN003', 'TN004', 'TN005', 'TN006'],
'taxifolia': ['JS2-1', 'JS2-2', 'JS2-3', 'JS2-4'],
'californica': ['JS2-6', 'JS2-7', 'JS2-8', 'JS2-9', 'JS2-11'],
'jackii': ['TJ001', 'TJ002', 'TJ003', 'TJ004', 'TJ005', 'TJ006']
}
# 批量运行TransDecoder
def run_transdecoder_batch():
pep_files = {}
# 遍历所有物种和样本
for species, samples in species_dict.items():
for sample in samples:
# 输入fasta路径
fasta_file = DATA_DIR / f"{sample}.fasta"
if not fasta_file.exists():
raise FileNotFoundError(f"缺失文件:{fasta_file}")
# 输出路径
sample_out_dir = TD_DIR / sample
sample_out_dir.mkdir(exist_ok=True)
final_pep = sample_out_dir / f"{sample}.transdecoder.pep"
# 预测长ORF
print(f"\n[运行] {sample} - 长ORF预测")
subprocess.run(
["TransDecoder.LongOrfs", "-t", str(fasta_file)],
cwd=str(sample_out_dir), check=True
)
# CDS与蛋白序列预测(调用4线程运行,可修改--cpu参数调整)
print(f"[运行] {sample} - CDS/蛋白预测")
subprocess.run(
["TransDecoder.Predict", "-t", str(fasta_file), "--cpu", "4"],
cwd=str(sample_out_dir), check=True
)
# 整理结果文件
pep_result = list(sample_out_dir.glob("*.transdecoder.pep"))[0]
shutil.move(str(pep_result), str(final_pep))
pep_files[sample] = final_pep
print(f"[完成] {sample} 蛋白序列已生成")
return pep_files
# 主程序
if __name__ == "__main__":
print("===== TransDecoder 多物种批量预测开始 =====")
pep_result_dict = run_transdecoder_batch()
print("\n===== 全部任务完成!=====")四、结果输出
result/transdecoder/[物种名]/[物种名].transdecoder.pep
【金山文档 | WPS云文档】 TransDecoder转录组CDS预测多物种批量流程 https://www.kdocs.cn/l/ckkQ8raW8jaK