一、流程说明
基于上一步TransDecoder预测的蛋白序列,下一步:
①同物种多转录组蛋白序列合并
②CD‑HIT聚类并筛选
注:我们需要保留至少2个转录组数据支持的高质性度蛋白质组数据,每个聚类簇仅保留最长蛋白异构体,最终输出高质量且去冗余的物种蛋白质组数据。
二、环境配置
conda install -c bioconda cd-hit
conda install -c bioconda biopython
conda install -c conda-forge pathlib三、核心代码
from pathlib import Path
from Bio import SeqIO
from collections import defaultdict
import subprocess
RESULT_DIR = Path("/data/result")
TD_DIR = RESULT_DIR / "transdecoder"
MERGED_DIR = RESULT_DIR / "merged_peptide" # 合并后序列
CLUSTER_DIR = RESULT_DIR / "clustering" # 聚类中间文件
FINAL_DIR = RESULT_DIR / "species_proteome" # 最终高质量蛋白组
# 自动创建目录
for d in [MERGED_DIR, CLUSTER_DIR, FINAL_DIR]:
d.mkdir(parents=True, exist_ok=True)
# 物种列表
species_dict = {
'fargesii': ['TF001','TF002','TF003','TF004','TF005','TF006'],
'nucifera': ['TN001','TN002','TN003','TN004','TN005','TN006'],
'taxifolia': ['JS2-1','JS2-2','JS2-3','JS2-4'],
'californica': ['JS2-6','JS2-7','JS2-8','JS2-9','JS2-11'],
'jackii': ['TJ001','TJ002','TJ003','TJ004','TJ005','TJ006']
}
# 同物种蛋白序列合并
def merge_species_proteins():
species_merged = {}
for species, samples in species_dict.items():
out_fa = MERGED_DIR / f"{species}.all_samples.raw.pep.fa"
species_merged[species] = out_fa
if out_fa.exists() and out_fa.stat().st_size > 0:
print(f"[跳过] 已合并:{species}")
continue
with open(out_fa, 'w') as out:
for sample in samples:
pep_file = TD_DIR / sample / f"{sample}.transdecoder.pep"
for rec in SeqIO.parse(pep_file, 'fasta'):
# ID 标准化:样本|原始ID
new_id = f"{sample}|{rec.id.split()[0]}"
rec.id, rec.name, rec.description = new_id, new_id, ""
SeqIO.write(rec, out, 'fasta')
print(f"[完成] 合并:{species}")
return species_merged
# CD-HIT 聚类与筛选
def cluster_and_filter(species_merged):
for species, merged_fa in species_merged.items():
sp_dir = FINAL_DIR / species
sp_dir.mkdir(exist_ok=True)
final_fa = sp_dir / f"{species}.final.filtered.pep.fa"
if final_fa.exists() and final_fa.stat().st_size > 0:
print(f"[跳过] 已筛选:{species}")
continue
# CD-HIT聚类,95%相似度
cluster_fa = CLUSTER_DIR / f"{species}.clustered.fa"
clstr_file = CLUSTER_DIR / f"{species}.clustered.fa.clstr"
if not cluster_fa.exists():
cmd = [
'cd-hit', '-i', str(merged_fa), '-o', str(cluster_fa),
'-c', '0.95', '-n', '5', '-d', '0', '-T', '8', '-M', '60000'
]
subprocess.run(cmd, check=True)
# 解析聚类结果
clusters = defaultdict(list)
with open(clstr_file) as f:
for line in f:
if line.startswith('>Cluster'):
current = line.strip()
else:
seq_id = line.split('>')[1].split('...')[0]
clusters[current].append(seq_id)
# 高质量序列筛选:≥2 样本支持 + 最长序列 + ≥50aa
seq_db = SeqIO.to_dict(SeqIO.parse(merged_fa, 'fasta'))
high_qual = []
for members in clusters.values():
samples = {m.split('|')[0] for m in members}
if len(samples) >= 2:
best = max((seq_db[m] for m in members), key=lambda x: len(x.seq))
if len(best.seq) >= 50:
high_qual.append(best)
SeqIO.write(high_qual, final_fa, 'fasta')
print(f"[完成] {species} 高质量蛋白:{len(high_qual)} 条")
# 主程序
if __name__ == "__main__":
print("===== 开始合并多样本蛋白序列 =====")
merged_files = merge_species_proteins()
print("\n===== 开始聚类与高质量筛选 =====")
cluster_and_filter(merged_files)
print("\n===== 全部任务完成!=====")四、结果输出
1.中间输出合并后的序列
result/merged_peptide/{物种}.all_samples.raw.pep.fa
2.最终高质量蛋白组
result/species_proteome/{物种}/{物种}.final.filtered.pep.fa
【金山文档 | WPS云文档】 同物种多转录组蛋白序列合并与CD‑HIT聚类筛选 https://www.kdocs.cn/l/cn3Fn66mxWLz