同物种多转录组蛋白序列合并与CD‑HIT聚类筛选

By yangyulei, 25 May, 2026

Forums

General discussion

一、流程说明

基于上一步TransDecoder预测的蛋白序列，下一步：

①同物种多转录组蛋白序列合并

②CD‑HIT聚类并筛选

注：我们需要保留至少2个转录组数据支持的高质性度蛋白质组数据，每个聚类簇仅保留最长蛋白异构体，最终输出高质量且去冗余的物种蛋白质组数据。

二、环境配置

conda install -c bioconda cd-hit
conda install -c bioconda biopython
conda install -c conda-forge pathlib

三、核心代码

from pathlib import Path
from Bio import SeqIO
from collections import defaultdict
import subprocess

RESULT_DIR = Path("/data/result")
TD_DIR = RESULT_DIR / "transdecoder"
MERGED_DIR = RESULT_DIR / "merged_peptide"  # 合并后序列
CLUSTER_DIR = RESULT_DIR / "clustering"   # 聚类中间文件
FINAL_DIR = RESULT_DIR / "species_proteome"   # 最终高质量蛋白组


# 自动创建目录
for d in [MERGED_DIR, CLUSTER_DIR, FINAL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# 物种列表
species_dict = {
    'fargesii': ['TF001','TF002','TF003','TF004','TF005','TF006'],
    'nucifera': ['TN001','TN002','TN003','TN004','TN005','TN006'],
    'taxifolia': ['JS2-1','JS2-2','JS2-3','JS2-4'],
    'californica': ['JS2-6','JS2-7','JS2-8','JS2-9','JS2-11'],
    'jackii': ['TJ001','TJ002','TJ003','TJ004','TJ005','TJ006']
}

# 同物种蛋白序列合并
def merge_species_proteins():
    species_merged = {}
    for species, samples in species_dict.items():
        out_fa = MERGED_DIR / f"{species}.all_samples.raw.pep.fa"
        species_merged[species] = out_fa

        if out_fa.exists() and out_fa.stat().st_size > 0:
            print(f"[跳过] 已合并：{species}")
            continue

        with open(out_fa, 'w') as out:
            for sample in samples:
                pep_file = TD_DIR / sample / f"{sample}.transdecoder.pep"
                for rec in SeqIO.parse(pep_file, 'fasta'):
                    # ID 标准化：样本|原始ID
                    new_id = f"{sample}|{rec.id.split()[0]}"
                    rec.id, rec.name, rec.description = new_id, new_id, ""
                    SeqIO.write(rec, out, 'fasta')
        print(f"[完成] 合并：{species}")
    return species_merged

# CD-HIT 聚类与筛选
def cluster_and_filter(species_merged):
    for species, merged_fa in species_merged.items():
        sp_dir = FINAL_DIR / species
        sp_dir.mkdir(exist_ok=True)
        final_fa = sp_dir / f"{species}.final.filtered.pep.fa"

        if final_fa.exists() and final_fa.stat().st_size > 0:
            print(f"[跳过] 已筛选：{species}")
            continue

        # CD-HIT聚类，95%相似度
        cluster_fa = CLUSTER_DIR / f"{species}.clustered.fa"
        clstr_file = CLUSTER_DIR / f"{species}.clustered.fa.clstr"

        if not cluster_fa.exists():
            cmd = [
                'cd-hit', '-i', str(merged_fa), '-o', str(cluster_fa),
                '-c', '0.95', '-n', '5', '-d', '0', '-T', '8', '-M', '60000'
            ]
            subprocess.run(cmd, check=True)

        # 解析聚类结果
        clusters = defaultdict(list)
        with open(clstr_file) as f:
            for line in f:
                if line.startswith('>Cluster'):
                    current = line.strip()
                else:
                    seq_id = line.split('>')[1].split('...')[0]
                    clusters[current].append(seq_id)

        # 高质量序列筛选：≥2 样本支持 + 最长序列 + ≥50aa
        seq_db = SeqIO.to_dict(SeqIO.parse(merged_fa, 'fasta'))
        high_qual = []
        for members in clusters.values():
            samples = {m.split('|')[0] for m in members}
            if len(samples) >= 2:
                best = max((seq_db[m] for m in members), key=lambda x: len(x.seq))
                if len(best.seq) >= 50:
                    high_qual.append(best)

        SeqIO.write(high_qual, final_fa, 'fasta')
        print(f"[完成] {species} 高质量蛋白：{len(high_qual)} 条")

# 主程序
if __name__ == "__main__":
    print("===== 开始合并多样本蛋白序列 =====")
    merged_files = merge_species_proteins()
    print("\n===== 开始聚类与高质量筛选 =====")
    cluster_and_filter(merged_files)
    print("\n===== 全部任务完成！=====")

四、结果输出

1.中间输出合并后的序列

result/merged_peptide/{物种}.all_samples.raw.pep.fa

2.最终高质量蛋白组

result/species_proteome/{物种}/{物种}.final.filtered.pep.fa

【金山文档 | WPS云文档】同物种多转录组蛋白序列合并与CD‑HIT聚类筛选 https://www.kdocs.cn/l/cn3Fn66mxWLz