根据基因ID批量提取CDS文件中的序列

By Tingting, 10 July, 2025

Forums

gwas分析后得到csv文件，首先利用excel，筛选gene，然后分列，将名字形成新的文件，如下。

（1）分列后得到的文件

（2）得到新的文件

cds文件如下

利用python进行提取cds文件后面gene=id一致的序列

def extract_sequences_by_gene_ids(gene_id_file, cds_file, output_file):
    try:
        # 读取基因 ID 文件，将基因 ID 存储到集合中，方便快速查找
        with open(gene_id_file, 'r', encoding='utf-8') as f:
            gene_ids = set(line.strip() for line in f)
        print(f"基因 ID 文件中的 ID 数量: {len(gene_ids)}")
        print("基因 ID 文件中的部分 ID（前 5 个）:", list(gene_ids)[:5])

        current_id = None
        current_sequence = ""
        extracted_sequences = {}
        matched_ids = []

        # 读取 CDS 文件
        with open(cds_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('>'):
                    # 提取 gene= 后面的基因 ID
                    parts = line.split('gene=')
                    if len(parts) > 1:
                        cds_gene_id = parts[1].split()[0]
                        print(f"CDS 文件中读取到的基因 ID: {cds_gene_id}")
                        if current_id in gene_ids:
                            extracted_sequences[current_id] = current_sequence
                            matched_ids.append(current_id)
                        current_id = cds_gene_id
                        current_sequence = ""
                    else:
                        print(f"警告：无法从该行提取基因 ID: {line}")
                else:
                    current_sequence += line.strip()

        # 处理最后一个序列
        if current_id in gene_ids:
            extracted_sequences[current_id] = current_sequence
            matched_ids.append(current_id)

        print(f"匹配到的基因 ID 数量: {len(matched_ids)}")
        print("匹配到的基因 ID:", matched_ids)

        # 将提取的序列写入输出文件
        with open(output_file, 'w', encoding='utf-8') as f:
            for gene_id, sequence in extracted_sequences.items():
                f.write(f">{gene_id}\n{sequence}\n")
    except FileNotFoundError:
        print("文件未找到，请检查文件路径。")
    except Exception as e:
        print(f"发生错误: {e}")

# 替换为你的实际文件路径
gene_id_file = 'C:/Users/27928/Desktop/刘起睿 论文 GS 20250521/3个新表型GWAS的结果/3_new_result/petalL.gene.id'
cds_file = 'E:/Rp-data/Rhododendron_genome/Rhododendron_delavayi_chr.cds.fa/Rhododendron_delavayi.cds.fa'
output_file = 'C:/Users/27928/Desktop/刘起睿 论文 GS 20250521/3个新表型GWAS的结果/3_new_result/extracted_cds.fasta'

# 调用函数进行序列提取
extract_sequences_by_gene_ids(gene_id_file, cds_file, output_file)