gwas分析后得到csv文件,首先利用excel,筛选gene,然后分列,将名字形成新的文件,如下。
(1)分列后得到的文件
(2)得到新的文件
cds文件如下
利用python进行提取cds文件后面gene=id一致的序列
def extract_sequences_by_gene_ids(gene_id_file, cds_file, output_file):
try:
# 读取基因 ID 文件,将基因 ID 存储到集合中,方便快速查找
with open(gene_id_file, 'r', encoding='utf-8') as f:
gene_ids = set(line.strip() for line in f)
print(f"基因 ID 文件中的 ID 数量: {len(gene_ids)}")
print("基因 ID 文件中的部分 ID(前 5 个):", list(gene_ids)[:5])
current_id = None
current_sequence = ""
extracted_sequences = {}
matched_ids = []
# 读取 CDS 文件
with open(cds_file, 'r', encoding='utf-8') as f:
for line in f:
if line.startswith('>'):
# 提取 gene= 后面的基因 ID
parts = line.split('gene=')
if len(parts) > 1:
cds_gene_id = parts[1].split()[0]
print(f"CDS 文件中读取到的基因 ID: {cds_gene_id}")
if current_id in gene_ids:
extracted_sequences[current_id] = current_sequence
matched_ids.append(current_id)
current_id = cds_gene_id
current_sequence = ""
else:
print(f"警告:无法从该行提取基因 ID: {line}")
else:
current_sequence += line.strip()
# 处理最后一个序列
if current_id in gene_ids:
extracted_sequences[current_id] = current_sequence
matched_ids.append(current_id)
print(f"匹配到的基因 ID 数量: {len(matched_ids)}")
print("匹配到的基因 ID:", matched_ids)
# 将提取的序列写入输出文件
with open(output_file, 'w', encoding='utf-8') as f:
for gene_id, sequence in extracted_sequences.items():
f.write(f">{gene_id}\n{sequence}\n")
except FileNotFoundError:
print("文件未找到,请检查文件路径。")
except Exception as e:
print(f"发生错误: {e}")
# 替换为你的实际文件路径
gene_id_file = 'C:/Users/27928/Desktop/刘起睿 论文 GS 20250521/3个新表型GWAS的结果/3_new_result/petalL.gene.id'
cds_file = 'E:/Rp-data/Rhododendron_genome/Rhododendron_delavayi_chr.cds.fa/Rhododendron_delavayi.cds.fa'
output_file = 'C:/Users/27928/Desktop/刘起睿 论文 GS 20250521/3个新表型GWAS的结果/3_new_result/extracted_cds.fasta'
# 调用函数进行序列提取
extract_sequences_by_gene_ids(gene_id_file, cds_file, output_file)