香榧基因组太大了,跑EDTA老是报错(Out of memory),将香榧拆成每条染色体之后也还是再运行,也还是报错(Out of memory),最后将单条染色体拆分成400或500Mb再运行,以下是脚本:
将每条染色体拆分成400-500Mb,然后批量运行
# 将每个序列(每条染色体)拆分为单独的文件
seqkit split -i -O split_by_chr genome.fasta
python chaifen.py Tgrandis.Chr.only.part_Chr3.fa 500000000 #chaifen.py脚本在下面
然后运行一下脚本:nohup bash piliang.sh &
cat piliang.sh
#!/bin/bash
# 遍历所有以 Chr1.part_ 开头的 fa 文件
for fa in Chr2.part_*.fa; do
# 生成日志名
log=${fa%.fa}.log
echo "开始运行: $fa"
echo "日志: $log"
# 运行 EDTA
docker run \
-v $PWD:/in \
-w /in \
quay.io/biocontainers/edta:1.9.6--hdfd78af_2 \
EDTA.pl \
--genome $fa \
--species others \
--sensitive 0 \
--anno 0 \
--threads 10 \
--cds Tgrandis.cds \
> $log 2>&1
echo "完成: $fa"
echo "-----------------------------------------"
done
下面是chaifen.py脚本
cat chaifen.py
#!/usr/bin/env python3
import sys
def split_fasta_into_chunks(input_fasta, chunk_size_bp):
"""将单条序列的FASTA文件按指定长度切分"""
with open(input_fasta, 'r') as infile:
# 读取第一行作为序列标题
header = infile.readline().strip()
# 移除标题行开头的 '>'
if header.startswith('>'):
header = header[1:]
else:
print("错误:输入文件不是有效的FASTA格式")
return
# 读取剩余的序列内容,并移除换行符
sequence_lines = [line.strip() for line in infile]
full_sequence = ''.join(sequence_lines)
seq_len = len(full_sequence)
num_chunks = (seq_len + chunk_size_bp - 1) // chunk_size_bp
print(f"总长度: {seq_len:,} bp")
print(f"将切分为 {num_chunks} 个片段 (~{chunk_size_bp:,} bp/片段)\n")
for i in range(num_chunks):
start = i * chunk_size_bp
end = min((i+1) * chunk_size_bp, seq_len)
chunk_seq = full_sequence[start:end]
# 按80字符一行进行换行,保持FASTA格式整洁
fasta_lines = [chunk_seq[j:j+80] for j in range(0, len(chunk_seq), 80)]
# 构建新序列的标题
new_header = f">{header}_part_{i+1} [{start+1}-{end}]"
out_filename = f"{header}.part_{i+1:03d}.fa"
with open(out_filename, 'w') as outfile:
outfile.write(new_header + '\n')
outfile.write('\n'.join(fasta_lines))
if i < num_chunks - 1:
outfile.write('\n') # 确保文件间有换行分隔
print(f"已创建: {out_filename}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py input.fasta chunk_size_bp")
print("例如: python split_fasta.py Tgrandis.Chr1.fa 400000000")
sys.exit(1)
input_file = sys.argv[1]
try:
chunk_size = int(sys.argv[2])
except ValueError:
print("错误: chunk_size_bp 必须是一个整数。")
sys.exit(1)
split_fasta_into_chunks(input_file, chunk_size)