# 输入文件 GENOME="genome.fa" GFF="annotation.gff" GENE_LIST="TgSBT_gene_id.txt" # 步骤1: 提取所有目标基因的位置信息 echo "Extracting gene positions..." awk -F'\t' '$3 == "gene" {print $0}' "$GFF" | \ grep -F -f "$GENE_LIST" | \ awk -v OFS='\t' '{ # 提取基因ID split($9, attrs, ";") for (i in attrs) { if (attrs[i] ~ /ID=/) { split(attrs[i], idpart, "=") gene_id = idpart[2] break } } # 输出BED格式:染色体 起始位置 终止位置 基因ID 链方向 print $1, $4, $5, gene_id, $7 }' > all_genes.bed # 步骤2: 计算启动子区域 echo "Calculating promoter regions..." awk -v OFS='\t' '{ chrom = $1 start_pos = $2 end_pos = $3 gene_id = $4 strand = $5 if (strand == "+") { promoter_start = (start_pos - 2000 > 0) ? start_pos - 2000 : 1 promoter_end = start_pos } else { promoter_start = end_pos promoter_end = end_pos + 2000 } # 输出:染色体 启动子起始 启动子终止 基因ID 链方向 print chrom, promoter_start, promoter_end, gene_id, strand }' all_genes.bed > promoter_regions.bed # 步骤3: 提取序列 echo "Extracting promoter sequences..." bedtools getfasta \ -fi "$GENOME" \ -bed promoter_regions.bed \ -name \ -s \ -fo target_promoters.fa echo "Done! Results saved to target_promoters.fa"