1. 准备数据与环境
1.1 目录结构建议
Jbrowse/
├── data/
│ ├── female_rename.gff3 # GFF3注释文件
│ ├── female_chr_rename.fna # 参考基因组FASTA
│ ├── female_chr_rename.fna.fai # fai文件
│ └── expression_transcript_ba/ # 表达量矩阵目录
│ ├── gene_fpkm_matrix.csv
│ ├── gene_fpkm_matrix_fruit.csv
│ └── gene_fpkm_matrix_other.csv
├── output/
│ ├── bedgraph_samples/
│ ├── bigwig_files/
│ └── trackList.json
└── scripts/
├── generate_bedgraph.py
├── convert_to_bigwig.sh
└── generate_track_config.py
mkdir -p output/bedgraph_samples
mkdir -p output/bigwig_files
mkdir -p output/track_config
1.2 环境依赖
- Python 3(推荐3.7及以上)
- pip(Python包管理器)
- pandas(Python数据处理库)
- samtools(FASTA索引工具)
- bedGraphToBigWig(UCSC工具,BedGraph转BigWig)
- JBrowse(已集成到Drupal7,或单独安装)
# 安装samtools
apt-get update
apt-get install -y samtools
# 安装UCSC工具(包含bedGraphToBigWig)
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig
chmod +x bedGraphToBigWig
mv bedGraphToBigWig /usr/local/bin/
1.3 脚本准备
1.3.1 scripts/generate_bedgraph.py
import pandas as pd
import os
import re
import glob
def process_gff(gff_file):
"""处理GFF3文件,提取mRNA信息"""
print("正在处理GFF3文件...")
gff = pd.read_csv(gff_file, sep='\t', comment="#", header=None,
names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
# 提取mRNA
gff_mrna = gff[gff['type'] == 'mRNA'].copy()
# 提取ID
gff_mrna['mrna_id'] = gff_mrna['attributes'].apply(lambda x: re.search(r'ID=([^;]+)', x).group(1))
# 只保留需要的列
gff_mrna = gff_mrna[['mrna_id', 'seqid', 'start', 'end']]
print(f"从GFF文件中提取了 {len(gff_mrna)} 个mRNA记录")
return gff_mrna
def read_csv_file(file_path):
"""读取CSV文件,尝试不同的编码"""
try:
return pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
try:
return pd.read_csv(file_path, encoding='gbk')
except UnicodeDecodeError:
return pd.read_csv(file_path, encoding='gb2312')
def process_expression_matrices(expr_dir):
"""处理所有表达量矩阵文件"""
print("正在处理表达量矩阵文件...")
# 存储表达量矩阵
expr_matrices = {}
combined_matrix = None
print("\n处理FPKM类型矩阵...")
for file_name in fpkm_files:
file_path = os.path.join(expr_dir, file_name)
if not os.path.exists(file_path):
print(f"警告: 文件不存在 {file_path}")
continue
print(f"\n处理文件: {file_name}")
df = read_csv_file(file_path)
# 打印列名(样本名)
print("文件中的样本列:")
for col in df.columns:
if col != 'mrna_id':
print(f"- {col}")
print(f"矩阵大小: {df.shape}")
# 合并矩阵
if combined_matrix is None:
combined_matrix = df
else:
# 合并新的样本列
common_cols = ['mrna_id']
new_cols = [col for col in df.columns if col not in common_cols]
combined_matrix = pd.merge(
combined_matrix,
df[common_cols + new_cols],
on='mrna_id',
how='outer'
)
if combined_matrix is not None:
expr_matrices['fpkm'] = combined_matrix
print(f"\n合并后的FPKM矩阵大小: {combined_matrix.shape}")
print("所有样本列表:")
for col in combined_matrix.columns:
if col != 'mrna_id':
print(f"- {col}")
return expr_matrices
def merge_overlapping_regions(df):
"""合并重叠区域,保留最大值"""
# 按染色体和位置排序
df = df.sort_values(['chrom', 'start', 'end'])
# 创建结果DataFrame
result = []
current_chrom = None
current_start = None
current_end = None
current_value = None
for _, row in df.iterrows():
if current_chrom is None:
# 第一条记录
current_chrom = row['chrom']
current_start = row['start']
current_end = row['end']
current_value = row['value']
elif row['chrom'] == current_chrom and row['start'] <= current_end:
# 重叠区域,更新结束位置和值
current_end = max(current_end, row['end'])
current_value = max(current_value, row['value'])
else:
# 不重叠,保存当前区域并开始新区域
result.append({
'chrom': current_chrom,
'start': current_start,
'end': current_end,
'value': current_value
})
current_chrom = row['chrom']
current_start = row['start']
current_end = row['end']
current_value = row['value']
# 添加最后一个区域
if current_chrom is not None:
result.append({
'chrom': current_chrom,
'start': current_start,
'end': current_end,
'value': current_value
})
return pd.DataFrame(result)
def generate_bedgraph_files(gff_mrna, expr_matrices, output_dir):
"""生成BedGraph文件"""
print("正在生成BedGraph文件...")
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 只处理FPKM表达量矩阵
matrix_type = 'fpkm'
expr_matrix = expr_matrices[matrix_type]
print(f"\n处理{matrix_type}表达量矩阵...")
# 合并表达和位置信息
merged = pd.merge(gff_mrna, expr_matrix, on='mrna_id')
print(f"合并后的数据大小: {merged.shape}")
# 为每个样本生成BedGraph文件
samples = [col for col in merged.columns if col not in ['mrna_id', 'seqid', 'start', 'end']]
print(f"样本数量: {len(samples)}")
print("样本列表:")
for sample in samples:
print(f"- {sample}")
for sample in samples:
print(f"处理样本: {sample}")
# 使用seqid作为染色体名称
df = merged[['seqid', 'start', 'end', sample]]
df.columns = ['chrom', 'start', 'end', 'value']
# 合并重叠区域
df = merge_overlapping_regions(df)
# 创建特定类型的输出目录
type_output_dir = os.path.join(output_dir, matrix_type)
os.makedirs(type_output_dir, exist_ok=True)
# 生成BedGraph文件
output_file = os.path.join(type_output_dir, f"{sample}.bedgraph")
df.dropna().to_csv(output_file, sep='\t', index=False, header=False)
print(f"已生成: {output_file}")
def main():
# 文件路径
gff_file = "data/female_rename.gff3"
expr_dir = "data/expression_transcript_ba"
output_dir = "output/bedgraph_samples"
try:
# 处理GFF文件
gff_mrna = process_gff(gff_file)
# 处理所有表达量矩阵
expr_matrices = process_expression_matrices(expr_dir)
# 生成BedGraph文件
generate_bedgraph_files(gff_mrna, expr_matrices, output_dir)
print("\n处理完成!")
print(f"BedGraph文件已生成在目录: {output_dir}")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
main()
1.3.2 generate_track_config.py
import json
import os
import glob
def generate_track_config():
"""生成JBrowse轨道配置文件"""
print("正在生成轨道配置文件...")
# 收集所有BigWig文件
bigwig_files = glob.glob("output/bigwig_files/fpkm/*.bw")
print(f"找到 {len(bigwig_files)} 个BigWig文件")
# 创建轨道配置
tracks = []
for bw_file in bigwig_files:
filename = os.path.basename(bw_file)
sample_name = os.path.splitext(filename)[0]
track = {
"label": sample_name,
"key": sample_name,
"type": "JBrowse/View/Track/Wiggle/XYPlot",
"storeClass": "JBrowse/Store/SeqFeature/BigWig",
"urlTemplate": f"bigwig_files/fpkm/{filename}",
"autoscale": "local",
"style": {
"pos_color": "#FFA07A",
"neg_color": "#4682B4",
"height": 100
}
}
tracks.append(track)
# 创建完整的配置
config = {
"formatVersion": 1,
"tracks": tracks
}
# 保存配置文件
output_file = "output/trackList.json"
with open(output_file, 'w') as f:
json.dump(config, f, indent=2)
print(f"轨道配置文件已生成: {output_file}")
if __name__ == "__main__":
generate_track_config()
1.3.3 convert_to_bigwig.sh
#!/bin/bash
# 生成染色体大小信息
echo "生成染色体大小信息..."
samtools faidx data/female_chr_rename.fna
# 创建输出目录
mkdir -p output/bigwig_files/fpkm
# 处理FPKM数据
echo "处理FPKM数据..."
for bedgraph in output/bedgraph_samples/fpkm/*.bedgraph; do
if [ -f "$bedgraph" ]; then
filename=$(basename "$bedgraph")
sample_name="${filename%.bedgraph}"
echo "处理样本: $sample_name"
# 转换为BigWig格式
bedGraphToBigWig "$bedgraph" data/female_chr_rename.fna.fai "output/bigwig_files/fpkm/${sample_name}.bw"
fi
done
echo "转换完成!"
2 生成 BedGraph 文件
# 生成 BedGraph
python scripts/generate_bedgraph.py
2.1 BedGraph 转 BigWig
2.1.1 BedGraph转换为BigWig格式的原理和步骤:
BedGraph格式:
BedGraph是一种文本格式,每行包含4列:染色体名称、起始位置、结束位置、值
例如:Chr01 1000 2000 5.2
这个格式记录了基因组上每个区域的表达量值
BigWig格式:
BigWig是一种二进制格式,用于高效存储和查询基因组数据
它需要两个关键信息:
染色体大小信息(.fai文件):记录每条染色体的长度
BedGraph数据:记录具体的表达量值
转换过程:
BedGraph文件 -> 染色体大小文件 -> BigWig文件
- 首先需要生成染色体大小文件(.fai)
- 然后使用bedGraphToBigWig工具进行转换
- 转换时会检查BedGraph中的染色体名称是否在.fai文件中存在
- 同时检查坐标是否超出染色体长度
3 检查参考基因组索引
samtools faidx data/female_rename.fa
4 运行转换脚本
bash scripts/convert_to_bigwig.sh
输出目录:output/bigwig_files/fpkm/
每个样本一个 .bw 文件
5 生成 JBrowse 轨道配置
python scripts/generate_track_config.py
输出文件:output/trackList.json
6 部署到 Drupal7 的 JBrowse
6.1 复制文件到 JBrowse 数据目录
假设 JBrowse 数据目录为 /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/
# 复制 BigWig 文件
cp output/bigwig_files /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/
# 复制 trackList.json
cp output/trackList.json /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/
6.2 设置权限
chown -R www-data:www-data /var/www/html/sites/default/files/jbrowse/
chmod -R 755 /var/www/html/sites/default/files/jbrowse/