By chenyi, 30 June, 2025

1. 准备数据与环境

1.1 目录结构建议

Jbrowse/
├── data/
│ ├── female_rename.gff3 # GFF3注释文件
│ ├── female_chr_rename.fna # 参考基因组FASTA
│ ├── female_chr_rename.fna.fai # fai文件
│ └── expression_transcript_ba/ # 表达量矩阵目录
│ ├── gene_fpkm_matrix.csv
│ ├── gene_fpkm_matrix_fruit.csv
│ └── gene_fpkm_matrix_other.csv
├── output/
│ ├── bedgraph_samples/
│ ├── bigwig_files/
│ └── trackList.json
└── scripts/
├── generate_bedgraph.py
├── convert_to_bigwig.sh
└── generate_track_config.py

mkdir -p output/bedgraph_samples
mkdir -p output/bigwig_files
mkdir -p output/track_config

1.2 环境依赖

  • Python 3(推荐3.7及以上)
  • pip(Python包管理器)
  • pandas(Python数据处理库)
  • samtools(FASTA索引工具)
  • bedGraphToBigWig(UCSC工具,BedGraph转BigWig)
  • JBrowse(已集成到Drupal7,或单独安装)
# 安装samtools
apt-get update
apt-get install -y samtools

# 安装UCSC工具(包含bedGraphToBigWig)
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig
chmod +x bedGraphToBigWig
mv bedGraphToBigWig /usr/local/bin/

1.3 脚本准备

1.3.1 scripts/generate_bedgraph.py

import pandas as pd
import os
import re
import glob

def process_gff(gff_file):
    """处理GFF3文件,提取mRNA信息"""
    print("正在处理GFF3文件...")
    gff = pd.read_csv(gff_file, sep='\t', comment="#", header=None,
                     names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    
    # 提取mRNA
    gff_mrna = gff[gff['type'] == 'mRNA'].copy()
    
    # 提取ID
    gff_mrna['mrna_id'] = gff_mrna['attributes'].apply(lambda x: re.search(r'ID=([^;]+)', x).group(1))
    
    # 只保留需要的列
    gff_mrna = gff_mrna[['mrna_id', 'seqid', 'start', 'end']]
    
    print(f"从GFF文件中提取了 {len(gff_mrna)} 个mRNA记录")
    return gff_mrna

def read_csv_file(file_path):
    """读取CSV文件,尝试不同的编码"""
    try:
        return pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            return pd.read_csv(file_path, encoding='gbk')
        except UnicodeDecodeError:
            return pd.read_csv(file_path, encoding='gb2312')

def process_expression_matrices(expr_dir):
    """处理所有表达量矩阵文件"""
    print("正在处理表达量矩阵文件...")

    # 存储表达量矩阵
    expr_matrices = {}
    combined_matrix = None
    
    print("\n处理FPKM类型矩阵...")
    
    for file_name in fpkm_files:
        file_path = os.path.join(expr_dir, file_name)
        if not os.path.exists(file_path):
            print(f"警告: 文件不存在 {file_path}")
            continue
            
        print(f"\n处理文件: {file_name}")
        df = read_csv_file(file_path)
        
        # 打印列名(样本名)
        print("文件中的样本列:")
        for col in df.columns:
            if col != 'mrna_id':
                print(f"- {col}")
        
        print(f"矩阵大小: {df.shape}")
        
        # 合并矩阵
        if combined_matrix is None:
            combined_matrix = df
        else:
            # 合并新的样本列
            common_cols = ['mrna_id']
            new_cols = [col for col in df.columns if col not in common_cols]
            combined_matrix = pd.merge(
                combined_matrix,
                df[common_cols + new_cols],
                on='mrna_id',
                how='outer'
            )
    
    if combined_matrix is not None:
        expr_matrices['fpkm'] = combined_matrix
        print(f"\n合并后的FPKM矩阵大小: {combined_matrix.shape}")
        print("所有样本列表:")
        for col in combined_matrix.columns:
            if col != 'mrna_id':
                print(f"- {col}")
    
    return expr_matrices

def merge_overlapping_regions(df):
    """合并重叠区域,保留最大值"""
    # 按染色体和位置排序
    df = df.sort_values(['chrom', 'start', 'end'])
    
    # 创建结果DataFrame
    result = []
    current_chrom = None
    current_start = None
    current_end = None
    current_value = None
    
    for _, row in df.iterrows():
        if current_chrom is None:
            # 第一条记录
            current_chrom = row['chrom']
            current_start = row['start']
            current_end = row['end']
            current_value = row['value']
        elif row['chrom'] == current_chrom and row['start'] <= current_end:
            # 重叠区域,更新结束位置和值
            current_end = max(current_end, row['end'])
            current_value = max(current_value, row['value'])
        else:
            # 不重叠,保存当前区域并开始新区域
            result.append({
                'chrom': current_chrom,
                'start': current_start,
                'end': current_end,
                'value': current_value
            })
            current_chrom = row['chrom']
            current_start = row['start']
            current_end = row['end']
            current_value = row['value']
    
    # 添加最后一个区域
    if current_chrom is not None:
        result.append({
            'chrom': current_chrom,
            'start': current_start,
            'end': current_end,
            'value': current_value
        })
    
    return pd.DataFrame(result)

def generate_bedgraph_files(gff_mrna, expr_matrices, output_dir):
    """生成BedGraph文件"""
    print("正在生成BedGraph文件...")
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 只处理FPKM表达量矩阵
    matrix_type = 'fpkm'
    expr_matrix = expr_matrices[matrix_type]
    print(f"\n处理{matrix_type}表达量矩阵...")
    
    # 合并表达和位置信息
    merged = pd.merge(gff_mrna, expr_matrix, on='mrna_id')
    print(f"合并后的数据大小: {merged.shape}")
    
    # 为每个样本生成BedGraph文件
    samples = [col for col in merged.columns if col not in ['mrna_id', 'seqid', 'start', 'end']]
    print(f"样本数量: {len(samples)}")
    print("样本列表:")
    for sample in samples:
        print(f"- {sample}")
    
    for sample in samples:
        print(f"处理样本: {sample}")
        # 使用seqid作为染色体名称
        df = merged[['seqid', 'start', 'end', sample]]
        df.columns = ['chrom', 'start', 'end', 'value']
        
        # 合并重叠区域
        df = merge_overlapping_regions(df)
        
        # 创建特定类型的输出目录
        type_output_dir = os.path.join(output_dir, matrix_type)
        os.makedirs(type_output_dir, exist_ok=True)
        
        # 生成BedGraph文件
        output_file = os.path.join(type_output_dir, f"{sample}.bedgraph")
        df.dropna().to_csv(output_file, sep='\t', index=False, header=False)
        print(f"已生成: {output_file}")

def main():
    # 文件路径
    gff_file = "data/female_rename.gff3"
    expr_dir = "data/expression_transcript_ba"
    output_dir = "output/bedgraph_samples"
    
    try:
        # 处理GFF文件
        gff_mrna = process_gff(gff_file)
        
        # 处理所有表达量矩阵
        expr_matrices = process_expression_matrices(expr_dir)
        
        # 生成BedGraph文件
        generate_bedgraph_files(gff_mrna, expr_matrices, output_dir)
        
        print("\n处理完成!")
        print(f"BedGraph文件已生成在目录: {output_dir}")
        
    except Exception as e:
        print(f"发生错误: {str(e)}")
        import traceback
        print(traceback.format_exc())

if __name__ == "__main__":
    main()

1.3.2 generate_track_config.py

import json
import os
import glob

def generate_track_config():
    """生成JBrowse轨道配置文件"""
    print("正在生成轨道配置文件...")
    
    # 收集所有BigWig文件
    bigwig_files = glob.glob("output/bigwig_files/fpkm/*.bw")
    print(f"找到 {len(bigwig_files)} 个BigWig文件")
    
    # 创建轨道配置
    tracks = []
    for bw_file in bigwig_files:
        filename = os.path.basename(bw_file)
        sample_name = os.path.splitext(filename)[0]
        
        track = {
            "label": sample_name,
            "key": sample_name,
            "type": "JBrowse/View/Track/Wiggle/XYPlot",
            "storeClass": "JBrowse/Store/SeqFeature/BigWig",
            "urlTemplate": f"bigwig_files/fpkm/{filename}",
            "autoscale": "local",
            "style": {
                "pos_color": "#FFA07A",
                "neg_color": "#4682B4",
                "height": 100
            }
        }
        tracks.append(track)
    
    # 创建完整的配置
    config = {
        "formatVersion": 1,
        "tracks": tracks
    }
    
    # 保存配置文件
    output_file = "output/trackList.json"
    with open(output_file, 'w') as f:
        json.dump(config, f, indent=2)
    
    print(f"轨道配置文件已生成: {output_file}")

if __name__ == "__main__":
    generate_track_config()

1.3.3 convert_to_bigwig.sh

#!/bin/bash

# 生成染色体大小信息
echo "生成染色体大小信息..."
samtools faidx data/female_chr_rename.fna

# 创建输出目录
mkdir -p output/bigwig_files/fpkm

# 处理FPKM数据
echo "处理FPKM数据..."
for bedgraph in output/bedgraph_samples/fpkm/*.bedgraph; do
    if [ -f "$bedgraph" ]; then
        filename=$(basename "$bedgraph")
        sample_name="${filename%.bedgraph}"
        echo "处理样本: $sample_name"
        
        # 转换为BigWig格式
        bedGraphToBigWig "$bedgraph" data/female_chr_rename.fna.fai "output/bigwig_files/fpkm/${sample_name}.bw"
    fi
done

echo "转换完成!"

2 生成 BedGraph 文件

# 生成 BedGraph
python scripts/generate_bedgraph.py

2.1 BedGraph 转 BigWig

2.1.1 BedGraph转换为BigWig格式的原理和步骤:

BedGraph格式:

BedGraph是一种文本格式,每行包含4列:染色体名称、起始位置、结束位置、值
例如:Chr01 1000 2000 5.2
这个格式记录了基因组上每个区域的表达量值

BigWig格式:

BigWig是一种二进制格式,用于高效存储和查询基因组数据
它需要两个关键信息:
染色体大小信息(.fai文件):记录每条染色体的长度
BedGraph数据:记录具体的表达量值

转换过程:

BedGraph文件 -> 染色体大小文件 -> BigWig文件

  • 首先需要生成染色体大小文件(.fai)
  • 然后使用bedGraphToBigWig工具进行转换
  • 转换时会检查BedGraph中的染色体名称是否在.fai文件中存在
  • 同时检查坐标是否超出染色体长度

3 检查参考基因组索引

samtools faidx data/female_rename.fa

4 运行转换脚本

bash scripts/convert_to_bigwig.sh

输出目录:output/bigwig_files/fpkm/
每个样本一个 .bw 文件

5 生成 JBrowse 轨道配置

python scripts/generate_track_config.py

输出文件:output/trackList.json

6 部署到 Drupal7 的 JBrowse

6.1 复制文件到 JBrowse 数据目录

假设 JBrowse 数据目录为 /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/

# 复制 BigWig 文件
cp output/bigwig_files /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/

# 复制 trackList.json
cp output/trackList.json /var/www/html/sites/default/files/jbrowse/torreya_grandis_ba/data/

6.2 设置权限

chown -R www-data:www-data /var/www/html/sites/default/files/jbrowse/
chmod -R 755 /var/www/html/sites/default/files/jbrowse/