转录组数据的处理——由于ncc版本id的更新

By chenyi, 6 December, 2024

Forums

一、需要的数据

转录组的数据、删除的id名称统计、替换的id名称

二、在92的wzz_python容器中进行的

三、步骤

1、在/home/replace_geneid目录下，先根据鹏哥给的abandoned.gene.id.txt文件，剔除转录组中所有已经被删掉的id。python代码为split_genes.py，输入命令python split_genes.py。生成included_genes文件和excluded_genes文件。

import os
import pandas as pd
# 读取 abandoned.gene.id.txt 中的 ID
with open("abandoned.gene.id.txt", "r") as f:
   abandoned_ids = set(line.strip() for line in f)
# FPKM.TG 文件夹路径
fpkm_folder = "FPKM.TG"
# 创建输出文件夹
included_folder = "included_genes"
excluded_folder = "excluded_genes"
os.makedirs(included_folder, exist_ok=True)
os.makedirs(excluded_folder, exist_ok=True)
# 遍历文件夹中的所有 CSV 文件
for file_name in os.listdir(fpkm_folder):
   file_path = os.path.join(fpkm_folder, file_name)
   if os.path.isfile(file_path) and file_name.endswith(".csv"):  # 确保是 CSV 文件
       try:
           # 显式设置分隔符为制表符
           df = pd.read_csv(file_path, sep=',')
           
           # 检查是否存在 gene_id 列
           if "gene_id" not in df.columns:
               print(f"文件 {file_name} 中未找到 'gene_id' 列，跳过。")
               continue
           # 筛选包含和不包含指定 ID 的行
           included_df = df[df["gene_id"].isin(abandoned_ids)]
           excluded_df = df[~df["gene_id"].isin(abandoned_ids)]
           # 保存结果
           included_file_path = os.path.join(included_folder, file_name)
           excluded_file_path = os.path.join(excluded_folder, file_name)
           included_df.to_csv(included_file_path, index=False)
           excluded_df.to_csv(excluded_file_path, index=False)
       
       except Exception as e:
           print(f"处理文件 {file_name} 时出错：{e}")
print(f"分离完成：生成的结果保存在文件夹 {included_folder} 和 {excluded_folder} 中。")

2、excluded_genes文件中的所有文件已经剔除了被删除的id，再写一个python代码根据Tgra_comparison.tsv对应表将ncc版的id替换掉老版的id。python代码为replace_geneid.py。最后生成output文件就是修改过后的转录组数据

输出命令：

python replace_geneid.py /home/replace_geneid/Tgra_comparison.tsv /home/replace_geneid/excluded_genes/ /home/replace_geneid/output

代码如下

import csv
import os
import argparse
import sys
# 设置命令行参数解析
parser = argparse.ArgumentParser(description="Replace gene_id in the specified column of all CSV files in a folder with corresponding GeneID_GFF2 based on the Tgra_comparison.tsv file")
parser.add_argument('comparison_file', help="Path to the Tgra_comparison.tsv file")
parser.add_argument('input_folder', help="Path to the folder containing input CSV files")
parser.add_argument('output_folder', help="Path to the folder for saving output CSV files")
parser.add_argument('--input_delimiter', default=',', help="Delimiter for the input CSV files (default: ',')")
parser.add_argument('--output_delimiter', default=',', help="Delimiter for the output CSV files (default: ',')")
parser.add_argument('--key_column', type=int, default=0, help="Index of the column to replace in input CSV (default: 0)")
parser.add_argument('--map_key_column', type=int, default=0, help="Index of the key column in comparison file (default: 0)")
parser.add_argument('--map_value_column', type=int, default=1, help="Index of the value column in comparison file (default: 1)")
args = parser.parse_args()
# 验证映射文件路径
if not os.path.exists(args.comparison_file):
   sys.exit(f"错误: 找不到映射文件 {args.comparison_file}，请检查路径是否正确。")
# 验证输入文件夹路径
if not os.path.isdir(args.input_folder):
   sys.exit(f"错误: 输入文件夹 {args.input_folder} 不存在或不是文件夹，请检查路径是否正确。")
# 确保输出文件夹存在
os.makedirs(args.output_folder, exist_ok=True)
# 创建 gene_id 到 GeneID_GFF2 的映射字典
gene_to_gff2 = {}
# 读取 Tgra_comparison.tsv 文件以构建映射
print("开始读取对应关系文件...")
try:
   with open(args.comparison_file, 'r') as file:
       reader = csv.reader(file, delimiter='\t')  # 假设 Tgra_comparison.tsv 以制表符分隔
       next(reader)  # 跳过表头
       for row in reader:
           # 检查行是否包含足够的列
           if len(row) > max(args.map_key_column, args.map_value_column):
               key = row[args.map_key_column].strip()
               value = row[args.map_value_column].strip()
               gene_to_gff2[key] = value
           else:
               print(f"警告: 跳过无效行（列不足）: {row}")
except Exception as e:
   sys.exit(f"错误: 无法读取映射文件 {args.comparison_file}。详情: {e}")
print("对应关系文件读取完毕。")
# 遍历输入文件夹中的所有 CSV 文件
for filename in os.listdir(args.input_folder):
   if filename.endswith(".csv"):  # 确保只处理 CSV 文件
       input_file_path = os.path.join(args.input_folder, filename)
       output_file_path = os.path.join(args.output_folder, f"modified_{filename}")
       
       print(f"正在处理文件: {filename}")
       
       try:
           # 读取并处理每个 CSV 文件
           with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
               reader = csv.reader(infile, delimiter=args.input_delimiter)
               writer = csv.writer(outfile, delimiter=args.output_delimiter)
               
               # 写入表头
               header = next(reader)
               writer.writerow(header)
               
               # 逐行读取，替换指定列的 gene_id 为 GeneID_GFF2
               for row in reader:
                   if len(row) > args.key_column:
                       original_id = row[args.key_column]
                       # 替换 gene_id 为 GeneID_GFF2，如果存在映射
                       row[args.key_column] = gene_to_gff2.get(original_id, original_id)
                       writer.writerow(row)
                   else:
                       print(f"警告: 文件 {filename} 中的行列不足，跳过行: {row}")
       except Exception as e:
           print(f"错误: 无法处理文件 {filename}。详情: {e}")
       else:
           print(f"文件处理完成，生成文件: {output_file_path}")
print("所有文件处理完成。")

root@aee56b84d678:/home/replace_geneid# ls
FPKM.TG             abandoned.gene.id.txt  included_genes  replace_geneid.py
Tgra_comparison.tsv  excluded_genes        output        split_genes.py
root@aee56b84d678:/home/replace_geneid# grep -r "evm" output
root@aee56b84d678:/home/replace_geneid#