一、需要的数据
转录组的数据、删除的id名称统计、替换的id名称
二、在92的wzz_python容器中进行的
三、步骤
1、在/home/replace_geneid目录下,先根据鹏哥给的abandoned.gene.id.txt文件,剔除转录组中所有已经被删掉的id。python代码为split_genes.py,输入命令python split_genes.py。生成included_genes文件和excluded_genes文件。
import os
import pandas as pd
# 读取 abandoned.gene.id.txt 中的 ID
with open("abandoned.gene.id.txt", "r") as f:
abandoned_ids = set(line.strip() for line in f)
# FPKM.TG 文件夹路径
fpkm_folder = "FPKM.TG"
# 创建输出文件夹
included_folder = "included_genes"
excluded_folder = "excluded_genes"
os.makedirs(included_folder, exist_ok=True)
os.makedirs(excluded_folder, exist_ok=True)
# 遍历文件夹中的所有 CSV 文件
for file_name in os.listdir(fpkm_folder):
file_path = os.path.join(fpkm_folder, file_name)
if os.path.isfile(file_path) and file_name.endswith(".csv"): # 确保是 CSV 文件
try:
# 显式设置分隔符为制表符
df = pd.read_csv(file_path, sep=',')
# 检查是否存在 gene_id 列
if "gene_id" not in df.columns:
print(f"文件 {file_name} 中未找到 'gene_id' 列,跳过。")
continue
# 筛选包含和不包含指定 ID 的行
included_df = df[df["gene_id"].isin(abandoned_ids)]
excluded_df = df[~df["gene_id"].isin(abandoned_ids)]
# 保存结果
included_file_path = os.path.join(included_folder, file_name)
excluded_file_path = os.path.join(excluded_folder, file_name)
included_df.to_csv(included_file_path, index=False)
excluded_df.to_csv(excluded_file_path, index=False)
except Exception as e:
print(f"处理文件 {file_name} 时出错:{e}")
print(f"分离完成:生成的结果保存在文件夹 {included_folder} 和 {excluded_folder} 中。")
2、excluded_genes文件中的所有文件已经剔除了被删除的id,再写一个python代码根据Tgra_comparison.tsv对应表将ncc版的id替换掉老版的id。python代码为replace_geneid.py。最后生成output文件就是修改过后的转录组数据
输出命令:
python replace_geneid.py /home/replace_geneid/Tgra_comparison.tsv /home/replace_geneid/excluded_genes/ /home/replace_geneid/output
代码如下
import csv
import os
import argparse
import sys
# 设置命令行参数解析
parser = argparse.ArgumentParser(description="Replace gene_id in the specified column of all CSV files in a folder with corresponding GeneID_GFF2 based on the Tgra_comparison.tsv file")
parser.add_argument('comparison_file', help="Path to the Tgra_comparison.tsv file")
parser.add_argument('input_folder', help="Path to the folder containing input CSV files")
parser.add_argument('output_folder', help="Path to the folder for saving output CSV files")
parser.add_argument('--input_delimiter', default=',', help="Delimiter for the input CSV files (default: ',')")
parser.add_argument('--output_delimiter', default=',', help="Delimiter for the output CSV files (default: ',')")
parser.add_argument('--key_column', type=int, default=0, help="Index of the column to replace in input CSV (default: 0)")
parser.add_argument('--map_key_column', type=int, default=0, help="Index of the key column in comparison file (default: 0)")
parser.add_argument('--map_value_column', type=int, default=1, help="Index of the value column in comparison file (default: 1)")
args = parser.parse_args()
# 验证映射文件路径
if not os.path.exists(args.comparison_file):
sys.exit(f"错误: 找不到映射文件 {args.comparison_file},请检查路径是否正确。")
# 验证输入文件夹路径
if not os.path.isdir(args.input_folder):
sys.exit(f"错误: 输入文件夹 {args.input_folder} 不存在或不是文件夹,请检查路径是否正确。")
# 确保输出文件夹存在
os.makedirs(args.output_folder, exist_ok=True)
# 创建 gene_id 到 GeneID_GFF2 的映射字典
gene_to_gff2 = {}
# 读取 Tgra_comparison.tsv 文件以构建映射
print("开始读取对应关系文件...")
try:
with open(args.comparison_file, 'r') as file:
reader = csv.reader(file, delimiter='\t') # 假设 Tgra_comparison.tsv 以制表符分隔
next(reader) # 跳过表头
for row in reader:
# 检查行是否包含足够的列
if len(row) > max(args.map_key_column, args.map_value_column):
key = row[args.map_key_column].strip()
value = row[args.map_value_column].strip()
gene_to_gff2[key] = value
else:
print(f"警告: 跳过无效行(列不足): {row}")
except Exception as e:
sys.exit(f"错误: 无法读取映射文件 {args.comparison_file}。详情: {e}")
print("对应关系文件读取完毕。")
# 遍历输入文件夹中的所有 CSV 文件
for filename in os.listdir(args.input_folder):
if filename.endswith(".csv"): # 确保只处理 CSV 文件
input_file_path = os.path.join(args.input_folder, filename)
output_file_path = os.path.join(args.output_folder, f"modified_{filename}")
print(f"正在处理文件: {filename}")
try:
# 读取并处理每个 CSV 文件
with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as outfile:
reader = csv.reader(infile, delimiter=args.input_delimiter)
writer = csv.writer(outfile, delimiter=args.output_delimiter)
# 写入表头
header = next(reader)
writer.writerow(header)
# 逐行读取,替换指定列的 gene_id 为 GeneID_GFF2
for row in reader:
if len(row) > args.key_column:
original_id = row[args.key_column]
# 替换 gene_id 为 GeneID_GFF2,如果存在映射
row[args.key_column] = gene_to_gff2.get(original_id, original_id)
writer.writerow(row)
else:
print(f"警告: 文件 {filename} 中的行列不足,跳过行: {row}")
except Exception as e:
print(f"错误: 无法处理文件 {filename}。详情: {e}")
else:
print(f"文件处理完成,生成文件: {output_file_path}")
print("所有文件处理完成。")
root@aee56b84d678:/home/replace_geneid# ls
FPKM.TG abandoned.gene.id.txt included_genes replace_geneid.py
Tgra_comparison.tsv excluded_genes output split_genes.py
root@aee56b84d678:/home/replace_geneid# grep -r "evm" output
root@aee56b84d678:/home/replace_geneid#