cafe5软件应用-计算基因家族扩张与收缩

By Tingting, 25 December, 2025

Forums

文件共需要2个：1是orthofinder的结果文件Orthogroups.GeneCount.csv；另一个就是超度量树。

一、软件安装

#使用ubuntu_conda镜像
conda create -n cafe5_env python=3.8
conda activate cafe5_env
conda install -c conda-forge mamba
mamba install cafe
cafe5 --help

二、过滤orthofinder的结果文件

#转换文件格式
perl -F'\t' -lane 'if($.==1){$flag=qq{Desc}}else{$flag=null}pop @F;print join qq{\t},$flag,@F;' Orthogroups.GeneCount.csv > gene.counts.tab
#awk命令来筛除3-38列中基因家族数量大于等于100或200的行，可以根据自己的数据更改。
awk 'NR==1 || $3<100 && $4<100 && $5<100 && $6<100 && $7<100 && $8<100 && $9<100 && $10<100 && $11<100 && $12<100 && $13<100  && $14<100 && $15<100 && $16<100 && $17<100  && $18<100 && $19<100 && $20<100  && $21<100  && $22<100 && $22<100 && $23<100 && $24<100 && $25<100 && $26<100 && $27<100 && $28<100 && $29<100 && $30<100 && $31<100 && $32<100 && $33<100 && $34<100 && $35<100 && $36<100 && $37<100 && $38<100 {print $0}' gene.counts.tab >gene.counts.tab_filter.txt
过滤之后的文件：这里是过滤掉大于100的

三、构建超度树

#目录Single_Copy_Orthologue_Sequences
#将单拷贝基因的序列文件放到这个目录下
conda create -n mafft
apt-get install mafft #版本v7.490 (2021/Oct/30)
mafft --maxiterate 1000 --localpair OG0011250.fa > OG0011250.fa.aln
conda create -n gblocks_env
conda activate gblocks_env
conda config --add channels conda-forge
conda config --add channels bioconda
conda install gblocks
conda list --full-name gblock
Gblocks OG0011238.fa.aln -b4=5 -b5=h -t=p -e=.gb
seqkit seq OG0011238.fa.aln.gb -w 0 > OG0011238.new.aln
#提取
awk '{if (NR%2==1) print substr($1,1,4); else print $0}' OG0011238.new.aln > OG0011238.final.aln
#串联所有结果
seqkit concat *.final.aln > all.fa
#准备all.tre文件，就是在普通树文件中添加分化时间
#获取分化时间
，数据库TimeTree（http://www.timetree.org/）
#使用mcmctree分析
#对mcmctree.ctl根据情况进行修改
vi mcmctree.ctl
#脚本
seed = -1
seqfile = all.fa
treefile = all.tre
outfile = out
# -----------------------------
# Step 1: Likelihood Calculation
# -----------------------------
ndata = 1 #

seqtype = 2  * 0: nucleotides; 1:codons; 2:AAs ##序列格式，蛋白就选择AA
usedata = 3    * 0: no data; 1:seq like; 2:use in.BV; 3: out.BV ##此步骤先设为3
clock = 3    * 1: global clock; 2: independent rates; 3: correlated rates
RootAge = <336.8  * * 根节点最大时间（若无化石约束则必须设置）
model = 0                * 0=Poisson (蛋白用这个即可)
alpha = 0                * 形状参数alpha（0=自动估计）
ncatG = 5                * 伽马分布分类数 (建议 5)
cleandata = 0    * remove sites with ambiguity data (1:yes, 0:no)?
# -----------------------------
# Birth–death process priors
# -----------------------------
BDparas = 1 1 0 C   * birth, death, sampling  + C/M (条件或乘法)
kappa_gamma = 6 2      * gamma prior for kappa
alpha_gamma = 1 1      * gamma prior for alpha
rgene_gamma = 2 2   * gamma prior for overall rates for genes
sigma2_gamma = 1 10    * gamma prior for sigma^2     (for clock=2 or 3)
# -----------------------------
# MCMC settings
# -----------------------------
finetune = 1: 0.1  0.1  0.1  0.01 .5  * auto (0 or 1) : times, musigma2, rates, mixing, paras, FossilErr
print = 1
burnin = 20000
sampfreq = 2
nsample = 100000
#脚本结束
#再进行一次计算
cp out.BV in.BV
#修改配置文件
cp mcmctree.ctl mcmctree_2.ctl
sed -i 's/usedata = 3/usedata = 2/' mcmctree_2.ctl
#下面三个参数根据情况进行修改
sed -i 's/^burnin = .*/burnin = 800000/' mcmctree_2.ctl
sed -i 's/^sampfreq = .*/sampfreq = 40/' mcmctree_2.ctl
sed -i 's/^nsample = .*/nsample = 6500000/' mcmctree_2.ctl
#随后运行，会生成Figtree
mcmctree mcmctree_2.ctl
#修改树
grep  "UTREE 1 =" FigTree.tre | sed -E -e "s/\[[^]]*\]//g" -e "s/[ \t]//g" -e "/^$/d" -e "s/UTREE1=//" > tree.txt
#路径
/home/genome_data/protein/Results_Nov08_1/Orthologues_Nov15/Single_Copy_Orthologue_Sequences/tree.txt
#cafe5计算
#基础命令
cafe5 -i gene_family_filter.txt -t /home/genome_data/protein/Results_Nov08_1/Orthologues_Nov15/Single_Copy_Orthologue_Sequences/tree.txt -o out -c 16
#使用GAMMA模型，注意两个文件的物种名称要一致
cafe5 -i gene.counts.tab_filter.txt -t /home/genome_data/protein/Results_Nov08_1/Orthologues_Nov15/Single_Copy_Orthologue_Sequences/tree.txt -o out -c 16 -k 2 -p