By Liu Qirui, 26 January, 2026
Forums

EMMAX能够高效的进行大样本、大基因组的GWAS,速度快,节约内存

安装

wget http://csg.sph.umich.edu/kang/emmax/download/emmax-beta-07Mar2010.tar.gz
gunzip emmax-beta-07Mar2010.tar.gz
tar xvf emmax-beta-07Mar2010.tar
cd emmax-beta-07Mar2010
#设置为全局环境变量
export PATH="$PATH:/root/home/liuqirui/Tools/emmax-beta-07Mar2010"

数据准备

基因型数据
combine_all.pass.recode.bed
combine_all.pass.recode.bim
combine_all.pass.recode.fam
需要转为tfam格式
plink --bfile combine_all.pass.recode --threads 60 \
     --recode 12 --output-missing-genotype 0 \
     --allow-extra-chr --allow-no-sex --keep-allele-order \
     --transpose \
     --out combine_all.pass.recode.plink
awk '{print $1, $2}' combine_all.pass.recode.plink.tfam > sample_order.txt
 
表型数据
260sample_styleL.txt
文件数据结构
H-R100 39
H-R101 42
H-R102 37.46
确保顺序与基因型数据完全一致
awk '{print $1, $1, $2}' 260sample_styleL.txt > 260sample_styleL_emmax
sed -i '1s/^\xef\xbb\xbf//' 260sample_styleL_emmax
awk 'NR==FNR{pheno[$1]=$3; next} ($1 in pheno){print $1, $1, pheno[$1]}' 260sample_styleL_emmax sample_order.txt > 260sample_styleL_emmax_fixed

协变量文件
PCA(群体关系)
基于 PCA.clear.result.eigenvec 进行处理
awk '{print $1, $2, "1", $3, $4, $5}' PCA.clear.result.eigenvec > pca_emmax_cov.txt
KINSHIP矩阵(亲缘关系)
使用EMMAX直接计算得到BN矩阵
emmax-kin -v -d 10 combine_all.pass.recode.plink

GWAS by EMMAX
emmax -v -d 10 \
     -t combine_all.pass.recode.plink \
     -p 260sample_styleL_emmax_fixed \
     -k combine_all.pass.recode.plink.BN.kinf \
     -c pca_emmax_cov.txt \
     -o result_styleL_emmax_gwas
结果数据位于 result_styleL_emmax_gwas.ps 文件中

可视化(R)
library('qqman')
#读取数据
emmax_results <- read.table("result_styleL_emmax_gwas.ps", header =FALSE)
#读取基因型文件
tped <- read.table("combine_all.pass.recode.plink.tped", header = FALSE, colClasses = c("character", "character", "NULL", "numeric"))
#对照补充染色体及位点信息
emmax_results <- data.frame(SNP = paste0("SNP_", 1:nrow(emmax_results)), CHR = as.numeric(gsub("[^0-9]", "", tped$V1)), BP  = tped$V4, P   = emmax_results$V3)
#结果数据过滤
emmax_results <- na.omit(emmax_results)
emmax_results <- emmax_results[emmax_results$P > 0 & emmax_results$P <= 1, ]
emmax_results <- emmax_results[order(emmax_results$CHR, emmax_results$BP), ]
emmax_filtered <- emmax_results[emmax_results$CHR %in% 1:13, ]
#设置颜色和绘制参数
mycol1 <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00", "#6A3D9A", "#A6CEE3", "#B2DF8A", "#FB9A99", "#FDBF6F", "#CAB2D6", "#FFFF99", "#B15928", "#8DD3C7")
mycol2 <-c("grey","skyblue")
par(mgp =c(1.5,0.5,0), mar =c(4,4,2,2))
#Bonferroni校正选择
n_snps <- 5328800
bonferroni_thresh <- 0.05 / n_snps
#Lambda-GC
p_values <- na.omit(emmax_filtered$P)
chisq <- qchisq(1 - p_values, 1)
lambda_gc <- median(chisq) / qchisq(0.5, 1)
print(paste("Lambda-GC:", round(lambda_gc, 4)))
#曼哈顿图
png("manhattan_plot_SL.png", width =2000, height =1200, res =150)
manhattan(emmax_filtered, 
         chr = "CHR", bp = "BP", p = "P", snp = "SNP", 
         col = mycol1, 
         cex = 0.5, cex.axis = 0.7, cex.lab = 0.8, 
         genomewideline = -log10(5e-8), 
         suggestiveline = FALSE,
         main = "EMMAX GWAS Manhattan Plot")
abline(h =c(8,10), col = mycol2[1:2], lty =2, lwd =1.5)
dev.off()
#QQ-plot图
png("qq_plot_SL.png", width =1200, height =1200, res =150)
qq(emmax_filtered$P, tck =-0.02, col ="skyblue")
dev.off()