金山文档链接
董老师的原始脚本,我硬编码了其中的一部分参数
(fst_plot) root@4632f56a8ada:/project1/data# cat fst_pi
#!/bin/bash
clean_vcf=$1
prefix=$2
window=10000000
step=500000
RES=/project1/data
female_pop=/project1/data/female_pop_list
male_pop=/project1/data/male_pop_list
# NC那篇文章上的参考基因组
FAI=/project1/data/Tgra.chr.fa.fai
fst=${RES}/${prefix}_${window}_${step}_fst
fst_manhattan_plot=/project1/data/fst_manhattan_plot.r
vcftools --gzvcf $clean_vcf --weir-fst-pop $female_pop --weir-fst-pop $male_pop --out $fst --fst-window-size $window --fst-window-step $step
fst_manhattan=${fst}.manhattan.fst
windowed_weir_fst=${fst}.windowed.weir.fst
fst_smooth=${fst}.smooth.fst
## 计算pi
pi_male=${RES}/${prefix}_male_${window}_${step}_pi
vcftools --gzvcf $clean_vcf --window-pi $window --window-pi-step $step --keep $male_pop --out $pi_male
pi_female=${RES}/${prefix}_female_${window}_${step}_pi
vcftools --gzvcf $clean_vcf --window-pi $window --window-pi-step $step --keep $female_pop --out $pi_female虚线代表Top 1%的点
绘图脚本也是董川老师的,实际上从韩之刚老师那边薅来的脚本。
10号染色体上张仁纲注释的着丝丽位置,具体是在967MB的位置
目前10号染色体上的fst的高信号区间离这个区间已经非常接近了。
(fst_plot) root@4632f56a8ada:/project1/data# awk 'NR==1 || $5 > 0.30' fst_11_23_10000000_500000_fst.windowed.weir.fst
CHROM BIN_START BIN_END N_VARIANTS WEIGHTED_FST MEAN_FST
Chr10 825000001 835000000 181311 0.301281 0.125553
Chr10 825500001 835500000 181039 0.30208 0.126121
Chr10 829000001 839000000 191597 0.305019 0.128358
Chr10 829500001 839500000 190923 0.32746 0.13365
Chr10 830000001 840000000 192220 0.336751 0.136287
Chr10 830500001 840500000 191772 0.342324 0.139134
Chr10 831000001 841000000 190001 0.357229 0.145652
Chr10 831500001 841500000 191684 0.356205 0.14549
Chr10 832000001 842000000 190310 0.35799 0.146519
Chr10 832500001 842500000 188494 0.359426 0.14702
Chr10 833000001 843000000 190522 0.358523 0.146878
Chr10 833500001 843500000 198695 0.348364 0.142337
Chr10 907500001 917500000 188814 0.303194 0.134047
Chr10 908000001 918000000 188023 0.30784 0.134735
Chr10 908500001 918500000 177218 0.325972 0.141704
Chr10 909000001 919000000 172557 0.330306 0.143429
Chr10 909500001 919500000 170225 0.330501 0.143302
Chr10 910000001 920000000 167175 0.331445 0.143473
Chr10 910500001 920500000 161173 0.339927 0.146942
Chr10 911000001 921000000 166810 0.326099 0.141194
Chr10 911500001 921500000 178310 0.303117 0.132816
Chr10 913000001 923000000 165485 0.315415 0.135838
Chr10 913500001 923500000 166015 0.312369 0.134471
Chr10 914000001 924000000 164948 0.300647 0.129295
Chr5 339500001 349500000 359338 0.303806 0.225534
Chr5 340000001 350000000 372311 0.30161 0.223115
Chr5 340500001 350500000 376857 0.305078 0.225052
Chr5 341000001 351000000 372535 0.309838 0.226312
Chr5 341500001 351500000 370578 0.313484 0.230478
Chr5 342000001 352000000 367784 0.32019 0.234927
Chr5 342500001 352500000 353695 0.328046 0.236537
Chr5 343000001 353000000 335937 0.344081 0.242521
Chr5 343500001 353500000 321851 0.349278 0.244561
Chr5 344000001 354000000 312717 0.349583 0.242606
Chr5 344500001 354500000 306113 0.346459 0.238858
Chr5 345000001 355000000 300994 0.344326 0.236792
Chr5 345500001 355500000 296987 0.337126 0.230382
Chr5 346000001 356000000 290487 0.326257 0.224837
Chr5 346500001 356500000 280104 0.32328 0.22359但是目前还不足以下结论,必须要结合这个区域的Pi值才行
至于说5号染色体上的位置,似乎向右偏移了,这是为什么呢?好奇怪啊。
fst小区间是296.5MB-306MB,目前应该是向右偏移了40MB。
让子弹飞一会,结合Pi值吧
Pi 图 也已经出来了。
在Male的中间位置有一个巨大的收缩
Pi计算这个东西,我发现董老师的代码绘图结果有问题,最终纵坐标显示成了绝对数(2e+5,4e+5,6e+5),应该是读列出问题了,我在这个基础上再改一下,当然是AI改的:
是的,这个程序统计的是N_VARIANTS
#!/usr/bin/env Rscript
library(argparse)
library(ggplot2)
parser <- ArgumentParser(description = "Plot genome-wide Pi (nucleotide diversity)")
parser$add_argument(
"-i","--input", required=TRUE, type="character",
help="windowed-pi input file (must contain CHROM, BIN_START, BIN_END, PI)"
)
parser$add_argument(
"-F","--fai", required=TRUE, type="character",
help="reference genome .fai file (for chr order and length)"
)
parser$add_argument(
"-o","--outdir", default=getwd(), type="character",
help="output directory"
)
parser$add_argument(
"-p","--prefix", default="pi_plot", type="character",
help="output file prefix"
)
parser$add_argument(
"-b","--blank", default=0.005, type="double",
help="gap ratio between chromosomes"
)
parser$add_argument(
"--smooth", action="store_true",
help="use geom_line instead of points (useful for Pi trends)"
)
parser$add_argument(
"--threshold", type="double", default=NULL,
help="draw a horizontal line at this Pi value (optional)"
)
parser$add_argument(
"--point_size", type="double", default=0.6,
help="point size (used when not smoothing)"
)
parser$add_argument(
"--line_size", type="double", default=0.3,
help="line width (when --smooth)"
)
parser$add_argument(
"--ylim_max", type="double", default=NULL,
help="set fixed y-axis max (optional)"
)
args <- parser$parse_args()
# --------------------------------------------------------
# 1. 读取数据
# --------------------------------------------------------
raw <- read.table(args$input, header=TRUE, comment.char="")
# 自动处理常见结构:CHROM START END ... PI
if (ncol(raw) >= 6) {
colnames(raw)[1:6] <- c("chr","start","end","n_var","n_mono","pi")
} else {
colnames(raw)[1:4] <- c("chr","start","end","pi")
}
raw$pi[raw$pi < 0] <- 0
# 增加窗口中点位置
raw$mid <- (raw$start + raw$end) / 2
# --------------------------------------------------------
# 2. 读取 fai 确定染色体顺序与长度
# --------------------------------------------------------
fai <- read.table(args$fai, header=FALSE)
colnames(fai) <- c("chr","length","a","b","c")
fai <- fai[,c("chr","length")]
raw <- raw[raw$chr %in% fai$chr,]
# --------------------------------------------------------
# 3. 构建 chr offset
# --------------------------------------------------------
chrlen <- fai$length
names(chrlen) <- fai$chr
blank <- args$blank * sum(chrlen)
chrpos <- cumsum(c(0, head(chrlen + blank, -1)))
names(chrpos) <- fai$chr
raw$pos <- raw$mid + chrpos[raw$chr]
# --------------------------------------------------------
# 4. 设置 chr 因子
# --------------------------------------------------------
raw$chr <- factor(raw$chr, levels=fai$chr, ordered=TRUE)
# --------------------------------------------------------
# 5. 绘图
# --------------------------------------------------------
p <- ggplot(raw, aes(x=pos, y=pi, color=chr))
if (args$smooth) {
p <- p + geom_line(size=args$line_size)
} else {
p <- p + geom_point(size=args$point_size)
}
# 色板循环使用
default_colors <- c("#4197d8", "#f8c120", "#d60b6f",
"#4abb6b", "#e66519", "#413496")
p <- p + scale_color_manual(values=rep(default_colors, 100))
# --------------------------------------------------------
# 6. X轴(染色体中点)
# --------------------------------------------------------
xmid <- tapply(raw$pos, raw$chr, function(x) mean(range(x)))
xmid <- unlist(xmid)
p <- p +
scale_x_continuous(
breaks = xmid,
labels = names(xmid),
expand = c(0.001, 0)
) +
labs(x="Chromosome", y="Pi (nucleotide diversity)")
# --------------------------------------------------------
# 7. 添加阈值线
# --------------------------------------------------------
if (!is.null(args$threshold)) {
p <- p + geom_hline(
yintercept=args$threshold,
linetype=2,
size=0.3
)
}
# --------------------------------------------------------
# 8. y 轴范围
# --------------------------------------------------------
if (!is.null(args$ylim_max)) {
p <- p + ylim(0, args$ylim_max)
}
# --------------------------------------------------------
# 9. 样式优化
# --------------------------------------------------------
p <- p +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle=65, hjust=1, vjust=1)
)
# --------------------------------------------------------
# 10. 输出 PDF
# --------------------------------------------------------
out_pdf <- file.path(args$outdir, paste0(args$prefix, "_pi_plot.pdf"))
pdf(out_pdf, width=11, height=4)
print(p)
dev.off()
cat("Pi plot saved to: ", out_pdf, "\n")GWAS
gwas的脚本我是分两边准备的,一边是服务器,一边是工作站,首先是服务器端的脚本
(fst_plot) root@4632f56a8ada:/project1/data# cat gwas_script.R
#!/usr/bin/env Rscript
# =========================
# GWAS 脚本
# 输入参数:
# 1. VCF 文件 (.vcf.gz)
# 2. 表型文件 (两列:样本ID \t 表型值)
# =========================
args <- commandArgs(trailingOnly = TRUE)
if(length(args) != 2){
stop("Usage: Rscript gwas_script.R <genotypes.vcf.gz> <phenotype.txt>")
}
vcf_file <- args[1]
pheno_file <- args[2]
# =========================
# 1. 读取表型文件
# =========================
phenotype <- read.table(pheno_file, header=FALSE, stringsAsFactors=FALSE)
colnames(phenotype) <- c("sample", "pheno")
# =========================
# 2. 读取 VCF 并转换为 GDS 文件
# =========================
vcf <- vcfR::read.vcfR(vcf_file)
# 转换为 SNPRelate GDS 格式
temp_gds <- tempfile(fileext = ".gds")
SNPRelate::vcfR2gds(vcf, temp_gds, verbose = FALSE)
genofile <- SNPRelate::snpgdsOpen(temp_gds)
# =========================
# 3. 确保样本顺序一致
# =========================
geno_samples <- read.gdsn(index.gdsn(genofile, "sample.id"))
phenotype <- phenotype[match(geno_samples, phenotype$sample), ]
# =========================
# 4. GWAS 分析(单标记线性模型)
# =========================
# 输出 GWAS 结果文件
gwas_outfile <- "gwas_results.txt"
assoc <- snpgdsAssocTest(
genofile,
phenotype = phenotype$pheno,
snp.id = read.gdsn(index.gdsn(genofile, "snp.id")),
method = "linear"
)
# 保存结果
write.table(assoc, gwas_outfile, quote=FALSE, sep="\t", row.names=FALSE)
# =========================
# 5. 绘制 Manhattan plot
# =========================
# 准备数据
manhattan_data <- data.frame(
SNP = assoc$snp.id,
CHR = assoc$chromosome,
BP = assoc$position,
P = assoc$p.value
)
# 绘制
pdf("manhattan_plot.pdf", width=10, height=5)
qqman::manhattan(manhattan_data, genomewideline=-log10(5e-8),
suggestiveline=FALSE, main="GWAS Manhattan Plot")
dev.off()
# =========================
# 关闭 GDS 文件
# =========================
snpgdsClose(genofile)
cat("GWAS 分析完成!结果保存在 gwas_results.txt,曼哈顿图保存在 manhattan_plot.pdf\n")但是内存开销太大了,服务器似乎顶不住。
SNPs总数达到了676MB
工作站端跑通了一条染色体:Chr10
工作站的程序,目前放的是完整版本
### 1. VCF → PLINK BED
echo "[1/3] 生成 PLINK 二进制文件 (.bed/.bim/.fam)..."
plink --vcf "$VCF_FILE" \
--make-bed \
--out "$BED_PREFIX" \
--double-id \
--allow-no-sex \
--pheno "$PHENO_FILE"
### 2. 生成 kinship
KINSHIP_FILE="$OUT_DIR/GEMMA_results/${CHR}_kinship.cXX.txt"
echo "[2/3] 生成 kinship 相关性矩阵..."
gemma -bfile "$BED_PREFIX" -gk 1 -o "${CHR}_kinship"
mv output/${CHR}_kinship.cXX.txt "$KINSHIP_FILE"
### 3. GEMMA GWAS
echo "[3/3] 运行 GEMMA GLM 和 MLM GWAS..."
# GLM
GLM_FILE="$OUT_DIR/GEMMA_results/${CHR}_glm.assoc.txt"
gemma -bfile "$BED_PREFIX" -lm 1 -o "${CHR}_glm"
mv output/${CHR}_glm.assoc.txt "$GLM_FILE"
# MLM
MLM_FILE="$OUT_DIR/GEMMA_results/${CHR}_mlm.assoc.txt"
gemma -bfile "$BED_PREFIX" -k "$KINSHIP_FILE" -lmm 1 -o "${CHR}_mlm"
mv output/${CHR}_mlm.assoc.txt "$MLM_FILE"
### ============= R 绘图部分(Bash 内嵌) ================
echo ""
echo "===== 绘制曼哈顿图和 QQ 图 ====="
plot_file() {
local assoc_file=$1
local prefix=$2
local manhattan_png="$OUT_DIR/plots/${prefix}_manhattan.png"
local qq_png="$OUT_DIR/plots/${prefix}_qq.png"
# 如果绘图已经完成,则跳过
if [[ -f "$manhattan_png" && -f "$qq_png" ]]; then
echo "[✔] R 绘图文件已存在,跳过绘图: $prefix"
return
fi
Rscript - <<EOF
library(data.table)
library(qqman)
# 读取 GEMMA 输出
res <- fread("$assoc_file")
# 识别 P 值列
p_col <- if ("p_wald" %in% colnames(res)) "p_wald" else if ("P" %in% colnames(res)) "P" else stop("无法识别 P 列")
chr_col <- if ("chr" %in% colnames(res)) "chr" else "CHR"
bp_col <- if ("ps" %in% colnames(res)) "ps" else "BP"
snp_col <- if ("rs" %in% colnames(res)) "rs" else "SNP"
# 删除 P 值为 NA 或 <=0 的行
res <- res[!is.na(get(p_col)) & get(p_col) > 0]
res[[chr_col]] <- as.integer(res[[chr_col]])
# 曼哈顿图
png("$manhattan_png", width=2000, height=1200, res=150)
manhattan(res, chr=chr_col, bp=bp_col, p=p_col, snp=snp_col,
col=c("#3399ff","#99cc33","#ffcc00"),
cex=0.5, cex.axis=0.7, cex.lab=0.8,
genomewideline=FALSE, suggestiveline=FALSE)
dev.off()
# QQ 图
png("$qq_png", width=1200, height=1200, res=150)
qq(res[[p_col]], tck=-0.02, col="skyblue", main="$prefix QQ Plot")
dev.off()
EOF
echo "[✔] R 绘图完成: $prefix"
}
# 绘制 GLM 和 MLM
plot_file "$GLM_FILE" "${CHR}_glm"
plot_file "$MLM_FILE" "${CHR}_mlm"
echo ""
echo "===== 🎉 单染色体 GWAS 分析完成! ====="
echo "PLINK + GEMMA 结果在: $OUT_DIR/GEMMA_results/"
echo "曼哈顿图 & QQ 图在: $OUT_DIR/plots/"
echo "============================================="这是MLM的模型结果,显然它过早地上抬了,需要跑完全部染色体的结果才知道到底是怎么一回事。
既然如此,让子弹飞一会,我要去睡觉了。
又跑了两条染色体,仍然是这样的趋势,所以这个模型的适配不太行。
0.1的阈值得到的Fst结果
从趋势来看,还算不错
这是Pi的结果,上面是female,下面是male,5号染色体的核苷酸多样性还是有差别的,但是除了五号以外,其余的染色体也有差别,所以这个结果目前不支持五号是性染色体。
基于EXON的GWAS分析
先把阿鹏的注释给更正了,我们vcf.gz文件里的染色体命名是Chr1,而李俞鹏的注释命名是Chr01
sed -E 's/Chr0?([1-9])/Chr\1/' female_ba_last.gff3 > female_ba_last.fix.gff3
awk '$3=="exon"{print $1"\t"$4-1"\t"$5"\t"$9}' female_ba_last.fix.gff3 > exons.raw.bed
sort -k1,1 -k2,2n exons.raw.bed | \
cut -f1-3 | \
bedtools merge -i - > exons.merged.bed
#需要先对vcf.gz文件构建索引哈这样来看,好像10号更强一点,但是强的有限。
好像离专利的那三个SNP比较近。
全外显子的GWAS结果,SNP的很少,但是这个qq plot图的质量不错,比之前的乱七八糟的要强多了。
基于基因的GWAS分析
awk '$3=="gene"{print $1"\t"$4-1"\t"$5"\t"$9}' female_ba_last.fix.gff3 > exons.raw.bed
sort -k1,1 -k2,2n exons.raw.bed | \
cut -f1-3 | \
bedtools merge -i - > genes.merged.bed
bcftools view -R genes.merged.bed -Oz -o 11_27_merged_exon_vcf.gz 11_27_merged_vcf.gz目前来看,基于外显子的GWAS分析的确是没有强烈的信号的。
好了,基因的提取已经挂上了。