By jiangchenhao, 31 December, 2025
Forums

金山文档链接
董老师的原始脚本,我硬编码了其中的一部分参数

(fst_plot) root@4632f56a8ada:/project1/data# cat fst_pi
#!/bin/bash
clean_vcf=$1
prefix=$2
window=10000000
step=500000

RES=/project1/data
female_pop=/project1/data/female_pop_list
male_pop=/project1/data/male_pop_list
# NC那篇文章上的参考基因组
FAI=/project1/data/Tgra.chr.fa.fai
fst=${RES}/${prefix}_${window}_${step}_fst

fst_manhattan_plot=/project1/data/fst_manhattan_plot.r

vcftools --gzvcf $clean_vcf --weir-fst-pop $female_pop --weir-fst-pop $male_pop --out $fst --fst-window-size $window --fst-window-step $step

fst_manhattan=${fst}.manhattan.fst
windowed_weir_fst=${fst}.windowed.weir.fst
fst_smooth=${fst}.smooth.fst

## 计算pi
pi_male=${RES}/${prefix}_male_${window}_${step}_pi
vcftools --gzvcf $clean_vcf --window-pi $window --window-pi-step $step --keep $male_pop --out $pi_male 
pi_female=${RES}/${prefix}_female_${window}_${step}_pi
vcftools --gzvcf $clean_vcf --window-pi $window --window-pi-step $step --keep $female_pop --out $pi_female

虚线代表Top 1%的点

绘图脚本也是董川老师的,实际上从韩之刚老师那边薅来的脚本。

10号染色体上张仁纲注释的着丝丽位置,具体是在967MB的位置

目前10号染色体上的fst的高信号区间离这个区间已经非常接近了。

(fst_plot) root@4632f56a8ada:/project1/data# awk 'NR==1 || $5 > 0.30' fst_11_23_10000000_500000_fst.windowed.weir.fst
CHROM    BIN_START    BIN_END    N_VARIANTS    WEIGHTED_FST    MEAN_FST
Chr10    825000001    835000000    181311    0.301281    0.125553
Chr10    825500001    835500000    181039    0.30208    0.126121
Chr10    829000001    839000000    191597    0.305019    0.128358
Chr10    829500001    839500000    190923    0.32746    0.13365
Chr10    830000001    840000000    192220    0.336751    0.136287
Chr10    830500001    840500000    191772    0.342324    0.139134
Chr10    831000001    841000000    190001    0.357229    0.145652
Chr10    831500001    841500000    191684    0.356205    0.14549
Chr10    832000001    842000000    190310    0.35799    0.146519
Chr10    832500001    842500000    188494    0.359426    0.14702
Chr10    833000001    843000000    190522    0.358523    0.146878
Chr10    833500001    843500000    198695    0.348364    0.142337
Chr10    907500001    917500000    188814    0.303194    0.134047
Chr10    908000001    918000000    188023    0.30784    0.134735
Chr10    908500001    918500000    177218    0.325972    0.141704
Chr10    909000001    919000000    172557    0.330306    0.143429
Chr10    909500001    919500000    170225    0.330501    0.143302
Chr10    910000001    920000000    167175    0.331445    0.143473
Chr10    910500001    920500000    161173    0.339927    0.146942
Chr10    911000001    921000000    166810    0.326099    0.141194
Chr10    911500001    921500000    178310    0.303117    0.132816
Chr10    913000001    923000000    165485    0.315415    0.135838
Chr10    913500001    923500000    166015    0.312369    0.134471
Chr10    914000001    924000000    164948    0.300647    0.129295
Chr5    339500001    349500000    359338    0.303806    0.225534
Chr5    340000001    350000000    372311    0.30161    0.223115
Chr5    340500001    350500000    376857    0.305078    0.225052
Chr5    341000001    351000000    372535    0.309838    0.226312
Chr5    341500001    351500000    370578    0.313484    0.230478
Chr5    342000001    352000000    367784    0.32019    0.234927
Chr5    342500001    352500000    353695    0.328046    0.236537
Chr5    343000001    353000000    335937    0.344081    0.242521
Chr5    343500001    353500000    321851    0.349278    0.244561
Chr5    344000001    354000000    312717    0.349583    0.242606
Chr5    344500001    354500000    306113    0.346459    0.238858
Chr5    345000001    355000000    300994    0.344326    0.236792
Chr5    345500001    355500000    296987    0.337126    0.230382
Chr5    346000001    356000000    290487    0.326257    0.224837
Chr5    346500001    356500000    280104    0.32328    0.22359

但是目前还不足以下结论,必须要结合这个区域的Pi值才行

至于说5号染色体上的位置,似乎向右偏移了,这是为什么呢?好奇怪啊。

fst小区间是296.5MB-306MB,目前应该是向右偏移了40MB。

让子弹飞一会,结合Pi值吧

Pi 图 也已经出来了。

在Male的中间位置有一个巨大的收缩

Pi计算这个东西,我发现董老师的代码绘图结果有问题,最终纵坐标显示成了绝对数(2e+5,4e+5,6e+5),应该是读列出问题了,我在这个基础上再改一下,当然是AI改的

是的,这个程序统计的是N_VARIANTS

#!/usr/bin/env Rscript

library(argparse)
library(ggplot2)

parser <- ArgumentParser(description = "Plot genome-wide Pi (nucleotide diversity)")

parser$add_argument(
    "-i","--input", required=TRUE, type="character",
    help="windowed-pi input file (must contain CHROM, BIN_START, BIN_END, PI)"
)

parser$add_argument(
    "-F","--fai", required=TRUE, type="character",
    help="reference genome .fai file (for chr order and length)"
)

parser$add_argument(
    "-o","--outdir", default=getwd(), type="character",
    help="output directory"
)

parser$add_argument(
    "-p","--prefix", default="pi_plot", type="character",
    help="output file prefix"
)

parser$add_argument(
    "-b","--blank", default=0.005, type="double",
    help="gap ratio between chromosomes"
)

parser$add_argument(
    "--smooth", action="store_true",
    help="use geom_line instead of points (useful for Pi trends)"
)

parser$add_argument(
    "--threshold", type="double", default=NULL,
    help="draw a horizontal line at this Pi value (optional)"
)

parser$add_argument(
    "--point_size", type="double", default=0.6,
    help="point size (used when not smoothing)"
)

parser$add_argument(
    "--line_size", type="double", default=0.3,
    help="line width (when --smooth)"
)

parser$add_argument(
    "--ylim_max", type="double", default=NULL,
    help="set fixed y-axis max (optional)"
)

args <- parser$parse_args()

# --------------------------------------------------------
# 1. 读取数据
# --------------------------------------------------------
raw <- read.table(args$input, header=TRUE, comment.char="")

# 自动处理常见结构:CHROM START END ... PI
if (ncol(raw) >= 6) {
    colnames(raw)[1:6] <- c("chr","start","end","n_var","n_mono","pi")
} else {
    colnames(raw)[1:4] <- c("chr","start","end","pi")
}

raw$pi[raw$pi < 0] <- 0

# 增加窗口中点位置
raw$mid <- (raw$start + raw$end) / 2

# --------------------------------------------------------
# 2. 读取 fai 确定染色体顺序与长度
# --------------------------------------------------------
fai <- read.table(args$fai, header=FALSE)
colnames(fai) <- c("chr","length","a","b","c")
fai <- fai[,c("chr","length")]

raw <- raw[raw$chr %in% fai$chr,]

# --------------------------------------------------------
# 3. 构建 chr offset
# --------------------------------------------------------
chrlen <- fai$length
names(chrlen) <- fai$chr

blank <- args$blank * sum(chrlen)
chrpos <- cumsum(c(0, head(chrlen + blank, -1)))
names(chrpos) <- fai$chr

raw$pos <- raw$mid + chrpos[raw$chr]

# --------------------------------------------------------
# 4. 设置 chr 因子
# --------------------------------------------------------
raw$chr <- factor(raw$chr, levels=fai$chr, ordered=TRUE)

# --------------------------------------------------------
# 5. 绘图
# --------------------------------------------------------
p <- ggplot(raw, aes(x=pos, y=pi, color=chr))

if (args$smooth) {
    p <- p + geom_line(size=args$line_size)
} else {
    p <- p + geom_point(size=args$point_size)
}

# 色板循环使用
default_colors <- c("#4197d8", "#f8c120", "#d60b6f",
                    "#4abb6b", "#e66519", "#413496")
p <- p + scale_color_manual(values=rep(default_colors, 100))

# --------------------------------------------------------
# 6. X轴(染色体中点)
# --------------------------------------------------------
xmid <- tapply(raw$pos, raw$chr, function(x) mean(range(x)))
xmid <- unlist(xmid)

p <- p +
    scale_x_continuous(
        breaks = xmid,
        labels = names(xmid),
        expand = c(0.001, 0)
    ) +
    labs(x="Chromosome", y="Pi (nucleotide diversity)")

# --------------------------------------------------------
# 7. 添加阈值线
# --------------------------------------------------------
if (!is.null(args$threshold)) {
    p <- p + geom_hline(
        yintercept=args$threshold,
        linetype=2,
        size=0.3
    )
}

# --------------------------------------------------------
# 8. y 轴范围
# --------------------------------------------------------
if (!is.null(args$ylim_max)) {
    p <- p + ylim(0, args$ylim_max)
}

# --------------------------------------------------------
# 9. 样式优化
# --------------------------------------------------------
p <- p +
    theme_classic() +
    theme(
        legend.position = "none",
        axis.text.x = element_text(angle=65, hjust=1, vjust=1)
    )

# --------------------------------------------------------
# 10. 输出 PDF
# --------------------------------------------------------
out_pdf <- file.path(args$outdir, paste0(args$prefix, "_pi_plot.pdf"))

pdf(out_pdf, width=11, height=4)
print(p)
dev.off()

cat("Pi plot saved to: ", out_pdf, "\n")

GWAS

gwas的脚本我是分两边准备的,一边是服务器,一边是工作站,首先是服务器端的脚本

(fst_plot) root@4632f56a8ada:/project1/data# cat gwas_script.R 
#!/usr/bin/env Rscript

# =========================
# GWAS 脚本
# 输入参数:
#   1. VCF 文件 (.vcf.gz)
#   2. 表型文件 (两列:样本ID \t 表型值)
# =========================

args <- commandArgs(trailingOnly = TRUE)
if(length(args) != 2){
  stop("Usage: Rscript gwas_script.R <genotypes.vcf.gz> <phenotype.txt>")
}

vcf_file <- args[1]
pheno_file <- args[2]
# =========================
# 1. 读取表型文件
# =========================
phenotype <- read.table(pheno_file, header=FALSE, stringsAsFactors=FALSE)
colnames(phenotype) <- c("sample", "pheno")

# =========================
# 2. 读取 VCF 并转换为 GDS 文件
# =========================
vcf <- vcfR::read.vcfR(vcf_file)
# 转换为 SNPRelate GDS 格式
temp_gds <- tempfile(fileext = ".gds")
SNPRelate::vcfR2gds(vcf, temp_gds, verbose = FALSE)
genofile <- SNPRelate::snpgdsOpen(temp_gds)

# =========================
# 3. 确保样本顺序一致
# =========================
geno_samples <- read.gdsn(index.gdsn(genofile, "sample.id"))
phenotype <- phenotype[match(geno_samples, phenotype$sample), ]

# =========================
# 4. GWAS 分析(单标记线性模型)
# =========================
# 输出 GWAS 结果文件
gwas_outfile <- "gwas_results.txt"

assoc <- snpgdsAssocTest(
  genofile,
  phenotype = phenotype$pheno,
  snp.id = read.gdsn(index.gdsn(genofile, "snp.id")),
  method = "linear"
)

# 保存结果
write.table(assoc, gwas_outfile, quote=FALSE, sep="\t", row.names=FALSE)

# =========================
# 5. 绘制 Manhattan plot
# =========================
# 准备数据
manhattan_data <- data.frame(
  SNP = assoc$snp.id,
  CHR = assoc$chromosome,
  BP = assoc$position,
  P = assoc$p.value
)

# 绘制
pdf("manhattan_plot.pdf", width=10, height=5)
qqman::manhattan(manhattan_data, genomewideline=-log10(5e-8),
                 suggestiveline=FALSE, main="GWAS Manhattan Plot")
dev.off()

# =========================
# 关闭 GDS 文件
# =========================
snpgdsClose(genofile)

cat("GWAS 分析完成!结果保存在 gwas_results.txt,曼哈顿图保存在 manhattan_plot.pdf\n")

但是内存开销太大了,服务器似乎顶不住。

SNPs总数达到了676MB

工作站端跑通了一条染色体:Chr10

工作站的程序,目前放的是完整版本

### 1. VCF → PLINK BED
echo "[1/3] 生成 PLINK 二进制文件 (.bed/.bim/.fam)..."
plink --vcf "$VCF_FILE" \
      --make-bed \
      --out "$BED_PREFIX" \
      --double-id \
      --allow-no-sex \
      --pheno "$PHENO_FILE"

### 2. 生成 kinship
KINSHIP_FILE="$OUT_DIR/GEMMA_results/${CHR}_kinship.cXX.txt"
echo "[2/3] 生成 kinship 相关性矩阵..."
gemma -bfile "$BED_PREFIX" -gk 1 -o "${CHR}_kinship"
mv output/${CHR}_kinship.cXX.txt "$KINSHIP_FILE"

### 3. GEMMA GWAS
echo "[3/3] 运行 GEMMA GLM 和 MLM GWAS..."
# GLM
GLM_FILE="$OUT_DIR/GEMMA_results/${CHR}_glm.assoc.txt"
gemma -bfile "$BED_PREFIX" -lm 1 -o "${CHR}_glm"
mv output/${CHR}_glm.assoc.txt "$GLM_FILE"

# MLM
MLM_FILE="$OUT_DIR/GEMMA_results/${CHR}_mlm.assoc.txt"
gemma -bfile "$BED_PREFIX" -k "$KINSHIP_FILE" -lmm 1 -o "${CHR}_mlm"
mv output/${CHR}_mlm.assoc.txt "$MLM_FILE"

### ============= R 绘图部分(Bash 内嵌) ================
echo ""
echo "===== 绘制曼哈顿图和 QQ 图 ====="

plot_file() {
    local assoc_file=$1
    local prefix=$2
    local manhattan_png="$OUT_DIR/plots/${prefix}_manhattan.png"
    local qq_png="$OUT_DIR/plots/${prefix}_qq.png"

    # 如果绘图已经完成,则跳过
    if [[ -f "$manhattan_png" && -f "$qq_png" ]]; then
        echo "[✔] R 绘图文件已存在,跳过绘图: $prefix"
        return
    fi

    Rscript - <<EOF
library(data.table)
library(qqman)

# 读取 GEMMA 输出
res <- fread("$assoc_file")
# 识别 P 值列
p_col <- if ("p_wald" %in% colnames(res)) "p_wald" else if ("P" %in% colnames(res)) "P" else stop("无法识别 P 列")
chr_col <- if ("chr" %in% colnames(res)) "chr" else "CHR"
bp_col <- if ("ps" %in% colnames(res)) "ps" else "BP"
snp_col <- if ("rs" %in% colnames(res)) "rs" else "SNP"

# 删除 P 值为 NA 或 <=0 的行
res <- res[!is.na(get(p_col)) & get(p_col) > 0]
res[[chr_col]] <- as.integer(res[[chr_col]])

# 曼哈顿图
png("$manhattan_png", width=2000, height=1200, res=150)
manhattan(res, chr=chr_col, bp=bp_col, p=p_col, snp=snp_col,
          col=c("#3399ff","#99cc33","#ffcc00"),
          cex=0.5, cex.axis=0.7, cex.lab=0.8,
          genomewideline=FALSE, suggestiveline=FALSE)
dev.off()

# QQ 图
png("$qq_png", width=1200, height=1200, res=150)
qq(res[[p_col]], tck=-0.02, col="skyblue", main="$prefix QQ Plot")
dev.off()
EOF
    echo "[✔] R 绘图完成: $prefix"
}

# 绘制 GLM 和 MLM
plot_file "$GLM_FILE" "${CHR}_glm"
plot_file "$MLM_FILE" "${CHR}_mlm"

echo ""
echo "===== 🎉 单染色体 GWAS 分析完成! ====="
echo "PLINK + GEMMA 结果在: $OUT_DIR/GEMMA_results/"
echo "曼哈顿图 & QQ 图在:   $OUT_DIR/plots/"
echo "============================================="

这是MLM的模型结果,显然它过早地上抬了,需要跑完全部染色体的结果才知道到底是怎么一回事。

既然如此,让子弹飞一会,我要去睡觉了。

又跑了两条染色体,仍然是这样的趋势,所以这个模型的适配不太行。

0.1的阈值得到的Fst结果

从趋势来看,还算不错

这是Pi的结果,上面是female,下面是male,5号染色体的核苷酸多样性还是有差别的,但是除了五号以外,其余的染色体也有差别,所以这个结果目前不支持五号是性染色体。

基于EXON的GWAS分析

先把阿鹏的注释给更正了,我们vcf.gz文件里的染色体命名是Chr1,而李俞鹏的注释命名是Chr01

sed -E 's/Chr0?([1-9])/Chr\1/' female_ba_last.gff3 > female_ba_last.fix.gff3
awk '$3=="exon"{print $1"\t"$4-1"\t"$5"\t"$9}' female_ba_last.fix.gff3 > exons.raw.bed
sort -k1,1 -k2,2n exons.raw.bed | \
  cut -f1-3 | \
  bedtools merge -i - > exons.merged.bed
#需要先对vcf.gz文件构建索引哈

这样来看,好像10号更强一点,但是强的有限。

好像离专利的那三个SNP比较近。

全外显子的GWAS结果,SNP的很少,但是这个qq plot图的质量不错,比之前的乱七八糟的要强多了。

基于基因的GWAS分析

awk '$3=="gene"{print $1"\t"$4-1"\t"$5"\t"$9}' female_ba_last.fix.gff3 > exons.raw.bed
sort -k1,1 -k2,2n exons.raw.bed | \
  cut -f1-3 | \
  bedtools merge -i - > genes.merged.bed
bcftools view -R genes.merged.bed -Oz -o 11_27_merged_exon_vcf.gz 11_27_merged_vcf.gz

目前来看,基于外显子的GWAS分析的确是没有强烈的信号的。

好了,基因的提取已经挂上了。