超几何分布检验用于检验你感兴趣的基因集合中,落在某条染色体上的基因数量,是否显著多于随机水平。
# 输入你的数字
N <- 28000 # 全基因组总基因数(香榧大约这个量级)
K <- 1800 # 某条染色体总基因数
n <- 120 # 你的目标基因集大小
k <- 25 # 你的基因中落在这条染色体的数量
# 超几何检验
p_value <- phyper(k - 1, K, N - K, n, lower.tail = FALSE)
p_value进阶版
library(ggplot2)
library(dplyr)
# ===================== 数据 =====================
chr_data <- data.frame(
Chr = paste0("Chr", 1:11),
Observed = c(257,258,236,253,268,421,257,305,277,333,274),
# Observed = c(,,,,,,,,,,),
Total_genes = c(3873,3354,3509,3692,4313,6526,3988,4566,3872,4717,3893)
)
N_total_genome <- 46303
n_query <- 3139
# ===================== 计算 =====================
chr_data <- chr_data %>%
mutate(
Expected = n_query * Total_genes / N_total_genome,
Fold = Observed / Expected,
P_value = phyper(Observed - 1, Total_genes, N_total_genome - Total_genes, n_query, lower.tail = FALSE),
sig = case_when(
P_value < 0.001 ~ "***",
P_value < 0.01 ~ "**",
P_value < 0.05 ~ "*",
TRUE ~ ""
)
)
# 染色体按 1-11 顺序
chr_data$Chr <- factor(chr_data$Chr, levels = paste0("Chr", 1:11))
# ===================== ✅ 导出结果表格(Excel可打开)=====================
write.csv(
chr_data,
file = "E:/博士课题/性别偏向基因的鉴定/Mfuzz/染色体富集结果/cluster12_染色体富集结果.csv",
row.names = FALSE
)
# ===================== 绘图 =====================
ggplot(chr_data, aes(x = Chr, y = Fold)) +
geom_col(aes(fill = -log10(P_value)), width = 0.7) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray30") +
geom_text(aes(label = sig), size = 6, fontface = "bold", vjust = -0.8) +
scale_fill_gradient(low = "#D5D5D5", high = "#F3C8B7") +
# 顶部自动留空,保证星号显示
scale_y_continuous(breaks = seq(0, 2.0, 0.1), expand = expansion(mult = c(0, 0.15))) +
labs(x = "Chromosome", y = "Enrichment fold", fill = "-log10(P)") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# 导出高清图片
ggsave("E:/博士课题/性别偏向基因的鉴定/Mfuzz/染色体富集结果/cluster12_chromosome_enrichment.pdf", width = 10, height = 6, dpi = 600)