表型数据分析 -直方图以及相关性批量分析

By Tingting, 20 March, 2026

Forums

做GWAS表型很多，不可能一个一个作图，所以对表型数据进行批量分析，利用AI写了一个R代码

用的表型数据形式如下：

第一列为样本为，之后每一列是一个表型，中间用制表符分隔开。

一、直方图分析


# 1. 加载所需包 ----------------------------------------------------------------
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggpubr")) install.packages("ggpubr")
library(tidyverse)
library(ggpubr)

# 2. 设置工作路径并读取数据 ----------------------------------------------------
setwd("D:/1、博士期间/9、博士-实验以及项目/12、根腐病GWAS/基因型填补")
df_raw <- read.delim("表型作直方图文件20260318.txt", 
                     sep = "\t", 
                     row.names = 1,
                     check.names = FALSE,
                     stringsAsFactors = FALSE)

# 获取所有性状名称
all_traits <- colnames(df_raw)
cat("共有", length(all_traits), "个性状：\n")
print(all_traits)

# 创建输出文件夹
output_dir <- "histograms"
if (!dir.exists(output_dir)) dir.create(output_dir)

# 3. 定义绘图函数（单个性状的直方图 + 正态曲线 + Shapiro-Wilk p值）-----------
plot_histogram <- function(data, trait, out_dir) {
  # 提取性状数据并删除缺失值
  vec <- data[[trait]]
  vec <- vec[!is.na(vec)]
  n <- length(vec)
  
  if (n < 3) {
    cat("跳过", trait, "：样本数不足", n, "\n")
    return(NULL)
  }
  
  # 计算Shapiro-Wilk正态性检验p值（当样本量在3~5000之间）
  sw_test <- shapiro.test(vec)
  p_value <- sw_test$p.value
  p_label <- ifelse(p_value < 0.001, "p < 0.001", 
                    paste("p =", formatC(p_value, format = "f", digits = 3)))
  
  # 计算均值和标准差用于正态曲线
  mean_val <- mean(vec)
  sd_val <- sd(vec)
  
  # 生成安全文件名
  safe_name <- gsub("[^A-Za-z0-9]", "_", trait)
  filename <- file.path(out_dir, paste0(safe_name, "_histogram.pdf"))
  
  # 绘制直方图 + 密度曲线 + 理论正态曲线
  p <- ggplot(data.frame(x = vec), aes(x = x)) +
    geom_histogram(aes(y = after_stat(density)), 
                   bins = 20, fill = "skyblue", color = "black", alpha = 0.7) +
    geom_density(color = "red", linewidth = 1, linetype = "solid") +
    stat_function(fun = dnorm, args = list(mean = mean_val, sd = sd_val),
                  color = "blue", linewidth = 1, linetype = "dashed") +
    annotate("text", x = Inf, y = Inf, 
             label = paste0("Shapiro-Wilk: ", p_label), 
             hjust = 1.1, vjust = 1.5, size = 4, color = "black") +
    labs(title = trait, x = trait, y = "Density") +
    theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
      axis.text = element_text(face = "bold", size = 10),
      axis.title = element_text(face = "bold", size = 12),
      panel.border = element_rect(color = "black", linewidth = 0.8, fill = NA),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank()
    )
  
  # 保存
  ggsave(filename = filename, plot = p, width = 6, height = 5)
  cat("已保存:", filename, "样本数:", n, "\n")
}

# 4. 批量生成所有性状的直方图 -------------------------------------------------
for (trait in all_traits) {
  plot_histogram(df_raw, trait, output_dir)
}

cat("全部完成！共生成", length(all_traits), "个PDF文件，保存在", output_dir, "文件夹中。\n")

二、相关性分析

# 1. 安装并加载所需包 --------------------------------------------------------
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggpubr")) install.packages("ggpubr")
library(tidyverse)
library(ggpubr)

# 2. 设置工作路径并读取数据 ----------------------------------------------------
setwd("D:/1、博士期间/9、博士-实验以及项目/12、根腐病GWAS/基因型填补")
df_raw <- read.delim("表型作直方图文件20260318.txt", 
                     sep = "\t", 
                     row.names = 1,
                     check.names = FALSE,
                     stringsAsFactors = FALSE)

# 查看列名
all_traits <- colnames(df_raw)
cat("共有", length(all_traits), "个性状：\n")
print(all_traits)

# 创建输出文件夹
output_dir <- "pairwise_correlations"
if (!dir.exists(output_dir)) dir.create(output_dir)

# 3. 定义绘图函数（使用ggplot2 + .data[[ ]] 避免特殊字符问题）-----------------
plot_pair <- function(data, trait1, trait2, out_dir) {
  # 提取数据并删除缺失值
  df_plot <- data[, c(trait1, trait2)]
  df_plot <- df_plot[complete.cases(df_plot), ]
  n <- nrow(df_plot)
  
  # 如果样本数太少（如<3），跳过绘图
  if (n < 3) {
    cat("跳过", trait1, "vs", trait2, "：样本数不足", n, "\n")
    return(NULL)
  }
  
  # 生成安全文件名（替换空格和特殊字符）
  safe_name1 <- gsub("[^A-Za-z0-9]", "_", trait1)
  safe_name2 <- gsub("[^A-Za-z0-9]", "_", trait2)
  filename <- file.path(out_dir, paste0(safe_name1, "_vs_", safe_name2, ".pdf"))
  
  # 绘制图形
  p <- ggplot(df_plot, aes(x = .data[[trait1]], y = .data[[trait2]])) +
    geom_point(color = "#a6d38d", size = 3, alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", linewidth = 0.8) +
    stat_cor(method = "pearson", 
             label.x.npc = "left",
             label.y.npc = "top",
             size = 5,
             digits = 2) +
    labs(x = trait1, y = trait2) +
    theme_bw() +
    theme(
      panel.border = element_rect(color = "black", linewidth = 0.8, fill = NA),
      axis.text = element_text(face = "bold", size = 12),
      axis.title = element_text(face = "bold", size = 14),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank()
    )
  
  # 保存
  ggsave(filename = filename, plot = p, width = 6, height = 5)
  cat("已保存:", filename, "样本数:", n, "\n")
}

# 4. 生成所有两两组合并绘图 ---------------------------------------------------
trait_combinations <- combn(all_traits, 2, simplify = FALSE)

for (i in seq_along(trait_combinations)) {
  trait_pair <- trait_combinations[[i]]
  trait1 <- trait_pair[1]
  trait2 <- trait_pair[2]
  plot_pair(df_raw, trait1, trait2, output_dir)
}

cat("全部完成！共生成", length(trait_combinations), "个PDF文件，保存在", output_dir, "文件夹中。\n")

示意图片: