这个批量火山图的批量原理是读取目录下的所有.txt文件,随后通过一个循环逐个处理这些文件,再输出到相应的_result文件夹下。并且代码循环中自带分组检测,能够识别列名中的字母来实现分组,因此只需要准备好表达矩阵即可,组内的同学可以使用学长学姐整理好的数据🥰,不过需要注意的是样本名不能重复,有重复的话可以手动编号一下。如果遇到很难处理的数据,可以先让ai写出一个小代码,将数据转化为输入数据的格式,再运行代码。
代码主体
1、数据读取和输入
这里的Normal和Tumor变量用于分组,比如说在这里Normal的功能就是将所用行名中带N的归位正常组,Tumor将所有行名中带T的归为肿瘤组。如果使用TCGA的数据可以把这段代码改为:
Normal <- "01A"
Tumor <- "11A"
# 1. 安装和加载必要的包 --------------------------------------------------------
required_packages <- c("limma", "ggplot2", "pheatmap", "dplyr", "tidyr", "RColorBrewer", "stringr")
# 检查并安装缺失的包
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) {
message("正在安装缺失的包: ", paste(new_packages, collapse = ", "))
install.packages(new_packages, dependencies = TRUE)
}
# 加载所有需要的包
suppressPackageStartupMessages({
library(limma)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(stringr)
})
# 输入
setwd("/Users/blastora/Documents/Bioinformation_PROJECT/Volcano_Batch")
Normal <- "N|NORMAL|NORM"
Tumor <- "T|TUMOR|TUMOUR|TUM"
# 2. 主处理流程 --------------------------------------------------------------
# 获取当前目录下所有txt文件
input_files <- list.files(pattern = "\\.txt$", full.names = TRUE)
if(length(input_files) == 0) {
stop("没有找到任何.txt文件")
}
2、循环主体
代码运行的主体由一个for循囊括起来,达到自动化的目的
# 处理每个文件
for(input_file in input_files) {
message("正在处理文件: ", basename(input_file))
# 创建输出目录
output_dir <- file.path(dirname(input_file),
paste0(tools::file_path_sans_ext(basename(input_file)), "_results"))
dir.create(output_dir, showWarnings = FALSE)
# 1. 读取数据 --------------------------------------------------------------
tryCatch({
expr_data <- as.matrix(read.delim(input_file, row.names = 1, check.names = FALSE))
message("数据维度: ", nrow(expr_data), " 基因 × ", ncol(expr_data), " 样本")
# 2. 自动分组 ------------------------------------------------------------
message("\n2. 自动检测样本分组...")
# 转换为大写以防大小写不一致
upper_names <- toupper(colnames(expr_data))
# 检测N和T标记
is_normal <- str_detect(upper_names, Normal)
is_tumor <- str_detect(upper_names, Tumor)
# 验证分组结果
if(sum(is_normal) == 0 || sum(is_tumor) == 0) {
stop("无法自动检测分组:请检查样本命名是否包含N/T标记")
}
if(any(is_normal & is_tumor)) {
warning("部分样本同时包含N和T标记,请检查样本命名")
}
# 创建分组因子
group_info <- factor(ifelse(is_normal, "Normal", "Tumor"),
levels = c("Normal", "Tumor"))
message("自动分组结果:")
message(" Normal样本: ", sum(group_info == "Normal"))
message(" Tumor样本: ", sum(group_info == "Tumor"))
# 3. 差异表达分析 -------------------------------------------------------
message("\n3. 执行差异表达分析...")
# 确保输入有效
if(ncol(expr_data) != length(group_info)) {
stop("样本数量与分组因子长度不匹配")
}
# 创建设计矩阵
design <- model.matrix(~0 + group_info)
colnames(design) <- levels(group_info)
# 线性模型拟合
fit <- lmFit(expr_data, design)
# 设置对比矩阵
contrast <- paste("Tumor", "Normal", sep = "-")
contrast.matrix <- makeContrasts(contrasts = contrast, levels = design)
# 执行对比分析
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
# 获取完整结果
deg_results <- topTable(fit2, number = Inf, adjust.method = "BH")
# 添加基因ID列
deg_results$Gene <- rownames(deg_results)
# 4. 保存结果 -----------------------------------------------------------
message("\n4. 保存分析结果...")
write.csv(deg_results, file.path(output_dir, "differential_expression_results.csv"),
row.names = FALSE)
# 保存显著差异基因列表
sig_genes <- deg_results %>%
filter(adj.P.Val < 0.05, abs(logFC) > 1)
write.csv(sig_genes, file.path(output_dir, "significant_genes.csv"),
row.names = FALSE)
# 5. 可视化 -------------------------------------------------------------
message("\n5. 生成可视化结果...")
# 火山图
message(" 生成火山图...")
# 添加显著性标记
deg_results <- deg_results %>%
mutate(
Significance = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Up (Tumor)",
adj.P.Val < 0.05 & logFC < -1 ~ "Down (Tumor)",
TRUE ~ "Not significant"
)
)
# 计算显示标签的基因(最显著的前10个上调和下调基因)
label_data <- deg_results %>%
filter(adj.P.Val < 0.05) %>%
group_by(Significance) %>%
arrange(desc(abs(logFC))) %>%
slice_head(n = 10) %>%
ungroup()
rownames(label_data) <- label_data$Gene
# 绘制火山图
volcano_plot <- ggplot(deg_results, aes(x = logFC, y = -log10(P.Value), color = Significance)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Up (Tumor)" = "#E64B35",
"Down (Tumor)" = "#3182bd",
"Not significant" = "#bdbdbd")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey50") +
labs(title = paste("Volcano Plot:", basename(input_file)),
x = expression(log[2]~fold~change~(Tumor/Normal)),
y = expression(-log[10]~adjusted~P~value),
color = "Differential Expression") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"))
# 添加基因标签
if(nrow(label_data) > 0) {
volcano_plot <- volcano_plot + ggrepel::geom_text_repel(
data = label_data,
aes(label = rownames(label_data)),
size = 3,
box.padding = 0.5,
max.overlaps = Inf
)
}
# 保存火山图
ggsave(file.path(output_dir, "volcano_plot.pdf"), volcano_plot,
width = 10, height = 8)
ggsave(file.path(output_dir, "volcano_plot.png"), volcano_plot,
width = 10, height = 8, dpi = 300)
# 热图
message(" 生成热图...")
# 筛选显著差异基因
sig_genes <- deg_results %>%
filter(adj.P.Val < 0.05, abs(logFC) > 1) %>%
arrange(adj.P.Val) %>%
head(50)
if(nrow(sig_genes) > 0) {
# 提取表达数据
plot_data <- expr_data[rownames(sig_genes), , drop = FALSE]
# 标准化数据(按行)
plot_data <- t(scale(t(plot_data)))
# 设置分组注释
annotation_col <- data.frame(Group = group_info)
rownames(annotation_col) <- colnames(expr_data)
# 设置颜色
color_palette <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
# 绘制热图
pdf(file.path(output_dir, "heatmap.pdf"), width = 12, height = 10)
pheatmap(plot_data,
color = color_palette,
annotation_col = annotation_col,
show_rownames = ifelse(nrow(sig_genes) <= 30, TRUE, FALSE),
fontsize_row = 8,
fontsize_col = 8,
main = paste("Top", nrow(sig_genes), "Differentially Expressed Genes"),
cluster_cols = TRUE,
cluster_rows = TRUE,
border_color = NA)
dev.off()
# 保存热图为PNG
png(file.path(output_dir, "heatmap.png"), width = 1200, height = 1000, res = 150)
pheatmap(plot_data,
color = color_palette,
annotation_col = annotation_col,
show_rownames = ifelse(nrow(sig_genes) <= 30, TRUE, FALSE),
fontsize_row = 8,
fontsize_col = 8,
main = paste("Top", nrow(sig_genes), "Differentially Expressed Genes"),
cluster_cols = TRUE,
cluster_rows = TRUE,
border_color = NA)
dev.off()
} else {
message("没有满足条件的差异表达基因,跳过热图生成")
}
# 6. 保存会话信息 -------------------------------------------------------
writeLines(capture.output(sessionInfo()),
file.path(output_dir, "session_info.txt"))
message("\n分析完成! 结果已保存到: ", output_dir)
message(strrep("=", 50))
}, error = function(e) {
message("\n处理文件 ", basename(input_file), " 时出错: ", e$message)
# 保存错误信息
writeLines(paste("Error:", e$message), file.path(output_dir, "error_log.txt"))
})
}
下面是这个循环中关键环节的拆解,首先是火山图,tryCatch()函数用于输出报错,方便调试。
tryCatch()
ggplot()
是 ggplot2
包中的核心函数,用于创建一个新的图形对象,简洁的来说就是构建绘图的框架,deg_results对应输入数据,aes()定义坐标轴,color定义颜色。
ggplot(deg_results, aes(x = logFC, y = -log10(P.Value), color = Significance))
geom_point()用于添加散点,用法基本与上文提到的geom_jitter()函数用法一致,alpha调节透明度,size调整散点大小。scale_color_manual()
用于手动设置颜色映射,values 参数指定了不同 Significance 级别的颜色,这里为上调基因("Up (Tumor)")设置为红色,下调基因("Down (Tumor)")设置为蓝色,非显著基因("Not significant")设置为灰色。
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Up (Tumor)" = "#E64B35",
"Down (Tumor)" = "#3182bd",
"Not significant" = "#bdbdbd"))
geom_hline() 用于在图形中添加水平线条。yintercept设置水平线条的 y 轴截距,这里表示 p 值为 0.05 的阈值线。linetype = "dashed" 设置线条类型为虚线,color = "grey50" 设置线条颜色为灰色。
geom_vline() 用于在图形中添加垂直线条。xintercept设置垂直线条的 x 轴截距,这里表示 logFC 为 -1 和 1 的阈值线。linetype = "dashed" 设置线条类型为虚线。color = "grey50" 设置线条颜色为灰色。
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey50")
theme_minimal(base_size = 12 ) 设置主题的基本字体大小为 12。theme() 用于进一步自定义图形的主题。legend.position = "bottom" 将图例放置在图形的底部。plot.title = element_text(hjust = 0.5, face = "bold") 设置图形标题的水平对齐方式为居中(hjust = 0.5),字体加粗(face = "bold")。
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"))
ggrepel::geom_text_repel() 用于在图形中添加文本标签,并自动调整位置以避免重叠。
aes(label = rownames(label_data)) 设置标签的内容为 label_data 的行名。size = 3 设置标签文本的大小为 3。box.padding = 0.5 设置标签周围的空白区域。max.overlaps = Inf 设置最大重叠数为无穷大,确保所有标签都能显示。
volcano_plot <- volcano_plot + ggrepel::geom_text_repel(
data = label_data,
aes(label = rownames(label_data)),
size = 3,
box.padding = 0.5,
max.overlaps = Inf)
热图的绘图参数比较简单,大家可以豆包或者deepseek一下了解其参数构成