setwd("/Volumes/MacData/1_SynologyDrive/1_Job/05课程/03-课程-公选课-生命大数据与R语言——数据驱动的生命科学探索/文件/")
# 安装 BiocManager(如果尚未安装)
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager", quiet = TRUE)
}
# 静默检查并安装 CRAN 包
cran_pkgs <- c("ggplot2", "ggrepel", "dplyr")
for (pkg in cran_pkgs) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg, quiet = TRUE)
}
suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}
# 静默安装并加载 Bioconductor 包
if (!requireNamespace("DESeq2", quietly = TRUE)) {
BiocManager::install("DESeq2", quiet = TRUE, ask = FALSE)
}
suppressPackageStartupMessages(library(DESeq2))
#这里的75仅仅是我项目中的一个count的序号的标记,代码中还是使用count,便于函数的书写
Count75 <- readRDS("Count75.RDS")
print(Count75[1:5,1:5])
## ENSEMBL SRR23104045 SRR23104046 SRR23104047 SRR23104048
## 1 ENSMUSG00000102628 0 0 0 0
## 2 ENSMUSG00000100595 0 0 1 0
## 3 ENSMUSG00000097426 0 0 0 0
## 4 ENSMUSG00000104478 0 0 0 0
## 5 ENSMUSG00000104385 0 0 1 0
Meta75 <- readRDS("Meta75.RDS")
print(Meta75[1:5,1:2])
## Run Group
## 1 SRR23104057 Control
## 2 SRR23104058 Control
## 3 SRR23104059 Control
## 4 SRR23104060 Control
## 5 SRR23104053 ICB
#geneid_df1.mus <- readRDS("~/1_home_gaoy/ICBS_r/xy/geneid_df1.mus.RDS")
geneid_df1.mus <- readRDS("geneid_df1.mus.RDS")
count <- Count75
count.use <- dplyr::left_join(geneid_df1.mus,count,by="ENSEMBL")
rownames(count.use) <- count.use$SYMBOL
meta <- Meta75
meta5<-meta[meta$Group %in% c("ICB", "Combination"),]
#根据列名取需要的count列,例如count.use[,"SRR23104053"]
count5<-as.data.frame(count.use[,meta5$Run])
#判断是否一致,避免出错,一些同学会漏了meta5中的“5”,导致False
identical(colnames(count5),meta5$Run)
## [1] TRUE
meta <- meta5
#count5 <- count5[rowSums(count5) >= 20, ]
CountFilter=function(count,meta){
Group<-meta$Group[!duplicated(meta$Group)]
#unique(meta$Group)
n1<-nrow(meta[meta$Group==Group[1],])
# length(which(meta$Group==Group[1]))
#n1 <- meta %>% dplyr::filter(., Group == Group[1]) %>% dim() %>% .[1]
n2<-nrow(meta[meta$Group==Group[2],])
n<-ifelse(n1<n2,n1,n2)
count = count[rowSums(count >=10) >= n,]
return(count)
}
count5<-CountFilter(count5,meta5)
library(DESeq2)
count <- count5
treatment <- "Combination"
Control <- "ICB"
dds <- DESeqDataSetFromMatrix(count, colData=meta, design= ~ Group)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast=c("Group",treatment,Control))
res$SYMBOL<-row.names(res)
res.df <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
res.df <- res.df[!is.na(res.df$padj), ]
# library (ggplot2)
# library(ggrepel)
# library(dplyr)
#(res.df$padj < 0.05) & abs(res.df$log2FoldChange) >=1
res.df$change <- ifelse((res.df$padj < 0.05) & (abs(res.df$log2FoldChange) >= 1),
ifelse(res.df$log2FoldChange> 1 ,'Up','Down'),
'Stable')
ifelse(条件, 为真时的值, 为假时的值) 是 R 的向量化条件判断函数。
外层判断:
(res.df$padj < 0.05) & (abs(res.df$log2FoldChange) >= 1)
👉 这是判断基因是否差异显著且表达变化足够大的标准
• 统计学显著性(FDR校正后):
padj < 0.05:
• 表达倍数差异至少是 2 倍(log2FoldChange ≥ 1 或 ≤ -1):
abs(log2FoldChange) >= 1
✅ 如果这两个条件都满足,就进入下一步判断。
内层判断:
ifelse(res.df$log2FoldChange > 1, 'Up', 'Down')
在差异显著的前提下:
• 如果 log2FoldChange > 1 → 表达上调,标注为 “Up”
• 否则(即 ≤ -1)→ 表达下调,标注为 “Down”
否则(不显著或变化太小):表示无显著表达变化,标注为 “Stable”
“Up” → 表达上调(显著&倍数大于2)
“Down” → 表达下调(显著&倍数小于1/2)
“Stable” → 不显著或倍数变化小
# 自定义差异基因颜色标注
# https://colorhunt.co/
# 提取 res.df$change(表示基因是 Up / Down / Stable)中的 唯一值,并去除 NA,以确定当前结果中到底有哪些类型的变化。
#uniq_change <- res.df$change[!duplicated(res.df$change)]
#uniq_change <- uniq_change[!is.na(uniq_change)] # 去除 NA
uniq_change <- res.df$change %>% unique()
# 如果三种状态都有(Up、Down、Stable),就给三种颜色:
# Down: #0072B5 蓝
# Stable: grey 灰
# Up: #BC3C28 红
# 如果只有 Up 和 Stable:灰 + 红
# 如果只有 Down 和 Stable:蓝 + 灰
if (length(uniq_change) == 3) {
colorlist <- c("#0072B5", "grey", "#BC3C28")
} else if ("Up" %in% uniq_change) {
colorlist <- c("grey", "#BC3C28")
} else {
colorlist <- c("#0072B5", "grey")
}
p <- ggplot( #设置数据
res.df,
aes(x = log2FoldChange,
y = -log10(padj),
colour=change)) +
geom_point(alpha=0.6, size=3) +
scale_color_manual(values=colorlist,na.translate = FALSE )+
# "#0072B5","grey","#BC3C28"
# "#546de5", "#d2dae2","#ff4757"
geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
# 坐标轴
labs(x="log2FoldChange",
# y="-log10(Adjusted P-Value)")+
y=expression(-log[10](adjusted~italic(p)*"-value")))+
theme_bw()+
theme(legend.position='bottom',
legend.title = element_blank(),
panel.background = element_blank(),
text=element_text(family="sans"),
axis.title = element_text(size = 14),
legend.text = element_text(size = 14),
axis.text = element_text(size = 10))
print(p)
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+geom_label_repel(data = lfcBlow0, #指定用于添加标签的数据集为 lfcBlow0,通常是差异表达显著下调的基因子集
aes( # 映射美学属性(aes):
x = log2FoldChange, #x = log2FoldChange:x 轴是 log2 倍变化
y = -log10(padj), # y 轴是 padj 的负对数,padj 越小越靠上
label = SYMBOL), #每个点对应的基因符号显示为标签
xlim=c(NA,-1),#限制标签只出现在 x 轴小于 -1 的区域。
fill="#daeff8", # 标签框的背景颜色为淡蓝色(你用于下调基因的色调)
)
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## 标签要分两块,一个是上调的基因标签,一个是下调的基因标签
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+geom_label_repel(data = lfcBlow0, #指定用于添加标签的数据集为 lfcBlow0,通常是差异表达显著下调的基因子集
aes( # 映射美学属性(aes):
x = log2FoldChange, #x = log2FoldChange:x 轴是 log2 倍变化
y = -log10(padj), # y 轴是 padj 的负对数,padj 越小越靠上
#fill=(factor(change)),
alpha=0.9, #透明度
label = SYMBOL), #每个点对应的基因符号显示为标签
xlim=c(NA,-1),#限制标签只出现在 x 轴小于 -1 的区域。
fill="#daeff8", # 标签框的背景颜色为淡蓝色(你用于下调基因的色调)
size = 3, #标签文字大小为 3
max.overlaps=25, # 最多允许重叠标签的数量为 30,避免太多标签造成混乱
fontface="bold", # 标签文字加粗显示
color="black", # 标签文字颜色为黑色
box.padding=unit(0.35, "lines"), # 标签框与周围标签之间的最小距离为 0.35 行高
point.padding=unit(0.5, "lines"), # 标签与其对应点之间的最小间距为 0.5 行高。
segment.colour = "grey50", # 标签与点之间的连线颜色为中灰色(灰度为 50%)。
min.segment.length = 0.01,#连线的最小长度为 0.01 NPC 单位(图形空间中的比例单位),防止太短的线让箭头堆在一起。
max.iter = 8000, #尝试优化排布的最大迭代次数,越大越精准,越慢,默认是 2000,这里设置为 8000,适用于标签密集场景。
force = 2, #标签排斥力,控制标签远离重叠的程度。值越大越分散,适合标签密集的情况。
segment.curvature = 0,#线条弯曲程度,设置为 0 表示直线。
# 连线箭头设置:
# • length = 0.01 npc:箭头的长度(单位是图形比例单位)。
# • type = "closed":箭头为实心。
# • ends = "first":箭头在连线起点(靠近点的一端)。
# • angle = 10:箭头张角为 10°。
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10))+
geom_label_repel(data = lfcAbove0,
aes(
x = log2FoldChange,
y = -log10(padj),
#fill=(factor(change)),
#alpha=0.9,
label = SYMBOL),
xlim=c(1,NA),
fill="#fde2e7",
size = 2.5,max.overlaps=20,fontface="bold",
color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
segment.colour = "grey50",
min.segment.length = 0.01,
max.iter = 8000,
force = 2,
segment.curvature = 0,
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10))
## Warning: ggrepel: 587 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#✅ force = 1
# • 作用:控制标签之间互相排斥的“斥力”大小,防止重叠。
# • 值越大:标签之间的间距越大,更不容易重叠,但布局可能更松散。
# • 默认值:1
# • 建议调节范围:0.1 ~ 5
#
# ✅ 用于:让标签分得更开。
# ✅ max.iter = 3000
# • 作用:最大迭代次数,控制排布算法的尝试次数。
# • 值越大:布局计算更精细,可能避免重叠,但也可能导致绘图变慢。
# • 默认值:2000
#
# ✅ 用于:在复杂图中提高标签排布质量。
#
# ✅ min.segment.length = 0.1
# • 作用:设置引导线(segment)的最短长度(以“npc”为单位,图像的相对长度)。
# • 值越小:允许更短的引导线,从而标签可以更靠近点。
# • 默认值:0.5
#
# ✅ 用于:让标签贴得更近,不让引导线太长。
fun_label <- function(data,color.lable,a,b,max.overlaps){
geom_label_repel(data = data,
aes(
x = log2FoldChange,
y = -log10(padj),
#fill=(factor(change)),
#alpha=0.9,
label = SYMBOL),
xlim=c(a,b),
fill= color.lable,
size = 2.5,max.overlaps=max.overlaps,fontface="bold",
color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
segment.colour = "grey50",
min.segment.length = 0.01,
max.iter = 8000,
force = 2,
segment.curvature = 0,
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10))
}
#
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+fun_label(lfcBlow0,"#daeff8",NA,-1,20)+fun_label(lfcAbove0,"#fde2e7",1,NA,20)
## Warning: ggrepel: 587 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## 写成总函数
volcano_plot<-function(res.df,cut_off_logFC,cut_off_padj,lab){
res.df$change = ifelse((res.df$padj < cut_off_padj) & (abs(res.df$log2FoldChange) >= cut_off_logFC),
ifelse(res.df$log2FoldChange> cut_off_logFC ,'Up','Down'),
'Stable')
res.df$label = ifelse(res.df$padj < cut_off_padj & abs(res.df$log2FoldChange) >=cut_off_logFC, as.character(res.df$SYMBOL),"")
uniq_change <- res.df$change %>% unique()
if (length(uniq_change) == 3) {
colorlist <- c("#0072B5", "grey", "#BC3C28")
} else if ("Up" %in% uniq_change) {
colorlist <- c("grey", "#BC3C28")
} else {
colorlist <- c("#0072B5", "grey")
}
p <- ggplot( #设置数据
res.df,
aes(x = log2FoldChange,
y = -log10(padj),
colour=change)) +
geom_point(alpha=0.6, size=3) +
scale_color_manual(values=colorlist,na.translate = FALSE )+
# "#0072B5","grey","#BC3C28"
# "#546de5", "#d2dae2","#ff4757"
geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
# 坐标轴
labs(x="log2FoldChange",
# y="-log10(Adjusted P-Value)")+
y=expression(-log[10](adjusted~italic(p)*"-value")))+
theme_bw()+
theme(legend.position='bottom',
legend.title = element_blank(),
panel.background = element_blank(),
text=element_text(family="sans"),
axis.title = element_text(size = 14),
legend.text = element_text(size = 14),
axis.text = element_text(size = 10))
#return(p)
fun_label <- function(data,color.lable,a,b,max.overlaps){
geom_label_repel(data = data,
aes(
x = log2FoldChange,
y = -log10(padj),
#fill=(factor(change)),
#alpha=0.9,
label = SYMBOL),
xlim=c(a,b),
fill= color.lable,
size = 2.5,max.overlaps=max.overlaps,fontface="bold",
color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
segment.colour = "grey50",
min.segment.length = 0.01,
max.iter = 8000,
force = 2,
segment.curvature = 0,
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10))
}
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -cut_off_logFC,padj<cut_off_padj)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > cut_off_logFC,padj<cut_off_padj)
p.final <-p+fun_label(lfcBlow0,"#daeff8",NA,-1,lab)+fun_label(lfcAbove0,"#fde2e7",1,NA,lab)
return(p.final)
}
volcano_plot<-function(res.df,cut_off_logFC,cut_off_padj){
res.df$change = ifelse((res.df$padj < cut_off_padj) & (abs(res.df$log2FoldChange) >= cut_off_logFC),
ifelse(res.df$log2FoldChange> cut_off_logFC ,'Up','Down'),
'Stable')
res.df$label = ifelse(res.df$padj < cut_off_padj & abs(res.df$log2FoldChange) >=cut_off_logFC, as.character(res.df$SYMBOL),"")
N_up<-res.df %>% dplyr::filter(padj < cut_off_padj & log2FoldChange > cut_off_logFC) %>% arrange(desc(log2FoldChange))
FC_up<-ifelse(nrow(N_up)>45,N_up$log2FoldChange[45],N_up$log2FoldChange[nrow(N_up)])
N_down<-res.df %>% dplyr::filter((padj < cut_off_padj) & (log2FoldChange < (-cut_off_logFC))) %>% arrange(log2FoldChange)
FC_down<-ifelse(nrow(N_down)>45,N_down$log2FoldChange[45],N_down$log2FoldChange[nrow(N_down)])
res.df$label = ifelse((res.df$padj < cut_off_padj & (res.df$log2FoldChange >=FC_up | res.df$log2FoldChange <= FC_down)), as.character(res.df$SYMBOL),"")
res.df<-res.df %>% dplyr::mutate(color=ifelse(label=="","",ifelse(change=="Up","#fde2e7","#daeff8")))
# if(res.df$change[!duplicated(res.df$change)] %>% length() ==3){
# colorlist<-c("#0072B5","grey","#BC3C28")
# }else if("Up" %in% res.df$change[!duplicated(res.df$change)]){
# colorlist<-c("grey","#BC3C28")
# }else{
# colorlist<-c("#0072B5","grey")
# }
uniq_change <- res.df$change[!duplicated(res.df$change)]
uniq_change <- uniq_change[!is.na(uniq_change)] # 去除 NA
#https://colorhunt.co/
if (length(uniq_change) == 3) {
colorlist <- c("#0072B5", "grey", "#BC3C28")
} else if ("Up" %in% uniq_change) {
colorlist <- c("grey", "#BC3C28")
} else {
colorlist <- c("#0072B5", "grey")
}
# 火山图主图绘制
# x 轴:log2FoldChange(对数倍数变化)
# y 轴:-log10(padj)(显著性)
# 用 color=change 控制点的颜色(Up/Down/Stable)
# 设置虚线阈值:|log2FC| > 1,padj < 0.05
p <- ggplot( #设置数据
res.df,
aes(x = log2FoldChange,
y = -log10(padj),
colour=change)) +
geom_point(alpha=0.6, size=3) +
#ggplot(res.df,aes(x=log2FoldChange,y= -log10(padj),colour=change))+geom_point(size=3)+scale_color_manual(values=(c("red","grey","blue")))
scale_color_manual(values=colorlist,na.translate = FALSE )+
# "#0072B5","grey","#BC3C28"
# "#546de5", "#d2dae2","#ff4757"
# geom_vline() 添加竖直参考线(vertical line)
# xintercept
# 设置竖线的位置,这里是 正负cut_off_10gFC
# (即 log2FoldChange 的阈值)
# lty = 3 虚线线型 (line type 3= 点划线)
# col = "grey" 线的颜色为灰色
# lwd = 1 线的粗细为 1
geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
# 坐标轴
labs(x="log2FoldChange",
y="-log10 (Adjusted P-Value)")+
#这是 ggplot 的一种内置主题(“Black & White” 风格)
# • 白底、黑色边框
# • 没有背景网格
# 👉 很适合科学图、文章配图使用
theme_bw()+
#把图例(legend)放在图的下方(而不是默认的右边)
theme(legend.position='bottom',
#去掉图例的标题,只保留颜色对应的类别(例如 Up / Down / Stable),让图例更简洁。
legend.title = element_blank(),
#清除坐标区域的背景色,配合 theme_bw() 进一步 让背景透明、纯净。设置全图的字体为无衬线字体(如 Arial / Helvetica)
panel.background = element_blank(),text=element_text(family="sans"),
axis.title = element_text(size = 14),legend.text = element_text(size = 14),axis.text = element_text(size = 10))
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < 0,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 0,padj<0.05)
vol<-p+geom_label_repel(data = lfcBlow0,
aes(
x = log2FoldChange,
y = -log10(padj),
fill=(factor(change)),
#alpha=0.9,
label = label),
xlim=c(NA,-1),
fill=lfcBlow0$color,
size = 2.5,max.overlaps=45,fontface="bold",
color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
segment.colour = "grey50",
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10),force = 1.5)+
geom_label_repel(data = lfcAbove0,
aes(
x = log2FoldChange,
y = -log10(padj),
fill=(factor(change)),
#alpha=0.9,
label = label),
xlim=c(1,NA),
fill=lfcAbove0$color,
size = 2.5,max.overlaps=45,fontface="bold",
color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
segment.colour = "grey50",
arrow = arrow(length = unit(0.01,"npc"), type = "closed",
ends = "first", angle = 10),force = 1.5)
return(vol)
}
# 函数说明
# 下面的 CountFilter 函数用于在差异表达分析之前,依据样本分组信息和表达量阈值过滤低表达基因。
#
# CountFilter <- function(count, meta) {
# # 提取不重复的分组名
# Group <- meta$Group[!duplicated(meta$Group)]
#
# # 统计每组的样本数
# n1 <- nrow(meta[meta$Group == Group[1], ])
# n2 <- nrow(meta[meta$Group == Group[2], ])
#
# # 取较小组的样本数作为过滤基准
# n <- ifelse(n1 < n2, n1, n2)
#
# # 保留在至少 n 个样本中表达量大于等于 10 的基因
# count <- count[rowSums(count >= 10) >= n, ]
#
# return(count)
# }
#count <- count[rowSums(count) >= 20, ]