setwd("/Volumes/MacData/1_SynologyDrive/1_Job/05课程/03-课程-公选课-生命大数据与R语言——数据驱动的生命科学探索/文件/")

# 安装 BiocManager(如果尚未安装)
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager", quiet = TRUE)
}

# 静默检查并安装 CRAN 包
cran_pkgs <- c("ggplot2", "ggrepel", "dplyr")
for (pkg in cran_pkgs) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, quiet = TRUE)
  }
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}

# 静默安装并加载 Bioconductor 包
if (!requireNamespace("DESeq2", quietly = TRUE)) {
  BiocManager::install("DESeq2", quiet = TRUE, ask = FALSE)
}
suppressPackageStartupMessages(library(DESeq2))

#这里的75仅仅是我项目中的一个count的序号的标记,代码中还是使用count,便于函数的书写
Count75 <- readRDS("Count75.RDS")
print(Count75[1:5,1:5])
##              ENSEMBL SRR23104045 SRR23104046 SRR23104047 SRR23104048
## 1 ENSMUSG00000102628           0           0           0           0
## 2 ENSMUSG00000100595           0           0           1           0
## 3 ENSMUSG00000097426           0           0           0           0
## 4 ENSMUSG00000104478           0           0           0           0
## 5 ENSMUSG00000104385           0           0           1           0
Meta75 <- readRDS("Meta75.RDS")
print(Meta75[1:5,1:2])
##           Run   Group
## 1 SRR23104057 Control
## 2 SRR23104058 Control
## 3 SRR23104059 Control
## 4 SRR23104060 Control
## 5 SRR23104053     ICB
#geneid_df1.mus <- readRDS("~/1_home_gaoy/ICBS_r/xy/geneid_df1.mus.RDS")
geneid_df1.mus <- readRDS("geneid_df1.mus.RDS")
count <- Count75
count.use <- dplyr::left_join(geneid_df1.mus,count,by="ENSEMBL")
rownames(count.use) <- count.use$SYMBOL

meta <- Meta75
meta5<-meta[meta$Group %in% c("ICB", "Combination"),]

#根据列名取需要的count列,例如count.use[,"SRR23104053"]
count5<-as.data.frame(count.use[,meta5$Run])  
#判断是否一致,避免出错,一些同学会漏了meta5中的“5”,导致False
identical(colnames(count5),meta5$Run)
## [1] TRUE
meta <- meta5
#count5 <- count5[rowSums(count5) >= 20, ]

CountFilter=function(count,meta){
  Group<-meta$Group[!duplicated(meta$Group)] 
  #unique(meta$Group)
  n1<-nrow(meta[meta$Group==Group[1],]) 
  # length(which(meta$Group==Group[1]))
  #n1 <- meta %>% dplyr::filter(., Group == Group[1]) %>% dim() %>% .[1]
  n2<-nrow(meta[meta$Group==Group[2],])
  n<-ifelse(n1<n2,n1,n2)
  count = count[rowSums(count >=10) >= n,]
  return(count)
}  

count5<-CountFilter(count5,meta5)
library(DESeq2)

count <- count5
treatment <- "Combination"
Control <- "ICB"
dds <- DESeqDataSetFromMatrix(count, colData=meta, design= ~ Group)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast=c("Group",treatment,Control))
res$SYMBOL<-row.names(res)

res.df <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)

res.df <- res.df[!is.na(res.df$padj), ]
# library (ggplot2)
# library(ggrepel)
# library(dplyr)


#(res.df$padj  < 0.05) & abs(res.df$log2FoldChange) >=1 


res.df$change <-  ifelse((res.df$padj  < 0.05) & (abs(res.df$log2FoldChange) >= 1),
                         ifelse(res.df$log2FoldChange> 1 ,'Up','Down'),
                         'Stable')

分步解释:

ifelse(条件, 为真时的值, 为假时的值) 是 R 的向量化条件判断函数。

外层判断:

(res.df$padj < 0.05) & (abs(res.df$log2FoldChange) >= 1)

👉 这是判断基因是否差异显著且表达变化足够大的标准

• 统计学显著性(FDR校正后):

padj < 0.05:

• 表达倍数差异至少是 2 倍(log2FoldChange ≥ 1 或 ≤ -1):

abs(log2FoldChange) >= 1

✅ 如果这两个条件都满足,就进入下一步判断。

内层判断:

ifelse(res.df$log2FoldChange > 1, 'Up', 'Down')

在差异显著的前提下:

• 如果 log2FoldChange > 1 → 表达上调,标注为 “Up”

• 否则(即 ≤ -1)→ 表达下调,标注为 “Down”

否则(不显著或变化太小):表示无显著表达变化,标注为 “Stable”

总结:

“Up” → 表达上调(显著&倍数大于2)

“Down” → 表达下调(显著&倍数小于1/2)

“Stable” → 不显著或倍数变化小

# 自定义差异基因颜色标注  
# https://colorhunt.co/
# 提取 res.df$change(表示基因是 Up / Down / Stable)中的 唯一值,并去除 NA,以确定当前结果中到底有哪些类型的变化。
#uniq_change <- res.df$change[!duplicated(res.df$change)]
#uniq_change <- uniq_change[!is.na(uniq_change)]  # 去除 NA
uniq_change <- res.df$change %>% unique()
  # 如果三种状态都有(Up、Down、Stable),就给三种颜色:
    # Down: #0072B5 蓝
    # Stable: grey 灰
    # Up: #BC3C28 红
    # 如果只有 Up 和 Stable:灰 + 红
    # 如果只有 Down 和 Stable:蓝 + 灰  
  if (length(uniq_change) == 3) {
    colorlist <- c("#0072B5", "grey", "#BC3C28")
  } else if ("Up" %in% uniq_change) {
    colorlist <- c("grey", "#BC3C28")
  } else {
    colorlist <- c("#0072B5", "grey")
}
  
  p <- ggplot( #设置数据
    res.df, 
    aes(x = log2FoldChange, 
        y = -log10(padj), 
        colour=change)) +
    geom_point(alpha=0.6, size=3) +
    scale_color_manual(values=colorlist,na.translate = FALSE )+
    #  "#0072B5","grey","#BC3C28"
    #   "#546de5", "#d2dae2","#ff4757"
    geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
    geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
    # 坐标轴
    labs(x="log2FoldChange",
         # y="-log10(Adjusted P-Value)")+
         y=expression(-log[10](adjusted~italic(p)*"-value")))+
    theme_bw()+
    theme(legend.position='bottom', 
          legend.title = element_blank(),
          panel.background = element_blank(),
          text=element_text(family="sans"),
          axis.title = element_text(size = 14),
          legend.text = element_text(size = 14),
          axis.text = element_text(size = 10))
  print(p)

添加标签-简单参数

lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+geom_label_repel(data = lfcBlow0, #指定用于添加标签的数据集为 lfcBlow0,通常是差异表达显著下调的基因子集
                   aes( # 映射美学属性(aes):
                     x = log2FoldChange, #x = log2FoldChange:x 轴是 log2 倍变化
                     y = -log10(padj), # y 轴是 padj 的负对数,padj 越小越靠上
                     label = SYMBOL), #每个点对应的基因符号显示为标签
                   xlim=c(NA,-1),#限制标签只出现在 x 轴小于 -1 的区域。
                   fill="#daeff8", # 标签框的背景颜色为淡蓝色(你用于下调基因的色调)
)
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

添加标签-详细调整

## 标签要分两块,一个是上调的基因标签,一个是下调的基因标签
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+geom_label_repel(data = lfcBlow0, #指定用于添加标签的数据集为 lfcBlow0,通常是差异表达显著下调的基因子集
                   aes( # 映射美学属性(aes):
                     x = log2FoldChange, #x = log2FoldChange:x 轴是 log2 倍变化
                     y = -log10(padj), # y 轴是 padj 的负对数,padj 越小越靠上
                     #fill=(factor(change)),
                     alpha=0.9, #透明度
                     label = SYMBOL), #每个点对应的基因符号显示为标签
                   xlim=c(NA,-1),#限制标签只出现在 x 轴小于 -1 的区域。
                   fill="#daeff8", # 标签框的背景颜色为淡蓝色(你用于下调基因的色调)
                   size = 3, #标签文字大小为 3
                   max.overlaps=25, # 最多允许重叠标签的数量为 30,避免太多标签造成混乱
                   fontface="bold", # 标签文字加粗显示
                   color="black", # 标签文字颜色为黑色
                   box.padding=unit(0.35, "lines"), # 标签框与周围标签之间的最小距离为 0.35 行高
                   point.padding=unit(0.5, "lines"), # 标签与其对应点之间的最小间距为 0.5 行高。
                   segment.colour = "grey50", # 标签与点之间的连线颜色为中灰色(灰度为 50%)。
                   min.segment.length = 0.01,#连线的最小长度为 0.01 NPC 单位(图形空间中的比例单位),防止太短的线让箭头堆在一起。
                   max.iter = 8000, #尝试优化排布的最大迭代次数,越大越精准,越慢,默认是 2000,这里设置为 8000,适用于标签密集场景。
                   force = 2, #标签排斥力,控制标签远离重叠的程度。值越大越分散,适合标签密集的情况。
                   segment.curvature = 0,#线条弯曲程度,设置为 0 表示直线。
                   #                    连线箭头设置:
                   #    •   length = 0.01 npc:箭头的长度(单位是图形比例单位)。
                   #    •   type = "closed":箭头为实心。
                   #    •   ends = "first":箭头在连线起点(靠近点的一端)。
                   #    •   angle = 10:箭头张角为 10°。
                   arrow = arrow(length = unit(0.01,"npc"), type = "closed",  
                                 ends = "first", angle = 10))+
  geom_label_repel(data = lfcAbove0,
                   aes(
                     x = log2FoldChange,
                     y = -log10(padj),
                     #fill=(factor(change)),
                     #alpha=0.9,
                     label = SYMBOL),
                   xlim=c(1,NA),
                   fill="#fde2e7",
                   size = 2.5,max.overlaps=20,fontface="bold",
                   color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
                   segment.colour = "grey50",
                   min.segment.length = 0.01,
                   max.iter = 8000,
                   force = 2,
                   segment.curvature = 0,
                   arrow = arrow(length = unit(0.01,"npc"), type = "closed", 
                                 ends = "first", angle = 10))
## Warning: ggrepel: 587 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#✅ force = 1
# • 作用:控制标签之间互相排斥的“斥力”大小,防止重叠。
# • 值越大:标签之间的间距越大,更不容易重叠,但布局可能更松散。
# • 默认值:1
# • 建议调节范围:0.1 ~ 5
# 
# ✅ 用于:让标签分得更开。

# ✅ max.iter = 3000
# • 作用:最大迭代次数,控制排布算法的尝试次数。
# • 值越大:布局计算更精细,可能避免重叠,但也可能导致绘图变慢。
# • 默认值:2000
# 
# ✅ 用于:在复杂图中提高标签排布质量。
# 
# ✅ min.segment.length = 0.1
# • 作用:设置引导线(segment)的最短长度(以“npc”为单位,图像的相对长度)。
# • 值越小:允许更短的引导线,从而标签可以更靠近点。
# • 默认值:0.5
# 
# ✅ 用于:让标签贴得更近,不让引导线太长。

封装标签函数

fun_label <- function(data,color.lable,a,b,max.overlaps){
  geom_label_repel(data = data,
                   aes(
                     x = log2FoldChange,
                     y = -log10(padj),
                     #fill=(factor(change)),
                     #alpha=0.9,
                     label = SYMBOL),
                   xlim=c(a,b),
                   fill= color.lable,
                   size = 2.5,max.overlaps=max.overlaps,fontface="bold",
                   color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
                   segment.colour = "grey50",
                   min.segment.length = 0.01,
                   max.iter = 8000,
                   force = 2,
                   segment.curvature = 0,
                   arrow = arrow(length = unit(0.01,"npc"), type = "closed", 
                                 ends = "first", angle = 10))
  
}

用上面的封装函数

# 
lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -1,padj<0.05)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 1,padj<0.05)
p+fun_label(lfcBlow0,"#daeff8",NA,-1,20)+fun_label(lfcAbove0,"#fde2e7",1,NA,20)
## Warning: ggrepel: 587 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 写成总函数

volcano_plot<-function(res.df,cut_off_logFC,cut_off_padj,lab){


  res.df$change = ifelse((res.df$padj  < cut_off_padj) & (abs(res.df$log2FoldChange) >= cut_off_logFC),
                         ifelse(res.df$log2FoldChange> cut_off_logFC ,'Up','Down'),
                         'Stable')
                         
  res.df$label = ifelse(res.df$padj < cut_off_padj & abs(res.df$log2FoldChange) >=cut_off_logFC, as.character(res.df$SYMBOL),"")    


  uniq_change <- res.df$change %>% unique()   
  if (length(uniq_change) == 3) {
      colorlist <- c("#0072B5", "grey", "#BC3C28")
    } else if ("Up" %in% uniq_change) {
      colorlist <- c("grey", "#BC3C28")
    } else {
      colorlist <- c("#0072B5", "grey")
  }
  
  
  p <- ggplot( #设置数据
    res.df, 
    aes(x = log2FoldChange, 
        y = -log10(padj), 
        colour=change)) +
    geom_point(alpha=0.6, size=3) +
    scale_color_manual(values=colorlist,na.translate = FALSE )+
    #  "#0072B5","grey","#BC3C28"
    #   "#546de5", "#d2dae2","#ff4757"
    geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
    geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
    # 坐标轴
    labs(x="log2FoldChange",
         # y="-log10(Adjusted P-Value)")+
         y=expression(-log[10](adjusted~italic(p)*"-value")))+
    theme_bw()+
    theme(legend.position='bottom', 
          legend.title = element_blank(),
          panel.background = element_blank(),
          text=element_text(family="sans"),
          axis.title = element_text(size = 14),
          legend.text = element_text(size = 14),
          axis.text = element_text(size = 10))
#return(p)
  
fun_label <- function(data,color.lable,a,b,max.overlaps){
  geom_label_repel(data = data,
                   aes(
                     x = log2FoldChange,
                     y = -log10(padj),
                     #fill=(factor(change)),
                     #alpha=0.9,
                     label = SYMBOL),
                   xlim=c(a,b),
                   fill= color.lable,
                   size = 2.5,max.overlaps=max.overlaps,fontface="bold",
                   color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
                   segment.colour = "grey50",
                   min.segment.length = 0.01,
                   max.iter = 8000,
                   force = 2,
                   segment.curvature = 0,
                   arrow = arrow(length = unit(0.01,"npc"), type = "closed", 
                                 ends = "first", angle = 10))
  
}
  

lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < -cut_off_logFC,padj<cut_off_padj)
lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > cut_off_logFC,padj<cut_off_padj)

p.final <-p+fun_label(lfcBlow0,"#daeff8",NA,-1,lab)+fun_label(lfcAbove0,"#fde2e7",1,NA,lab)

return(p.final)

}

写成函数

volcano_plot<-function(res.df,cut_off_logFC,cut_off_padj){
  res.df$change = ifelse((res.df$padj  < cut_off_padj) & (abs(res.df$log2FoldChange) >= cut_off_logFC),
                         ifelse(res.df$log2FoldChange> cut_off_logFC ,'Up','Down'),
                         'Stable')
  res.df$label = ifelse(res.df$padj < cut_off_padj & abs(res.df$log2FoldChange) >=cut_off_logFC, as.character(res.df$SYMBOL),"")
  
  N_up<-res.df %>% dplyr::filter(padj < cut_off_padj & log2FoldChange > cut_off_logFC) %>% arrange(desc(log2FoldChange))
  FC_up<-ifelse(nrow(N_up)>45,N_up$log2FoldChange[45],N_up$log2FoldChange[nrow(N_up)])
  N_down<-res.df %>% dplyr::filter((padj < cut_off_padj) & (log2FoldChange < (-cut_off_logFC))) %>% arrange(log2FoldChange)
  FC_down<-ifelse(nrow(N_down)>45,N_down$log2FoldChange[45],N_down$log2FoldChange[nrow(N_down)])
  res.df$label = ifelse((res.df$padj < cut_off_padj & (res.df$log2FoldChange >=FC_up | res.df$log2FoldChange <= FC_down)), as.character(res.df$SYMBOL),"")
  res.df<-res.df %>% dplyr::mutate(color=ifelse(label=="","",ifelse(change=="Up","#fde2e7","#daeff8")))
  # if(res.df$change[!duplicated(res.df$change)] %>% length() ==3){
  #   colorlist<-c("#0072B5","grey","#BC3C28")
  # }else if("Up" %in% res.df$change[!duplicated(res.df$change)]){
  #   colorlist<-c("grey","#BC3C28")
  # }else{
  #   colorlist<-c("#0072B5","grey")
  # }
  uniq_change <- res.df$change[!duplicated(res.df$change)]
  uniq_change <- uniq_change[!is.na(uniq_change)]  # 去除 NA
  
  
  #https://colorhunt.co/
  
  
  if (length(uniq_change) == 3) {
    colorlist <- c("#0072B5", "grey", "#BC3C28")
  } else if ("Up" %in% uniq_change) {
    colorlist <- c("grey", "#BC3C28")
  } else {
    colorlist <- c("#0072B5", "grey")
  }
  # 火山图主图绘制
#   x 轴:log2FoldChange(对数倍数变化)
#   y 轴:-log10(padj)(显著性)
#   用 color=change 控制点的颜色(Up/Down/Stable)
#   设置虚线阈值:|log2FC| > 1,padj < 0.05
  p <- ggplot( #设置数据
    res.df, 
    aes(x = log2FoldChange, 
        y = -log10(padj), 
        colour=change)) +
    geom_point(alpha=0.6, size=3) +
    #ggplot(res.df,aes(x=log2FoldChange,y= -log10(padj),colour=change))+geom_point(size=3)+scale_color_manual(values=(c("red","grey","blue")))
    scale_color_manual(values=colorlist,na.translate = FALSE )+
    #  "#0072B5","grey","#BC3C28"
    #   "#546de5", "#d2dae2","#ff4757"
    
    
# geom_vline() 添加竖直参考线(vertical line)
# xintercept
# 设置竖线的位置,这里是 正负cut_off_10gFC
# (即 log2FoldChange 的阈值)
# lty = 3 虚线线型 (line type 3= 点划线)
# col = "grey" 线的颜色为灰色
# lwd = 1 线的粗细为 1
    geom_vline(xintercept=c(-1,1),lty=3,col="grey",lwd=1) +
    geom_hline(yintercept = -log10(0.05),lty=3,col="grey",lwd=1) +
    # 坐标轴
    labs(x="log2FoldChange",
         y="-log10 (Adjusted P-Value)")+
#这是 ggplot 的一种内置主题(“Black & White” 风格)
#   •   白底、黑色边框
#   •   没有背景网格
# 👉 很适合科学图、文章配图使用
    theme_bw()+
#把图例(legend)放在图的下方(而不是默认的右边)
     theme(legend.position='bottom', 
#去掉图例的标题,只保留颜色对应的类别(例如 Up / Down / Stable),让图例更简洁。
           legend.title = element_blank(),
#清除坐标区域的背景色,配合 theme_bw() 进一步 让背景透明、纯净。设置全图的字体为无衬线字体(如 Arial / Helvetica)
           panel.background = element_blank(),text=element_text(family="sans"),
          axis.title = element_text(size = 14),legend.text = element_text(size = 14),axis.text = element_text(size = 10))
  lfcBlow0 <- res.df %>% dplyr::filter(log2FoldChange < 0,padj<0.05)
  lfcAbove0 <- res.df %>% dplyr::filter(log2FoldChange > 0,padj<0.05)
  vol<-p+geom_label_repel(data = lfcBlow0,
                          aes(
                            x = log2FoldChange,
                            y = -log10(padj),
                            fill=(factor(change)),
                            #alpha=0.9,
                            label = label),
                          xlim=c(NA,-1),
                          fill=lfcBlow0$color,
                          size = 2.5,max.overlaps=45,fontface="bold",
                          color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
                          segment.colour = "grey50",
                          arrow = arrow(length = unit(0.01,"npc"), type = "closed", 
                                        ends = "first", angle = 10),force = 1.5)+
    geom_label_repel(data = lfcAbove0,
                     aes(
                       x = log2FoldChange,
                       y = -log10(padj),
                       fill=(factor(change)),
                       #alpha=0.9,
                       label = label),
                     xlim=c(1,NA),
                     fill=lfcAbove0$color,
                     size = 2.5,max.overlaps=45,fontface="bold",
                     color="black", box.padding=unit(0.35, "lines"), point.padding=unit(0.5, "lines"),
                     segment.colour = "grey50",
                     arrow = arrow(length = unit(0.01,"npc"), type = "closed", 
                                   ends = "first", angle = 10),force = 1.5)
  return(vol)
}





# 函数说明
# 下面的 CountFilter 函数用于在差异表达分析之前,依据样本分组信息和表达量阈值过滤低表达基因。
# 
# CountFilter <- function(count, meta) {
#   # 提取不重复的分组名
#   Group <- meta$Group[!duplicated(meta$Group)]
#   
#   # 统计每组的样本数
#   n1 <- nrow(meta[meta$Group == Group[1], ])
#   n2 <- nrow(meta[meta$Group == Group[2], ])
#   
#   # 取较小组的样本数作为过滤基准
#   n <- ifelse(n1 < n2, n1, n2)
#   
#   # 保留在至少 n 个样本中表达量大于等于 10 的基因
#   count <- count[rowSums(count >= 10) >= n, ]
#   
#   return(count)
# }
#count <- count[rowSums(count) >= 20, ]