zl程序教程

您现在的位置是:首页 >  其它

当前栏目

转录组差异分析这样做能行吗?

分析 差异 这样 转录 能行
2023-06-13 09:14:17 时间

缘起

前段时间,我们分享了转录组三种常见差异分析的推文以及单样本1V1进行差异分析的推文。对单个样本进行差异分析时,我们能获得相应的差异基因。在转录组三种常见差异分析的推文中,我们利用取交集的方式看了下三种方法获得共同差异基因的交集情况。曾老师提出了一个有趣的猜想,试想如果我们将3V3的样本拆分成3次1V1进行差异分析,是否会出现什么有趣的现象呢。为了让结果可比,我们就用上次的数据集GSE190114吧。此次,我们除了关注3次1V1差异分析上调与下调差异基因分别共同的交集情况之外,还将关注3种常见分析方法的上调与下调差异基因分别与拆分成3次1V1差异分析的上调与下调差异基因的共同交集情况,「用于探究是否能够拆分成3次1V1后进行差异分析」。话不多说,由于此次所使用的数据与上次一样,对此次的探究描述与数据集介绍感兴趣的小伙伴,请移驾至三种转录组差异分析方法及区别你会了吗?

0.清控环境,加载包

rm(list = ls())
getwd()
## [1] "C:/Users/lj/Desktop/公众号推文/第九周"
library(limma)
library(EnhancedVolcano)
library(edgeR)
library(FactoMineR)
library(factoextra)
library(clusterProfiler)
library(org.Mm.eg.db)
library(stringr)
library(stringi)
library(tidyverse)
library(ggplot2)
library(patchwork)
library(pheatmap)
library(VennDiagram)
library(RColorBrewer)
library(patchwork)
library(ggplotify)
library(ggvenn)

1.读取数据,获得拆分后3组1V1的基因表达矩阵以及cpm矩阵

rawcount <- read.table("./GSE190114_RNA_Seq_CountMatrix.txt",header = T,sep="\t",row.names = 1)
colnames(rawcount)
## [1] "WT"     "WT.2"   "WT.3"   "KI2.1"  "KI2.2"  "KI2.3"  "KIKI.1" "KIKI.2"
## [9] "KIKI.3"
rawcount[1:4,1:4]
##           WT WT.2 WT.3 KI2.1
## A1BG     176  292  151   235
## A1BG-AS1 242  366  179   361
## A1CF       2    5    2    18
## A2M        5    1    3    18
# 本身就是基因表达矩阵(无需降重与ID转换)
# 选择二分组的样本
rawcount=rawcount[,1:6]
colnames(rawcount)
## [1] "WT"    "WT.2"  "WT.3"  "KI2.1" "KI2.2" "KI2.3"
## 拆分成3个分组;获取基因表达矩阵与cpm矩阵,准备进行三个样本的单样本分析
rawcount1=rawcount[,c(1,4)]
colnames(rawcount1)
## [1] "WT"    "KI2.1"
rawcount2=rawcount[,c(2,5)]
rawcount3=rawcount[,c(3,6)]

keep1 <- rowSums(rawcount1>0) >= floor(0.75*ncol(rawcount1))
filter_count1 <- rawcount1[keep1,] #获得filter_count矩阵
keep2 <- rowSums(rawcount2>0) >= floor(0.75*ncol(rawcount2))
filter_count2 <- rawcount2[keep2,] #获得filter_count矩阵
keep3 <- rowSums(rawcount3>0) >= floor(0.75*ncol(rawcount3))
filter_count3 <- rawcount3[keep3,] #获得filter_count矩阵

express_cpm1 <- log2(cpm(filter_count1)+ 1)
express_cpm1[1:2,1:2] #获得cpm矩阵
##                WT    KI2.1
## A1BG     1.717622 1.747681
## A1BG-AS1 2.052169 2.208695
express_cpm2 <- log2(cpm(filter_count2)+ 1)
express_cpm2[1:2,1:2] #获得cpm矩阵
##              WT.2    KI2.2
## A1BG     1.733224 1.887287
## A1BG-AS1 1.968583 2.072286
express_cpm3 <- log2(cpm(filter_count3)+ 1)
express_cpm3[1:2,1:2] #获得cpm矩阵
##              WT.3    KI2.3
## A1BG     1.620961 1.633474
## A1BG-AS1 1.791070 2.339318

2.第一组1V1单样本的差异分析(重点模拟4V4;limma具有欺骗功能)

exprSet <- filter_count1
exprSet1=cbind(
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2]
) 

# 查看分组信息和表达矩阵数据
group1=rep(c('WT1','KI2_1'),4)
group_list1=factor(group1,levels = c('WT1','KI2_1'))
table(group_list1)#检查一下组别数量
## group_list1
##   WT1 KI2_1 
##     4     4
#创建设计矩阵和对比:假设数据符合分布,构建线性模型
design1 <- model.matrix(~0+factor(group_list1))
colnames(design1) <- levels(factor(group_list1))
rownames(design1) <- colnames(exprSet1)
#设置需要进行对比的分组
comp1 <- 'KI2_1-WT1'
cont.matrix1 <- makeContrasts(contrasts=c(comp1),levels = design1)
#进行差异表达分析
dge1 <- DGEList(counts=exprSet1)
v1 <- voom(dge1,design1,plot=TRUE, normalize="quantile")
fit1 <- lmFit(v1, design1)
fit1 <- contrasts.fit(fit1,cont.matrix1)
fit1 <- eBayes(fit1)
tmp1 <- topTable(fit1, coef=comp1, n=Inf,adjust.method="BH")
DEG_limma_voom1 <- na.omit(tmp1)
head(DEG_limma_voom1)
##            logFC     AveExpr             t       P.Value     adj.P.Val        B
## ZNF302 -4.820303 -1.35848446 -7.976083e+18 1.369391e-118 6.607816e-116 260.2507
## MIR675 -4.388997 -0.95178652 -7.355088e+18 2.289856e-118 9.042578e-116 260.1893
## CDH11  -6.096823 -1.83874433 -9.540170e+18 4.398097e-119 3.750684e-116 260.1692
## UNC13A -6.096823 -1.83874433 -9.540170e+18 4.398097e-119 3.750684e-116 260.1692
## RUNX2  -3.041175  0.04715971 -5.752666e+18 1.088205e-117 1.762830e-115 260.1678
## DTX3   -8.288663 -3.30746580 -1.251738e+19 7.853752e-120 2.281684e-116 260.1673
EnhancedVolcano(DEG_limma_voom1,
                lab =rownames(DEG_limma_voom1),
                x = 'logFC',
                y = 'adj.P.Val')
fc <- 2
p <- 0.05
DEG_limma_voom1$regulated <- ifelse(DEG_limma_voom1$logFC>log2(fc)&DEG_limma_voom1$adj.P.Val<p,
                                    "up",ifelse(DEG_limma_voom1$logFC<(-log2(fc))&DEG_limma_voom1$adj.P.Val,"down","normal"))


table(DEG_limma_voom1$regulated)
## 
##   down normal     up 
##   1947  15601   2241

3.第二组1V1单样本的差异分析

exprSet <- filter_count2
exprSet2=cbind(
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2]
) 
# 查看分组信息和表达矩阵数据
group2=rep(c('WT2','KI2_2'),4)
group_list2=factor(group2,levels = c('WT2','KI2_2'))
table(group_list2)#检查一下组别数量
## group_list2
##   WT2 KI2_2 
##     4     4
#创建设计矩阵和对比:假设数据符合分布,构建线性模型
design2 <- model.matrix(~0+factor(group_list2))
colnames(design2) <- levels(factor(group_list2))
rownames(design2) <- colnames(exprSet2)
comp2 <- 'KI2_2-WT2'
cont.matrix2 <- makeContrasts(contrasts=c(comp2),levels = design2)
#进行差异表达分析
dge2 <- DGEList(counts=exprSet2)
v2 <- voom(dge2,design2,plot=TRUE, normalize="quantile")
fit2 <- lmFit(v2, design2)
fit2 <- contrasts.fit(fit2,cont.matrix2)
fit2 <- eBayes(fit2)
tmp2 <- topTable(fit2, coef=comp2, n=Inf,adjust.method="BH")
DEG_limma_voom2 <- na.omit(tmp2)
head(DEG_limma_voom2)
##              logFC      AveExpr            t       P.Value     adj.P.Val
## TMEM27    2.843764 -0.416357146 9.490563e+18 3.671715e-119 8.819086e-116
## CYP2J2    3.662103 -0.401951573 1.087962e+19 1.542764e-119 8.819086e-116
## ABCG2     2.907810  0.447487988 9.383159e+18 3.946809e-119 8.819086e-116
## CYP1A1    2.929636  0.584124556 9.250964e+18 4.318793e-119 8.819086e-116
## LINC00319 2.313701  0.005680239 8.177598e+18 9.448536e-119 8.819086e-116
## ABCA13    2.403420 -0.560656423 8.263021e+18 8.845368e-119 8.819086e-116
##                  B
## TMEM27    263.8117
## CYP2J2    263.7216
## ABCG2     263.6639
## CYP1A1    263.5480
## LINC00319 263.4814
## ABCA13    263.4456
EnhancedVolcano(DEG_limma_voom2,
                lab =rownames(DEG_limma_voom2),
                x = 'logFC',
                y = 'adj.P.Val')
fc <- 2
p <- 0.05
DEG_limma_voom2$regulated <- ifelse(DEG_limma_voom2$logFC>log2(fc)&DEG_limma_voom2$adj.P.Val<p,
                                    "up",ifelse(DEG_limma_voom2$logFC<(-log2(fc))&DEG_limma_voom2$adj.P.Val,"down","normal"))
table(DEG_limma_voom2$regulated)
## 
##   down normal     up 
##   1944  15558   2345

4. 第三组1V1单样本差异分析

exprSet <- filter_count3
exprSet3=cbind(
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2],
  exprSet[,1:2]
) 
# 查看分组信息和表达矩阵数据
group3=rep(c('WT3','KI3_3'),4)
group_list3=factor(group3,levels = c('WT3','KI3_3'))
table(group_list3)#检查一下组别数量
## group_list3
##   WT3 KI3_3 
##     4     4
#创建设计矩阵和对比:假设数据符合分布,构建线性模型
design3 <- model.matrix(~0+factor(group_list3))
colnames(design3) <- levels(factor(group_list3))
rownames(design3) <- colnames(exprSet3)
design3
##       WT3 KI3_3
## WT.3    1     0
## KI2.3   0     1
## WT.3    1     0
## KI2.3   0     1
## WT.3    1     0
## KI2.3   0     1
## WT.3    1     0
## KI2.3   0     1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`factor(group_list3)`
## [1] "contr.treatment"
#设置需要进行对比的分组
comp3 <- 'KI3_3-WT3'
cont.matrix3 <- makeContrasts(contrasts=c(comp3),levels = design3)
#进行差异表达分析
dge3 <- DGEList(counts=exprSet3)
v3 <- voom(dge3,design3,plot=TRUE, normalize="quantile")
fit3 <- lmFit(v3, design3)
fit3 <- contrasts.fit(fit3,cont.matrix3)
fit3 <- eBayes(fit3)
tmp3 <- topTable(fit3, coef=comp3, n=Inf,adjust.method="BH")
DEG_limma_voom3 <- na.omit(tmp3)
head(DEG_limma_voom3)
##            logFC       AveExpr             t       P.Value     adj.P.Val
## SNTB1  -2.775061  0.0672863404 -5.938358e+18 5.700051e-118 6.754097e-115
## LBH    -3.372135  0.3162153907 -6.580706e+18 2.968066e-118 6.754097e-115
## AMPH   -3.317054 -0.1086660275 -6.505961e+18 3.191492e-118 6.754097e-115
## TDRD5   3.191238 -0.2396085692  6.186625e+18 4.394062e-118 6.754097e-115
## MCOLN3  2.546560  0.0009733578  5.494929e+18 9.332954e-118 6.754097e-115
## MAGEE1 -2.626896 -0.2414293712 -5.573306e+18 8.529823e-118 6.754097e-115
##               B
## SNTB1  261.1710
## LBH    261.1250
## AMPH   261.1165
## TDRD5  260.9435
## MCOLN3 260.9433
## MAGEE1 260.9405
EnhancedVolcano(DEG_limma_voom3,
                lab =rownames(DEG_limma_voom3),
                x = 'logFC',
                y = 'adj.P.Val')
fc <- 2
p <- 0.05
DEG_limma_voom3$regulated <- ifelse(DEG_limma_voom3$logFC>log2(fc)&DEG_limma_voom3$adj.P.Val<p,
                                    "up",ifelse(DEG_limma_voom3$logFC<(-log2(fc))&DEG_limma_voom3$adj.P.Val,"down","normal"))

table(DEG_limma_voom3$regulated)
## 
##   down normal     up 
##   2054  15379   2217

5.3个1V1分组中上下调差异基因以及整体差异基因的交集情况

# 01先获取第一组比较的上调基因,下调基因与差异基因
deg1 <- rownames(DEG_limma_voom1[DEG_limma_voom1$regulated!="normal",])
up1 <- rownames(DEG_limma_voom1[DEG_limma_voom1$regulated=="up",])
down1 <- rownames(DEG_limma_voom1[DEG_limma_voom1$regulated=="down",])
# 02再获取第二组比较的上调基因,下调基因与差异基因
deg2 <- rownames(DEG_limma_voom2[DEG_limma_voom2$regulated!="normal",])
up2 <- rownames(DEG_limma_voom2[DEG_limma_voom2$regulated=="up",])
down2 <- rownames(DEG_limma_voom2[DEG_limma_voom2$regulated=="down",])
# 03再获取第三组比较的上调基因,下调基因与差异基因
deg3 <- rownames(DEG_limma_voom3[DEG_limma_voom3$regulated!="normal",])
up3 <- rownames(DEG_limma_voom3[DEG_limma_voom3$regulated=="up",])
down3 <- rownames(DEG_limma_voom3[DEG_limma_voom3$regulated=="down",])


# 整体差异基因的交集情况
deg<-list(`deg1`=deg1,
          `deg2`=deg2,
          `deg3`=deg3)
p5 <- ggvenn(deg,
             show_percentage = F,
             stroke_color = "white",
             fill_color = c("#42B540FF","#925E9FFF","#00468BFF"),
             set_name_color = c("#ff0000","#4a9b83","#b2d4ec"))

# 整体上调差异基因的交集情况

up<-list(`up1`=up1,
         `up2`=up2,
         `up3`=up3)
p6 <- ggvenn(up,
             show_percentage = F,
             stroke_color = "white",
             fill_color = c("#8491B4B2","#91D1C2B2","#DC0000B2"),
             set_name_color = c("#ff0000","#4a9b83","#b2d4ec"))

down<-list(`down1`=down1,
           `down2`=down2,
           `down3`=down3)
p7 <- ggvenn(down,
             show_percentage = F,
             stroke_color = "white",
             fill_color = c("#DF8F4499","#00A1D599","#B2474599"),
             set_name_color = c("#ff0000","#4a9b83","#b2d4ec"))
p5+p6+p7

6.三组1V1差异分析与三种差异分析方法获得的上下调共同差异基因的交集

# 01先获取3种差异分析方法的共同上调基因与下调基因(直接读取上一篇推文结果即可;取三种方法的共同交集即可,取法如下面的02)
up_method=read.csv('up_method.csv')
up_method=up_method$x
down_method=read.csv('down_method.csv')
down_method=down_method$x
# 02获取3组1V1样本差异分析的共同上调以及下调基因
deg_sample=intersect(deg1,deg2)
deg_sample=intersect(deg_sample,deg3)
up_sample=intersect(up1,up2)
up_sample= intersect(up_sample,up3) 
down_sample=intersect(down1,down2)
down_sample= intersect(down_sample,down3)


# 03看3差异分析方法
up<-list(`up_method`=up_method,
         `up_sample`=up_sample)
p8 <- ggvenn(up,
             show_percentage = F,
             stroke_color = "white",
             fill_color = c("#AD002AFF","#8491B4B2"),
             set_name_color = c("#ff0000","#4a9b83"))

down<-list(`down_method`=down_method,
           `down_sample`=down_sample)
p9 <- ggvenn(down,
             show_percentage = F,
             stroke_color = "white",
             fill_color = c("#0099B4FF","#FDAF91FF"),
             set_name_color = c("#ff0000","#4a9b83"))

p8+p9

「我们主要关注两个结果」:1. 将3V3的样本拆分成3次1V1差异分析之后,它们之间的共同上调与共同下调差异基因以及共同差异基因之间分别的交集情况怎么样。2. 3次1V1差异分析的共同上调与共同下调基因的交集分别与对应的3种转录组常规差异分析方法获得的共同上调与共同下调基因交集情况怎么样。

从给出的第一张Venn图可知,3次1V1差异分析,其共同上调与共同下调的差异基因数量是比较多的。尤为注意的是,第二张Veen图结果表明三种方法分别取的共同上调基因交集与共同下调基因有将近6成-8成的差异基因在3次1V1单样本差异分析中也获得了。

「猜想:」那是否对于异质性相对不大的细胞实验,如某些药物处理时,做转录组差异分析我们做1V1单样本间的差异分析也可以呢?3次1V1与3种方法获得的共同上调差异基因的交集与共同下调差异基因的交集,即图中的662与576是否更能够反映实验现象,或者说三种常规方法获得的共同差异基因与三次1V1获得的共同差异基因进一步取交集能否有效Zoom in,能否进一步获得真正能够发挥作用的下游作用的关键分子呢?