zl程序教程

您现在的位置是:首页 >  数据库

当前栏目

多个数据集的整合分析

2023-02-19 12:23:53 时间

今天是平平无奇的整合分析,是数据挖掘中经常用到的一部分~

参考文献在这里⬇

  • A robust 6-mRNA signature for prognosis prediction of pancreatic ductal adenocarcinoma
  • DOI: 10.7150/ijbs.32899

文章的故事是从这句话开始的:prognostic gene signature by generating the differentially expressed mRNAs with P-value less than 0.001 and fold-change values of −2 to 2 between tumor and non-tumor samples in GSE15471, GSE28735, and GSE62452 datasets.

After downloading and normalizing the raw data CEL files from GEO and ArrayExpress by a robust multiarray averaging (RMA) method, we used the ‘Affy’ and ‘affycoretools’ packages of R software.DEGs were defined with P< 0.001 and |log2 fold change|> 1 as the cut-off criteria:

作者是直接下载cel格式的原始数据,然后用RMA函数获取表达矩阵,分别对三个数据集进行了差异分析,然后对差异分析取交集作了后续的分析。

我们也试试看吧——

# GSE15471, GSE28735 and GSE62452
rm(list = ls()) 

##全局设置
##下载的数据大小>131072字节,所以需要调整默认连接缓存,以便正常下载
Sys.setenv("VROOM_CONNECTION_SIZE"=99999999)
options(stringsAsFactors = F)
options(timeout = 999999999)

library(affy)
library(GEOquery)
library(oligo)

getwd()

if (F) {
# 1.数据解压到新建的文件夹中 ----------------------------------------------------------
# dir.create("../GSE15471")##创建新的文件夹
samPath <- "../GSE15471"
untar("../Rawdata/GSE15471_RAW.tar", exdir = samPath)##解压原始文件到sampath文件夹中
setwd(samPath)
list.files()##显示文件夹中的文件,及原始文件顺序

# 2.匹配临床信息,读取cel文件 ---------------------------------------------------------------
gse <- "GSE15471"
gset <- getGEO(gse)

###匹配临床信息
celFiles <- list.celfiles(samPath) ##listGzipped参数:Logical. List .CEL.gz files? 
celFiles
dat <- read.celfiles(filenames = celFiles, phenoData = phenoData(gset[[1]]), 
                     sampleNames = rownames(pData(gset[[1]])))
class(dat)  ##这样一来,表型等信息的数据就在这个dat对象中了

eset=rma(dat)
class(eset)
exprSet=exprs(eset)
exprSet[1:10,1:10]

dim(exprSet)
range(exprSet)


# 3.获取分组信息 ----------------------------------------------------------------
library(GEOquery)
pd <- pData(dat)
table(pd$characteristics_ch1.1)
group_list <- ifelse(grepl("normal",pd$characteristics_ch1.1),"normal","tumor")
table(group_list)

##判断一下样本名是否与表达矩阵的列名一一对应
if(!identical(rownames(pd),colnames(exprSet))) exprSet = exprSet[,match(rownames(pd),colnames(exprSet))] 

setwd("../Main/")
getwd()
save(exprSet,group_list,gse,file = paste0(gse,"step1_output.Rdata"))
}

这样我们就获得了表达矩阵和分组信息,接下来就是常规的探针注释和差异分析了。

##因为注释平台不一样,所以annotation这一步要单独做一下,然后就可以直接用source函数解决了
gselist <- c("GSE15471", "GSE28735","GSE62452")
for (i in gselist) {
  gse <- gselist[3] ##改一下这里的数字
  source("step2_check.R")
  source("step4_DEG.R")
  source("step5_degVisualise.R")
}

完事了呢,我们来比较一下我们的差异分析和文章的差异分析结果:

155 VS 153,数量差不多~

其实还有另外一种方法,就是RRA。之前的推文也介绍过这种算法,相较于简单的取交集,RRA会根据logFC值对交集基因再排个序:

rm(list = ls())

library(RobustRankAggreg)
library(clusterProfiler)
library(ggplotify)

##差异基因门槛:
logFC_t=1
P.Value_t = 0.001

# gselist <- c("GSE15471", "GSE28735","GSE62452")
load("GSE15471_deg.Rdata")
GSE15471_deg <- deg
load( "GSE28735_deg.Rdata")
GSE28735_deg <- deg
load("GSE62452_deg.Rdata")
GSE62452_deg <- deg

###上调基因-------------
get_up <- function(df){
    result=df[df$logFC>(logFC_t) & df$P.Value<P.Value_t,]
    result=result[order(result$logFC,decreasing = T),]
    rownames(result)
}
glist=list(get_up(GSE15471_deg),
           get_up(GSE28735_deg),
           get_up(GSE62452_deg))

ups=aggregateRanks(glist = glist, N = length(unique(unlist(glist))))
tmp=as.data.frame(table(unlist(glist)))
ups$Freq=tmp[match(ups$Name,tmp[,1]),2]
head(ups)

##画热图
gs=ups[ups$Score < 0.05,1] ##抽到的次数越多,score越小
gs
updat=data.frame(GSE15471=GSE15471_deg[gs,'logFC'],
                 GSE28735=GSE28735_deg[gs,'logFC'],
                 GSE62452=GSE62452_deg[gs,'logFC']
                 )

rownames(updat)=gs
library(pheatmap)
head(updat)
updat1 <- updat[1:30,]  ##太多了,取部分基因看看
pheatmap(updat,display_numbers=F)

###下调基因-----------
get_down <- function(df){
  result=df[df$logFC<(-logFC_t) & df$P.Value<P.Value_t,]
  result=result[order(result$logFC,decreasing = F),]
  rownames(result)
}
glist1=list(get_down(GSE15471_deg),
           get_down(GSE28735_deg),
           get_down(GSE62452_deg))

downs=aggregateRanks(glist = glist1, N = length(unique(unlist(glist1))))
tmp=as.data.frame(table(unlist(glist1)))
downs$Freq=tmp[match(downs$Name,tmp[,1]),2]
head(downs)
gs1=downs[downs$Score < 0.05,1] ##抽到的次数越多,score越小
gs1

downdat=data.frame(GSE15471=GSE15471_deg[gs1,'logFC'],
                   GSE28735=GSE28735_deg[gs1,'logFC'],
                   GSE62452=GSE62452_deg[gs1,'logFC']
)
rownames(downdat)=gs1
library(pheatmap)
head(downdat)
pheatmap(downdat,display_numbers=F)


####上下调基因一起画------------
union_RRA <- rbind(updat1,downdat)
library(ComplexHeatmap)
library(circlize)
col_fun <- colorRamp2(
  c(-2, 0, 2), 
  c("green", "white", "red")
)
p <- Heatmap(
  as.matrix(union_RRA), 
  name = "expression", 
  border = "black",
  col = col_fun,
  row_names_gp = gpar(fontsize=4),
  rect_gp = gpar(col = "white", lwd = 2)
);p
p1 <- as.ggplot(p)
ggsave("../outputs/RRA_heatmap.png", egg::set_panel_size(p1, width=unit(4.5, "in"), height=unit(5, "in")), 
       width = 8, height = 7, units = 'in', dpi = 300)