您现在的位置是：首页 > 数据库

当前栏目

多个数据集的整合分析

2023-02-19 12:23:53 时间

今天是平平无奇的整合分析，是数据挖掘中经常用到的一部分~

参考文献在这里⬇

A robust 6-mRNA signature for prognosis prediction of pancreatic ductal adenocarcinoma
DOI: 10.7150/ijbs.32899

文章的故事是从这句话开始的：prognostic gene signature by generating the differentially expressed mRNAs with P-value less than 0.001 and fold-change values of −2 to 2 between tumor and non-tumor samples in GSE15471, GSE28735, and GSE62452 datasets.

After downloading and normalizing the raw data CEL files from GEO and ArrayExpress by a robust multiarray averaging (RMA) method, we used the ‘Affy’ and ‘affycoretools’ packages of R software.DEGs were defined with P< 0.001 and |log2 fold change|> 1 as the cut-off criteria：

作者是直接下载cel格式的原始数据，然后用RMA函数获取表达矩阵，分别对三个数据集进行了差异分析，然后对差异分析取交集作了后续的分析。

我们也试试看吧——

# GSE15471, GSE28735 and GSE62452
rm(list = ls()) 

##全局设置
##下载的数据大小>131072字节，所以需要调整默认连接缓存，以便正常下载
Sys.setenv("VROOM_CONNECTION_SIZE"=99999999)
options(stringsAsFactors = F)
options(timeout = 999999999)

library(affy)
library(GEOquery)
library(oligo)

getwd()

if (F) {
# 1.数据解压到新建的文件夹中 ----------------------------------------------------------
# dir.create("../GSE15471")##创建新的文件夹
samPath <- "../GSE15471"
untar("../Rawdata/GSE15471_RAW.tar", exdir = samPath)##解压原始文件到sampath文件夹中
setwd(samPath)
list.files()##显示文件夹中的文件，及原始文件顺序

# 2.匹配临床信息，读取cel文件 ---------------------------------------------------------------
gse <- "GSE15471"
gset <- getGEO(gse)

###匹配临床信息
celFiles <- list.celfiles(samPath) ##listGzipped参数：Logical. List .CEL.gz files? 
celFiles
dat <- read.celfiles(filenames = celFiles, phenoData = phenoData(gset[[1]]), 
                     sampleNames = rownames(pData(gset[[1]])))
class(dat)  ##这样一来，表型等信息的数据就在这个dat对象中了

eset=rma(dat)
class(eset)
exprSet=exprs(eset)
exprSet[1:10,1:10]

dim(exprSet)
range(exprSet)


# 3.获取分组信息 ----------------------------------------------------------------
library(GEOquery)
pd <- pData(dat)
table(pd$characteristics_ch1.1)
group_list <- ifelse(grepl("normal",pd$characteristics_ch1.1),"normal","tumor")
table(group_list)

##判断一下样本名是否与表达矩阵的列名一一对应
if(!identical(rownames(pd),colnames(exprSet))) exprSet = exprSet[,match(rownames(pd),colnames(exprSet))] 

setwd("../Main/")
getwd()
save(exprSet,group_list,gse,file = paste0(gse,"step1_output.Rdata"))
}

这样我们就获得了表达矩阵和分组信息，接下来就是常规的探针注释和差异分析了。

##因为注释平台不一样，所以annotation这一步要单独做一下，然后就可以直接用source函数解决了
gselist <- c("GSE15471", "GSE28735","GSE62452")
for (i in gselist) {
  gse <- gselist[3] ##改一下这里的数字
  source("step2_check.R")
  source("step4_DEG.R")
  source("step5_degVisualise.R")
}

完事了呢，我们来比较一下我们的差异分析和文章的差异分析结果：

155 VS 153，数量差不多~

其实还有另外一种方法，就是RRA。之前的推文也介绍过这种算法，相较于简单的取交集，RRA会根据logFC值对交集基因再排个序：

rm(list = ls())

library(RobustRankAggreg)
library(clusterProfiler)
library(ggplotify)

##差异基因门槛：
logFC_t=1
P.Value_t = 0.001

# gselist <- c("GSE15471", "GSE28735","GSE62452")
load("GSE15471_deg.Rdata")
GSE15471_deg <- deg
load( "GSE28735_deg.Rdata")
GSE28735_deg <- deg
load("GSE62452_deg.Rdata")
GSE62452_deg <- deg

###上调基因-------------
get_up <- function(df){
    result=df[df$logFC>(logFC_t) & df$P.Value<P.Value_t,]
    result=result[order(result$logFC,decreasing = T),]
    rownames(result)
}
glist=list(get_up(GSE15471_deg),
           get_up(GSE28735_deg),
           get_up(GSE62452_deg))

ups=aggregateRanks(glist = glist, N = length(unique(unlist(glist))))
tmp=as.data.frame(table(unlist(glist)))
ups$Freq=tmp[match(ups$Name,tmp[,1]),2]
head(ups)

##画热图
gs=ups[ups$Score < 0.05,1] ##抽到的次数越多，score越小
gs
updat=data.frame(GSE15471=GSE15471_deg[gs,'logFC'],
                 GSE28735=GSE28735_deg[gs,'logFC'],
                 GSE62452=GSE62452_deg[gs,'logFC']
                 )

rownames(updat)=gs
library(pheatmap)
head(updat)
updat1 <- updat[1:30,]  ##太多了，取部分基因看看
pheatmap(updat,display_numbers=F)

###下调基因-----------
get_down <- function(df){
  result=df[df$logFC<(-logFC_t) & df$P.Value<P.Value_t,]
  result=result[order(result$logFC,decreasing = F),]
  rownames(result)
}
glist1=list(get_down(GSE15471_deg),
           get_down(GSE28735_deg),
           get_down(GSE62452_deg))

downs=aggregateRanks(glist = glist1, N = length(unique(unlist(glist1))))
tmp=as.data.frame(table(unlist(glist1)))
downs$Freq=tmp[match(downs$Name,tmp[,1]),2]
head(downs)
gs1=downs[downs$Score < 0.05,1] ##抽到的次数越多，score越小
gs1

downdat=data.frame(GSE15471=GSE15471_deg[gs1,'logFC'],
                   GSE28735=GSE28735_deg[gs1,'logFC'],
                   GSE62452=GSE62452_deg[gs1,'logFC']
)
rownames(downdat)=gs1
library(pheatmap)
head(downdat)
pheatmap(downdat,display_numbers=F)


####上下调基因一起画------------
union_RRA <- rbind(updat1,downdat)
library(ComplexHeatmap)
library(circlize)
col_fun <- colorRamp2(
  c(-2, 0, 2), 
  c("green", "white", "red")
)
p <- Heatmap(
  as.matrix(union_RRA), 
  name = "expression", 
  border = "black",
  col = col_fun,
  row_names_gp = gpar(fontsize=4),
  rect_gp = gpar(col = "white", lwd = 2)
);p
p1 <- as.ggplot(p)
ggsave("../outputs/RRA_heatmap.png", egg::set_panel_size(p1, width=unit(4.5, "in"), height=unit(5, "in")), 
       width = 8, height = 7, units = 'in', dpi = 300)

猜你喜欢

最长无重复子串
写技术博客的一些心得分享
Java 多线程（七）：线程池
Java 多线程（五）：锁（三）
Java 多线程（四）：锁（二）
Java 多线程（三）：锁（一）
Java 多线程（二）：并发编程的三大特性
线性时间非比较类排序
Java 多线程（一）：基础
合并k个已排序的链表
HDFS 高可用分布式环境搭建
合并两个有序数组
连续子数组的最大和
HDFS 分布式环境搭建
容器盛水问题
大数加法
HDFS 伪分布式环境搭建
设计LRU缓存结构
两数之和
使用单调栈来解决的一些问题

zl程序教程

当前栏目

多个数据集的整合分析

相关文章