zl程序教程

您现在的位置是:首页 >  其他

当前栏目

TCGA泛癌的批量基因相关性分析,快上车吧

2023-02-19 12:23:48 时间

通常在肿瘤研究中,我们会挑出肿瘤的某种表型,然后去看一类基因和这类表型(比如免疫相关基因和血管生成)的相关性,开始一个生物学故事。

血管生成相关的marker当然不止一个,如果想再探索这群基因在不同肿瘤中的相关性,就要重复很多次换基因、换癌症的操作,颇为费力。想实现对这群基因的批量化分析吗?速速上车吧~

之前我们是用另外一个包实现的,玩转cgdsr之循环批量相关性,但是这个包11月底被弃用了,┭┮﹏┭┮,它的功能被一个新包继承过来了。

这个包大家可以去了解⬇

它把每一个肿瘤研究的数据都包装为一个MultiAssayExperiment对象,通过不同的函数可以取到其中的临床表型信息、不同的数据矩阵等等。

[cBioPortalData: User Guide https://www.bioconductor.org/packages/release/bioc//vignettes/cBioPortalData/inst/doc/cBioPortalData.html#cbioportaldata-obtain-data-from-the-cbioportal-api)

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("cBioPortalData")

成功安装以后,你可以通过ls函数来了解这个包能调用的函数类型:

> ##了解cBioPortalData包下面的函数
> ls("package:cBioPortalData")
 [1] "allSamples"            "cBioCache"             "cBioDataPack"         
 [4] "cBioPortal"            "cBioPortalData"        "clinicalData"         
 [7] "downloadStudy"         "genePanelMolecular"    "genePanels"           
[10] "geneTable"             "getDataByGenes"        "getGenePanel"         
[13] "getGenePanelMolecular" "getSampleInfo"         "getStudies"           
[16] "loadStudy"             "molecularData"         "molecularProfiles"    
[19] "mutationData"          "operations"            "queryGeneTable"       
[22] "removeDataCache"       "removePackCache"       "sampleLists"          
[25] "samplesInSampleLists"  "searchOps"             "setCache"             
[28] "untarStudy"     

先挑几个函数试试看喽~

看看通过这个包我们能拿到多少个研究的数据

cbio <- cBioPortal(
  hostname = "www.cbioportal.org",
  protocol = "https",
  api. = "/api/api-docs"
)

all_TCGA_studies <- getStudies(cbio, buildReport = TRUE)
head(all_TCGA_studies)
dim(all_TCGA_studies) 
# [1] 365  15

all.cancer = all_TCGA_studies[grep("tcga$",all_TCGA_studies$studyId),]#筛选出癌症——癌症_tcga这样的形式
dim(all.cancer) #[1] 32 15
all.cancer$studyId[1:5]

#得到TCGA癌症列表
cancer_model = all.cancer$studyId
cancer_model
 [1] "acc_tcga"      "blca_tcga"     "brca_tcga"     "cesc_tcga"    
 [5] "chol_tcga"     "coadread_tcga" "dlbc_tcga"     "esca_tcga"    
 [9] "gbm_tcga"      "hnsc_tcga"     "laml_tcga"     "kirc_tcga"    
[13] "kich_tcga"     "lgg_tcga"      "lihc_tcga"     "kirp_tcga"    
[17] "luad_tcga"     "lusc_tcga"     "meso_tcga"     "ov_tcga"      
[21] "paad_tcga"     "pcpg_tcga"     "prad_tcga"     "skcm_tcga"    
[25] "sarc_tcga"     "stad_tcga"     "tgct_tcga"     "thym_tcga"    
[29] "thca_tcga"     "ucec_tcga"     "ucs_tcga"      "uvm_tcga"  

然后,我们关心的是每种癌症下面有什么数据

sampleLists(cbio, studyId = "brca_tcga") ##以乳腺癌为例

可以看到有mrna/cna/methylation/rppa等多种类型的数据,基本可以满足数据挖掘的要求吧——

如果我想要的是乳腺癌中血管生成相关基因的mrna数据,应该用哪个函数呢?

brca_pub <- cBioPortalData(
  api = cbio,
  studyId = "brca_tcga",
  genes = c("VEGFA","PIGF","MMP2", "MMP9", "EGF" ,"bFGF",
            "F4/80","iNOS","CD86","Arg1","CD206"), by = "hugoGeneSymbol",
  molecularProfileIds = "brca_tcga_mrna"
)              

然后,再灵活运用这几个函数取出对应的信息

## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save data to flat files

到这里,你就能拿到表型数据和mrna数据了,后续的分析就看你啦!


接下来是对TCGA癌症列表的批量相关性分析,?

我们可以出多少张图呢,可以这么理解:32(TCGA_tumor)×n1(gene list1)×n2(gene list2)

genelist <-  c("VEGFA","PIGF","MMP2", "MMP9", "EGF" ,"FGF2","NOS2",
               "CD86","ARG1","MRC1")df_list <- list()
for (i in 1:length(cancer_model)) 
  {
  cancer=cancer_model[i]
  cancerID <- sampleLists(cbio, studyId = cancer)
  datID <-cancerID$sampleListId[cancerID$category=="all_cases_with_mrna_rnaseq_data"] 
  tmp <- cBioPortalData(api = cbio, by = "hugoGeneSymbol", studyId = cancer,
                        genes =unlist(genelist),
                        molecularProfileIds = datID)
  temp <- try(assay(tmp[[datID]]),silent=FALSE)
  if(class(temp)[1] != 'try-error'  ){  ##有的肿瘤没有mrna数据,所以要判断一下
    exp <- assay(tmp[[datID]])
    df = na.omit(exp)
    range(df)
    df=log2(df+1)
    range(df)
    df1 <- as.data.frame(t(df))
    df2 <- df1[substr(rownames(df1),14,15)=="01",] ##取出肿瘤组织样本
    df_list[[i]] <- df2
  }
}

names(df_list) = cancer_model
save(df_list,genelist,file = "correlation_output.Rdata")

这时候我们得到的是上面的一堆基因在32种肿瘤组织中的表达量矩阵,接下来就可以准备画图了。

整理一下数据:

rm(list = ls())

load("correlation_output.Rdata")

##调出能用的肿瘤矩阵
library(rlist)
cancer_list <-   list.clean(df_list) ##清除列表中的null数据

##散点图
library(ggstatsplot)
genelist
Marker <- c("NOS2","ARG1" , "CD86","MRC1") ##免疫相关基因
mygenelist <-  setdiff(genelist,Marker);mygenelist ##血管生成相关基因

gene1 <- "NOS2" ##这里需要手动换一下基因
# gene1 <- "ARG1"
# gene1 <- "CD86"
# gene1 <- "MRC1"
##接下来是循环的套娃模式
if (T) {
plist<-list()
fig_list <- list()
#循环1
for (i in 1:length(mygenelist)) {
  gene <- mygenelist[i]
    ##循环2
  for (j in 1:length(cancer_list)) {
  cancer <- names(cancer_list[j])
  df = cancer_list[[cancer]]
    print("ok")
        p<-ggscatter(df,x=gene1,y=gene,
                 size=1, 
                 rug=T, 
                 add="reg.line",
                 point = F, ##去掉散点
                 conf.int=T,
                 conf.int.level=0.95,
                 cor.coef=T,
                 cor.method="pearson",
                 ggtheme=theme_pubr(),
                 title = cancer
    )
  fig_list[[j]]<-p ##所有肿瘤
  }
  plist[[i]] <- fig_list  ##所有基因
}

最后是可视化的拼图

library(cowplot)
library(egg)
mygenelist ##"VEGFA" "PIGF"  "MMP2"  "MMP9"  "EGF"   "FGF2" 
##批量拼图
names(plist) <- mygenelist
lapply(names(plist), function(gene){
  # gene <- names(plist)[1]
  print(gene)
  p <- plot_grid(plotlist=plist[[gene]],labels = "AUTO",
            nrow = 5,
            ncol = 6)
  ggsave(p,filename = paste0("../outputs/",gene1,gene,"_cor.png"),
         width = 15, height = 14, units = 'in', dpi = 300)
})

  }

选两幅看看效果,虽略显冗杂,但可以反映这两个基因在多种肿瘤中的相关性的情况:

NOS2和EGF

NOS2和VEGFA

如果你也有感兴趣的两群基因,不妨用这些代码去复现看看~~~~