生物信息数据分析教程视频——16-单样本基因集富集分析(ssGSEA)用于肿瘤相关免疫细胞浸润水平评估 http://mpvideo.qpic.cn/0b2ezeadqaaanaafvdb5cjrvbsodhdeqaoaa.f10002.mp4?dis_k=434c024ce3cb08025f1257e30ff4b4a4&dis_t=1671187731&vid=wxv_2606760913223974912&format_id=10002&support_redirect=0&mmversion=false

参考视频： http://mpvideo.qpic.cn/0bf2nyabqaaa3iabiz7dnnqfa3wddbxaagaa.f10002.mp4?dis_k=dc98b18769e0ad4768ae32e8f35e4b2e&dis_t=1671187731&vid=wxv_1944485866878402561&format_id=10002&support_redirect=0&mmversion=false

实际应用：

【0代码】单基因泛癌分析教程

单基因泛癌分析视频教程补充——基于R语言

estimate代码：


# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(corrplot)
library(pheatmap)
library(estimate)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

opt <- "output/009-CellFractions/estimate/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))

record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"

for(proj in project[1]){
  message("===============================")
  message(proj)

  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  ##肿瘤组织样本ID
  SamT <- setdiff(colnames(tpm),SamN)
  
  ###去除重复样本
  tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
  # ###long2转换
  # tur_exp <- log2(tur_exp + 1)
  # dat=log2(edgeR::cpm(pd_mat)+1)
  # scores = estimate_RNAseq(dat,pro)
  
  diffgene2ent <- bitr(rownames(tur_exp),
                       fromType="SYMBOL", toType="ENTREZID",
                       OrgDb="org.Hs.eg.db")
  tur_exp <- tur_exp[diffgene2ent$SYMBOL,]
  rownames(tur_exp) <- diffgene2ent$ENTREZID
  
  
  colnames(tur_exp) <- gsub("-","_",colnames(tur_exp))
  head(tur_exp)[,1:3]
  ###输出表达数据，作为filterCommonGenes的输入
  input.f <- paste0(opt,"sample_input.txt")
  write.table(tur_exp,file = input.f,row.names = T,sep = "\t",quote = F)
  
  output.f <- paste0(opt,"estimate_input.gct")##定义输出
  filterCommonGenes(input.f= input.f, output.f=output.f, id="EntrezID")#过滤掉细胞中共有的基因

  ### 计算免疫浸润打分 ###
  estimate_path <- paste0(opt,"sample_estimate.gct")
  estimateScore(output.f, estimate_path, platform="illumina")#计算肿瘤微环境的评分
  
  # ### 可视化 ###
  # # 绘制单个样本
  # output.dir <- paste0(opt,"figures/")##定义输出
  # plotPurity(scores= estimate_path, samples=colnames(tur_exp)[1], 
  #            platform="illumina", 
  #            output.dir = opt)
  # # 绘制多个样本
  # plotPurity(scores= estimate_path, 
  #            platform="illumina", output.dir = opt_path)
  
  estimate_scores = read.table(estimate_path,skip = 2,header = T)
  rownames(estimate_scores)=estimate_scores[,1]
  estimate_scores= t(estimate_scores[,3:ncol(estimate_scores)]) %>% as.data.frame()
  head(estimate_scores)
  estimate_scores$Tumour_purity <- cos(0.6049872018+0.0001467884*estimate_scores$ESTIMATEScore)
  save(estimate_scores,file = paste0(opt,proj,"-estimate.Rdata"))
  
  file.remove(dir(opt,"(.gct)|(.txt)$",full.names = T))

}



###============老版本数据
message("===============================")
message(proj)
load("H:/data_analysis/TCGA/old/TCGA/RNASeqTPM/TCGA-LUAD-protein_coding-TPM.Rdata")


tpm <- filterGeneTypeExpr(expr = tpmdata,
                          fil_col = "gene_type",
                          filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                              typesample = c("NT","NB","NBC","NEBV","NBM"))

##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)

###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# ###long2转换
# tur_exp <- log2(tur_exp + 1)

diffgene2ent <- bitr(rownames(tur_exp),
                     fromType="SYMBOL", toType="ENTREZID",
                     OrgDb="org.Hs.eg.db")
tur_exp <- tur_exp[diffgene2ent$SYMBOL,]
rownames(tur_exp) <- diffgene2ent$ENTREZID

colnames(tur_exp) <- gsub("-","_",colnames(tur_exp))
head(tur_exp)[,1:3]
###输出表达数据，作为filterCommonGenes的输入
input.f <- paste0(opt,"sample_input.txt")
write.table(tur_exp,file = input.f,row.names = T,sep = "\t",quote = F)

output.f <- paste0(opt,"estimate_input.gct")##定义输出
filterCommonGenes(input.f= input.f, output.f=output.f, id="EntrezID")#过滤掉细胞中共有的基因

### 计算免疫浸润打分 ###
estimate_path <- paste0(opt,"sample_estimate.gct")
estimateScore(output.f, estimate_path, platform="illumina")#计算肿瘤微环境的评分


estimate_scores = read.table(estimate_path,skip = 2,header = T)
rownames(estimate_scores)=estimate_scores[,1]
estimate_scores= t(estimate_scores[,3:ncol(estimate_scores)]) %>% as.data.frame()
head(estimate_scores)
estimate_scores$Tumour_purity <- cos(0.6049872018+0.0001467884*estimate_scores$ESTIMATEScore)
save(estimate_scores,file = paste0(opt,proj,"-estimate.Rdata"))

file.remove(dir(opt,"(.gct)|(.txt)$",full.names = T))

CIBERSORTx代码：


# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
#https://rdrr.io/github/singha53/amritr/src/R/supportFunc_cibersort.R
# install.packages("devtools")
# devtools::install_github("BioInfoCloud/csCIBERSORTx")
library(csCIBERSORTx)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

opt <- "output/009-CellFractions/CIBERSORT/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))

record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"

for(proj in project[2:33]){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  ##肿瘤组织样本ID
  SamT <- setdiff(colnames(tpm),SamN)
  
  ###去除重复样本
  tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
  # ###long2转换,该方法不需要log转化内部函数会有判断
  # tur_exp <- log2(tur_exp + 1)

  # head(tur_exp)[,1:3]
  
  file <- paste0(opt,proj,"-CIBERSORT-Result.txt")
  cibr <- CIBERSORT2(LM22_matrix = LM22,
                     mixture = tur_exp,
                     opt_path = file,
                     perm=100,
                     QN=TRUE)
  # 提取CIBERSORT的打分结果
  CIBERSORT_Score <- cibr[,1:(ncol(cibr)-3)]#因为绘图不需要最后三列，所以去掉最后三列，取22列
  
  # 生成标准化函数normalize，构建一个function
  normalize <- function(x){# （x-x的最小值）/(X的最大值-X的最小值)
    return((x-min(x))/(max(x)-min(x)))}
  
  # 使用normalize标准化CIBERSORT打分
  CIBERSORT_Score <- normalize(CIBERSORT_Score)
  # 将处理后CIBERSORT的打分写出到文件
  save(CIBERSORT_Score, file = paste0(opt,proj,"-CIBERSORT-Result.Rdata"))
  # # 构建热图注释数据框
  # annotation <- data.frame(rownames(CIBERSORT_Score))
  # colnames(annotation) <- "sample"#设定列名
  # rownames(annotation) <- rownames(CIBERSORT_Score)#设定行名
  # annotation_row <- cbind(annotation, cibr[,(ncol(cibr)-2):ncol(cibr)])#合并列，CIBERSORT_Result的最后三列
  # 
  # # 绘制热图
  # pheatmap(CIBERSORT_Score,
  #          show_colnames = T,
  #          cluster_rows = F,
  #          cluster_cols = F,#cluster_cols = F 对列不进行聚类
  #          annotation_row = annotation_row,#注释数据框放在行
  #          cellwidth=10,
  #          cellheight=40,
  #          fontsize=5,
  #          fontsize_row=5,
  #          scale="row",
  #          fontsize_col=3)
}

MCPcounter代码：


# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
# devtools::install_github('ebecht/MCPcounter',ref='master', subdir='Source')
library(MCPcounter)
library(preprocessCore)

# https://github.com/ebecht/MCPcounter
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

opt <- "output/009-CellFractions/MCPcounter/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))

record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
# 读取探针注释结果
probesets = read.table(curl('http://raw.githubusercontent.com/ebecht/MCPcounter/master/Signatures/probesets.txt'),
                       sep='\t',stringsAsFactors=FALSE,colClasses='character')


# 读取基因名注释结果
genes = read.table(curl('http://raw.githubusercontent.com/ebecht/MCPcounter/master/Signatures/genes.txt'),
                   sep='\t',
                   stringsAsFactors=FALSE,
                   header=TRUE,
                   colClasses='character',
                   check.names=FALSE)

library(csGeneset)
# Probe annotation.
probesets <- annoMCPcounter[["probesets"]]
# gene annotation
genes <- annoMCPcounter[["genes"]]
?MCPcounter.estimate


for(proj in project[1]){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  ##肿瘤组织样本ID
  SamT <- setdiff(colnames(tpm),SamN)
  
  ###去除重复样本
  tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
  # ###long2转换
  # tur_exp <- log2(tur_exp + 1)
  # 
  head(tur_exp)[,1:3]
  
  # 进行MCPcounter分析
  MCPcounterScore <- MCPcounter.estimate(tur_exp, featuresType = "HUGO_symbols",#featuresType就是根据input来变化的
                                         probesets = probesets,genes = genes)
  
  # 生成标准化函数normalize，构建一个function
  normalize <- function(x){# （x-x的最小值）/(X的最大值-X的最小值)
    return((x-min(x))/(max(x)-min(x)))}
  # 使用normalize函数进行标准化
  MCPcounterScore <-  normalize(data.matrix(MCPcounterScore))
  
  # 将结果写出到文件
  write.table(MCPcounterScore, file = "MCPcounterScore.txt",sep="\t", quote=F, col.names=T)
  
  # 构建包含注释信息的数据框
  annotation_col <- data.frame(colnames(MCPcounterScore))
  colnames(annotation_col) <- "sample"
  rownames(annotation_col) <- colnames(MCPcounterScore)  
  # 绘制热图
  pheatmap(MCPcounterScore,show_colnames = F,cluster_rows = F,
           cluster_cols = T,annotation_col = annotation_col,#annotation_col 注释放在列， cluster_cols = T对列也就是样本进行聚类
           cellwidth=15,cellheight=15,fontsize=5,
           filename = 'figures/MCPcounter-heatmap.tiff')
}

xCell代码：


# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
# devtools::install_github('dviraran/xCell')
library(xCell)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/xCell/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))

source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  ##肿瘤组织样本ID
  SamT <- setdiff(colnames(tpm),SamN)
  
  ###去除重复样本
  tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)

  # 进行xCell分析
  xCell <- xCellAnalysis(tur_exp)
  dim(xCell)
  rownames(xCell)
  ## 如果是RNA-seq数据，则
  xCell_RNAseq <- xCellAnalysis(tur_exp,rnaseq = T)
  dim(xCell_RNAseq)
  
  # 将结果写出到文件
  save(xCell_RNAseq, file = paste0(opt,proj,"-xCell.Rdata"),sep="\t", quote=F, col.names=T) 
}

ImmuCellAI代码：

# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
# devtools::install_github("lydiaMyr/ImmuCellAI@main")
# https://github.com/lydiaMyr/ImmuCellAI
library(ImmuCellAI)
# https://github.com/lydiaMyr/ImmuCellAI-mouse
# devtools::install_github("lydiaMyr/ImmuCellAI-mouse@main")
data("train_data")
data("train_tag")
data("marker_exp")
data("marker_exp_T")
data("paper_marker")
data("train_data")
data("train_tag")
data("compensation_matrix")
data("immune_infiltate_marker")
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

opt <- "output/009-CellFractions/ImmuCellAI/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))

source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  ##肿瘤组织样本ID
  SamT <- setdiff(colnames(tpm),SamN)
  
  ###去除重复样本
  tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
  dim(tur_exp)
  # group <- data.frame(group = rep(c("High","low"),each = 258)) %>% t()
  # colnames(group) <- colnames(tur_exp)
  # tur_exp <- rbind(group,tur_exp)
  #The prediction process of ImmuCellAI is updated, which simulated the flow cytometry process to predict cell type abundance by hierarchical strategy. All 24 cell types were divided into two layers, layer1:DC, B cell, Monocyte, Macrophage, NK, Neutrophil, CD4 T, CD8 T, NKT, Tgd; layer2:CD4 naive, CD8 naive, Tc, Tex, Tr1, nTreg, iTreg, Th1, Th2, Th17, Tfh, Tcm, Tem, MAIT.
  #Parameters
  #sample_expression: Sample expression profile in FPKM, TPM format by RNA-seq or log2-transformed signal by microarray.
  #datatype: One of "rnaseq" and "microarray"
  #group_tag: One of 0 and 1, if there is the need to perform the comparision between different groups. If the value is 1, users need to add a group tag row in the input epxression matrix to explain the group of each sample.
  #response_tag: One of 0 and 1, if there is the need to predict the ICB response of each sample.
  
  ImmuCellAIscorelist <- ImmuCellAI_new(tur_exp,"rnaseq",group_tag =0,0)
  
  ImmuCellAIscore <- ImmuCellAIscorelist[["Sample_abundance"]]
  #output
  # The output of the function is a list with three variables including sample immune cell abundance, group comparison result and ICB response prediction result.
  
  # 将结果写出到文件
  save(ImmuCellAIscore, file = paste0(opt,proj,"-ImmuCellAI.Rdata"),sep="\t", quote=F, col.names=T)  
}

猜你喜欢

基于昇腾CANN的卡通图像生成可在线体验啦！十分钟带你了解CANN应用开发全流程
什么是强化学习？
高可用架构演进之单元化
AOC萌新探索：搭建和体验在线AOC环境
如何将知识引入机器学习模型提升泛化能力？
零代码以“王者荣耀”为例解析设计七原则
Python入门：ChainMap 有效管理多个上下文
高并发中，那些不得不说的线程池与ThreadPoolExecutor类
“互联网+”大赛之智慧校园赛题攻略：你的智慧校园，WeLink帮你来建
云小课 | 网络知识一箩筐——NAT网关，让IP地址华丽变身，轻松实现内外网互通
跟我读论文丨ACL2021 NER 模块化交互网络用于命名实体识别
4种基于像素分割的文本检测算法
U2Net基于ModelArts Notbook的仿真实验
七夕赶上服务器架构升级，女朋友的约会怎么办
在openEuler上做开发？这个大赛拿出30万寻找开源的yyds
5招教你实现多线程场景下的线程安全！
跟我读论文丨ACL2021 NER BERT化隐马尔可夫模型用于多源弱监督命名实体识别
大数据集群跨多版本升级、业务0中断，只因背后有TA
云小课 | 到底什么是区块链？
信创产业已成现象级新风口，快来加入争做“弄潮儿”

zl程序教程

当前栏目

生物信息数据分析教程视频——17-多种算法评估肿瘤免疫细胞浸润水平

相关文章