zl程序教程

您现在的位置是:首页 >  数据库

当前栏目

单细胞转录组分析揭示胃肠道间质瘤和微环境的异质性

2023-04-18 14:38:59 时间

下面11月份学徒的投稿

文章标题:《Single-cell transcriptome analysis revealed the heterogeneity and microenvironment of gastrointestinal stromal tumors》

原文链接:https://pubmed.ncbi.nlm.nih.gov/33393143/

文章背景:

胃肠道间质瘤(GIST)是人类胃肠道最常见的间叶性肿瘤,文章发现 GIST 中肿瘤细胞的异质性及其与其他细胞的相互作用,在 GIST 肿瘤组织中发现了四种主要的细胞类型,包括 T 细胞、巨噬细胞、肿瘤细胞和 NK 细胞。

肿瘤细胞可以分为两组:一组高度增殖并与高转移风险相关,另一组似乎“静止”并与低风险相关。

T 细胞是我们单细胞数据中最大的细胞类型。两组CD8 +效应记忆 (EM) 细胞的克隆扩增最高,细胞毒性最高,但也是所有 T 细胞中最耗竭的。

巨噬细胞极化以具有 M1 和 M2 特征,并随着肿瘤进展而增加。细胞间相互作用分析表明,脂肪内皮细胞与肿瘤细胞具有高度相互作用以促进其进展。巨噬细胞处于肿瘤微环境的中心,将免疫细胞募集到肿瘤部位,并与肿瘤细胞和非肿瘤细胞发生最多的相互作用。

文章链接:https://pubmed.ncbi.nlm.nih.gov/33393143/

文章图标复现:

本片文章的用到数据是GSE162115,包含4个样本:

本次复现的图为Figure4 图A

(1) 数据读取质控过滤:

数据在读取之前需要将每个样本文件整理成如下格式(每个样品一个文件夹,每个文件夹里面就一个h5个文件 ):

image-20211105174714342

# 加载R包

rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 

# 加载数据
dir = '../data/GSE162115_RAW/'
samples=list.files(dir)
samples
sceList = lapply(samples,function(pro){ 
  # pro=samples[1]
  folder=file.path(dir,pro) 
  print(pro)
  print(folder)
  print(list.files(folder))
  sce=CreateSeuratObject(counts = Read10X_h5(paste0(folder,"/matrix.h5")), #       remotes::install_github("hhoeflin/hdf5r")
                         project =  pro )
  return(sce)
})
names(sceList)  
samples
names(sceList) =  samples
sce.all <- merge(sceList[[1]], y= sceList[ -1 ] ,
                 add.cell.ids=samples) 
# 查看数据
as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
tail(sce.all@meta.data, 10) 
dim(sce.all)
# [1] 33538 52416
table(sce.all$orig.ident)
# G1.I  G1.P  G2.I  G2.P 
# 12017 15942 11029 13428
# 添加分组信息
# sce.all$group <- ifelse(sce.all$orig.ident %in% c(),"a组","b组")
dir.create("../subdata")
save(sce.all,file = "../subdata/sce.all.Rdata")

# 第一次过滤 -------------------------------------------------------------------
#计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^MT-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
#计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^RP[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^RP[SL]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
#计算红血细胞基因比例
hb_genes <- rownames(sce.all)[grep("^HB[^(p)]", rownames(sce.all),ignore.case = T)]
hb_genes
sce.all=PercentageFeatureSet(sce.all, "^HB[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)

#根据上述指标,过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA >300) # 每个细胞中基因表达>300
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3] # 至少在3个细胞中表达 
sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)

# 第二次过滤 -------------------------------------------------------------------
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
dim(sce.all.filt)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 1)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.5)

sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
# [1] 20760 34588
table(sce.all$orig.ident)
# G1.I  G1.P  G2.I  G2.P 
# 12017 15942 11029 13428 
table(sce.all.filt$orig.ident)
# G1.I  G1.P  G2.I  G2.P 
# 5565  5763 10268 12992 

(2) 标准的 降维聚类分析,去除批次效应

# 降维聚类分析 ------------------------------------------------------------------
rm(list=ls()) 
load("../subdata/sce.all.filt.Rdata")
#检查数据
dim(sce.all.filt)
# [1] 20760 34588
table(sce.all.filt$orig.ident)
# G1.I  G1.P  G2.I  G2.P 
# 5565  5763 10268 12992 
sce <- sce.all.filt
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
)  #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)

# 检查批次效应 ------------------------------------------------------------------
p1.compare=plot_grid(ncol = 3,
                     DimPlot(sce, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                     DimPlot(sce, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                     DimPlot(sce, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP")
)
p1.compare
ggsave(plot=p1.compare,filename="../picture/2_PCA_UMAP_TSNE/Before_inter_1.pdf")

去除批次效应:

# CCA 去除批次效应 --------------------------------------------------------------
sce.all=sce
#拆分为 个seurat子对象
sce.all.list <- SplitObject(sce.all, split.by = "orig.ident")
sce.all.list
for (i in 1:length(sce.all.list)) {
  print(i)
  sce.all.list[[i]] <- NormalizeData(sce.all.list[[i]], verbose = FALSE)
  sce.all.list[[i]] <- FindVariableFeatures(sce.all.list[[i]], selection.method = "vst", 
                                            nfeatures = 2000, verbose = FALSE)
}
alldata.anchors <- FindIntegrationAnchors(object.list = sce.all.list, dims = 1:30, 
                                          reduction = "cca")
sce.all.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA")
names(sce.all.int@assays)
#[1] "RNA" "CCA"
sce.all.int@active.assay
#[1] "CCA"
sce.all.int=ScaleData(sce.all.int)
sce.all.int=RunPCA(sce.all.int, npcs = 30)
sce.all.int=RunTSNE(sce.all.int, dims = 1:30)
sce.all.int=RunUMAP(sce.all.int, dims = 1:30)
names(sce.all.int@reductions)
# CCA 比较去除批次前后  --------------------------------------------------------
p5.compare=plot_grid(ncol = 3,
                  DimPlot(sce, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                  DimPlot(sce, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                  DimPlot(sce, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP"),
                     
                  DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA"),
                  DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE"),
                  DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP")
)
p5.compare
ggsave(plot=p5.compare,filename="../picture/2_PCA_UMAP_TSNE/umap_by_har_before_after.pdf",units = "cm", width = 40,height = 18)

CCA 聚类分析

sce <- sce.all.int
sce <- FindNeighbors(sce, dims = 1:30)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  # res=0.01
  print(res)
  sce <- FindClusters(sce, graph.name = "CCA_snn", resolution = res, algorithm = 1)
}
ls <- apply(sce@meta.data[,grep("CCA_snn_res",colnames(sce@meta.data))],2,table)
ls
# CCA分群数据----
save(sce,file = '../subdata/first_sce_by_CCA.Rdata')
#umap可视化
cluster_umap <- plot_grid(ncol = 4, 
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.01", label = T) & NoAxes(), 
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.05", label = T)& NoAxes(),
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.1", label = T) & NoAxes(),
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.2", label = T)& NoAxes(),
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.3", label = T)& NoAxes(),
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.5", label = T) & NoAxes(),
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.0.8", label = T) & NoAxes(), 
                      DimPlot(sce, reduction = "umap", group.by = "CCA_snn_res.1", label = T) & NoAxes()
)
ggsave(cluster_umap, filename = "../picture/2_PCA_UMAP_TSNE/cluster_umap_all_res_by_CCA.pdf", width = 16, height = 8)

细胞亚群注释:

genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2',
                   'PECAM1', 'VWF', 
                   'MKI67','TOP2A',
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',
                   "NKG7",
                   "KIT","PDGFRA","ANO1","CD34", # tumor
                   "AIF1", "C1QC","MS4A6A" ,"CD68","LRP1","BST2")
p_all_markers <- DotPlot(sce.all, features = unique(genes_to_check),
                         group.by = "CCA_snn_res.0.8",assay='RNA'  )  + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
ggsave(p1,filename = '../picture/Annotation/all_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
celltype=data.frame(ClusterID=0:22,
                    celltype='NA')
celltype[celltype$ClusterID %in% c(0,1,2,11),2]='T cells'  
celltype[celltype$ClusterID %in% c(16),2]='NKT cells'  
celltype[celltype$ClusterID %in% c(13,5),2]='NK cells' 
celltype[celltype$ClusterID %in% c(12),2]='Div-NKT cells' 
celltype[celltype$ClusterID %in% c( 21 ),2]='Plasma cell'
celltype[celltype$ClusterID %in% c( 15,20 ),2]='Endothelial'
celltype[celltype$ClusterID %in% c( 4,7,8,9 ),2]='Macrophage'  
celltype[celltype$ClusterID %in% c( 14 ),2]='B cells' 
celltype[celltype$ClusterID %in% c( 8),2]='cDC2'
celltype[celltype$ClusterID %in% c( 18),2]='cDC3'
celltype[celltype$ClusterID %in% c( 10),2]='pDC'
celltype[celltype$ClusterID %in% c( 3,6,17,22),2]='Tumor'
celltype[celltype$ClusterID %in% c(19 ),2]='Monocytes_B cells' # "MZB1","S100A9"
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$CCA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

对Macrophage进行重新降维聚类分群

# 对macrophage亚群重新降维聚类分群 ---------------------------------------------------
rm(list = ls())
load(paste0("../subcluster/Macrophage.Rdata"))

#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
)  #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)

# CCA 聚类分析------------------------------------------------------------------

seob <- sce
seob <- FindNeighbors(seob, dims = 1:30)
seob <- FindClusters(seob, graph.name = "RNA_snn", 
                     resolution = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1),
                     algorithm = 1)
# CCA分群数据----

#聚类树可视化
p2_tree <- clustree(seob@meta.data, prefix = "RNA_snn_res.") 
p2_tree
ggsave(p2_tree, filename = paste0("../picture/2_PCA_UMAP_TSNE/macrophage_cluster_tree_by_CCA.pdf"), width = 10, height = 8)
#tsne可视化
cluster_tsne <- plot_grid(ncol = 4, 
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.01", label = T) & NoAxes(), 
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.05", label = T)& NoAxes(),
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.1", label = T) & NoAxes(),
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.2", label = T)& NoAxes(),
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.3", label = T)& NoAxes(),
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.5", label = T) & NoAxes(),
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.8", label = T) & NoAxes(), 
                     DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.1", label = T) & NoAxes()
)
ggsave(cluster_tsne, filename = paste0("../picture/2_PCA_UMAP_TSNE/macrophage_cluster_tsne_all_res_by_CCA.pdf"), width = 16, height = 8)
sce <- seob
save(sce,file = paste0("../subdata/macrophage_after.Rdata"))

image-20211105135140796

Macrophage 细分亚群方式:

标题为:Single cell RNA sequencing identifies unique inflammatory airspace macrophage subsets

文章链接为:https://insight.jci.org/articles/view/126556/figure/3

不难发现,细分亚群要根据Marker gene的表达分为四种细胞亚型:

高表达M2 marker genes

高表达M1 marker genes

高表达M1,M2 marker genes

低表达M1,M2 marker genes

扩展:

1.巨噬细胞在协调炎症反应和调节组织修复方面的双重作用

2.所有急性炎症组织中,有两个主要的巨噬细胞亚类并存,包括来自胚胎的常驻组织巨噬细胞和来自骨髓的被招募的巨噬细胞。

巨噬细胞的极化现象,大家可以自行前往维基百科进行阅读:

巨噬细胞亚群注释

# 细胞亚群注释 macrophage------------------------------------------------------------------
rm(list = ls())
load("../subdata/macrophage_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
genes_to_check = c("CLEC7A","EGR2","IL10","IRF4","KIF4","MRC1",#M2
                   "AZIN1","CD38","CD86","CXCL10","IL18","IL1B","IRF5","NFKBIZ","PTGS2","SOCS3","TIR4","TNF",#M1
                   "C1QA","C1QB","S100A8","S100A9","TOP2A","MKI67")

# 该基因名称(大小写转换)----
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
                         group.by = "RNA_snn_res.1",assay='RNA'  )  + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/macrophage_check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/macrophage_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
sce.all <- sce
celltype=data.frame(ClusterID=0:18,
                    celltype='NA')
celltype[celltype$ClusterID %in% c( 1 ),2]='M1'
celltype[celltype$ClusterID %in% c( 2,4,6,7),2]='M2'
celltype[celltype$ClusterID %in% c( 0,3,5,8,9,15,16,17,12,13,14),2]='height-both'
celltype[celltype$ClusterID %in% c( 10,11,18),2]='low-both'
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
genes_to_check <- c("CLEC7A","MRC1","AZIN1","IL18","IL1B","NFKBIZ","SOCS3","S100A8","S100A9")
p1 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
ggsave(p1,filename = "../picture/Annotation/macrophage_tsne.pdf")
p2 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
ggsave(p2,filename = "../picture/Annotation/macrophage_vlnplot.pdf")
ps <- p2|p1
ps
ggsave(ps,filename = "../picture/Annotation/macrophage_tsne_vlnplot.pdf",units = "cm",width = 32,height = 16)

至此,文章 macrophage 亚群细分已完成。但是我们的亚群细分跟文章的细分,基本上就是完全不一样的思路了。