zl程序教程

您现在的位置是:首页 >  数据库

当前栏目

单细胞水平看小鼠胰腺导管腺癌进展中的细胞异质性

2023-04-18 14:38:59 时间

下面11月份学徒的投稿

文章标题:《Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution》

文章链接:https://insight.jci.org/articles/view/129212

文章概述:

胰腺导管腺癌 (PDA) 是癌症相关死亡的主要原因,可用的治疗选择有限。通过单细胞 RNA 测序技术在基因工程小鼠模型的 PDA 进展的不同阶段对细胞异质性进行分析。发现**除了具有增加炎症特征的不同巨噬细胞群外,癌细胞的上皮向间充质转变伴随着肿瘤进展;**正常小鼠胰腺中存在三种不同的成纤维细胞分子亚型,最终在晚期 PDA 中产生两种不同的成纤维细胞群;癌细胞和成纤维细胞可能受到表观遗传机制的动态调节。

文章复现:

此次复现的图为文章Figure 1 的B-E图,是3个单细胞样品各自独立的降维聚类分群并且进行生物学命名:

image-20211104152226887

可以看出文章中的是将Normal,early-KIC,late-KIC分别进行降维聚类分群及细胞亚群注释。接下来进行文章复现:

  1. 数据概览:

文章中处理的数据集是GSE125588,其中包括5个样本:

(1)数据读取质控过滤:

数据读取之前需要将每个样本的标准10X文件整理成如下格式:

image-20211105174422303

# 加载R包 -------------------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(stringr)) 

# 加载数据 -------------------------------------------------------------------
dir = '../data/GSE125588/'
samples=list.files(dir)
samples
sceList = lapply(samples,function(pro){ 
  # pro=samples[1]
  folder=file.path(dir,pro) 
  print(pro)
  print(folder)
  print(list.files(folder))
  sce=CreateSeuratObject(counts = Read10X(folder),
                         project =  pro )
  
  return(sce)
})
names(sceList)  
samples
names(sceList) =  samples
sce.all <- merge(sceList[[1]], y= sceList[ -1 ] ,
                 add.cell.ids=samples) 
# 查看数据
as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
tail(sce.all@meta.data, 10)
dim(sce.all)
# [1] 27998 11636
table(sce.all$orig.ident)
# early.KIC  late.KIC  late.KPC late.KPfC    normal 
# 3534       938      1377      3143      2644 

# 添加分组信息
# sce.all$group <- ifelse(sce.all$orig.ident %in% c(),"a组","b组")
dir.create("../subdata")
save(sce.all,file = "../subdata/sce.all.Rdata")

# 质控和过滤 -------------------------------------------------------------------
# 1. 计算线粒体基因比例
# 人和鼠的基因名字稍微不一样 
mito_genes=rownames(sce.all)[grep("^mt-", rownames(sce.all))] 
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^mt-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
# 2. 计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^Rp[sl]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
# 3. 计算红血细胞基因比例
hb_genes <- rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
hb_genes
sce.all=PercentageFeatureSet(sce.all, "^Hb[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)

# 过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA >300) # 每个细胞中基因表达>300
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3] # 至少在3个细胞中表达 
sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
# 过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 1)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.5)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
# [1] 17935  9621
table(sce.all$orig.ident)
# early.KIC  late.KIC  late.KPC late.KPfC    normal 
# 3534       938      1377      3143      2644 
table(sce.all.filt$orig.ident)
# early.KIC  late.KIC  late.KPC late.KPfC    normal 
# 3379       765       917      2632      1928 

(2)拆分细胞亚群:

sce.all.list <- SplitObject(sce, split.by = "orig.ident") # "orig.ident" 中包含了样本信息
sce.all.list 
names(sce.all.list)
for (i in names(sce.all.list)) {
  print(i)
  epi_mat = sce.all.list[[i]]@assays$RNA@counts
  epi_phe = sce.all.list[[i]]@meta.data
  sce=CreateSeuratObject(counts = epi_mat, 
                         meta.data = epi_phe )
  sce
  table(sce@meta.data$orig.ident) 
  save(sce,file = paste0("../subcluster/",i,'.Rdata'))
}

(3)进行降维聚类分群

# 循环进行降维聚类分群 --------------------------------------------------------------

names <- c("normal","early.KIC","late.KIC")
for (name in names) {
  # name <- "normal"
  print(name)
  # 降维聚类分析 ------------------------------------------------------------------
  load(paste0("../subcluster/",name,".Rdata"))
 
  #数据预处理
  sce <- NormalizeData(sce)
  sce = ScaleData(sce, 
                  features = rownames(sce),
                  # vars.to.regress = c("nFeature_RNA", "percent_mito")
  )  #加上vars.to.regress参数后运行so slow!!!
  # 降维处理
  sce <- FindVariableFeatures(sce, nfeatures = 3000)
  sce <- RunPCA(sce, npcs = 30)
  sce <- RunTSNE(sce, dims = 1:30)
  sce <- RunUMAP(sce, dims = 1:30)
  
    # CCA 聚类分析------------------------------------------------------------------
 
  seob <- sce
  seob <- FindNeighbors(seob, dims = 1:30)
  seob <- FindClusters(seob, graph.name = "RNA_snn", 
                         resolution = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1),
                         algorithm = 1)
  # CCA分群数据----
  
  #聚类树可视化
  p2_tree <- clustree(seob@meta.data, prefix = "RNA_snn_res.") 
  p2_tree
  ggsave(p2_tree, filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_tree_by_CCA.pdf"), width = 10, height = 8)
  
  #umap可视化
  cluster_umap <- plot_grid(ncol = 4, 
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.01", label = T) & NoAxes(), 
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.05", label = T)& NoAxes(),
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.1", label = T) & NoAxes(),
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.2", label = T)& NoAxes(),
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.3", label = T)& NoAxes(),
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.5", label = T) & NoAxes(),
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.8", label = T) & NoAxes(), 
                   DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.1", label = T) & NoAxes()
  )
  ggsave(cluster_umap, filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_umap_all_res_by_CCA.pdf"), width = 16, height = 8)
  
  cluster_tsne <- plot_grid(ncol = 4, 
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.01", label = T) & NoAxes(), 
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.05", label = T)& NoAxes(),
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.1", label = T) & NoAxes(),
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.2", label = T)& NoAxes(),
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.3", label = T)& NoAxes(),
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.5", label = T) & NoAxes(),
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.8", label = T) & NoAxes(), 
                    DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.1", label = T) & NoAxes()
  )
  ggsave(cluster_tsne, 
         filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_tsne_all_res_by_CCA.pdf"), 
         width = 16, height = 8)
  
  sce <- seob
  save(sce,file = paste0("../subdata/",name,"_after.Rdata"))
}

Normal

elay_KIC

Late-KIC

细胞亚群注释:

# 细胞亚群注释 Normal------------------------------------------------------------------
rm(list = ls())
load("../subdata/normal_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Amy1","Amy2a2","Pyy","Sst","Sox9","Krt18","Col1a1","Col1a2","Cd14","Adgre1","Cd3d","Cd3e","Cd19","Cd79b")
# 该基因名称(大小写转换)----
genes_to_check=str_to_title(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
                         group.by = "RNA_snn_res.1",assay='RNA'  )  + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/normal_check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/normal_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######  
sce.all <- sce
celltype=data.frame(ClusterID=0:11,
                    celltype='NA')
celltype[celltype$ClusterID %in% c( 0,1,2,11 ),2]='Acinar cells'   
celltype[celltype$ClusterID %in% c( 9 ),2]='Islet & Ductal cell'
celltype[celltype$ClusterID %in% c( 3),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 4),2]='Fibroblasts_2'  
celltype[celltype$ClusterID %in% c( 10 ),2]='Fibroblasts_3' 
celltype[celltype$ClusterID %in% c( 8 ),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 6,7),2]='T cells' 
celltype[celltype$ClusterID %in% c( 5 ),2]='B cells'   
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
# 可视化 ---------------------------------------------------------------------
p1 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
ggsave(p1,filename = "../picture/Annotation/normal_tsne.pdf")
p2 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
ggsave(p2,filename = "../picture/Annotation/normal_vlnplot.pdf")
sce.normal <- sce.all
save(sce.normal,p1,p2,file = "../picture/Annotation/figure.Rdata")

# 细胞亚群注释 early.KIC------------------------------------------------------------------
rm(list = ls())
load("../subdata/early.KIC_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Amy1","Amy2a2","Pyy","Sst","Ins1","Ins2","Sox9","Krt18","Col1a1","Col1a2","Cd52","Cd14","Cdh5","Pecam1")
# 该基因名称(大小写转换)----
genes_to_check=str_to_title(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
                         group.by = "RNA_snn_res.1",assay='RNA'  )  + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/early.KIC_check_all_marker_by_CCA_res_0.8.pdf")

p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/early.KIC_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######  
sce.all <- sce
celltype=data.frame(ClusterID=0:15,
                    celltype='NA')
celltype[celltype$ClusterID %in% c( 4,15),2]='Acinar cells'   
celltype[celltype$ClusterID %in% c( 11,13),2]='Islet & Ductal cell'
celltype[celltype$ClusterID %in% c( 0,1,5),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 8),2]='Fibroblasts_2'  
celltype[celltype$ClusterID %in% c( 14 ),2]='Fibroblasts_3' 
celltype[celltype$ClusterID %in% c( 3,7,6,9),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 2),2]='Cancer cell' 
celltype[celltype$ClusterID %in% c( 12 ),2]='Beta cells'   
celltype[celltype$ClusterID %in% c( 10 ),2]='Endothelial cells'
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

# 可视化 ---------------------------------------------------------------------

p3 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
p3
ggsave(p3,filename = "../picture/Annotation/early.KIC_tsne.pdf")
p4 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
ggsave(p4,filename = "../picture/Annotation/early.KIC_vlnPlot.pdf")
sce.early <- sce.all
save(sce.early,p3,p4,file = "../picture/Annotation/figure2.Rdata")
# 细胞亚群注释 late.KIC------------------------------------------------------------------
rm(list = ls())
load("../subdata/late.KIC_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Sox9","Krt18","Col1a1","Col1a2","Adgre1","Cd3d","Cd14","Cd79b","Hba-a1","Hba-a2")
# 该基因名称(大小写转换)----
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
                         group.by = "RNA_snn_res.1",assay='RNA'  )  + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/late.KIC_check_all_marker_by_CCA_res_0.8.pdf")

p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/late.KIC_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)

# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######  
sce.all <- sce
celltype=data.frame(ClusterID=0:11,
                    celltype='NA')
celltype[celltype$ClusterID %in% c( 0,3,6 ),2]='Cancer cells-1'   
celltype[celltype$ClusterID %in% c( 11,13),2]='Cancer cells-2'
celltype[celltype$ClusterID %in% c( 7 ),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 8 ),2]='Fibroblasts_3' 
celltype[celltype$ClusterID %in% c( 1,2,4,9,10),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 5),2]='Lymphocytes' 
celltype[celltype$ClusterID %in% c( 12 ),2]='Red blood cells'   
head(celltype)
celltype 
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

# 可视化 ---------------------------------------------------------------------

p5 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
p5
ggsave(p5,filename = "../picture/Annotation/late.KIC_tsne.pdf")
p6 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
p6
ggsave(p6,filename = "../picture/Annotation/late.KIC_vlnPlot.pdf")
sce.late <- sce.all
save(sce.late,p5,p6,file = "../picture/Annotation/figure3.Rdata")

# 拼图 ----------------------------------------------------------------------
rm(list = ls())
load("../picture/Annotation/figure.Rdata")
load("../picture/Annotation/figure2.Rdata")
load("../picture/Annotation/figure3.Rdata")
ps <- p1/p2|p3/p4|p5/p6
ps
ggsave(ps,filename = "../picture/Annotation/umap_vlnplot.pdf")

至此,这篇文章的复现就完成了。

大家觉得我们的复现咋样呢?

虽然说上面的代码都是复制粘贴即可运行,但是如果要更好地完成上面的图表,通常是需要掌握5个R包,分别是: scater,monocle,Seurat,scran,M3Drop,需要熟练掌握它们的对象,:一些单细胞转录组R包的对象 而且分析流程也大同小异:

  • step1: 创建对象
  • step2: 质量控制
  • step3: 表达量的标准化和归一化
  • step4: 去除干扰因素(多个样本整合)
  • step5: 判断重要的基因
  • step6: 多种降维算法
  • step7: 可视化降维结果
  • step8: 多种聚类算法
  • step9: 聚类后找每个细胞亚群的标志基因
  • step10: 继续分类