单细胞水平看小鼠胰腺导管腺癌进展中的细胞异质性
下面11月份学徒的投稿
文章标题:《Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution》
文章链接:https://insight.jci.org/articles/view/129212
文章概述:
胰腺导管腺癌 (PDA) 是癌症相关死亡的主要原因,可用的治疗选择有限。通过单细胞 RNA 测序技术在基因工程小鼠模型的 PDA 进展的不同阶段对细胞异质性进行分析。发现**除了具有增加炎症特征的不同巨噬细胞群外,癌细胞的上皮向间充质转变伴随着肿瘤进展;**正常小鼠胰腺中存在三种不同的成纤维细胞分子亚型,最终在晚期 PDA 中产生两种不同的成纤维细胞群;癌细胞和成纤维细胞可能受到表观遗传机制的动态调节。
文章复现:
此次复现的图为文章Figure 1 的B-E图,是3个单细胞样品各自独立的降维聚类分群并且进行生物学命名:
image-20211104152226887
可以看出文章中的是将Normal,early-KIC,late-KIC分别进行降维聚类分群及细胞亚群注释。接下来进行文章复现:
- 数据概览:
文章中处理的数据集是GSE125588,其中包括5个样本:
(1)数据读取质控过滤:
数据读取之前需要将每个样本的标准10X文件整理成如下格式:
image-20211105174422303
# 加载R包 -------------------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(stringr))
# 加载数据 -------------------------------------------------------------------
dir = '../data/GSE125588/'
samples=list.files(dir)
samples
sceList = lapply(samples,function(pro){
# pro=samples[1]
folder=file.path(dir,pro)
print(pro)
print(folder)
print(list.files(folder))
sce=CreateSeuratObject(counts = Read10X(folder),
project = pro )
return(sce)
})
names(sceList)
samples
names(sceList) = samples
sce.all <- merge(sceList[[1]], y= sceList[ -1 ] ,
add.cell.ids=samples)
# 查看数据
as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
tail(sce.all@meta.data, 10)
dim(sce.all)
# [1] 27998 11636
table(sce.all$orig.ident)
# early.KIC late.KIC late.KPC late.KPfC normal
# 3534 938 1377 3143 2644
# 添加分组信息
# sce.all$group <- ifelse(sce.all$orig.ident %in% c(),"a组","b组")
dir.create("../subdata")
save(sce.all,file = "../subdata/sce.all.Rdata")
# 质控和过滤 -------------------------------------------------------------------
# 1. 计算线粒体基因比例
# 人和鼠的基因名字稍微不一样
mito_genes=rownames(sce.all)[grep("^mt-", rownames(sce.all))]
mito_genes #13个线粒体基因
sce.all=PercentageFeatureSet(sce.all, "^mt-", col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
# 2. 计算核糖体基因比例
ribo_genes=rownames(sce.all)[grep("^Rp[sl]", rownames(sce.all),ignore.case = T)]
ribo_genes
sce.all=PercentageFeatureSet(sce.all, "^Rp[sl]", col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
# 3. 计算红血细胞基因比例
hb_genes <- rownames(sce.all)[grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T)]
hb_genes
sce.all=PercentageFeatureSet(sce.all, "^Hb[^(P)]", col.name = "percent_hb")
fivenum(sce.all@meta.data$percent_hb)
# 过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce.all, expression = nFeature_RNA >300) # 每个细胞中基因表达>300
selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3] # 至少在3个细胞中表达
sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c)
# 过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 15)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 1)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.5)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
dim(sce.all.filt)
# [1] 17935 9621
table(sce.all$orig.ident)
# early.KIC late.KIC late.KPC late.KPfC normal
# 3534 938 1377 3143 2644
table(sce.all.filt$orig.ident)
# early.KIC late.KIC late.KPC late.KPfC normal
# 3379 765 917 2632 1928
(2)拆分细胞亚群:
sce.all.list <- SplitObject(sce, split.by = "orig.ident") # "orig.ident" 中包含了样本信息
sce.all.list
names(sce.all.list)
for (i in names(sce.all.list)) {
print(i)
epi_mat = sce.all.list[[i]]@assays$RNA@counts
epi_phe = sce.all.list[[i]]@meta.data
sce=CreateSeuratObject(counts = epi_mat,
meta.data = epi_phe )
sce
table(sce@meta.data$orig.ident)
save(sce,file = paste0("../subcluster/",i,'.Rdata'))
}
(3)进行降维聚类分群
# 循环进行降维聚类分群 --------------------------------------------------------------
names <- c("normal","early.KIC","late.KIC")
for (name in names) {
# name <- "normal"
print(name)
# 降维聚类分析 ------------------------------------------------------------------
load(paste0("../subcluster/",name,".Rdata"))
#数据预处理
sce <- NormalizeData(sce)
sce = ScaleData(sce,
features = rownames(sce),
# vars.to.regress = c("nFeature_RNA", "percent_mito")
) #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# CCA 聚类分析------------------------------------------------------------------
seob <- sce
seob <- FindNeighbors(seob, dims = 1:30)
seob <- FindClusters(seob, graph.name = "RNA_snn",
resolution = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1),
algorithm = 1)
# CCA分群数据----
#聚类树可视化
p2_tree <- clustree(seob@meta.data, prefix = "RNA_snn_res.")
p2_tree
ggsave(p2_tree, filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_tree_by_CCA.pdf"), width = 10, height = 8)
#umap可视化
cluster_umap <- plot_grid(ncol = 4,
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.01", label = T) & NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.05", label = T)& NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.1", label = T) & NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.2", label = T)& NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.3", label = T)& NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.5", label = T) & NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.0.8", label = T) & NoAxes(),
DimPlot(seob, reduction = "umap", group.by = "RNA_snn_res.1", label = T) & NoAxes()
)
ggsave(cluster_umap, filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_umap_all_res_by_CCA.pdf"), width = 16, height = 8)
cluster_tsne <- plot_grid(ncol = 4,
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.01", label = T) & NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.05", label = T)& NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.1", label = T) & NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.2", label = T)& NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.3", label = T)& NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.5", label = T) & NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.0.8", label = T) & NoAxes(),
DimPlot(seob, reduction = "tsne", group.by = "RNA_snn_res.1", label = T) & NoAxes()
)
ggsave(cluster_tsne,
filename = paste0("../picture/2_PCA_UMAP_TSNE/",name,"_cluster_tsne_all_res_by_CCA.pdf"),
width = 16, height = 8)
sce <- seob
save(sce,file = paste0("../subdata/",name,"_after.Rdata"))
}
Normal
elay_KIC
Late-KIC
细胞亚群注释:
# 细胞亚群注释 Normal------------------------------------------------------------------
rm(list = ls())
load("../subdata/normal_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Amy1","Amy2a2","Pyy","Sst","Sox9","Krt18","Col1a1","Col1a2","Cd14","Adgre1","Cd3d","Cd3e","Cd19","Cd79b")
# 该基因名称(大小写转换)----
genes_to_check=str_to_title(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
group.by = "RNA_snn_res.1",assay='RNA' ) + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/normal_check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/normal_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######
sce.all <- sce
celltype=data.frame(ClusterID=0:11,
celltype='NA')
celltype[celltype$ClusterID %in% c( 0,1,2,11 ),2]='Acinar cells'
celltype[celltype$ClusterID %in% c( 9 ),2]='Islet & Ductal cell'
celltype[celltype$ClusterID %in% c( 3),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 4),2]='Fibroblasts_2'
celltype[celltype$ClusterID %in% c( 10 ),2]='Fibroblasts_3'
celltype[celltype$ClusterID %in% c( 8 ),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 6,7),2]='T cells'
celltype[celltype$ClusterID %in% c( 5 ),2]='B cells'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
# 可视化 ---------------------------------------------------------------------
p1 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
ggsave(p1,filename = "../picture/Annotation/normal_tsne.pdf")
p2 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
ggsave(p2,filename = "../picture/Annotation/normal_vlnplot.pdf")
sce.normal <- sce.all
save(sce.normal,p1,p2,file = "../picture/Annotation/figure.Rdata")
# 细胞亚群注释 early.KIC------------------------------------------------------------------
rm(list = ls())
load("../subdata/early.KIC_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Amy1","Amy2a2","Pyy","Sst","Ins1","Ins2","Sox9","Krt18","Col1a1","Col1a2","Cd52","Cd14","Cdh5","Pecam1")
# 该基因名称(大小写转换)----
genes_to_check=str_to_title(genes_to_check)
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
group.by = "RNA_snn_res.1",assay='RNA' ) + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/early.KIC_check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/early.KIC_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######
sce.all <- sce
celltype=data.frame(ClusterID=0:15,
celltype='NA')
celltype[celltype$ClusterID %in% c( 4,15),2]='Acinar cells'
celltype[celltype$ClusterID %in% c( 11,13),2]='Islet & Ductal cell'
celltype[celltype$ClusterID %in% c( 0,1,5),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 8),2]='Fibroblasts_2'
celltype[celltype$ClusterID %in% c( 14 ),2]='Fibroblasts_3'
celltype[celltype$ClusterID %in% c( 3,7,6,9),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 2),2]='Cancer cell'
celltype[celltype$ClusterID %in% c( 12 ),2]='Beta cells'
celltype[celltype$ClusterID %in% c( 10 ),2]='Endothelial cells'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
# 可视化 ---------------------------------------------------------------------
p3 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
p3
ggsave(p3,filename = "../picture/Annotation/early.KIC_tsne.pdf")
p4 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
ggsave(p4,filename = "../picture/Annotation/early.KIC_vlnPlot.pdf")
sce.early <- sce.all
save(sce.early,p3,p4,file = "../picture/Annotation/figure2.Rdata")
# 细胞亚群注释 late.KIC------------------------------------------------------------------
rm(list = ls())
load("../subdata/late.KIC_after.Rdata")
p_umap <- DimPlot(sce,label = T)
p_umap
# paper -------------------------------------------------------------------
genes_to_check = c("Sox9","Krt18","Col1a1","Col1a2","Adgre1","Cd3d","Cd14","Cd79b","Hba-a1","Hba-a2")
# 该基因名称(大小写转换)----
genes_to_check
p_all_markers <- DotPlot(sce, features = genes_to_check,
group.by = "RNA_snn_res.1",assay='RNA' ) + coord_flip()
p_all_markers
ggsave(plot=p_all_markers, filename="../picture/Annotation/late.KIC_check_all_marker_by_CCA_res_0.8.pdf")
p1 <- plot_grid(p_all_markers,p_umap,rel_widths = c(1.5,1))
p1
ggsave(p1,filename = '../picture/Annotation/late.KIC_paper_markers_umap_by_CCA_res.0.8.pdf',units = "cm",width = 35,height = 18)
# 注释细胞亚群 ------------------------------------------------------------------
###### step6: 细胞类型注释 (根据marker gene) ######
sce.all <- sce
celltype=data.frame(ClusterID=0:11,
celltype='NA')
celltype[celltype$ClusterID %in% c( 0,3,6 ),2]='Cancer cells-1'
celltype[celltype$ClusterID %in% c( 11,13),2]='Cancer cells-2'
celltype[celltype$ClusterID %in% c( 7 ),2]='Fibroblasts_1'
celltype[celltype$ClusterID %in% c( 8 ),2]='Fibroblasts_3'
celltype[celltype$ClusterID %in% c( 1,2,4,9,10),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 5),2]='Lymphocytes'
celltype[celltype$ClusterID %in% c( 12 ),2]='Red blood cells'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
# 可视化 ---------------------------------------------------------------------
p5 <- DimPlot(sce.all,group.by = "celltype",label = T,reduction = "tsne",pt.size = 1.5)
p5
ggsave(p5,filename = "../picture/Annotation/late.KIC_tsne.pdf")
p6 <- VlnPlot(sce.all,group.by = "celltype",features = genes_to_check,stack = T,flip = T)
p6
ggsave(p6,filename = "../picture/Annotation/late.KIC_vlnPlot.pdf")
sce.late <- sce.all
save(sce.late,p5,p6,file = "../picture/Annotation/figure3.Rdata")
# 拼图 ----------------------------------------------------------------------
rm(list = ls())
load("../picture/Annotation/figure.Rdata")
load("../picture/Annotation/figure2.Rdata")
load("../picture/Annotation/figure3.Rdata")
ps <- p1/p2|p3/p4|p5/p6
ps
ggsave(ps,filename = "../picture/Annotation/umap_vlnplot.pdf")
至此,这篇文章的复现就完成了。
大家觉得我们的复现咋样呢?
虽然说上面的代码都是复制粘贴即可运行,但是如果要更好地完成上面的图表,通常是需要掌握5个R包,分别是: scater,monocle,Seurat,scran,M3Drop,需要熟练掌握它们的对象,:一些单细胞转录组R包的对象 而且分析流程也大同小异:
- step1: 创建对象
- step2: 质量控制
- step3: 表达量的标准化和归一化
- step4: 去除干扰因素(多个样本整合)
- step5: 判断重要的基因
- step6: 多种降维算法
- step7: 可视化降维结果
- step8: 多种聚类算法
- step9: 聚类后找每个细胞亚群的标志基因
- step10: 继续分类
相关文章
- 直接在代码里面对list集合进行分页
- .NET Framework 4.5新特性详解
- 大数据的简要介绍
- 大数据的由来
- 高斯混合模型的自然梯度变量推理
- timing-wheel 仿Kafka实现的时间轮算法
- 使用Navicat软件连接自建数据库(Linux系统)
- 那一天,我被Redis主从架构支配的恐惧
- Redis 深入了解键的过期时间
- C#使用委托调用实现用户端等待闪屏
- 基于流计算 Oceanus 和 Elasticsearch Service 构建百亿级实时监控系统
- GRAND | 转录调控网络预测数据库
- JFreeChart API中文文档
- 临床相关突变查询数据库
- TIGER | 人类胰岛基因变化查询数据库
- 视频边缘计算网关EasyNVR在视频整体监控解决方案中的应用分析
- Apache Arrow - 大数据在数据湖后的下一个风向标
- 常见的电商数据指标体系
- AKShare-艺人数据-艺人流量价值
- MySQL中多表联合查询与子查询的这些区别,你可能不知道!