您现在的位置是：首页 > 数据库

当前栏目

跟着Nature Plants学数据分析：R语言WGCNA分析完整示例

2023-02-19 12:27:46 时间

论文

Plant flavones enrich rhizosphere Oxalobacteraceae to improve maize performance under nitrogen deprivation

https://www.nature.com/articles/s41477-021-00897-y#data-availability

https://github.com/PengYuMaize/Yu2021NaturePlants

https://www.youtube.com/watch?v=dwWhm78j8YU 作者还录了视频介绍

论文中提供的数据有100多个样本，这里我减少数据量，只用19个样本

表达量的count文件

image.png

最后一列是基因长度，用来计算fpkm

样本的分组文件

image.png

样本的表型

image.png

样本的表型论文中提供的数据也有很多，这里我只选取其中的一部分

代码

代码里有很多细节我也不明白，但是能够从前到后运行完，这里记录一下，细节有空再来研究吧

加载需要的R包

library(WGCNA)
enableWGCNAThreads(nThreads = 6)
#BiocManager::install("DESeq2")
library(DESeq2)

这里用到deseq2主要是用来计算fpkm的

读取三个数据

data0<-read.table("my_counts.csv",
                  header=T,
                  row.names=1,
                  sep=",")
head(data0)
sample_metadata <- read.csv(file = "sample_info.csv",
                            header = TRUE)
head(sample_metadata)

bac_traits<-read.csv("input_data/traits.csv",row.names = 1)
head(bac_traits)

利用count值计算fpkm并计算log2然后转置

dataExpr_deseq <- DESeqDataSetFromMatrix(countData = data0[,-20],
                                         colData = sample_metadata,
                                         design = ~ Zone)
mcols(dataExpr_deseq)$basepairs = data0$geneLength
fpkm_matrix = fpkm(dataExpr_deseq)
datExpr = t(log2(fpkm_matrix+1))

dim(datExpr)

对数据进行过滤

nSamples = nrow(datExpr)  # 统计样品数目
variancedatExpr=as.vector(apply(as.matrix(datExpr),2,var, na.rm=T))  #按列（基因）取方差
no.missingdatExpr=as.vector(apply(is.na(as.matrix(datExpr)),2, sum) )#按列（基因）统计缺失数目
KeepGenes= variancedatExpr>0 & no.missingdatExpr<0.1*nSamples  # 保留方差不等于0，且缺失低于10%的基因
table(KeepGenes)# 过滤统计

datExpr=datExpr[, KeepGenes]

这一步原文的代码里没有，可能是我减少了样本的原因，如果没有这一步后面后有报错

第一步是样本聚类树

sampleTree = hclust(dist(datExpr), method = "average")
pdf(file = "1-n-sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.5);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",
     cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
dev.off()

image.png

第二步是筛选阈值

powers = c(c(1:20), seq(from = 22, to=30, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) 
# Scale-free topology fit index as a function of the soft-thresholding power
pdf(file = "2-n-sft.pdf", width = 9, height = 5);
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red") 
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity")) 
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

image.png

第三步是基因聚类树

power=sft$powerEstimate
power
#TOM = TOMsimilarityFromExpr(datExpr, power = power) 这行代码要好长时间，需要比较大的内存
## long time big mem
## multiple cpu

## 这一步用linux系统的电脑设置多核心还挺快的 enableWGCNAThreads(nThreads = 4)，
## 我设置12核心很快就得到了结果
## 但是我在自己的win系统上设置多核好像没有作用
## 我这边在linux系统运行完，把结果保存下来

load("../TOM.rdata")
dissTOM = 1-TOM 
# Plot gene tree
geneTree = hclust(as.dist(dissTOM), method = "average");
pdf(file = "3-gene_cluster.pdf", width = 12, height = 9);
plot(geneTree, 
     xlab="", 
     sub="", 
     main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, 
     hang = 0.04);
dev.off()

image.png

第四步是划分模块

dynamicMods = cutreeDynamic(dendro = geneTree, 
                            distM = dissTOM,
                            deepSplit = 2, 
                            pamRespectsDendro = FALSE,
                            minClusterSize = 30)

dynamicColors = labels2colors(dynamicMods)

pdf(file = "4-module_tree.pdf", width = 8, height = 6);
plotDendroAndColors(geneTree, 
                    dynamicColors, 
                    "Dynamic Tree Cut",
                    dendroLabels = FALSE,
                    hang = 0.03,
                    addGuide = TRUE, 
                    guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
dev.off()

MEDissThres=0.25

merge = mergeCloseModules(datExpr, 
                          dynamicColors, 
                          cutHeight = MEDissThres, 
                          verbose = 3) 
mergedColors = merge$colors  
mergedMEs = merge$newMEs  
# Plot merged module tree
pdf(file = "5-merged_Module_Tree.pdf", width = 12, height = 9)  
plotDendroAndColors(geneTree, 
                    cbind(dynamicColors, mergedColors), 
                    c("Dynamic Tree Cut", "Merged dynamic"), 
                    dendroLabels = FALSE, 
                    hang = 0.03, 
                    addGuide = TRUE, 
                    guideHang = 0.05)  
dev.off()
write.table(merge$oldMEs,file="oldMEs.txt");
write.table(merge$newMEs,file="newMEs.txt");

image.png

第五步是表型和模块的相关性

nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, mergedColors)$eigengenes
MEs = orderMEs(MEs0)

table(rownames(MEs) == rownames(bac_traits))
moduleTraitCor = cor(MEs, bac_traits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
write.table(moduleTraitCor,file="moduleTrait_correlation.txt");
write.table(moduleTraitPvalue,file="moduleTrait_pValue.txt");

textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                    signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
pdf("module-traits-bacteria-order.pdf", width = 10, height = 10)
par(mar = c(15, 12, 5, 5));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(bac_traits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()

image.png

猜你喜欢

最长无重复子串
写技术博客的一些心得分享
Java 多线程（七）：线程池
Java 多线程（五）：锁（三）
Java 多线程（四）：锁（二）
Java 多线程（三）：锁（一）
Java 多线程（二）：并发编程的三大特性
线性时间非比较类排序
Java 多线程（一）：基础
合并k个已排序的链表
HDFS 高可用分布式环境搭建
合并两个有序数组
连续子数组的最大和
HDFS 分布式环境搭建
容器盛水问题
大数加法
HDFS 伪分布式环境搭建
设计LRU缓存结构
两数之和
使用单调栈来解决的一些问题

zl程序教程

当前栏目

跟着Nature Plants学数据分析：R语言WGCNA分析完整示例

论文

表达量的count文件

样本的分组文件

样本的表型

代码

加载需要的R包

读取三个数据

利用count值计算fpkm并计算log2然后转置

对数据进行过滤

第一步是样本聚类树

第二步是筛选阈值

第三步是基因聚类树

第四步是划分模块

第五步是表型和模块的相关性

相关文章