WGCNA代码

WGCNA

rm(list = ls())

#加载R包

library(WGCNA)

library(tidyr)

library(ggplot2)

library(gridExtra)

setwd("D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426")

#=================================================================#

# 1.导入表达数据

#=================================================================#

gene_exp <- read.table('D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426\\DSC.txt',

sep = '\t',

header = T,

stringsAsFactors = F,

#第一列作为行名

row.names = 1)

gene_exp[is.na(gene_exp)] <- 0 # 将NA转换为0

# 根据方差筛选

m.vars=apply(gene_exp,1,var)

expro.upper=gene_exp[which(m.vars>quantile(m.vars, probs = seq(0, 1, 0.25))[4]),]

dim(expro.upper)

datExpr <- t(expro.upper) # 在WGCNA中需要转置表达矩阵

gsg = goodSamplesGenes(datExpr,verbose = 3)

gsg$allOK

#查看行和列

dim(datExpr)

# 聚类分析是否存在离群值

sampleTree = hclust(dist(datExpr),method = "average")

sizeGrWindow(12,9)

par(cex=0.6)

par(mar = c(0,4,2,0))

plot(sampleTree,

main = "Sample clustering to detect outliners",

sub = "",xlab = "",cex.lab=1.5,

cex.axis=1.5,cex.main = 2)

# 存在离群样本时可以剔除

if(F){abline(h = 15,col = "red")

clust = cutreeStatic(sampleTree,cutHeight = 15,minSize = 10)

table(clust)

keepSamples = (clust==1)

datExpr =datExpr0[keepSamples,]

nGenes = ncol(datExpr)

nSamples = nrow(datExpr)

}

#=================================================================#

# 2.寻找最佳β值

#=================================================================#

# 选择并构建一个软阈值向量

powers = c(c(1:10),seq(from = 12,to = 20,by=2)

# 执行TOM功能

sft = pickSoftThreshold(datExpr,powerVector = powers,verbose = 5)

# 绘图

sizeGrWindow(9,5)

par(mfrow = c(1,2)) # 一页多图

cex1 = 0.9

# Scale-free topology fit index as a function of the soft-thresholding power

plot(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],

xlab="Soft Threshold(power)",ylab = "Scale Free Topology Model Fit,signed R^2",type="n",

main = paste("Scale independence"))

text(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")

abline(h = 0.90,col = "red")

# Mean connectivity as a function of the soft-thresholding power

plot(sft$fitIndices[,1],sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab ="Mean Connectivity",type = "n",main = paste("Mean connectivity"))

text(sft$fitIndices[,1],sft$fitIndices[,5],labels=powers,cex = cex1,col="red")

#=================================================================#

# 3.一步法构建网络模块

#3.1 minModuleSize指定Module所含基因数最少

#3.2 mergeCutHeight指定合并Module的阈值

#3.3 列表net中包含了许多信息

#3.4 可以通过recutBlockwiseTrees函数来修改模块的一些参数，而不用重新计算聚类树

#=================================================================#

net = blockwiseModules(datExpr,power = sft$powerEstimate,

TOMType = "unsigned",minModuleSize = 30,reassignThreshold = 0,mergeCutHeight = 0.25,

numericLabels =TRUE,pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "RiceTOM",

verbose = 3)

table(net$colors)

# 绘图

sizeGrWindow(12,9)

# 将标签转换为颜色

mergedColors = labels2colors(net$colors)

# 绘制树图及模块颜色图

plotDendroAndColors(net$dendrograms[[1]],mergeColors[net$blockGenes[[1]],"Module colors",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05)

# 保存相关数据

moduleLabels = net$colors

moduleColors = labels2colors(net$colors)

MEs = net$MEs

geneTree = net$dendrograms[[1:2]] # 可能会存在基因较多，默认将基因分成了两部分

#=================================================================#

# 3.步步法构建

# 3.构建网络

# 3.1.计算相关系数

# 3.2.计算邻接矩阵

# 3.3.计算TOM矩阵

# 3.4.聚类并且划分模块

# 3.5.合并相似模块

#=================================================================#

# 计算邻接矩阵

softpower = sft$powerEstimate

adjacency = adjacency(datExpr,power = softPower)

# 通过邻接矩阵计算TOM矩阵

TOM = TOMsimilarity(adjacency)

dissTOM = 1 - TOM # 此为距离矩阵

# 通过距离矩阵进行聚类分析基因

geneTree = hclust(as.dist(dissTOM),method = "average")

sizeGrWindow(12,9)

plot(geneTree,xlab="",sub="",main="Gene clustering on TOM-based dissimilarity",labels=FALSE,hang = 0.04)

# 设置模块中基因最小数

minModuleSize = 30

# 通过动态树剪切鉴定出模块

dynamicMods = cutreeDynamic(dendro = geneTree,disM = dissTOM,deepSplit = 2,pamRespectsDendro = FALSE,minClusterSize = minModuleSize)

table(dynamicMods)

# 绘图

dynamicColors = labels2colors(dynamicMods)

table(dynamicColors)

sizeGrWindow(8,6)

plotDendroAndColors(geneTree,dynamicColors,"Dynamic Tree Cut",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05,main = " Gene dendrogram and module colors")

# 合并表达近似的Modules

# 计算Modules的特征基因

MEList = moduleEigengenes(datExpr,color = dynamicColors)

MEs = MEList$eigengenes

# 计算Modules之间的相关性

MEDiss = 1 - cor(MEs)

# 聚类特征基因

METree = hclust(as.dist(MEDiss),method = "average")

# 绘图

sizeGrWindow(7,6)

plot(METree,main = "Clustering of eigengenes",xlab="",sub="")

# 自动合并

MEDissThres = 0.25

abline(h=MEDissThres,col = "red")

merge = mergeCloseModules(datExpr,dynamicColors,cutHeight = MEDissThres,verbose = 3)

mergedColors = merge$colors

mergedMEs = merge$newMEs

# 绘图对比合并前后模块

sizeGrWindow(12,9)

plotDendroAndColors(geneTree,cbind(dynamicColors,mergedColors),c("Dynamic Tree Cut","Merged dynamic"),dendroLabels = FALSE,hang =0.03,addGuide = TRUE,guideHang = 0.05)

# 保存相关的变量数据

moduleColors = mergedColors

# 转换为数字标签

colorOrder = c("grey",standardColors(50))

moduleLabels = match(moduleColors,colorOrder)-1

MEs = mergedMEs

#=================================================================#

# 3.大数据集处理

# 基本思想：使用两层聚类

#=================================================================#

# Block-wise network construction and module detection

bwnet <- blockwiseModules(datExpr,maxBlockSize = 2000, # 指定电脑最大可运行的模块

power = sft$powerEstimate,TOMtype ="unsigned",minModuleSize = 30,

reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE,

saveTOMs = TRUE,saveTOMFileBase = "RiceTOM-blockwise",

verbose = 3)

bwLabels = matchLabels(bwnet$colors,moduleLabels)

bwModuleColors = labels2colors(bwLabels)

table(bwLabels)

# 绘图

sizeGrWindow(6,6)

# block1绘图

plotDendroAndColors(bwnet$dendrograms[[1]], bwModuleColors[bwnet$blockGenes[[1]]], "Module colors", main = "Gene dendrogram and module colors in block 1", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# block2绘图

plotDendroAndColors(bwnet$dendrograms[[2]], bwModuleColors[bwnet$blockGenes[[2]]], "Module colors", main = "Gene dendrogram and module colors in block 2", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# 绘图比较single block和block-wise的区别

sizeGrWindow(12,9)

plotDendroAndColors(geneTree, cbind(moduleColors, bwModuleColors), c("Single block", "2 blocks"), main = "Single block gene dendrogram and module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# 通过Eigengenes比较single block和block-wise的区别

singleBlockMEs = moduleEigengenes(datExpr, moduleColors)$eigengenes

blockwiseMEs = moduleEigengenes(datExpr, bwModuleColors)$eigengenes

single2blockwise = match(names(singleBlockMEs), names(blockwiseMEs))

signif(diag(cor(blockwiseMEs[, single2blockwise], singleBlockMEs)), 3)

#=================================================================#

# 4.导入性状数据

#=================================================================#

traitData = read.csv("ClinicalTraits.csv")

dim(traitData)

names(traitData)

# 与表达数据进行校准后构建性状数据框

RiceSamples = rownames(datExpr)

traitRows = match(RiceSamples, allTraits$Sample)

datTraits = allTraits[traitRows, -1]

rownames(datTraits) = allTraits[traitRows, 1]

#=================================================================#

# 5.模块与性状的相关关系

#=================================================================#

# 量化Module-trait的关系

nGenes = ncol(datExpr)

nSamples = nrow(datExpr)

MEs0 = moduleEigengenes(datExpr,moduleColors)$eigengenes

MEs = orderMEs(MEs0)

moduleTraitCor = cor(MEs,datTraits,use = "p")

moduleTraitPvalue = corPvalueStudent(moduleTrsitCor,nSamples)

# 绘图

sizeGrWindow(10,6)

textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")

dim(textMatrix) = dim(moduleTraitCor)

par(mar = c(6, 8.5, 3, 3))

labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))

#=================================================================#

# 6.基因与模块、性状的相关关系

#=================================================================#

# 确定基因与模块的关系-MM

modNames = substring(names(MEs),3) # 提取模块数字标签

geneModuleMembership = as.data.frame（cor(datExpr,MEs,use = "p"))

MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)) # 通过计算每个Module的Eigengenes与基因的关系

names(geneModuleMembership) = paste("MM", modNames, sep="")

names(MMPvalue) = paste("p.MM", modNames, sep="")

# 提取某个特定的性状

Treatment = as.data.frame(datTraits$Treatment)

names(weight) = "Treatment"

# 确定基因与性状的关系-GS

geneTraitSignificance = as.data.frame(cor(datExpr, Treatment, use = "p"))

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(Treatment), sep="")

names(GSPvalue) = paste("p.GS.", names(Treatment), sep="")

#=================================================================#

# 7.获取与性状关联度大的特定模块中GS高且MM高的基因

#=================================================================#

# 绘制特定模块GS和MM热图

module = "saddlebrown"

column = match(module,modNames)

moduleGenes = moduleColors==module # 获取特定列的数据，只是一种数据提取方法而已

sizeGrWindow(7, 7)

par(mfrow = c(1,1))

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for body weight", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

# 获取特定模块的特定基因

probes = names(datExpr)[moduleColors==module]

# 输出总体结果数据框

geneInfo0 = data.frame(substanceBXH = probes, geneSymbol = annot$gene_symbol[probes2annot], LocusLinkID = annot$LocusLinkID[probes2annot], moduleColor = moduleColors, geneTraitSignificance, GSPvalue)

# 根据与性状显著相关进行排序

modOrder = order(-abs(cor(MEs, weight, use = "p")))

# 添加模块关系信息

for (mod in 1:ncol(geneModuleMembership))

{ oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]]); names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep="")) }

# 第一根据颜色第二根据基因性状显著性进行排序

geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight))

geneInfo = geneInfo0[geneOrder, ]

#=================================================================#

# 8.进行后续分析GO功能注释等

#=================================================================#

# 选择感兴趣的Modules

intModules = c("brown", "red", "salmon")

for (module in intModules)

{

# Select module probes

modGenes = (moduleColors==module)

# Get their entrez ID codes

modLLIDs = allLLIDs[modGenes]

# Write them into a file

fileName = paste("LocusLinkIDs-", module, ".txt", sep="")

write.table(as.data.frame(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)

}

WGCNA代码

推荐阅读更多精彩内容