参考文献
High-resolution spatiotemporal transcriptome mapping of tomato fruit development and ripening
关于文章中WGCNA方法参数
文章中参数主要包括一下内容
① 转录组数据基因输入参数
选取变异系数CV值大于1的基因作为输入数据进行分析
② 计算软阈值参数
设定的软阈值soft power为16,TOM结构类型为signed,也是推荐的结构类型。
③ 设定最小模块基因数目参数
最小模块基因数目为30,模块设定的多少,决定分出来模块的多少
④ 相似模块合并参数
前面设定最小模块数目后,基因会分成不同的模块,计算模块之间的相关性后,相关性高的模块进行合并,文章中的参数设定为0.25,也就是说相关性高于0.75的就合并在一起。
⑤ 模块内hub基因筛选参数
模块内基因的筛选主要从两个参数进行筛选,第一个参数是模块内的基因跟模块的相关性高于0.9,然后利用FDR 相关性计算基因跟性状的相关性。
大麦WGCNA分析流程
一、转录组数据输入和质控
##导入表达量TPM数据
rm(list=ls())
library(dplyr)
library(Mfuzz)
library(WGCNA)
library(impute)
library(preprocessCore)
library(MASS)
library(class)
library(cluster)
library(impute)
library(Hmisc)
library(matrixStats)
library(flashClust)
library(dplyr)
setwd("D:\\mywork\\mywork\\10.大麦转录组\\2022年大麦分析\\")
data <- read.csv("1.TPM_data_aver_use.csv",header=T,row.names = 1,check.names = F)
##表达量数据是三个重复进行矫正过离群值后,离群值用剩余两个重复求平均值后的最终TPM结果
##基因型数据预处理
##由于soft设定R2一直小于阈值0.8,查看问题所在是
# 无向网络在power小于15或有向网络power小于30内,没有一个power值可以使无标度网络图谱结构R^2达到0.8或平均连接度降到100以下,可能是由于部分样品与其他样品差别太大造成的。这可能由批次效应、样品异质性或实验条件对表达影响太大等造成
datExprFemale2 <- log2(data+1)
cor_T <- cor(datExprFemale2)
##绘制热图
library(pheatmap)
pheatmap(cor_T,show_rownames = T,show_colnames = T,cluster_rows=T,cluster_cols = T)
##结果展示相关性好
###数据质控
m.mad <- apply(datExprFemale2,1,mad)
dataExprVar <- datExprFemale2[which(m.mad >
max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
dim(dataExprVar)
###去除变异系数cv<1的基因
cal_cv=function(x){ # 自定义函数 标准差/平均值
y=na.omit(x)
return(sd(y)/mean(y))
}
m.cv <- apply(dataExprVar, 1, cal_cv)
dataExprVar2 <- dataExprVar[which(m.cv > 1),]
datExprFemale = as.data.frame(t(dataExprVar2 ))
datExprFemale_D_1 = as.data.frame(datExprFemale)
# ##检验数据质量
# # 样本聚类检查离群值(就是树上特别不接近的
gsg = goodSamplesGenes(datExprFemale_D_1, verbose = 3)
head(gsg)
gsg$allOK
## 是true的话说明所有genes都通过了筛选
if (!gsg$allOK){
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:",
paste(names(datExprFemale_D_1)[!gsg$goodGenes], collapse = ",")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:",
paste(rownames(datExprFemale_D_1)[!gsg$goodSamples], collapse = ",")));
# Remove the offending genes and samples from the data:
datExprFemale_D_1 =datExprFemale_D_1[gsg$goodSamples, gsg$goodGenes]
}
## 画图来看是否有特别离群的
dim(datExprFemale_D_1)
##共有5210个基因进行WGCNA分析
sampleTree = hclust(dist(datExprFemale_D_1), method = "average")
##设置画板大小
sizeGrWindow(12, 9)
par(cex=0.6)
par(mar=c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
##没有发现离群值,不需要剔除样本
二、设定软阈值
设定软阈值,
# 清理,释放内存
collectGarbage()
gene <- datExprFemale_D_1
powers = c(c(1:10), seq(from = 12, to=20, by=2))
type = "unsigned" ##自己试验设计的按照unsigned
sft <- pickSoftThreshold(gene, powerVector = powers, verbose=5 )
# 推荐值。如果是NA,就需要画图来自己挑选
setwd("./7.WGCNA_result")
pdf("soft.pdf",width = 15, height = 8)
par(mfrow = c(1, 2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], type = 'n',
xlab = 'Soft Threshold (power)', ylab = 'Scale Free Topology Model Fit,signed R^2',
main = paste('Scale independence'))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels = powers, col = 'red');
abline(h = 0.8, col = 'red')
#平均连通性与 power 值散点图
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab = 'Soft Threshold (power)', ylab = 'Mean Connectivity', type = 'n',
main = paste('Mean connectivity'))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, col = 'red')
dev.off()
##构建网络
#上一步估计的最佳 power 值
write.csv(sft,"1.sft.csv")
sft$powerEstimate
###推荐的软阈值设定为9
三、构建TOM矩阵
获得邻接矩阵:
softPower <- sft$powerEstimate
adjacency = adjacency(gene, power = softPower)
# 将临近矩阵转为 Tom 矩阵
TOM = TOMsimilarity(adjacency)
# 计算基因之间的相异度
dissTOM = 1-TOM
hierTOM = hclust(as.dist(dissTOM),method="average")
ADJ1_cor <- abs(WGCNA::cor( gene,use = "p" ))^softPower
# 基因少(<5000)的时候使用下面的代码:
k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))
# 基因多的时候使用下面的代码:
k <- softConnectivity(datE=gene,power=softPower)
sizeGrWindow(10, 5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k,main="Check Scale free topology\n")
# 可以看出k与p(k)成负相关(相关性系数0.84),说明选择的β值能够建立基因无尺度网络。
# 使用相异度来聚类为gene tree(聚类树):
geneTree = hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
windows()
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
# 使用动态剪切树挖掘模块:
minModuleSize = 50;
# 动态切割树:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
pdf("Gene dendrogram and module colors.pdf")
#模块颜色指代
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
plotDendroAndColors(geneTree, dynamicColors, 'Dynamic Tree Cut',
dendroLabels = FALSE, addGuide = TRUE, hang = 0.03, guideHang = 0.05,
main = 'Gene dendrogram and module colors')
dev.off()
四、随机挑选400个基因查看TOM结构
# 拓扑热图:
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
nGenes
select = sample(5210, size = nSelect);
selectTOM = dissTOM[select, select];
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = dynamicColors[select];
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^softPower;
diag(plotDiss) = NA;
TOMplot(plotDiss,
selectTree,
selectColors,
main = "Network heatmap plot, selected genes")
5、计算根据模块特征向量基因计算模块相异度
MEList = moduleEigengenes(gene, colors = dynamicColors)
MEs = MEList$eigengenes
# 计算根据模块特征向量基因计算模块相异度:
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
# Plot the result
plotEigengeneNetworks(MEs,
"Eigengene adjacency heatmap",
marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE,
xLabelsAngle = 90)
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
# 在聚类图中画出剪切线
abline(h=MEDissThres, col = "red")
MEDissThres = 0.2
# 在聚类图中画出剪切线
abline(h=MEDissThres, col = "red")
# 合并模块:
merge_modules = mergeCloseModules(gene, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 合并后的颜色:
mergedColors = merge_modules$colors;
# 新模块的特征向量基因:
mergedMEs = merge_modules$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
probes = colnames(gene)
dimnames(TOM) <- list(probes, probes)
六、计算表型和模块的相关性
trait_data <- read.table("1.trait_input.txt",header=T,sep="\t",row.names = 1)
trait <- trait_data
#使用上一步新组合的共表达模块的结果
module <- mergedMEs
#共表达模块和表型的相关性分析
moduleTraitCor <- cor(module, trait, use = 'p')
moduleTraitCor[1:6,1:3] #相关矩阵
#相关系数的 p 值矩阵
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nrow(module))
#输出相关系数矩阵或 p 值矩阵
write.table(moduleTraitCor, 'moduleTraitCor.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(moduleTraitPvalue, 'moduleTraitPvalue.txt', sep = '\t', col.names = NA, quote = FALSE)
#相关图绘制
textMatrix <- paste(signif(moduleTraitCor, 2), '\n(', signif(moduleTraitPvalue, 1), ')', sep = '')
dim(textMatrix) <- dim(moduleTraitCor)
pdf("Module-trait relationships.pdf")
par(mar = c(5, 10, 1,1))
labeledHeatmap(Matrix = moduleTraitCor, main = paste('Module-trait relationships'),
xLabels = names(trait), yLabels = names(module), ySymbols = names(module),
colorLabels = FALSE, colors = blueWhiteRed(50), cex.text = 1, zlim = c(-1,1),
textMatrix = textMatrix, setStdMargins = FALSE)
dev.off()
gene_module <- data.frame(gene_name = colnames(gene), module = mergedColors, stringsAsFactors = FALSE)
head(gene_module)
write.csv(gene_module,"gene_module.csv")
七、感兴趣模块基因分析
#提取模块内的基因名称
gene_module_select <- subset(gene_module, module == 'yellow')$gene_name
#“black”模块内基因在各样本中的表达值矩阵(基因表达值矩阵的一个子集)
gene_select <- t(gene[,gene_module_select])
#“black”模块内基因的共表达相似度(在 TOM 矩阵中提取子集)
tom_select <- TOM[gene_module_select,gene_module_select]
#输出
write.table(gene_select, 'blue_gene_select.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(tom_select, 'blue_tom_select.txt', sep = '\t', col.names = NA, quote = FALSE)
##重要基因的获得,以与 TNM 显著相关的 black 模块为例
#选择 black 模块内的基因
gene_yellow <- gene[ ,gene_module_select]
#基因的模块成员度(module membership)计算
#即各基因表达值与相应模块特征基因的相关性,其衡量了基因在全局网络中的位置
#基因的模块成员度(module membership)计算
#即各基因表达值与相应模块特征基因的相关性,其衡量了基因在全局网络中的位置
geneModuleMembership <- signedKME(gene_black, module['MEyellow'], outputColumnName = 'MM')
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nrow(module)))
#各基因表达值与临床表型的相关性分析
geneTraitSignificance <- as.data.frame(cor(gene_black, trait['spike_weight'], use = 'p'))
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nrow(trait)))
##选择显著(p<0.01)、高 black 模块成员度(MM>=0.8),与 TNM 表型高度相关(r>=0.8)的基因
geneModuleMembership[geneModuleMembership<0.8 | MMPvalue>0.01] <- 0
geneTraitSignificance[geneTraitSignificance<0.8 | MMPvalue>0.01] <- 0
select <- cbind(geneModuleMembership, geneTraitSignificance)
select <- subset(select, geneModuleMembership>=0.8 & geneTraitSignificance>=0.8)
head(select)
write.csv(select,"2.yellow_select基因模块相关性和表型相关性都高于0.8.csv")
###对模块内选择的基因进行GO分析
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(stringr)
#######重新尝试进行GO和KEGG提取
#读取数据
term2gene <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\term2gene.csv",header = T,row.names = 1)
head(term2gene)
go2term <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\go2term.csv",header = T,row.names = 1)
go2ont <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\go2ont.csv",header = T,row.names = 1)
df <- enricher(gene = row.names(select), TERM2GENE = term2gene, TERM2NAME = go2term, pvalueCutoff =1, qvalueCutoff = 1)
dim(df)
head(df)
p1 <- dotplot(df, showCategory=30)+scale_y_discrete(labels=function(x) str_wrap(x, width=50))
go<-as.data.frame(df)
names(go)
go<-as.data.frame(df)
names(go)
# go <- read.csv("D:\\mywork\\mywork\\申大麦转录组分析\\UPset图\\BOWMAN比较\\GO分析\\18.V1a_V5_V4K_V3int.GO_result.csv.csv",header=T,row.names=1)
# go<-as.data.frame(go)
# names(go)
colnames(go) <- c("go_id","Description" ,"GeneRatio" , "BgRatio" , "pvalue" , "p.adjust" , "qvalue" , "geneID","Count")
go1 <- merge(go,go2ont,by="go_id")
go1 <- go1[order(-go1$Count),]
go_MF<-go1[go1$Ontology=="MF",][1:10,]
go_CC<-go1[go1$Ontology=="CC",][1:10,]
go_BP<-go1[go1$Ontology=="BP",][1:10,]
go_enrich_df<-data.frame(ID=c(go_BP$go_id, go_CC$go_id, go_MF$go_id),
Description=c(go_BP$Description, go_CC$Description, go_MF$Description),
GeneNumber=c(go_BP$Count, go_CC$Count, go_MF$Count),
type=factor(c(rep("biological process", 10), rep("cellular component", 10),rep("molecular function",10)),levels=c("molecular function", "cellular component", "biological process")))
# go_enrich_df <- rbind(go_BP,go_CC,go_MF)
# go_enrich_df <-go_enrich_df [,c(1,2,9,10)]
# go_enrich_df$type <-c(rep("biological process", 10), rep("cellular component", 10),rep("molecular function",10))
# ## numbers as data on x axis
go_enrich_df$number <- factor(rev(1:nrow(go_enrich_df)))
## shorten the names of GO terms
shorten_names <- function(x, n_word=4, n_char=40){
if (length(strsplit(x, " ")[[1]]) > n_word || (nchar(x) > 40))
{
if (nchar(x) > 40) x <- substr(x, 1, 40)
x <- paste(paste(strsplit(x, " ")[[1]][1:min(length(strsplit(x," ")[[1]]), n_word)],
collapse=" "),"...", sep="")
return(x)
}
else
{
return(x)
}
}
labels=(sapply(
levels(go_enrich_df$Description)[as.numeric(go_enrich_df$Description)],
shorten_names))
names(labels) = rev(1:nrow(go_enrich_df))
## colors for bar // green, blue, orange
CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5")
library(ggplot2)
p2 <- ggplot(data=go_enrich_df, aes(x=number, y=GeneNumber, fill=type)) +
geom_bar(stat="identity", width=0.8) + coord_flip() +
scale_fill_manual(values = CPCOLS) + theme_test() +
scale_x_discrete(labels=go_enrich_df$Description) +
xlab("GO term") +
theme(axis.text=element_text(face = "bold", color="gray50")) +
labs(title = "The Most Enriched GO Terms")
p2
filename1 <- paste("select_yellow.csv",sep=".")
write.csv(go1,filename1)
filename2 <- paste("select_yellow", "png", sep = ".")
filename3 <- paste("select_yellow", "png", sep = ".")
ggsave(filename2, p2, width = 10, height = 8)
ggsave(filename3, p1, width = 10, height = 8)
八、感兴趣的模块绘制网络图
##所有模块结果输出
#输出各模块子网络的边列表和节点列表,后续可导入 cytoscape 绘制网络图
#其中,边的权重为 TOM 矩阵中的值,记录了基因间共表达相似性
dir.create('cytoscape', recursive = TRUE)
for (mod in 1:nrow(table(mergedColors))) {
modules <- names(table(mergedColors))[mod]
probes <- colnames(gene)
inModule <- (mergedColors == modules)
modProbes <- probes[inModule]
modGenes <- modProbes
modtom_sim <- tom_sim[inModule, inModule]
dimnames(modtom_sim) <- list(modProbes, modProbes)
outEdge <- paste('cytoscape/', modules, '.edge_list.txt',sep = '')
outNode <- paste('cytoscape/', modules, '.node_list.txt', sep = '')
exportNetworkToCytoscape(modtom_sim,
edgeFile = outEdge,
nodeFile = outNode,
weighted = TRUE,
threshold = 0.3, #该参数可控制输出的边数量,过滤低权重的边
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = mergedColors[inModule])
}
参考教程:
加权基因共表达网络分析(WGCNA) - pome24的个人空间 - OSCHINA - 中文开源技术交流社区
WGCNA(二)WGCNA的步骤和原理 - 简书 (jianshu.com)
soft软阈值设定的原理
(领接矩阵) 邻接矩阵diviner_s的博客-CSDN博客邻接矩阵
(23条消息) 邻接矩阵与关联矩阵的转换及实现hukai7190的博客-CSDN博客邻接矩阵转关联矩阵