大麦WGCNA分析

参考文献
High-resolution spatiotemporal transcriptome mapping of tomato fruit development and ripening

image.png

关于文章中WGCNA方法参数

image.png

文章中参数主要包括一下内容
① 转录组数据基因输入参数
选取变异系数CV值大于1的基因作为输入数据进行分析
② 计算软阈值参数
设定的软阈值soft power为16，TOM结构类型为signed，也是推荐的结构类型。
③ 设定最小模块基因数目参数
最小模块基因数目为30，模块设定的多少，决定分出来模块的多少
④ 相似模块合并参数
前面设定最小模块数目后，基因会分成不同的模块，计算模块之间的相关性后，相关性高的模块进行合并，文章中的参数设定为0.25，也就是说相关性高于0.75的就合并在一起。
⑤ 模块内hub基因筛选参数
模块内基因的筛选主要从两个参数进行筛选，第一个参数是模块内的基因跟模块的相关性高于0.9，然后利用FDR 相关性计算基因跟性状的相关性。

大麦WGCNA分析流程

一、转录组数据输入和质控

##导入表达量TPM数据
rm(list=ls())
library(dplyr)
library(Mfuzz)
library(WGCNA)
library(impute)
library(preprocessCore)
library(MASS)
library(class)
library(cluster)
library(impute)
library(Hmisc)
library(matrixStats)
library(flashClust)
library(dplyr)
setwd("D:\\mywork\\mywork\\10.大麦转录组\\2022年大麦分析\\")
data <- read.csv("1.TPM_data_aver_use.csv",header=T,row.names = 1,check.names = F)
##表达量数据是三个重复进行矫正过离群值后，离群值用剩余两个重复求平均值后的最终TPM结果
##基因型数据预处理
##由于soft设定R2一直小于阈值0.8，查看问题所在是
# 无向网络在power小于15或有向网络power小于30内，没有一个power值可以使无标度网络图谱结构R^2达到0.8或平均连接度降到100以下，可能是由于部分样品与其他样品差别太大造成的。这可能由批次效应、样品异质性或实验条件对表达影响太大等造成
datExprFemale2 <- log2(data+1)
cor_T <- cor(datExprFemale2)
##绘制热图
library(pheatmap)
pheatmap(cor_T,show_rownames = T,show_colnames = T,cluster_rows=T,cluster_cols = T)
##结果展示相关性好
###数据质控
m.mad <- apply(datExprFemale2,1,mad)
dataExprVar <- datExprFemale2[which(m.mad > 
                                max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
dim(dataExprVar)

###去除变异系数cv<1的基因

cal_cv=function(x){  # 自定义函数 标准差/平均值
  y=na.omit(x)
  return(sd(y)/mean(y))
}

m.cv <- apply(dataExprVar, 1, cal_cv)
dataExprVar2 <- dataExprVar[which(m.cv > 1),]


datExprFemale = as.data.frame(t(dataExprVar2 ))


datExprFemale_D_1 = as.data.frame(datExprFemale)

# ##检验数据质量
# # 样本聚类检查离群值（就是树上特别不接近的
gsg = goodSamplesGenes(datExprFemale_D_1, verbose = 3)
head(gsg)
gsg$allOK

## 是true的话说明所有genes都通过了筛选
if (!gsg$allOK){
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
    printFlush(paste("Removing genes:", 
                     paste(names(datExprFemale_D_1)[!gsg$goodGenes], collapse = ",")));
  if (sum(!gsg$goodSamples)>0) 
    printFlush(paste("Removing samples:", 
                     paste(rownames(datExprFemale_D_1)[!gsg$goodSamples], collapse = ",")));
  # Remove the offending genes and samples from the data:
  datExprFemale_D_1 =datExprFemale_D_1[gsg$goodSamples, gsg$goodGenes]
}
## 画图来看是否有特别离群的
dim(datExprFemale_D_1)
##共有5210个基因进行WGCNA分析
sampleTree = hclust(dist(datExprFemale_D_1), method = "average")
##设置画板大小

sizeGrWindow(12, 9)
par(cex=0.6)
par(mar=c(0,4,2,0))

plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")

##没有发现离群值，不需要剔除样本

二、设定软阈值

image.png

设定软阈值，

# 清理，释放内存
collectGarbage()
gene <- datExprFemale_D_1
powers = c(c(1:10), seq(from = 12, to=20, by=2))
type = "unsigned" ##自己试验设计的按照unsigned
sft <- pickSoftThreshold(gene, powerVector = powers, verbose=5 )
# 推荐值。如果是NA，就需要画图来自己挑选
setwd("./7.WGCNA_result")
pdf("soft.pdf",width = 15,  height = 8)
par(mfrow = c(1, 2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], type = 'n', 
     xlab = 'Soft Threshold (power)', ylab = 'Scale Free Topology Model Fit,signed R^2', 
     main = paste('Scale independence'))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels = powers, col = 'red');
abline(h = 0.8, col = 'red')

#平均连通性与 power 值散点图
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab = 'Soft Threshold (power)', ylab = 'Mean Connectivity', type = 'n',
     main = paste('Mean connectivity'))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, col = 'red')
dev.off()
##构建网络
#上一步估计的最佳 power 值
write.csv(sft,"1.sft.csv")
sft$powerEstimate
###推荐的软阈值设定为9

image.png

三、构建TOM矩阵

 获得邻接矩阵：
softPower <- sft$powerEstimate
adjacency = adjacency(gene, power = softPower)
# 将临近矩阵转为 Tom 矩阵
TOM = TOMsimilarity(adjacency)
# 计算基因之间的相异度
dissTOM = 1-TOM
hierTOM = hclust(as.dist(dissTOM),method="average")

ADJ1_cor <- abs(WGCNA::cor( gene,use = "p" ))^softPower
# 基因少（<5000）的时候使用下面的代码：
k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))
# 基因多的时候使用下面的代码：
k <- softConnectivity(datE=gene,power=softPower) 
sizeGrWindow(10, 5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k,main="Check Scale free topology\n")
# 可以看出k与p(k)成负相关(相关性系数0.84),说明选择的β值能够建立基因无尺度网络。

# 使用相异度来聚类为gene tree(聚类树)：
geneTree = hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
windows()
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
# 使用动态剪切树挖掘模块：
minModuleSize = 50;
# 动态切割树:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
table(dynamicMods)

dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
pdf("Gene dendrogram and module colors.pdf")

#模块颜色指代
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
plotDendroAndColors(geneTree, dynamicColors, 'Dynamic Tree Cut',
                    dendroLabels = FALSE, addGuide = TRUE, hang = 0.03, guideHang = 0.05,
                    main = 'Gene dendrogram and module colors')
dev.off()

image.png

四、随机挑选400个基因查看TOM结构

# 拓扑热图：
nSelect = 400 
# For reproducibility, we set the random seed 
set.seed(10); 
nGenes
select = sample(5210, size = nSelect); 
selectTOM = dissTOM[select, select]; 
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster. 
selectTree = hclust(as.dist(selectTOM), method = "average") 
selectColors = dynamicColors[select]; 
# Open a graphical window 
sizeGrWindow(9,9) 
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing 
# the color palette; setting the diagonal to NA also improves the clarity of the plot 
plotDiss = selectTOM^softPower; 
diag(plotDiss) = NA; 
TOMplot(plotDiss, 
        selectTree, 
        selectColors, 
        main = "Network heatmap plot, selected genes")

5、计算根据模块特征向量基因计算模块相异度

MEList = moduleEigengenes(gene, colors = dynamicColors)
MEs = MEList$eigengenes
# 计算根据模块特征向量基因计算模块相异度：
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
# Plot the result
plotEigengeneNetworks(MEs, 
                      "Eigengene adjacency heatmap", 
                      marHeatmap = c(3,4,2,2), 
                      plotDendrograms = FALSE, 
                      xLabelsAngle = 90) 

plot(METree, 
     main = "Clustering of module eigengenes",
     xlab = "", 
     sub = "")
# 在聚类图中画出剪切线
abline(h=MEDissThres, col = "red")
MEDissThres = 0.2
# 在聚类图中画出剪切线
abline(h=MEDissThres, col = "red")
# 合并模块：
merge_modules = mergeCloseModules(gene, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 合并后的颜色：
mergedColors = merge_modules$colors;
# 新模块的特征向量基因：
mergedMEs = merge_modules$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

probes = colnames(gene)
dimnames(TOM) <- list(probes, probes)

六、计算表型和模块的相关性

trait_data <- read.table("1.trait_input.txt",header=T,sep="\t",row.names = 1)
trait <- trait_data 

#使用上一步新组合的共表达模块的结果
module <- mergedMEs

#共表达模块和表型的相关性分析
moduleTraitCor <- cor(module, trait, use = 'p')
moduleTraitCor[1:6,1:3]  #相关矩阵

#相关系数的 p 值矩阵
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nrow(module))

#输出相关系数矩阵或 p 值矩阵
write.table(moduleTraitCor, 'moduleTraitCor.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(moduleTraitPvalue, 'moduleTraitPvalue.txt', sep = '\t', col.names = NA, quote = FALSE)

#相关图绘制
textMatrix <- paste(signif(moduleTraitCor, 2), '\n(', signif(moduleTraitPvalue, 1), ')', sep = '')
dim(textMatrix) <- dim(moduleTraitCor)
pdf("Module-trait relationships.pdf")
par(mar = c(5, 10, 1,1))
labeledHeatmap(Matrix = moduleTraitCor, main = paste('Module-trait relationships'), 
               xLabels = names(trait), yLabels = names(module), ySymbols = names(module),
               colorLabels = FALSE, colors = blueWhiteRed(50), cex.text = 1, zlim = c(-1,1), 
               textMatrix = textMatrix, setStdMargins = FALSE)
dev.off()

gene_module <- data.frame(gene_name = colnames(gene), module = mergedColors, stringsAsFactors = FALSE)
head(gene_module)
write.csv(gene_module,"gene_module.csv")

七、感兴趣模块基因分析

#提取模块内的基因名称
gene_module_select <- subset(gene_module, module == 'yellow')$gene_name

#“black”模块内基因在各样本中的表达值矩阵（基因表达值矩阵的一个子集）
gene_select <- t(gene[,gene_module_select])


#“black”模块内基因的共表达相似度（在 TOM 矩阵中提取子集）
tom_select <- TOM[gene_module_select,gene_module_select]

#输出
write.table(gene_select, 'blue_gene_select.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(tom_select, 'blue_tom_select.txt', sep = '\t', col.names = NA, quote = FALSE)

##重要基因的获得，以与 TNM 显著相关的 black 模块为例
#选择 black 模块内的基因
gene_yellow <- gene[ ,gene_module_select]

#基因的模块成员度（module membership）计算
#即各基因表达值与相应模块特征基因的相关性，其衡量了基因在全局网络中的位置
#基因的模块成员度（module membership）计算
#即各基因表达值与相应模块特征基因的相关性，其衡量了基因在全局网络中的位置
geneModuleMembership <- signedKME(gene_black, module['MEyellow'], outputColumnName = 'MM')
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nrow(module)))

#各基因表达值与临床表型的相关性分析
geneTraitSignificance <- as.data.frame(cor(gene_black, trait['spike_weight'], use = 'p'))
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nrow(trait)))

##选择显著（p<0.01）、高 black 模块成员度（MM>=0.8），与 TNM 表型高度相关（r>=0.8）的基因
geneModuleMembership[geneModuleMembership<0.8 | MMPvalue>0.01] <- 0
geneTraitSignificance[geneTraitSignificance<0.8 | MMPvalue>0.01] <- 0

select <- cbind(geneModuleMembership, geneTraitSignificance)
select <- subset(select, geneModuleMembership>=0.8 & geneTraitSignificance>=0.8)
head(select)
write.csv(select,"2.yellow_select基因模块相关性和表型相关性都高于0.8.csv")

###对模块内选择的基因进行GO分析
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(stringr)
#######重新尝试进行GO和KEGG提取
#读取数据
term2gene <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\term2gene.csv",header = T,row.names = 1)
head(term2gene)
go2term <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\go2term.csv",header = T,row.names = 1)
go2ont <- read.csv("D:\\mywork\\mywork\\13.申大麦转录组分析_VRS突变体整穗\\2020.12.24_重新计算Bowman和突变体的转录组DESeq2和GO结果等\\Bowman_V1a\\GO结果\\go2ont.csv",header = T,row.names = 1)

df <- enricher(gene = row.names(select), TERM2GENE = term2gene, TERM2NAME = go2term, pvalueCutoff =1, qvalueCutoff = 1)
dim(df)
head(df)
p1 <- dotplot(df, showCategory=30)+scale_y_discrete(labels=function(x) str_wrap(x, width=50))
go<-as.data.frame(df)
names(go)
go<-as.data.frame(df)
  names(go)
  # go <- read.csv("D:\\mywork\\mywork\\申大麦转录组分析\\UPset图\\BOWMAN比较\\GO分析\\18.V1a_V5_V4K_V3int.GO_result.csv.csv",header=T,row.names=1)
  #  go<-as.data.frame(go)
  # names(go)
  colnames(go) <- c("go_id","Description" ,"GeneRatio"  , "BgRatio"    , "pvalue"  ,    "p.adjust"  ,  "qvalue"   ,   "geneID","Count")
  
  go1 <- merge(go,go2ont,by="go_id")
  
  
  go1 <-  go1[order(-go1$Count),] 
  go_MF<-go1[go1$Ontology=="MF",][1:10,]
  go_CC<-go1[go1$Ontology=="CC",][1:10,]
  go_BP<-go1[go1$Ontology=="BP",][1:10,]
  go_enrich_df<-data.frame(ID=c(go_BP$go_id, go_CC$go_id, go_MF$go_id),
                           Description=c(go_BP$Description, go_CC$Description, go_MF$Description),
                           GeneNumber=c(go_BP$Count, go_CC$Count, go_MF$Count),
                           type=factor(c(rep("biological process", 10), rep("cellular component", 10),rep("molecular function",10)),levels=c("molecular function", "cellular component", "biological process")))
  
  # go_enrich_df <- rbind(go_BP,go_CC,go_MF)
  
  # go_enrich_df <-go_enrich_df [,c(1,2,9,10)]
  # go_enrich_df$type <-c(rep("biological process", 10), rep("cellular component", 10),rep("molecular function",10)) 
  # ## numbers as data on x axis
  go_enrich_df$number <- factor(rev(1:nrow(go_enrich_df)))
  ## shorten the names of GO terms
  shorten_names <- function(x, n_word=4, n_char=40){
    if (length(strsplit(x, " ")[[1]]) > n_word || (nchar(x) > 40))
    {
      if (nchar(x) > 40) x <- substr(x, 1, 40)
      x <- paste(paste(strsplit(x, " ")[[1]][1:min(length(strsplit(x," ")[[1]]), n_word)],
                       collapse=" "),"...", sep="")
      return(x)
    } 
    else
    {
      return(x)
    }
  }
  
  labels=(sapply(
    levels(go_enrich_df$Description)[as.numeric(go_enrich_df$Description)],
    shorten_names))
  names(labels) = rev(1:nrow(go_enrich_df))
  
  ## colors for bar // green, blue, orange
  CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5")
  library(ggplot2)
  
  p2 <- ggplot(data=go_enrich_df, aes(x=number, y=GeneNumber, fill=type)) +
    geom_bar(stat="identity", width=0.8) + coord_flip() + 
    scale_fill_manual(values = CPCOLS) + theme_test() + 
    scale_x_discrete(labels=go_enrich_df$Description) +
    xlab("GO term") + 
    theme(axis.text=element_text(face = "bold", color="gray50")) +
    labs(title = "The Most Enriched GO Terms")
  p2

filename1 <- paste("select_yellow.csv",sep=".")
 
  
  write.csv(go1,filename1)
  filename2 <- paste("select_yellow", "png", sep = ".")
  filename3 <- paste("select_yellow", "png", sep = ".")
  ggsave(filename2, p2, width = 10, height = 8)
  ggsave(filename3, p1, width = 10, height = 8)

八、感兴趣的模块绘制网络图

##所有模块结果输出
#输出各模块子网络的边列表和节点列表，后续可导入 cytoscape 绘制网络图
#其中，边的权重为 TOM 矩阵中的值，记录了基因间共表达相似性
dir.create('cytoscape', recursive = TRUE)
 
for (mod in 1:nrow(table(mergedColors))) {
    modules <- names(table(mergedColors))[mod]
    probes <- colnames(gene)
    inModule <- (mergedColors == modules)
    modProbes <- probes[inModule]
    modGenes <- modProbes
    modtom_sim <- tom_sim[inModule, inModule]
    dimnames(modtom_sim) <- list(modProbes, modProbes)
    outEdge <- paste('cytoscape/', modules, '.edge_list.txt',sep = '')
    outNode <- paste('cytoscape/', modules, '.node_list.txt', sep = '')
    exportNetworkToCytoscape(modtom_sim,
        edgeFile = outEdge,
        nodeFile = outNode,
        weighted = TRUE,
        threshold = 0.3,  #该参数可控制输出的边数量，过滤低权重的边
        nodeNames = modProbes,
        altNodeNames = modGenes,
        nodeAttr = mergedColors[inModule])
}