文章来源:生信技能树
原文:GEO数据挖掘-第四期-肝细胞癌(HCC)
疾病背景
肝细胞癌(HCC),起源于肝细胞,是全球第七大常见癌症。
每年全球约诊断HCC病例50万例,约占所有癌症病例的5.4%,每年因肝细胞癌而死亡得人数约70万人。
绝大多数HCC发生在亚洲和撒哈拉以南非洲,美国和其他发展中国家的HCC发病率正在上升。
GEO数据框:GSE54238
◎ 10个正常样本 (NL)
◎ 10个慢性肝炎 (IL)
◎ 10个肝硬化 (CL)
◎ 13个早期肝细胞癌 (eHCC)
◎ 13个晚期肝细胞癌 (aHCC)
****■ ■ ■****
这一期还是WGCNA。。。。。。
rm( list = ls() )load( './gset.Rdata' )load('./exprSet_for_WGCNA.Rdata')options( stringsAsFactors = F )
◎ gset包含原始数据
◎ exprSet是已经处理并注释好的数据集
第一步 准备样本表型数据和数据集
library( "GEOquery" )## 取表达矩阵和样本信息表{ gset = gset[[1]] pdata = pData( gset )}colnames(pdata)hclust_table = pdata[,38:42]colnames(hclust_table) = c("age", "condition", "gender", "hbeag", "hbsag")hclust_table[hclust_table$gender=="M", 3] = 1## 这样就得到了一个样本特性的数据框## age condition gender hbeag hbsag## GSM1310758 44 normal liver 1 Negative Negative## GSM1310759 41 normal liver 1 Negative Negative## GSM1310760 49 normal liver 1 Negative Negative
exprSet[1:5,1:5]## GSM1310758 GSM1310759 GSM1310760 GSM1310761 GSM1310762## FAM138C 6.944570 6.248560 7.526373 8.255203 8.717276## FLJ39609 6.594915 6.269244 6.801226 6.705848 6.784622## LOC100128842 9.240880 9.793829 9.232292 8.533079 8.549601## LOC148413 10.696339 10.126397 10.274773 9.017882 8.310329## LOC100128003 7.156712 8.209122 7.413724 6.254033 6.980212group_list = as.character( pdata[, 8] )table( group_list )## advanced HCC chronic inflammatory liver cirrhotic livers## 13 10 10## early HCC normal liver ## 13 10colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' )exprSet[1:5,1:5]## normal liver_1 normal liver_2 normal liver_3 normal liver_4## FAM138C 6.944570 6.248560 7.526373 8.255203## FLJ39609 6.594915 6.269244 6.801226 6.705848## LOC100128842 9.240880 9.793829 9.232292 8.533079## LOC148413 10.696339 10.126397 10.274773 9.017882## LOC100128003 7.156712 8.209122 7.413724 6.254033# 计算数据集的每个基因的平均绝对离差,取排在前面最大的5000个基因。datExpr = t(exprSet[order(apply(exprSet,1,mad), decreasing = T)[1:5000],])# 上面这步实际就是把一个基因在个样本间表达变化最大的基因跳出来,继续下面的分析dim( datExpr )# [1] 56 5000## 接下来就是对这56个样本的5000个基因做分析
第二步 绘制进化树
library("WGCNA"){ # 样本信息不同特征值赋予不同的颜色 status_colors <- numbers2colors(as.numeric(factor(hclust_table$condition)), colors = c("red","white","pink"),signed = FALSE) age_colors <- numbers2colors(as.numeric(factor(hclust_table$age)), colors = c("red","white","pink"),signed = FALSE) sex_colors[,1] <- rep( 'white', 56 ) hbeag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbeag)), colors = c("red","white"),signed = FALSE) hbsag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbsag)), colors = c("red","white"),signed = FALSE) par(mar = c(1,4,3,1),cex=0.8) plotDendroAndColors(datExpr_tree, cbind(status_colors, age_colors, sex_colors, hbeag_colors, hbsag_colors), groupLabels = colnames(hclust_table), cex.dendroLabels = 0.8, marAll = c(1, 4, 3, 1), cex.rowText = 0.01, main = "Sample dendrogram and trait heatmap")}
第三步 确定软阀值
powers = c( c(1:10), seq(12, 20, by = 2) )## [1] 1 2 3 4 5 6 7 8 9 10 12 14 16 18 20sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 0)str(sft)par(mfrow = c(1,2))cex1 = 0.9## 两种方法挑选软阀值# Scale independenceplot(sft$fitIndices$Power, -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq, xlab="Soft Threshold (power)", ylab="Scale Free Topology Model Fit,signed R^2", type="n", main = paste("Scale independence"))text(sft$fitIndices$Power, -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq, labels=powers, cex=cex1, col="red")abline(h=0.85,col="RED")# Mean connectivityplot(sft$fitIndices$Power, sft$fitIndices$mean.k., xlab="Soft Threshold (power)", ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))text(sft$fitIndices$Power, sft$fitIndices$mean.k., labels=powers, cex=cex1, col="red")best_beta=sft$powerEstimate
第四步 构建共表达矩阵
## blockwiseModules自动网络构建和模块检测,单个基因水平上的net = blockwiseModules(datExpr, power = best_beta, TOMType = "unsigned", minModuleSize = 30, reassignThreshold = 0, mergeCutHeight = 0.25, numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs = TRUE, saveTOMFileBase = "HCCTOM", verbose = 3)table(net$colors)# 这里的color是标签,下一步转换为对应的颜色moduleColors <- labels2colors(net$colors)# 计算模块特征基因,为第一主成分MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes# 计算根据模块特征基因计算模块相异度MEDiss = 1-cor(MEs0)# 计算模块特征基因间的距离,并聚类METree = hclust(as.dist(MEDiss), method = "average");plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")# 在聚类图中画出剪切线MEDissThres = 0.25abline(h=MEDissThres, col = "red")# 合并特征基因相近的模块,特征基因水平上的merge_modules = mergeCloseModules(datExpr, moduleColors, cutHeight = MEDissThres, verbose = 3)mergedColors = merge_modules$colorsmergedMEs = merge_modules$newMEsplotDendroAndColors(net$dendrograms[[1]], cbind(moduleColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
第五步 绘制TOM图
# 每个块中基因的层次聚类树状图geneTree = net$dendrograms[[1]]dissTOM = 1-TOMsimilarityFromExpr(datExpr)set.seed(10)select = sample(ncol(datExpr), size = 400)selectTOM = dissTOM[select, select]selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select]sizeGrWindow(9,9)plotDiss = selectTOM^7diag(plotDiss) = NATOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
第六步 模块与性状的关系
design=model.matrix(~0+ group_list)colnames(design)=levels(factor(group_list))# 根据相关性重新排序MEs = orderMEs(MEs0)# 计算两个相关性值moduleTraitCor = cor(MEs, design , use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))sizeGrWindow(10,6)textMatrix = paste(signif(moduleTraitCor, 2), "(", signif(moduleTraitPvalue, 1), ")", sep = "");dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3))# 可视化labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(hclust_table[,1:5]), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = blueWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))
sizeGrWindow(5,7.5);par(cex = 0.9)# 特征基因网络图plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)# Plot the dendrogramsizeGrWindow(6,6);par(cex = 1.0)## 聚类图plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0), plotHeatmaps = FALSE)# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)par(cex = 1.0)## 性状与模块热图plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)
第七步 对感兴趣模块的进一步分析
<pre style="margin: 0px; padding: 0px; max-width: 100%; box-sizing: border-box !important; overflow-wrap: break-word !important; font-size: inherit; color: inherit; line-height: inherit;">
modNames = substring(names(MEs), 3)geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));## 算出每个模块跟特征基因的皮尔森相关系数矩阵## MEs是每个模块在每个样本里面的值MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));names(geneModuleMembership) = paste("MM", modNames, sep="");names(MMPvalue) = paste("p.MM", modNames, sep="");geneTraitSignificance = as.data.frame(cor(datExpr,use = "p"));GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));module = "turquoise"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for condition", main = paste("Module membership vs. gene significance"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)module = "blue"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for condition", main = paste("Module membership vs. gene significance"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
</pre>