GEO数据挖掘-第四期-肝细胞癌（HCC）

疾病背景

肝细胞癌（HCC），起源于肝细胞，是全球第七大常见癌症。

每年全球约诊断HCC病例50万例，约占所有癌症病例的5.4%，每年因肝细胞癌而死亡得人数约70万人。

绝大多数HCC发生在亚洲和撒哈拉以南非洲，美国和其他发展中国家的HCC发病率正在上升。

GEO数据框：GSE54238

◎ 10个正常样本          (NL)

◎ 10个慢性肝炎          (IL)

◎ 10个肝硬化              (CL)

◎ 13个早期肝细胞癌   (eHCC)

◎ 13个晚期肝细胞癌   (aHCC)

****■ ■ ■****

这一期还是WGCNA。。。。。。

rm( list = ls() )load( './gset.Rdata' )load('./exprSet_for_WGCNA.Rdata')options( stringsAsFactors = F )

◎ gset包含原始数据

◎ exprSet是已经处理并注释好的数据集

第一步准备样本表型数据和数据集

library( "GEOquery" )## 取表达矩阵和样本信息表{  gset = gset[[1]]  pdata = pData( gset )}colnames(pdata)hclust_table = pdata[,38:42]colnames(hclust_table) = c("age", "condition", "gender", "hbeag", "hbsag")hclust_table[hclust_table$gender=="M", 3] = 1## 这样就得到了一个样本特性的数据框##            age    condition gender    hbeag    hbsag## GSM1310758  44 normal liver      1 Negative Negative## GSM1310759  41 normal liver      1 Negative Negative## GSM1310760  49 normal liver      1 Negative Negative

exprSet[1:5,1:5]##              GSM1310758 GSM1310759 GSM1310760 GSM1310761 GSM1310762## FAM138C        6.944570   6.248560   7.526373   8.255203   8.717276## FLJ39609       6.594915   6.269244   6.801226   6.705848   6.784622## LOC100128842   9.240880   9.793829   9.232292   8.533079   8.549601## LOC148413     10.696339  10.126397  10.274773   9.017882   8.310329## LOC100128003   7.156712   8.209122   7.413724   6.254033   6.980212group_list = as.character( pdata[, 8] )table( group_list )## advanced HCC   chronic inflammatory liver   cirrhotic livers##           13                           10                 10## early HCC    normal liver ##        13              10colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' )exprSet[1:5,1:5]##              normal liver_1 normal liver_2 normal liver_3 normal liver_4## FAM138C            6.944570       6.248560       7.526373       8.255203## FLJ39609           6.594915       6.269244       6.801226       6.705848## LOC100128842       9.240880       9.793829       9.232292       8.533079## LOC148413         10.696339      10.126397      10.274773       9.017882## LOC100128003       7.156712       8.209122       7.413724       6.254033# 计算数据集的每个基因的平均绝对离差，取排在前面最大的5000个基因。datExpr = t(exprSet[order(apply(exprSet,1,mad), decreasing = T)[1:5000],])# 上面这步实际就是把一个基因在个样本间表达变化最大的基因跳出来，继续下面的分析dim( datExpr )# [1]   56 5000## 接下来就是对这56个样本的5000个基因做分析

第二步绘制进化树

library("WGCNA"){  # 样本信息不同特征值赋予不同的颜色  status_colors <- numbers2colors(as.numeric(factor(hclust_table$condition)),                                   colors = c("red","white","pink"),signed = FALSE)  age_colors <- numbers2colors(as.numeric(factor(hclust_table$age)),                                colors = c("red","white","pink"),signed = FALSE)  sex_colors[,1] <- rep( 'white', 56 )  hbeag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbeag)),                                colors = c("red","white"),signed = FALSE)  hbsag_colors <- numbers2colors(as.numeric(factor(hclust_table$hbsag)),                                colors = c("red","white"),signed = FALSE)  par(mar = c(1,4,3,1),cex=0.8)  plotDendroAndColors(datExpr_tree, cbind(status_colors, age_colors, sex_colors, hbeag_colors, hbsag_colors),                      groupLabels = colnames(hclust_table),                      cex.dendroLabels = 0.8,                      marAll = c(1, 4, 3, 1),                      cex.rowText = 0.01,                      main = "Sample dendrogram and trait heatmap")}

image

第三步确定软阀值

powers = c( c(1:10), seq(12, 20, by = 2) )## [1]  1  2  3  4  5  6  7  8  9 10 12 14 16 18 20sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 0)str(sft)par(mfrow = c(1,2))cex1 = 0.9## 两种方法挑选软阀值# Scale independenceplot(sft$fitIndices$Power,     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,     xlab="Soft Threshold (power)",     ylab="Scale Free Topology Model Fit,signed R^2",     type="n",      main = paste("Scale independence"))text(sft$fitIndices$Power,     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,     labels=powers,     cex=cex1,     col="red")abline(h=0.85,col="RED")# Mean connectivityplot(sft$fitIndices$Power,     sft$fitIndices$mean.k.,     xlab="Soft Threshold (power)",     ylab="Mean Connectivity",     type="n",     main = paste("Mean connectivity"))text(sft$fitIndices$Power,     sft$fitIndices$mean.k.,     labels=powers,     cex=cex1,     col="red")best_beta=sft$powerEstimate

image

第四步构建共表达矩阵

## blockwiseModules自动网络构建和模块检测，单个基因水平上的net = blockwiseModules(datExpr, power = best_beta,                       TOMType = "unsigned", minModuleSize = 30,                       reassignThreshold = 0, mergeCutHeight = 0.25,                       numericLabels = TRUE, pamRespectsDendro = FALSE,                       saveTOMs = TRUE,                       saveTOMFileBase = "HCCTOM",                       verbose = 3)table(net$colors)# 这里的color是标签，下一步转换为对应的颜色moduleColors <- labels2colors(net$colors)# 计算模块特征基因，为第一主成分MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes# 计算根据模块特征基因计算模块相异度MEDiss = 1-cor(MEs0)# 计算模块特征基因间的距离，并聚类METree = hclust(as.dist(MEDiss), method = "average");plot(METree,      main = "Clustering of module eigengenes",     xlab = "",      sub = "")# 在聚类图中画出剪切线MEDissThres = 0.25abline(h=MEDissThres, col = "red")# 合并特征基因相近的模块，特征基因水平上的merge_modules = mergeCloseModules(datExpr, moduleColors, cutHeight = MEDissThres, verbose = 3)mergedColors = merge_modules$colorsmergedMEs = merge_modules$newMEsplotDendroAndColors(net$dendrograms[[1]], cbind(moduleColors, mergedColors),                    c("Dynamic Tree Cut", "Merged dynamic"),                    dendroLabels = FALSE, hang = 0.03,                    addGuide = TRUE, guideHang = 0.05)

image

第五步绘制TOM图

# 每个块中基因的层次聚类树状图geneTree = net$dendrograms[[1]]dissTOM = 1-TOMsimilarityFromExpr(datExpr)set.seed(10)select = sample(ncol(datExpr), size = 400)selectTOM = dissTOM[select, select]selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select]sizeGrWindow(9,9)plotDiss = selectTOM^7diag(plotDiss) = NATOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

image

第六步模块与性状的关系

design=model.matrix(~0+ group_list)colnames(design)=levels(factor(group_list))# 根据相关性重新排序MEs = orderMEs(MEs0)# 计算两个相关性值moduleTraitCor = cor(MEs, design , use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))sizeGrWindow(10,6)textMatrix = paste(signif(moduleTraitCor, 2), "(",                   signif(moduleTraitPvalue, 1), ")", sep = "");dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3))# 可视化labeledHeatmap(Matrix = moduleTraitCor,               xLabels = names(hclust_table[,1:5]),               yLabels = names(MEs),               ySymbols = names(MEs),               colorLabels = FALSE,               colors = blueWhiteRed(50),               textMatrix = textMatrix,               setStdMargins = FALSE,               cex.text = 0.5,               zlim = c(-1,1),               main = paste("Module-trait relationships"))

image

sizeGrWindow(5,7.5);par(cex = 0.9)# 特征基因网络图plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8,                       xLabelsAngle= 90)# Plot the dendrogramsizeGrWindow(6,6);par(cex = 1.0)## 聚类图plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0),                      plotHeatmaps = FALSE)# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)par(cex = 1.0)## 性状与模块热图plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),                      plotDendrograms = FALSE, xLabelsAngle = 90)

image

第七步对感兴趣模块的进一步分析

modNames = substring(names(MEs), 3)geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));## 算出每个模块跟特征基因的皮尔森相关系数矩阵## MEs是每个模块在每个样本里面的值MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));names(geneModuleMembership) = paste("MM", modNames, sep="");names(MMPvalue) = paste("p.MM", modNames, sep="");geneTraitSignificance = as.data.frame(cor(datExpr,use = "p"));GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));module = "turquoise"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),                   abs(geneTraitSignificance[moduleGenes, 1]),                   xlab = paste("Module Membership in", module, "module"),                   ylab = "Gene significance for condition",                   main = paste("Module membership vs. gene significance"),                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)module = "blue"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),                   abs(geneTraitSignificance[moduleGenes, 1]),                   xlab = paste("Module Membership in", module, "module"),                   ylab = "Gene significance for condition",                   main = paste("Module membership vs. gene significance"),                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

image

</pre>

GEO数据挖掘-第四期-肝细胞癌（HCC）

推荐阅读更多精彩内容