GEO数据挖掘-第三期-口腔鳞状细胞癌(OSCC)

文章来源:生信技能树
原文:GEO数据挖掘-第三期-口腔鳞状细胞癌(OSCC)
文章标题

A pplication of weighted gene co-expression network analysis to identify key modules and hub genes in oral squamous cell carcinoma tumorigenesis

关键词

口腔鳞状细胞癌

第一步 安装包

## 整理流程中的软件包bioPackages <- c(   "stringi",  # 处理字符串  "GEOquery", # 下载包  "limma",    # 差异分析  "ggfortify", "ggplot2", "pheatmap", "ggstatsplot", "VennDiagram", # 作图  "clusterProfiler", "org.Hs.eg.db",                                # 注释  "AnnotationDbi", "impute", "GO.db", "preprocessCore", "WGCNA"     # WGCNA)## 设置软件包的下载镜像local({  # CRAN的镜像设置  r <- getOption( "repos" );   r[ "CRAN" ] <- "https://mirrors.tuna.tsinghua.edu.cn/CRAN/";   options( repos = r )  # bioconductor的镜像设置  BioC <- getOption( "BioC_mirror" );   BioC[ "BioC_mirror" ] <- "https://mirrors.ustc.edu.cn/bioc/";   options( BioC_mirror = BioC )})## 安装未安装的软件包#source( "https://bioconductor.org/biocLite.R" ) #第一次使用bioconductor需要运行lapply( bioPackages,   function( bioPackage ){    if( !require( bioPackage, character.only = T ) ){      CRANpackages <- available.packages()      if( bioPackage %in% rownames( CRANpackages) ){        install.packages( bioPackage )      }else{        BiocInstaller::biocLite( bioPackage, suppressUpdates = FALSE, ask = FALSE)      }    }  })

第二步 整理数据集和临床表型

## 数据集和注释平台的下载步骤略过,详见第一期options( stringsAsFactors = F )load( './gset.Rdata' )library( "GEOquery" )## 取表达矩阵和样本信息表{  gset = gset[[1]]  exprSet = exprs( gset )  pdata = pData( gset )  chl = length( colnames( pdata ) )  group_list = as.character( pdata[, chl] )}colnames(pdata)tmp = pdata[,10:12]library(stringr)status= str_split(as.character(tmp$characteristics_ch1),':',simplify = T)[,2]age = str_split(as.character(tmp$characteristics_ch1.1),':',simplify = T)[,2]sex = str_split(as.character(tmp$characteristics_ch1.2),':',simplify = T)[,2]tmp$status = statustmp$age = agetmp$sex = sexhclust_table = tmp[,4:6]group_list = tmp$statusload('./ID2gene.Rdata')## 去除没有注释的数据集{  exprSet = exprSet[ rownames(exprSet) %in% ID2gene[ , 1 ], ]  ID2gene = ID2gene[ match(rownames(exprSet), ID2gene[ , 1 ] ), ]}dim( exprSet )dim( ID2gene )tail( sort( table( ID2gene[ , 2 ] ) ), n = 12L )## 相同基因的表达数据取最大值{  MAX = by( exprSet, ID2gene[ , 2 ],               function(x) rownames(x)[ which.max( rowMeans(x) ) ] )  MAX = as.character(MAX)  exprSet = exprSet[ rownames(exprSet) %in% MAX , ]  rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ]}exprSet = log(exprSet)dim(exprSet)exprSet[1:5,1:5]save( exprSet, group_list, file = 'exprSet_for_WGCNA.Rdata')

第三步 绘制进化树

library("WGCNA")## WGCNAload('./exprSet_for_WGCNA.Rdata')load('./final_exprSet.Rdata')colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' )dim( exprSet )exprSet[1:5,1:5]datExpr = t(exprSet[order(apply(exprSet,1,mad), decreasing = T)[1:5000],])dim( datExpr )datExpr[1:4,1:4]{  nGenes = ncol(datExpr)  nSamples = nrow(datExpr)  #系统聚类树  datExpr_tree<-hclust(dist(datExpr), method = "average")  par(mar = c(0,5,2,0))  plot(datExpr_tree, main = "Sample clustering", sub="", xlab="",       cex.axis = 0.9, cex.main = 1.2, cex.lab=1, cex = 0.7)  status_colors <- numbers2colors(as.numeric(factor(hclust_table$status)),                                   colors = c("red","white","pink"),signed = FALSE)  age_colors <- numbers2colors(as.numeric(factor(hclust_table$age)),                                colors = c("red","white","pink"),signed = FALSE)  sex_colors <- numbers2colors(as.numeric(factor(hclust_table$sex)),                                colors = c("red","white"),signed = FALSE)  par(mar = c(1,4,3,1),cex=0.8)  plotDendroAndColors(datExpr_tree, cbind(status_colors, age_colors, sex_colors),                      groupLabels = colnames(hclust_table),                      cex.dendroLabels = 0.8,                      marAll = c(1, 4, 3, 1),                      cex.rowText = 0.01,                      main = "Sample dendrogram and trait heatmap")}
image

第四步 确定软阀值

powers = c(c(1:10), seq(from = 12, to=20, by=2))## [1]  1  2  3  4  5  6  7  8  9 10 12 14 16 18 20sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)# Plot the results:# Scale independencepar(mfrow = c(1,2));cex1 = 0.9;plot(sft$fitIndices$Power,     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,     xlab="Soft Threshold (power)",     ylab="Scale Free Topology Model Fit,signed R^2",     type="n",      main = paste("Scale independence"))text(sft$fitIndices$Power,     -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,     labels=powers,     cex=cex1,     col="red")abline(h=0.90,col="RED")# Mean connectivityplot(sft$fitIndices$Power,     sft$fitIndices$mean.k.,     xlab="Soft Threshold (power)",     ylab="Mean Connectivity",     type="n",     main = paste("Mean connectivity"))text(sft$fitIndices$Power,     sft$fitIndices$mean.k.,     labels=powers,     cex=cex1,     col="red")best_beta=sft$powerEstimatebest_beta
image

第五步 构建共表达矩阵

net = blockwiseModules(datExpr, power = 8,                       TOMType = "unsigned", minModuleSize = 30,                       reassignThreshold = 0, mergeCutHeight = 0.25,                       numericLabels = TRUE, pamRespectsDendro = FALSE,                       saveTOMs = TRUE,                       saveTOMFileBase = "femaleMouseTOM",                       verbose = 3)## 这个诡异的问题必须记录一下,第一次运行blockwiseModules很顺利,后来就开始报错了,网上给的解决方案是重启电脑,实在不行## 把WGCNA包重装一次,详情看这里https://www.biostars.org/p/305714/table(net$colors)moduleColors <- labels2colors(net$colors)# Recalculate MEs with color labelsMEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes# 计算根据模块特征向量基因计算模块相异度:MEDiss = 1-cor(MEs0);# Cluster module eigengenesMETree = hclust(as.dist(MEDiss), method = "average");# Plot the resultplot(METree,      main = "Clustering of module eigengenes",     xlab = "",      sub = "")# 在聚类图中画出剪切线MEDissThres = 0.25abline(h=MEDissThres, col = "red")merge_modules = mergeCloseModules(datExpr, moduleColors, cutHeight = MEDissThres, verbose = 3)mergedColors = merge_modules$colors;mergedMEs = merge_modules$newMEs;plotDendroAndColors(net$dendrograms[[1]], cbind(moduleColors, mergedColors),                    c("Dynamic Tree Cut", "Merged dynamic"),                    dendroLabels = FALSE, hang = 0.03,                    addGuide = TRUE, guideHang = 0.05)
image
image

第六步 TOM图

nGenes = ncol(datExpr)nSamples = nrow(datExpr)geneTree = net$dendrograms[[1]]dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6)nSelect = 400set.seed(10)select = sample(nGenes, size = nSelect)selectTOM = dissTOM[select, select]selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select]sizeGrWindow(9,9)plotDiss = selectTOM^7diag(plotDiss) = NATOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
image

第七步 模块与形状的关系

design=model.matrix(~0+ datTraits$subtype)colnames(design)=levels(datTraits$subtype)moduleColors <- labels2colors(net$colors)MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenesMEs = orderMEs(MEs0)moduleTraitCor = cor(MEs, design , use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)sizeGrWindow(10,6)textMatrix = paste(signif(moduleTraitCor, 2), "
(",                   signif(moduleTraitPvalue, 1), ")", sep = "");dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3));# Display the correlation values within a heatmap plotlabeledHeatmap(Matrix = moduleTraitCor,               xLabels = names(hclust_table[,1:3]),               yLabels = names(MEs),               ySymbols = names(MEs),               colorLabels = FALSE,               colors = greenWhiteRed(50),               textMatrix = textMatrix,               setStdMargins = FALSE,               cex.text = 0.5,               zlim = c(-1,1),               main = paste("Module-trait relationships"))
image
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenesMET = orderMEs(MEs)sizeGrWindow(5,7.5);par(cex = 0.9)plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8,                       xLabelsAngle= 90)# Plot the dendrogramsizeGrWindow(6,6);par(cex = 1.0)## 模块的聚类图plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),                      plotHeatmaps = FALSE)# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)par(cex = 1.0)## 性状与模块热图plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),                      plotDendrograms = FALSE, xLabelsAngle = 90)
image

第八步 感兴趣模块的具体基因分析

# names (colors) of the modulesmodNames = substring(names(MEs), 3)geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));## 算出每个模块跟基因的皮尔森相关系数矩阵## MEs是每个模块在每个样本里面的值## datExpr是每个基因在每个样本的表达量MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));names(geneModuleMembership) = paste("MM", modNames, sep="");names(MMPvalue) = paste("p.MM", modNames, sep="");geneTraitSignificance = as.data.frame(cor(datExpr,use = "p"));GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));module = "turquoise"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),                   abs(geneTraitSignificance[moduleGenes, 1]),                   xlab = paste("Module Membership in", module, "module"),                   ylab = "Gene significance for STATUS",                   main = paste("Module membership vs. gene significance
"),                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)module = "brown"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),                   abs(geneTraitSignificance[moduleGenes, 1]),                   xlab = paste("Module Membership in", module, "module"),                   ylab = "Gene significance for STATUS",                   main = paste("Module membership vs. gene significance
"),                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
image
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 215,245评论 6 497
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,749评论 3 391
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 160,960评论 0 350
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,575评论 1 288
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,668评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,670评论 1 294
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,664评论 3 415
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,422评论 0 270
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,864评论 1 307
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,178评论 2 331
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,340评论 1 344
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,015评论 5 340
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,646评论 3 323
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,265评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,494评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,261评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,206评论 2 352

推荐阅读更多精彩内容