rm(list = ls())
options(stringsAsFactors = F)
#读入公司提供的excel
library(readxl)#如果没有这个包得自己先安装
library(tidyverse)#如果没有这个包得自己先安装
eSet<- read_excel("Gene Expression Profiling Data.xls",sheet=1,na="NA")#文件名记得改成自己的
eSet <- arrange(eSet,GeneSymbol)
class(eSet)
#(1)提取表达矩阵exp
eSet= as.data.frame(eSet)
#通过平均法去除重复基因名
exprset <- eSet[,c(2,9,10,11,12,13,14)]
exprset <- arrange(exprset,GeneSymbol)
class(exprset)
exprset_symbol <- aggregate(x = exprset[,2:ncol(exprset)],
by = list(exprset$GeneSymbol),
FUN = mean)
head(exprset_symbol)
exp <- exprset_symbol
colnames(exp)[2:4] <- paste0('NC_',1:3)
colnames(exp)[5:7] <- paste0('si_',1:3)
colnames(exp) <- 'symbol'
expforGSEA <- data.frame(symbol=exp$symbol,
DESCRIPTION=rep('na',nrow(exp)))
expforGSEA <- merge(expforGSEA,exp)
group <- c(rep("NC", 3), rep("si", 3))
group <- paste(group, collapse = " ")
group <- c(paste(c(3+3,2,1), collapse = " "), "# NC si", group)
write.table(expforGSEA,file = 'expforGSEA.txt',sep ='\t',col.names = T, row.names = F, quote = F)#保存了的文件记得打开看看有没有错误
write.table(file = "group.cls", group, col.names = F, row.names = F, quote = F)#同上