加载包,设置差异倍数logFC和P值
#=======================================================
#=======================================================
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
library(dplyr)
library(tidyr)
library(tibble)
#清除环境变量
rm(list=ls())
padj=0.05
logFC=0.5
读取各个数据集的差异基因
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
a <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
b <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
c <-read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
d <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
e <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
f <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
g <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
h <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
读取各个数据集的差异基因
读取这些差异基因的数据框,其必须含有的列是logfc和p值两类
保存符合我们设置的差异基因标准基因
# --------------------------------------------------------
#
# --------------------------------------------------------
a <- a %>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(a, file = "allDiff1.csv", row.names = F)
b <- b %>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(b, file = "allDiff2.csv", row.names = F)
c <- c %>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(c, file = "allDiff3.csv", row.names = F)
d <- d%>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(d, file = "allDiff4.csv", row.names = F)
e <- e%>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(e, file = "allDiff5.csv", row.names = F)
f <- f%>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(f, file = "allDiff6.csv", row.names = F)
g <- g%>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(g, file = "allDiff7.csv", row.names = F)
h <- h%>%
rownames_to_column(var='gene')%>%
dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
write.csv(h, file = "allDiff8.csv", row.names = F)
得到合并后的差异基因
# --------------------------------------------------------
#
# --------------------------------------------------------
#读取差异基因分析结果,其必须包含两列:基因名和差异倍数#
files=c("allDiff1.csv","allDiff2.csv",
"allDiff3.csv","allDiff4.csv",
"allDiff5.csv","allDiff6.csv",
"allDiff7.csv", "allDiff8.csv")
upList=list()
downList=list()
allFCList=list()
for(i in 1:length(files)){
inputFile=files[i]
rt=read.csv(inputFile,header=T,sep = ',', row.names = 1) # 注意文件读取
rt$Gene <- rownames(rt)
rt <- rt %>%
distinct(Gene, .keep_all = T)%>%
arrange(logFC)%>%
dplyr::select(Gene, logFC)
header=unlist(strsplit(inputFile,"_"))
downList[[header[1]]]=as.vector(rt[,1])
upList[[header[1]]]=rev(as.vector(rt[,1]))
fcCol=rt[,1:2]
colnames(fcCol)=c("Gene",header[[1]])
allFCList[[header[1]]]=fcCol}
#合并文件
mergeLe=function(x,y){
merge(x,y,by="Gene",all=T)}
newTab=Reduce(mergeLe,allFCList)
rownames(newTab)=newTab[,1]
newTab=newTab[,2:ncol(newTab)]
newTab[is.na(newTab)]=0
#计算共同上调基因
library(RobustRankAggreg)
upMatrix = rankMatrix(upList)
upAR = aggregateRanks(rmat=upMatrix)
colnames(upAR)=c("Name","Pvalue")
upAdj=p.adjust(upAR$Pvalue,method="bonferroni")
upXls=cbind(upAR,adjPvalue=upAdj)
upFC=newTab[as.vector(upXls[,1]),]
upXls=cbind(upXls,logFC=rowMeans(upFC))
write.table(upXls,file="up.xls",sep="\t",quote=F,row.names=F)
upSig=upXls[(upXls$Pvalue<padj & upXls$logFC>logFC),]
write.table(upSig,file="upSig.xls",sep="\t",quote=F,row.names=F)
#计算共同下调基因
downMatrix = rankMatrix(downList)
downAR = aggregateRanks(rmat=downMatrix)
colnames(downAR)=c("Name","Pvalue")
downAdj=p.adjust(downAR$Pvalue,method="bonferroni")
downXls=cbind(downAR,adjPvalue=downAdj)
downFC=newTab[as.vector(downXls[,1]),]
downXls=cbind(downXls,logFC=rowMeans(downFC))
write.table(downXls,file="down.xls",sep="\t",quote=F,row.names=F)
downSig=downXls[(downXls$Pvalue<padj & downXls$logFC< -logFC),]
write.table(downSig,file="downSig.xls",sep="\t",quote=F,row.names=F)
#合并上调与下调基因
allSig = rbind(upSig,downSig)
colnames(allSig)
write.csv(allSig,file = 'allSign.csv')
绘图准备:仅仅保留那些在所有数据存在的基因,以及删除TCGA和GEO趋势不一致的基因
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
a <- read.csv("allDiff.csv", header = T, row.names = 1) %>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
b <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
c <-read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
d <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
e <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
f <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
g <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
h <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
target <- allSig$Name
length(target)
a <- subset(a, select =c("gene", "logFC" ))
b <- subset(b, select =c("gene", "logFC" ))
c <- subset(c, select =c("gene", "logFC" ))
d <- subset(d, select =c("gene", "logFC" ))
e <- subset(e, select =c("gene", "logFC" ))
f <- subset(f, select =c("gene", "logFC" ))
g <- subset(g, select =c("gene", "logFC" ))
h <- subset(h, select =c("gene", "logFC" ))
gene <- intersect(a$gene, b$gene)
gene <- intersect(c$gene, gene)
gene <- intersect(d$gene, gene)
gene <- intersect(e$gene, gene)
gene <- intersect(f$gene, gene)
gene <- intersect(g$gene, gene)
gene <- intersect(h$gene, gene)
gene <- intersect(allSig$Name, gene)
dat <- data.frame()
for (i in 1:length(gene)) {
dat[i,1] <- a[which(a$gene %in% gene[i]),'logFC']
dat[i,2] <- b[which(b$gene %in% gene[i]),'logFC']
dat[i,3] <- c[which(c$gene %in% gene[i]),'logFC']
dat[i,4] <- d[which(d$gene %in% gene[i]),'logFC']
dat[i,5] <- e[which(e$gene %in% gene[i]),'logFC']
dat[i,6] <- f[which(f$gene %in% gene[i]),'logFC']
dat[i,7] <- g[which(g$gene %in% gene[i]),'logFC']
dat[i,8] <- h[which(g$gene %in% gene[i]),'logFC']
}
colnames(dat) <- c("GSE18864", "GSE58812",
"GSE76124", "GSE76250",
"GSE83937", "GSE95700",
"GSE106977",'TCGA'
)
rownames(dat) <- gene
dat <- dat[which(rownames(dat) %in% rownames(allSig)),]
hminput <- dat
for (i in 1:dim(hminput)[1]) {
hminput$re[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]*hminput[i,8]
hminput$re1[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]
}
table(hminput$re )
hminput <- subset(hminput, hminput$re != 0)
hminput$re <- NULL
hminput$re1 <- hminput$re1*hminput$TCGA
hminput <- subset(hminput, hminput$re1 >0)
hminput$re1 <- NULL
for (x in 1:nrow(hminput )) {
for ( y in 1:ncol(hminput )) {
if(hminput[x,y] > 2){
hminput[x,y] <- 2}
if(hminput[x,y] < -2){
hminput[x,y] <- -2}
}
}
bk = unique(c(seq(-2,2, length=100)))
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
library(pheatmap)
pdf(file="logFC.pdf",width = 4,height = 4)
pheatmap(hminput,display_numbers = F,
cluster_cols = T,
cluster_row = T,
border_color = NA,
breaks = bk,
show_colnames = T,
show_rownames = T,
color = colorRampPalette(c("#20B6E2",
"#020303",
"#F5EB17"))(100),
fontsize_row=1,
fontsize_col=5 )
dev.off()
代码数据私信我