1、数据处理
数据:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE141295
解压,所有的文件到input文件夹
a = list.files("input") #(文件名) "GSM4200430_IgAN1.txt"
dir = paste("./input/",a,sep="") #(文件夹的名字,路径)"./input/GSM4200448_CTRL7.txt"
n = length(dir) # 24
merge.data = read.table(file = dir[1],header=F,dec = ".") #(先倒入第一个,后面就不用定义data。frame)
for (i in 2:n){
new.data = read.table(file = dir[i], header=F, dec = ".")
merge.data = merge(merge.data,new.data,by = "V1")
}
b<- substr(a,1,16) #切割基因文件名
c <- c("geneid",b)
colnames(merge.data) <- c
merge.data <- merge.data[-c(1:5),]
write.csv(merge.data, file = "merge.data")
exprSet <- merge.data
input:比对过了,倒进来的编号都对的上的
image.png
2、DESeq2差异化分析
2-1,先整理一下exprSet
library(stringr)
exprSet <- merge.data
exprSet$geneid <- substr(exprSet$geneid,1,15) #切割基因ID
exprSet <- exprSet[!duplicated(exprSet$geneid),] #去重复
row.names(exprSet) <- exprSet$geneid
exprSet <- exprSet[,-1]
input:
image.png
2-2 DESeq2差异化分析
#样本矩阵
colData <- read.csv("pdata-肾病.csv", header = T) #自己在外面构建的样本矩阵
row.names(colData) <- colData$X
coldata2 <- colData[2]
#DESeq2差异性分析
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,design = ~ condition)
dds <- DESeq(dds)
res <- results(dds, contrast=c("condition","IgAN","CTRL"))
DEG <- as.data.frame(res)
DEG = DEG[order(DEG$pvalue),]
write.csv(DEG,file="肾癌DEseq差异分析结果.csv")
input:
image.png
image.png
3、可视化-火山图
library(ggplot2)
#3-4 DESeq2结果可视化
#定义筛选
DEG[which(DEG$padj %in% NA),'sig'] <- 'no diff'
DEG[which(DEG$log2FoldChange >= 1 & DEG$padj < 0.05),'sig'] <- 'rich (p.adj < 0.05, log2FC >= 1)'
DEG[which(DEG$log2FoldChange <= -1 & DEG$padj < 0.05),'sig'] <- 'down (p.adj < 0.05, log2FC <= -1)'
DEG[which(abs(DEG$log2FoldChange) < 1 | DEG$padj >= 0.05),'sig'] <- 'no diff'
#画图
volcano_p2 <- ggplot(DEG, aes(log2FoldChange, -log(padj, 10))) +
geom_point(aes(color = sig), alpha = 0.6, size = 1) +
scale_color_manual(values = c('blue2', 'gray30', 'red2')) +
theme(panel.grid = element_blank(), panel.background = element_rect(color = 'black', fill = 'transparent'), legend.position = c(0.26, 0.92)) +
theme(legend.title = element_blank(), legend.key = element_rect(fill = 'transparent'), legend.background = element_rect(fill = 'transparent')) +
geom_vline(xintercept = c(-1, 1), color = 'gray', size = 0.25) +
geom_hline(yintercept = -log(0.05, 10), color = 'gray', size = 0.25) +
labs(x = 'log2 Fold Change', y = '-log10 p-value', color = NA) +
xlim(-5, 5)
ggsave('IgAN-volcano_p.png', volcano_p, width = 5, height = 6)
input
image.png
4、可视化 MA-plot
library(BiocGenerics)
plotMA(res,ylim = c(-5,5))
input ;差异基因,标记为蓝色
Image.png
5、热图(差异基因表达量热图)
na.fail(DEG)
DEG <- na.omit(DEG) #去除NA值
library(pheatmap)
diff_gene <-subset(DEG, padj < 0.05 & abs(log2FoldChange) > 1) #挑选出p < 0.05并且log2FoldChange)> 1的基因出来
diff_gene_sort <- diff_gene[order(diff_gene$padj),]
choose_gene <- head(rownames(diff_gene_sort),50) #按照p值排序,取前面的50个
choose_matrix <- exprSet[choose_gene,]
choose_matrix_scale <- scale(choose_matrix)
pheatmap(choose_matrix_scale, show_rownames = F, show_colnames = F,
annotation_col = coldata2)
input:
image.png