原始数据准备
学习生信我认为首先要具有全局观,首先要弄清楚做什么分析,需要准备什么文件,然后把这些文件变成什么样子。当准备工作做好后,直接调用R包进行分析和作图。为了方便看,下面先将原始数据及需要整理好的文件先列出来,然后再进行差异分析及作图。
-
从TCGA下载数据的样式:
-
基因注释文件:
- 整理后的数据样式:
整理成列为样本名,行为基因名
在此期间需要对基因进行的处理:只保留编码蛋白的基因,删除重复基因(在这过程中,需要使用基因注释文件)。
-
基因表达矩阵:
- 样本信息
样本注释信息:data.fram形式,样品名称,组别信息或处理信息。
当然,上面的表述过于简单,具体操作还需要看代码,后面我会列出来,且我运行不会报错的代码。
到这儿数据准备就结束了,下面我将从前到后整体走一遍。
原始数据整理
counts <- read.table("counts.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
#加载基因注释文件
Ginfo_0 <- read.table("gene_length_Table.txt",sep = "\t",check.names = F,stringsAsFactors = F,header = T,row.names = 1)
#只要编码RNA
Ginfo <- Ginfo_0[which(Ginfo_0$genetype == "protein_coding"),]
#取行名交集
comgene <- intersect(rownames(counts),rownames(Ginfo))
counts <- counts[comgene,]
Ginfo <- Ginfo[comgene,] #为了保证提取的行名和顺序一致
#判断是否一致
a <- rownames(counts)
b <- rownames(Ginfo)
identical(a,b) #返回TRUE代表行名一致
counts$Gene <- as.character(Ginfo$genename) #新增Gene Symbol
counts <- counts[!duplicated(counts$Gene),] #去重复
rownames(counts) <- counts$Gene #将行名变为Gene Symbol
counts <- counts[,-ncol(counts)] #去除最后一列
#数据整理完毕,保存文件。
write.table(counts, file = "LIHC_counts_mRNA_all.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#保存癌症患者的counts
tumor <- colnames(counts)[substr(colnames(counts),14,16) == "01A"]
counts_01A <- counts[,tumor]
write.table(counts_01A, file = "LIHC_counts_mRNA_01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#对样本信息进行注释
conditions=data.frame(sample=colnames(counts), group=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))) %>%
column_to_rownames("sample")
DESeq2差异分析
library(tidyverse)
library(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = conditions,
design = ~ group)
dds <- DESeq(dds)
resultsNames(dds)
res <- results(dds)
save(res,file = "LIHC_DEG.rda")#一定要保存!
res_deseq2 <- as.data.frame(res)%>%
arrange(padj) %>%
dplyr::filter(abs(log2FoldChange) > 2, padj<0.05)#根据自己需要
#保存差异基因
write.table(res_deseq2, file = "res_deseq2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
结果输出:
差异分析的可视化
TCGA差异分析热图
library(pheatmap)
#根据差异表达水平进行分组
load("LIHC_DEG.rda")
logFC_cutoff <- 2
DEG <- as.data.frame(res)%>%
arrange(padj) %>%
dplyr::filter(abs(log2FoldChange) > 2, padj < 0.05)
type1 = (DEG$padj < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
type2 = (DEG$padj < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(type1,"DOWN",ifelse(type2,"UP","NOT"))
table(DEG$change)
cg = rownames(DEG)[DEG$change !="NOT"] #提取上调或下调显著的基因列表
counts <-read.table("LIHC_counts_mRNA_all.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
exp_diff <- counts[cg,]
group_list=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(exp_diff) #对样本名标注类别信息
pheatmap(exp_diff,
annotation_col=annotation_col,
scale = "row",
show_rownames = F,
show_colnames =F,
color = colorRampPalette(c("navy", "white", "red"))(50),
cluster_cols =F,
fontsize = 10,
fontsize_row=3,
fontsize_col=3)
输出的结果 (样本太多,基因太多,图太丑了。不过真正的项目分析,还需要精简和打磨,这个可以参考其他优秀的作者,很多人分享相关的代码)
TCGA差异分析热图
library(tidyverse)
library(ggpubr)
library(ggthemes)
load("LIHC_DEG.rda")
DEG <- as.data.frame(res)%>%
arrange(padj) %>%
dplyr::filter(abs(log2FoldChange) > 1, padj < 0.05)
logFC_cutoff <- 1
type1 = (DEG$padj < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
type2 = (DEG$padj < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(type1,"DOWN",ifelse(type2,"UP","NOT"))
table(DEG$change)
DEG$logP <- -log10(DEG$padj)
#添加基因标签信息
DEG$Label = "" #新加一列label
DEG <- DEG[order(DEG$padj), ] #对差异基因的p值进行从小到大的排序
DEG$Gene <- rownames(DEG)
#高表达的基因中,选择fdr值最小的5个(注意排序是按照-log10(padj)排的,不是Foldchange,这部分酌情考虑更改)
up.genes <- head(DEG$Gene[which(DEG$change == "UP")], 5)
#低表达的基因中,选择fdr值最小的5个
down.genes <- head(DEG$Gene[which(DEG$change == "DOWN")], 5)
#将up.genes和down.genes合并,并加入到Label中
DEG.top5.genes <- c(as.character(up.genes), as.character(down.genes))
DEG$Label[match(DEG.top5.genes, DEG$Gene)] <- DEG.top5.genes
#作图
ggscatter(DEG, x = "log2FoldChange", y = "logP",
color = "change",
palette = c("blue", "red", "black"),
size = 1,
label = DEG$Label,
font.label = 8,
repel = T,
xlab = "log2FoldChange",
ylab = "-log10(Adjust P-value)") +
theme_base() +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed")
结果输出:
致谢:
B站UP主-- 小陈医生想躺平
他是一位特别优秀的讲师,讲的很细致,逻辑性也很强,强烈推荐生信小白观看。
最近先把分析的框架写出来,后面有时间再慢慢打磨。