软件DESeq2
在R语言中运行
library(DESeq2)
library(readr)
mycounts <- read.csv("gene_count_matrix.csv")
rownames(mycounts)<-mycounts[,1] #将第一列作为行名
mycounts<-mycounts[,-1] #去除第一列
head(mycounts)
coldata<-read.csv("coldata.csv",header = F,row.names = 1)
#coldata.csv中一列为样本名,另一列为分组对照 /处理
colnames(coldata)[1]<- "condition" #命名第一列列名为condition
all(rownames(colData) == colnames(mycounts)) #查看行名是否等于列名
condition <- factor(coldata$condition)
condition = relevel( condition, "ck")
any(is.na(mycounts)) #查看是否有na值
mycounts[is.na(mycounts)] <- 0 #设NA值为0
dds <- DESeqDataSetFromMatrix(mycounts, coldata, design= ~ condition)
dds <- DESeq(dds)
res <- results(dds, contrast = c('condition', 'ht', 'ck'))
res = res[order(res$pvalue),]
head(res)
write.csv(res,file="DEseq2_allresults.csv",quote = FALSE)
res[which(res$log2FoldChange >= 1 & res$padj < 0.05),'sig'] <- 'up'
res[which(res$log2FoldChange <= -1 & res$padj < 0.05),'sig'] <- 'down'
res [which(abs(res$log2FoldChange) <= 1 | res$padj >= 0.05),'sig'] <- 'none'
diff_gene_deseq2 <- subset(res, sig %in% c('up', 'down'))
huoshandata <- subset(res, sig %in% c('up', 'down','none'))
write.csv(diff_gene_deseq2,file="diff_gene_deseq2.csv")
res_up <- subset(res, sig == 'up')
res_down <- subset(res, sig == 'down')
write.csv(res_up, file = 'diff_gene_up.csv')
write.csv(res_down, file = 'diff_gene_down.csv')
软件EdgeR
#安装
BiocManager::install("edgeR")
library(edgeR)
BiocManager::install("limma")
#读取文件
rawdata <- read.csv(file = "gene_count_matrix.csv",header = T,stringsAsFactors = F)
head(rawdata)
ovule_rawdata <- cbind(rawdata$gene_id,rawdata$SRR12606496,rawdata$SRR12606493)
head(ovule_rawdata)
#matrix变为data frame
ovule_rawdata <- data.frame(ovule_rawdata)
colnames(ovule_rawdata) <- c("X","gene_id","96_LM11","93_CML25")
head(ovule_rawdata)
# 转化完转化先存一份csv文件在电脑里,便于之后用电脑查看
write.csv(x = ovule_rawdata,file = "ovule_count.csv")
#存完之后直接从电脑导入你刚存的文件,这样做可以避免出现numeric数据框变成factor形式
ovule_rawdata <- read.table("ovule_count.csv",header = T,sep = ",")
head(ovule_rawdata)
ovule_rawdata <- ovule_rawdata[,-1]
#构建DEGlist
group <- 1:2
y <- DGEList(counts = ovule_rawdata[,3:4],genes = ovule_rawdata[,2],group = group)
#DGEList对象主要有三部分:
1、counts矩阵:包含的是整数counts;
2、samples数据框:包含的是文库(sample)信息。包含 lib.size列 :for the library size (sequencing depth) for each sample,如果不自定义, the library sizes will be computed from the column sums of the counts。其中还有一个group列,用于指定每个sample组信息
3、一个可选的数据框genes:gene的注释信息
#数据过滤 保留在至少在一个样本里有表达的基因(CPM > 1)
keep <- rowSums(cpm(y)>1) >= 1
y <- y[keep, , keep.lib.sizes=FALSE]
#标准化
y <- calcNormFactors(y)
#差异表达分析
不同差异表达分析工具的目标就是预测出dispersion(离散值), 有了离散值就能够计算p值. 那么dispersion怎么计算呢? edgeR给了几个方法
根据经验给定一个值(BCV, square-root-dispersion). edgeR给的建议是, 如果你是人类数据, 且实验做的很好(无过多的其他因素影响), 设置为0.4, 如果是遗传上相似的模式物种(这里为小鼠), 设置为0.1 (查询edgeR的bioconductor包所得)
如果觉得觉得基因较多的话,可以上调bcv的值
y_bcv <- y
bcv <- 0.4
et <- exactTest(y_bcv, dispersion = bcv ^ 2)
gene1 <- decideTestsDGE(et, p.value = 0.05, lfc = 0)
head(gene1)
summary(gene1)
# 改一下gene1的名称
colnames(gene1) <- "Signifi"
#组合将所需要的数据组成一个新的data.frame
results <- cbind(y$genes,y$counts,et$table,gene1)
#将新生成的results数据框写成一个excel数据表
write.csv(x = results,file = "DEresult.csv",row.names = F)