本次采用的是GSE66507的数据文件
前期使用trim进行数据清洗,STAR进行比对,featurecounts进行计数分析
Length 该区间的长度,最后一列的表头是你的输入文件的名称,代表的是这个meta-feature的表达量。
现在进行DESeq2差异分析
library(DESeq2)#载入包
data <- read.table("C:/Users/cendy/Desktop/counts.txt", header=TRUE, quote="\t",skip = 1)#读取数据
#号开头的注释行,记录了运行的命令;Geneid开头的行是表头,Geneid代表统计的meta-features的名称,Chr,Start,End对应染色体上的位置,Strand代表正负链,由于一个基因有多个外显子构成,所以这里的染色体位置信息有多个,和外显子个数一一对应
sampleNames <- c("SRR1825955","SRR1825956","SRR1825957","SRR1825958","SRR1825959","SRR1825960","SRR1825961","SRR1825962","SRR1825963","SRR1825964","SRR1825965","SRR1825966","SRR1825967","SRR1825968","SRR1825969","SRR1825970","SRR2240578","SRR2240579","SRR2240580","SRR2240581","SRR2240582","SRR2240583","SRR2240584","SRR2240585","SRR2240586","SRR2240587","SRR2240588","SRR2240589","SRR2240590","SRR2240591","SRR2240592","SRR2240593","SRR2240594","SRR2240595","SRR2240596","SRR2240597","SRR2240598","SRR2240599","SRR2240600","SRR2240601","SRR2240602","SRR2240603","SRR2240604","SRR2240605","SRR2240606","SRR2240607","SRR2240608","SRR2240609","SRR2240610","SRR2240611","SRR2240612","SRR2240613","SRR2240614","SRR2240615","SRR2240616","SRR2240617","SRR2240618","SRR2240619","SRR2240620","SRR2240621","SRR2240622","SRR2240623","SRR2240624","SRR2240625","SRR2240626","SRR2240627","SRR2240628","SRR2240629","SRR2240630","SRR2240631","SRR2240632","SRR2240633","SRR2240634","SRR2240635","SRR2240636","SRR2240637","SRR2240638","SRR2240639","SRR2240640","SRR2240641","SRR2240642","SRR2240643","SRR2240644","SRR2240645","SRR2240646","SRR2240647")
# 前六列分别是Geneid Chr Start End Strand Length
# 我们要的是count数,所以从第七列开始
names(data)[7:92] <- sampleNames
countData <- as.matrix(data[7:92])
countData <- countData[rowMeans(countData)>1,]
rownames(countData) <- data$Geneid
database <- data.frame(name=sampleNames, condition=c("SRR1825955","SRR1825956","SRR1825957","SRR1825958","SRR1825959","SRR1825960","SRR1825961","SRR1825962","SRR1825963","SRR1825964","SRR1825965","SRR1825966","SRR1825967","SRR1825968","SRR1825969","SRR1825970","SRR2240578","SRR2240579","SRR2240580","SRR2240581","SRR2240582","SRR2240583","SRR2240584","SRR2240585","SRR2240586","SRR2240587","SRR2240588","SRR2240589","SRR2240590","SRR2240591","SRR2240592","SRR2240593","SRR2240594","SRR2240595","SRR2240596","SRR2240597","SRR2240598","SRR2240599","SRR2240600","SRR2240601","SRR2240602","SRR2240603","SRR2240604","SRR2240605","SRR2240606","SRR2240607","SRR2240608","SRR2240609","SRR2240610","SRR2240611","SRR2240612","SRR2240613","SRR2240614","SRR2240615","SRR2240616","SRR2240617","SRR2240618","SRR2240619","SRR2240620","SRR2240621","SRR2240622","SRR2240623","SRR2240624","SRR2240625","SRR2240626","SRR2240627","SRR2240628","SRR2240629","SRR2240630","SRR2240631","SRR2240632","SRR2240633","SRR2240634","SRR2240635","SRR2240636","SRR2240637","SRR2240638","SRR2240639","SRR2240640","SRR2240641","SRR2240642","SRR2240643","SRR2240644","SRR2240645","SRR2240646","SRR2240647"))
rownames(database) <- sampleNames
## 设置分组信息并构建dds对象
dds <- DESeqDataSetFromMatrix(countData, colData=database, design= ~ 1)
我们想要知道treatment的影响,其中sex和age是主要的变异来源,那么我们的公式则应该为design <- ~sex + age + treatment
公式中的波浪线应该在所有的代数式之前,从而告诉DESeq2在进行差异表达分析时,使用后面的公式。而公式中代数的名称应该与数据框中的列名相匹配。
此外,DESeq2还允许我们研究变异之间的交互作用。比如,我们想知道sex对于treatment的影响,那么我们的公式就应该是design <- ~ sex + age + treatment + sex:treatment
此处需要注意,因为我们关注的是sex对于treatment的交互作用,因此sex:treatment应该放在公式的最末尾
dds <- dds[ rowSums(counts(dds)) > 1, ]
## 使用DESeq函数估计离散度,然后差异分析获得res对象
dds <- DESeq(dds)
res <- results(dds)
write.csv(res, "C:/Users/cendy/Desktop/res_des_output.csv")
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
write.csv(resdata, "C:/Users/cendy/Desktop/all_des_output.csv", row.names=FALSE)