2019-10-22-火山图，国庆留下的坑，现在来填

具体的题目和前期的倒腾可以看我10-1日前后的记录，这次重新翻出来练习

第一步找到表达矩阵信息和样本信息构建对象

rm(list = ls())
options(stringsAsFactors = F)
library(DESeq2)
library( "biomaRt" ) 
library("pheatmap")
library("RColorBrewer")
library( "genefilter")
#1.获取counts矩阵和样本信息
a=read.table('exprSet/GSM2653819_Counts_notmergedTR_Healthy1_Tissue_1.txt.gz')

tmp=do.call(cbind,lapply(list.files('exprSet/'),function(x){
  return(read.table(file.path('exprSet/',x)) )
}))
b=tmp[,seq(3,42,by=3)]
rownames(b)=tmp[,1]
library(stringr)
tmp2=str_split(list.files('exprSet/'),'_',simplify = T)
colnames(b)=tmp2[,1]
datExpr=b[-1,]
datTraits=as.data.frame(tmp2[,c(1,4:6)])
datTraits$V3=unlist(lapply(datTraits$V3,function(x){
                          str_split(x,"[.]")[[1]][1]
                     }))
datTraits$V4=unlist(lapply(datTraits$V4,function(x){
  str_split(x,"[.]")[[1]][1]
}))
datTraits$V5=paste0(datTraits$V3,datTraits$V4)
newdatTraits=as.data.frame(datTraits[,c(1,2,5)])
colnames(newdatTraits)=c("gsm","group","baches")
datExpr$symbol=a[-1,2]
save(newdatTraits,datExpr, file = "1021rearranged-GSE84073.Rdata")
load(file = "1021rearranged-GSE84073.Rdata")
#法2提取样本信息
library(GEOquery)
a=getGEO('GSE84073')
length(a)
a[[1]]
#需要去的样本名称在a[[1]]里前14个
pd=pData(a[[1]])
pd=pd[c(1:14),]
library(stringr)
datTraits = data.frame(gsm=pd[,2],
                       treat = unlist(lapply(pd$title,function(x){
                         str_split(x," ")[[1]][1]
                       })),
                      gender = unlist(lapply(pd$characteristics_ch1.2,function(x){
                        str_split(x,":")[[1]][2]
                      }))
)
save(datExpr,datTraits,file = 'input.Rdata')

由于这里面2种方式得到的矩阵和表达信息不完全匹配的，但是，分组信息都没有选对，因为任务是做火山图，一般是选2组来比较，我这个数据集里面是有4个组，Healthy,HCC,CHC,CC3共4个组，所有可以做3个图。

那么下面我们需要重新进行分组，然后进行过滤操作，因为原来是counts信息，需要进行筛选归一化等处理。这里又遇到了一些问题因为之前参考的代码有点问题，今天重新阅读说明书来处理
第一步构建DEGList

rm(list = ls())
options(stringsAsFactors = F)
library(DESeq2)
library(biomaRt)
library(pheatmap)
library(RColorBrewer)
library(genefilter)
library(limma)
library(edgeR)
#######LOAD SAMPLES#####
#设置需要操作的目录
directory <- "."

###ALL
utils::untar("GSE84073_RAW.tar", exdir = ".")#解压缩文件

sampleFiles <- grep("Counts",list.files(directory),value=TRUE)#提取文件名称

sampleNames <- sub("(.*)(.gz)", "\\1", sampleFiles)
#用edgeR读入DGEList
files <- sampleNames
for(i in paste(files, ".gz", sep=""))
  R.utils::gunzip(i, overwrite=TRUE)
files[1]#查看第一个文件名
read.delim(files[1], nrow=5)#查看第一个文件内容
x <- readDGE(files, columns=c(1,3))#将所有文件读入对象DEGLlist
class(x)
dim(x)

#DESeq2读入数据
sampleFiles2 <- grep("Counts",list.files(directory),value=TRUE)

sampleCondition <- sapply(sampleFiles2, function(x){
  sample <- unlist(strsplit(x, '_')[[1]][4])
  if (grepl("Healthy1|Healthy2|Healthy3", sample)){
    cond <- "Healthy"
  } else if (grepl("HCC1|HCC3", sample)){
    cond <- "HCC"
  } else if (grepl("CHC1|CHC2", sample)){
    cond <- "CHC"
  } else {
    cond <- "CC"
  } 
  cond
})

library(stringr)
sN <- sub("(.*)(.txt.gz)", "\\1", sampleFiles)
sN <- sub("(.*)(_Counts_notmergedTR)", "\\1", sN)
sampleNames2 <- unlist(lapply(sN,function(x){
  str_split(x,"_")[[1]][1]
}))
sampleTable <- data.frame(sampleName  = sampleNames2,
                          fileName = sampleFiles2,
                          condition = sampleCondition)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                       directory = directory,
                                       design= ~ condition)
#用DESeqDataSetFromHTSeqCount这个函数来读取的时候，需要将这些txt文档合并一个。因此未能成功。
#那么下面的分析用edgeR读入DGEList来操作

x$samples
x$counts[1:4,1:4]
head(x$samples)
colnames(x$samples)
#虽然前面的不能读入，但是前面得到的sampleTable还是可以用来做转换的，现在就达到了目的。
x$samples$files <- sampleTable$sampleName
x$samples$group <- sampleTable$condition
x$samples$files
x$samples$group
ddsHTSeq <- x

花费了很多的时间来构建这个对象，并对其做了深入的了解。
本来打算继续对DEseq2包进行探索的作了如下尝试

#先看能否套用原来GitHub的代码来做过滤
#########FILTERING#######
#Remove genes with zero counts
dds <- ddsHTSeq[rowSums(counts(ddsHTSeq)) > 1, ]

发现这个代码还是不能运行，提示么有counts这个函数

第二步后面就按照`edgeR`包的说明书来进行后续分析

参考http://www.statsci.org/smyth/pubs/edgeRChapterPreprint.pdf

#那就放弃，还是用edgeR包的方法继续
#查阅说明书，A gene is kept in the analysis if it is sufficiently expressed (CPM > 1) in at least two samples:
y=x
CPM <- cpm(y)
keep <- rowSums(CPM > 1) >= 2
y <- y[keep, ]
#过滤后剩下17924个基因，说明书上说，这个种过滤方法没有牵扯到分组等信息，最大量的保留了数据
dim(y)
head(y$counts)
y <- calcNormFactors(y)
y$samples

The calcNormFactors function returns the DGEList data argument back with only the norm.factors changed.
查询了说明书了解到，这个返回值如果小于1，表明，该库中少数高度表达的基因已分配过多片段，这意味着剩余基因的片段较少，测序深度可能不够。在此处作为缩放因子表示样本计数分布的形状之间的组成差异。是归一化样本的参数。

第三步注释过滤后的基因转换为symbol

先找到NCBI数据库的信息数据ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/
找到人类的进行下载：

image.png

下载后读入R

#utils::untar("Homo_sapiens.gene_info.gz", exdir = ".")本来想用这个解压，但是报错说不识别，不知道为啥
#那就自己解压好了
anno <- read.delim(file="Homo_sapiens.gene_info", header=FALSE, skip=1)
anno[1:4,1:4]
m <- match(rownames(y), anno[,2])
#发现无法匹配到，查看了具体信息发现，名称对不上，现在y的行名为ENSG……，也就是ENSEMBL基因名，这时候需要用人的数据包来注释：
library(org.Hs.eg.db)
?org.Hs.eg.db
ls("package:org.Hs.eg.db")
g2s=toTable(org.Hs.egSYMBOL);head(g2s)
g2e=toTable(org.Hs.egENSEMBL);head(g2e)
m <- match(rownames(y), g2e[,2])
y$genes <- g2s[m,]
head(y$genes)

简单数据探索

#绘图
plotMDS(y, col=as.numeric(y$samples$group))

查看样本分组情况是否合理

从plotMDS函数返回的图可以看到CHC的样本分组不太好，但是对照组的比较集中，图中Healthy的样本都聚在了一起，总的来说健康和疾病的区分还是很明显的。

#绘制火山图的探索
fac <- y$samples$group
fac <- factor(fac)
design <- model.matrix(~0+fac)
colnames(design) <- levels(fac)
design
design1=design[,c(1,4)]
y <- estimateDisp(y, design1)
y$common.dispersion
summary(y$tagwise.dispersion)
y$prior.df
plotBCV(y)
fit <- glmFit(y, design1)
lrt <- glmLRT(fit, contrast=c(0,1))
topTags(lrt)
tp <- topTags(lrt, n=Inf)
#画CC与Healthy比较的火山图
library(ggplot2)
DEG=tp$table
logCPM_cutoff <- with(DEG,mean(abs(logCPM)) + sd(abs(logCPM)) ) 
DEG$result = as.factor(ifelse(DEG$PValue < 0.05 & abs(DEG$logCPM) > logCPM_cutoff,
                              ifelse(DEG$logCPM > logCPM_cutoff ,'UP','DOWN'),'NOT'))
this_tile <- paste0('Cutoff for logCPM is ',round(logCPM_cutoff,3), #round保留小数位数
                    '\nThe number of up gene is ',nrow(DEG[DEG$result =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$result =='DOWN',])
)
ggplot(data=DEG, aes(x=logCPM, y=-log10(PValue), color=result)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("logCPM fold change") + ylab("-log10 p-value") +
  ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red'))
save(CPM,DEG,design,design1,tp, file = "2019-10-22.Rdata")

尝试了很多次，但是这个火山图还是不会画，不知参数设置的问题，还是我对这个东西理解的问题，做流程的的东西还能理解，自己做分析就各种出问题。主要还是对对象的操作函数不熟练，还有统计上也需要补课。
用之前的办法是行不通的，这个计算不太一样,对说明书的不是理解的也是不全面，用了它的代码

DE <- tp$table[tp$table$FDR < 0.01,]$gene_id
plotSmear(lrt, de.tags=DE, cex = 0.4)
abline(h=c(-1, 1), col="blue")

不忍直视的结果