RNA-seq多Run合并、VST标准化、PCA、差异分析

以下代码用于SLURM调配系统的服务器递交任务
如果在个人电脑上的注意安装PEER软件,并配置环境变量,使用R代码部分不能照抄,需要调整代码

Combind_vst_DEGs.sh

#! /bin/bash
#SBATCH --time=3:00:00
#SBATCH --cpus-per-task=10
#SBATCH --mem=10g
ml R/3.5
ml peer
Rscript Combind_vst_DEGs.R counts.tsv sampleinfo.tsv

Combind_vst_DEGs.R

#!/usr/bin/env Rscript
#author: CFJiang 08032019
#should change the count cols
#should chang the DEGs conditions
suppressMessages(library(DESeq2, quietly=T))
suppressMessages(library(argparse))
suppressMessages(library(RColorBrewer))
suppressMessages(library(ggplot2))
suppressMessages(library(ggrepel))
suppressMessages(library(plyr))
combine_columns <- function(df, info) {
  info$collapse <- factor(info$collapse)
  for (f in levels(info$collapse)) {
    samples <- rownames(subset(info, info$collapse == f))
    if (length(samples)==1){
      df[[f]] <- df[,samples]
      df[,samples] <- list(NULL)
    }else{
    df[[f]] <- rowSums(df[,samples])
    df[,samples] <- list(NULL)}
  }
  return(df)
}
#Combind counts files and caculate the cpm
parser <- ArgumentParser(description="")
parser$add_argument('file', metavar="*.tsv", help="input gene counts")
parser$add_argument('info', metavar="*.tsv", help="sample information file; requires sample names to be in first column and experimental condition to be in 'condition' column")
parser$add_argument('-o', '--out', metavar="*", help="output file name prefix")
args <- parser$parse_args()
if (is.null(args$out)) { args$out <- sub('.counts.tsv', '', args$file) }
prefix <- args$out

countdata <- read.table(args$file, sep="\t", header=TRUE, row.names=1)
info<- read.table(args$info, sep="\t", header=TRUE,row.names =1 )

countdata_com<- combine_columns(countdata[,4:ncol(countdata)], info)# Should ajust this
write.csv(countdata_com,paste0(prefix,"_combind_counts.csv"))
cpm <- t(t(countdata_com)/colSums(countdata_com) * 1000000)
row.names(cpm) <- row.names(countdata_com)
write.csv(cpm,paste0(prefix,"_combind_cpm.csv"))
pass <- rowSums(cpm > 1)
pass <- pass[pass > (ncol(countdata_com) / 2)]
countdata_pass <- subset(countdata_com, rownames(countdata_com) %in% names(pass))
write.csv(countdata_pass,paste0(prefix,"_combind_filter_counts.csv"))
cpm_pass <- subset(cpm, rownames(cpm) %in% names(pass))
write.csv(cpm_pass,paste0(prefix,"_combind_filter_cpm.csv"))
#--------------Construct Experiment---------------
#This is for paired analysis
colData<-info[!duplicated(info$collapse),]
row.names(colData)<-as.character(colData$collapse)
countdata_com_new<-countdata_pass[,as.character(colData$collapse)]
ddsOrig <- DESeqDataSetFromMatrix(countData=countdata_com_new, 
                                  colData=colData, 
                                  design=~subject_id + condition)
suppressMessages(ddsOrig <- DESeq(ddsOrig))

#plot PCA
vsd <- vst(ddsOrig, blind=FALSE)
pdf(file=paste0(prefix, '.pca.pdf'))
pcaData <- plotPCA(vsd, intgroup=c('condition', 'subject_id'), returnData=T)
xmin <- min(pcaData$PC1)
xmax <- max(pcaData$PC1)
xmin <- xmin - 0.3*(xmax - xmin)
ggplot(pcaData, aes(PC1, PC2, color=subject_id, shape=condition)) + 
  geom_text_repel(aes(label=name)) +
  geom_point(size=3) +
  xlim(xmin, NA) +
  theme_minimal()
dev.off()

if (args$collapse) {
  vsd <- vst(ddsOrig, blind=FALSE)
  pdf(file=paste0(prefix, '.techreplicates.pca.pdf'))
  pcaData <- plotPCA(vsd, intgroup=c("condition", "collapse"), returnData=T)
  xmin <- min(pcaData$PC1)
  xmax <- max(pcaData$PC1)
  xmin <- xmin - 0.3*(xmax - xmin)
  print(ggplot(pcaData, aes(PC1, PC2, shape=condition, color=collapse)) + 
          geom_text_repel(aes(label=name)) +
          geom_point(size=3) +
          xlim(xmin, NA) +
          theme_minimal()
  )
  dev.off()
}

#DEGs
conditions <- levels(coldata$condition)
combinations <- expand.grid(conditions, conditions)
combinations <- combinations[duplicated(apply(combinations, 1,
                                              function(x) paste(sort(x),collapse=''))), ]

message("Producing pairwise differential expression across conditions...")
testcombo <- function(x, ddsOrig) {
  t1 <- x[1]
  t2 <- x[2]
  if(t1 != t2) {
    name <- paste0(t1, '_v_', t2)
    res <- results(ddsOrig, contrast=c("condition", t1, t2))
    resOrdered <- res[order(res$padj),]
    write.table(res, 
                file=paste0(prefix, '_', name, '.DESeq2.tsv'), 
                quote=FALSE, sep="\t", col.names=NA)
  }
}

x <- apply(combinations, 1, testcombo, dds=ddsOrig)
message("Finished.")

#Normalized in DESeq2
rld <- rlogTransformation(ddsOrig)
exprSet_new<-assay(rld)
write.csv(exprSet_new,paste0(prefix,"_combind_filter_DESeq2_normalized.csv"))
###############VST
tmpfile <- tempfile(fileext=".csv")
tmpdir <- tempdir(check = FALSE)
vsd <- varianceStabilizingTransformation(as.matrix(countdata_pass))
write.table(vsd, file=paste0(args$out, ".vst.tsv"), sep="\t",
            quote=F, col.names=NA)
write.table(t(vsd), file=tmpfile, sep=",",
            quote=F, row.names=F, col.names=F)
command <- paste("peertool -f", tmpfile, "-n 10 -o", tmpdir)
message("Running command: ", command)
system(command)
peerCov <- read.table(file.path(tmpdir, "X.csv"), sep=",", header=F)
unlink(tmpfile)
unlink(tmpdir)
row.names(peerCov) <- colnames(vsd)
colnames(peerCov)

normalizeLM <- function(datarow, covariates) {
  datarow <- unlist(datarow)
  x <- mean(datarow)
  covariates$gene <- datarow
  fit <- lm(gene ~ ., data=covariates)
  return(residuals(fit) + x)
}

normalized <- adply(vsd, 1, normalizeLM, covariates=peerCov)
write.csv(normalized, file=paste0(args$out, ".vst.normalized.csv"))



最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容