以下代码用于SLURM调配系统的服务器递交任务
如果在个人电脑上的注意安装PEER软件,并配置环境变量,使用R代码部分不能照抄,需要调整代码
Combind_vst_DEGs.sh
#! /bin/bash
#SBATCH --time=3:00:00
#SBATCH --cpus-per-task=10
#SBATCH --mem=10g
ml R/3.5
ml peer
Rscript Combind_vst_DEGs.R counts.tsv sampleinfo.tsv
Combind_vst_DEGs.R
#!/usr/bin/env Rscript
#author: CFJiang 08032019
#should change the count cols
#should chang the DEGs conditions
suppressMessages(library(DESeq2, quietly=T))
suppressMessages(library(argparse))
suppressMessages(library(RColorBrewer))
suppressMessages(library(ggplot2))
suppressMessages(library(ggrepel))
suppressMessages(library(plyr))
combine_columns <- function(df, info) {
info$collapse <- factor(info$collapse)
for (f in levels(info$collapse)) {
samples <- rownames(subset(info, info$collapse == f))
if (length(samples)==1){
df[[f]] <- df[,samples]
df[,samples] <- list(NULL)
}else{
df[[f]] <- rowSums(df[,samples])
df[,samples] <- list(NULL)}
}
return(df)
}
#Combind counts files and caculate the cpm
parser <- ArgumentParser(description="")
parser$add_argument('file', metavar="*.tsv", help="input gene counts")
parser$add_argument('info', metavar="*.tsv", help="sample information file; requires sample names to be in first column and experimental condition to be in 'condition' column")
parser$add_argument('-o', '--out', metavar="*", help="output file name prefix")
args <- parser$parse_args()
if (is.null(args$out)) { args$out <- sub('.counts.tsv', '', args$file) }
prefix <- args$out
countdata <- read.table(args$file, sep="\t", header=TRUE, row.names=1)
info<- read.table(args$info, sep="\t", header=TRUE,row.names =1 )
countdata_com<- combine_columns(countdata[,4:ncol(countdata)], info)# Should ajust this
write.csv(countdata_com,paste0(prefix,"_combind_counts.csv"))
cpm <- t(t(countdata_com)/colSums(countdata_com) * 1000000)
row.names(cpm) <- row.names(countdata_com)
write.csv(cpm,paste0(prefix,"_combind_cpm.csv"))
pass <- rowSums(cpm > 1)
pass <- pass[pass > (ncol(countdata_com) / 2)]
countdata_pass <- subset(countdata_com, rownames(countdata_com) %in% names(pass))
write.csv(countdata_pass,paste0(prefix,"_combind_filter_counts.csv"))
cpm_pass <- subset(cpm, rownames(cpm) %in% names(pass))
write.csv(cpm_pass,paste0(prefix,"_combind_filter_cpm.csv"))
#--------------Construct Experiment---------------
#This is for paired analysis
colData<-info[!duplicated(info$collapse),]
row.names(colData)<-as.character(colData$collapse)
countdata_com_new<-countdata_pass[,as.character(colData$collapse)]
ddsOrig <- DESeqDataSetFromMatrix(countData=countdata_com_new,
colData=colData,
design=~subject_id + condition)
suppressMessages(ddsOrig <- DESeq(ddsOrig))
#plot PCA
vsd <- vst(ddsOrig, blind=FALSE)
pdf(file=paste0(prefix, '.pca.pdf'))
pcaData <- plotPCA(vsd, intgroup=c('condition', 'subject_id'), returnData=T)
xmin <- min(pcaData$PC1)
xmax <- max(pcaData$PC1)
xmin <- xmin - 0.3*(xmax - xmin)
ggplot(pcaData, aes(PC1, PC2, color=subject_id, shape=condition)) +
geom_text_repel(aes(label=name)) +
geom_point(size=3) +
xlim(xmin, NA) +
theme_minimal()
dev.off()
if (args$collapse) {
vsd <- vst(ddsOrig, blind=FALSE)
pdf(file=paste0(prefix, '.techreplicates.pca.pdf'))
pcaData <- plotPCA(vsd, intgroup=c("condition", "collapse"), returnData=T)
xmin <- min(pcaData$PC1)
xmax <- max(pcaData$PC1)
xmin <- xmin - 0.3*(xmax - xmin)
print(ggplot(pcaData, aes(PC1, PC2, shape=condition, color=collapse)) +
geom_text_repel(aes(label=name)) +
geom_point(size=3) +
xlim(xmin, NA) +
theme_minimal()
)
dev.off()
}
#DEGs
conditions <- levels(coldata$condition)
combinations <- expand.grid(conditions, conditions)
combinations <- combinations[duplicated(apply(combinations, 1,
function(x) paste(sort(x),collapse=''))), ]
message("Producing pairwise differential expression across conditions...")
testcombo <- function(x, ddsOrig) {
t1 <- x[1]
t2 <- x[2]
if(t1 != t2) {
name <- paste0(t1, '_v_', t2)
res <- results(ddsOrig, contrast=c("condition", t1, t2))
resOrdered <- res[order(res$padj),]
write.table(res,
file=paste0(prefix, '_', name, '.DESeq2.tsv'),
quote=FALSE, sep="\t", col.names=NA)
}
}
x <- apply(combinations, 1, testcombo, dds=ddsOrig)
message("Finished.")
#Normalized in DESeq2
rld <- rlogTransformation(ddsOrig)
exprSet_new<-assay(rld)
write.csv(exprSet_new,paste0(prefix,"_combind_filter_DESeq2_normalized.csv"))
###############VST
tmpfile <- tempfile(fileext=".csv")
tmpdir <- tempdir(check = FALSE)
vsd <- varianceStabilizingTransformation(as.matrix(countdata_pass))
write.table(vsd, file=paste0(args$out, ".vst.tsv"), sep="\t",
quote=F, col.names=NA)
write.table(t(vsd), file=tmpfile, sep=",",
quote=F, row.names=F, col.names=F)
command <- paste("peertool -f", tmpfile, "-n 10 -o", tmpdir)
message("Running command: ", command)
system(command)
peerCov <- read.table(file.path(tmpdir, "X.csv"), sep=",", header=F)
unlink(tmpfile)
unlink(tmpdir)
row.names(peerCov) <- colnames(vsd)
colnames(peerCov)
normalizeLM <- function(datarow, covariates) {
datarow <- unlist(datarow)
x <- mean(datarow)
covariates$gene <- datarow
fit <- lm(gene ~ ., data=covariates)
return(residuals(fit) + x)
}
normalized <- adply(vsd, 1, normalizeLM, covariates=peerCov)
write.csv(normalized, file=paste0(args$out, ".vst.normalized.csv"))