Swimming downstream: statistical analysis of differential transcript usage following Salmon quantification
Salmon 进行转录本定量
转录组分析:RNA-seq pipeline through kallisto or salmon
tximport 将 Salmon 定量结果导入 DESeq2
提取 genecode的gtf注释信息
我的例子,比较复杂
构建索引
#! /bin/bash
#SBATCH --time=2:00:00
#SBATCH --cpus-per-task=20
#SBATCH --mem=20g
##index
ml salmon
transcript_fa=~/all.human.mouse.isoforms.fa
out_dir=~/humanized
salmon index -t $transcript_fa -i $out_dir/humanized_index \
-k 31 --gencode -p 20 --keepFixedFasta --keepDuplicates
比对
#!/bin/bash
#SBATCH --job-name=QC_trimed
#SBATCH --time=1:00:00
#SBATCH --cpus-per-task=2
#SBATCH --mem=2g
index=~/humanized_index
data_dir=~/short_reads/fastq
out_dir=~/short_reads/salmon
threads=5
log_dir=${out_dir}/log
job_dir=${out_dir}/jobs
mkdir -p $log_dir
mkdir -p $job_dir
for i in $(ls $pwd *.fq.gz | sed s/_[12].fq.gz// | sort -u);do
job_file="${job_dir}/${i}.job"
echo "#!/bin/bash
#SBATCH --job-name=${i}.salmon.job
#SBATCH --output=$log_dir/${i}.salmon.out
#SBATCH --time=24:00:00
#SBATCH --cpus-per-task=${threads}
#SBATCH --mem=20g
ml salmon
new_out=$out_dir/${i}
mkdir -p \$new_out
salmon quant -l A -p ${threads} \
-i $index -g $gtf \
--validateMappings \
-1 ${i}_1.fq.gz -2 ${i}_2.fq.gz \
-o \$new_out
" > $job_file
sbatch $job_file
done
整理数据,分析差异
rm(list = ls())
options(stringsAsFactors=F)
suppressMessages(library(GenomicFeatures))
suppressMessages(library(tximport))
suppressMessages(library(DESeq2))
suppressMessages(library(tidyverse))
gtf <- rtracklayer::import('~/short_reads/salmon_humanized_fast_nanopore/all.human.mouse.isoforms.gtf')
gtf_df=as.data.frame(gtf)
gtf_df<- gtf_df %>% filter(type=="transcript") %>% select(transcript_id,gene_id)
write.csv(gtf_df,"~/short_reads/salmon_humanized_fast_nanopore/transcript_gene_id.csv",row.names = F)
sample_info<-read.csv("~/short_reads/sample_info.csv")
sampleList<-as.character(sample_info$Run)
fileList <- file.path("~/short_reads/salmon_humanized/salmon", sampleList, "quant.genes.sf")
names(fileList) <- sampleList
#gene levels
txi_gene <- tximport(fileList, type="salmon", txOut=TRUE,
countsFromAbundance="no")
txi_gene_counts<-txi_gene$counts
txi_gene_counts <- txi_gene_counts[rowSums(txi_gene_counts) > 0,]
txi_gene_tpm<-txi_gene$abundance
txi_gene_tpm <- txi_gene_tpm[rowSums(txi_gene_tpm) > 0,]
#txi_gene_pick<-txi_gene_counts["ENSMUSG00000096768.8",]
write.csv(txi_gene_counts,"~/salmon_analysis/gene_counts.csv")
write.csv(txi_gene_tpm,"~/salmon_analysis/gene_tpm.csv")
###transcript levels
fileList <- file.path(data_dir, sampleList, "quant.sf")
names(fileList) <- sampleList
txi_transcript <- tximport(fileList, type="salmon", txOut=TRUE,
countsFromAbundance="no")
txi_transcript_counts <- txi_transcript$counts
txi_transcript_counts <- txi_transcript_counts[rowSums(txi_transcript_counts) > 0,]
#txi_transcript_pick<-txi_transcript_counts["Hb681a1a3-59b8-4c32-b42a-215851c2d0f6",]
txi_transcript_tpm<-txi_transcript$abundance
txi_transcript_tpm <- txi_transcript_tpm[rowSums(txi_transcript_tpm) > 0,]
#txi_transcript_pick<-txi_transcript_counts["Hb681a1a3-59b8-4c32-b42a-215851c2d0f6",]
write.csv(txi_transcript_counts,"~/salmon_analysis/transcript_counts.csv")
write.csv(txi_transcript_tpm,"~/salmon_analysis/transcript_tpm.csv")
##degs analysis
dds <- DESeqDataSetFromTximport(txi_gene, colData=sample_info, design= ~ group)
dds <- DESeq(dds)
res <- results(dds)
write.table(res,"deg_result.csv", sep = ",", row.names = TRUE)
##det同理
dds <- DESeqDataSetFromTximport(txi_transcript, colData=sample_info, design= ~ group)
dds <- DESeq(dds)
res <- results(dds)