RNA-seq流程-从SRR下载到得到表达矩阵
1.数据下载
在~/project/new/路径下,将SRR号重定向到一个id里
#(文件夹都是建在~/project/new/路径下)
mkdir sra
cd sra
cat /four/mm/project/new/id |while read id;do (prefetch ${id});done
将SRA数据转成fastq
#在~/project/new/路径下
mkdir fq
cd fq
ls /four/mm/project/new/sra/*sra | while read id; do (fastq-dump --gzip --split-3 -O ./ ../sra/${id}); done
另外:cat ./id |while read id;do (fastq-dump --gzip --split-3 -O ./ ${id}); done
2.质控
#在~/project/new/路径下
mkdir fq.qc
cd fq.qc
###1.data statistics
ls ../fq/*.fastq.gz |while read id ;do (fastqc -t 2 -o ./ ${id});done
multiqc ./ #整合报告结果
###2.filter data
#双端测序
ls ~/fqmm/*_1.fastq.gz >1
ls ~/fqmm/*_2.fastq.gz >2
paste 1 2 > config
cat >qc.sh#下面是要输入的内容
source activate rna
bin_trim_galore=trim_galore
dir='/four/mm/project/new/clean'
cat $1 |while read id
do
arr=($id)
fq1=${arr[0]}
fq2=${arr[1]}
nohup $bin_trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o $dir $fq1 $fq2 &
done
source deactivate
#单端测序(当前路径是~/project/new)(注:此时od不仅仅是SRR号,还包括了.fastq.gz,所以输入文件的循环是${id},而不是${id}.fastq.gz)
mkdir clean
cat ./fq/od |while read id ;do (trim_galore --phred33 -q 25 -e 0.1 --length 36 --stringency 3 -o ./clean/ ./fq/${id}); done
3.比对+bam排序
#双端测序
nohup cat /four/mm/project/new/id |while read id;do #复制一份id到当前路径下
hisat2 -p 5 -x ~/index/grch38/genome #比对
-1 ${id}_1_val_1.fq.gz
-2 ${id}_2_val_2.fq.gz | #管道符,生成的文件进行下一步
samtools sort -@ 5 -o ~/rna.GSE52778/sort.bam/${id}.sort.bam - #bam排序
done &
#单端测序 当前路径~/project/new/clean/
mkdir sort.bam
nohup cat /four/mm/project/new/id |while read id;do (hisat2 -p 5 -x /four/mm/index/hisat/hg38/genome -U ${id}_trimmed.fq.gz|samtools sort -@ 5 -o ../sort.bam/${id}.sort.bam -) done &
4.计数
#双/单端测序
nohup cat /four/mm/project/new/id |while read id;do featureCounts -T 5 -p -t exon -g gene_id -a /four/mm/project/gtf/gencode.v29.annotation.gtf
-o ./featureCounts/all.counts.txt ./sort.bam/${id}.sort.bam; done &
终于完整的能得到表达矩阵了,能显示下面这个图片,真是好漂酿!