索引构建
#!/bin/bash
#SBATCH --job-name="star_index"
fastq_dir=~/hg38/GRCh38.p12.genome.fa #genecode下载,并放在目的目录(自己随意放),截止05072019 最新版本
gtf_dir=~/hg38/gencode.v30.annotation.gtf #(同上)
out_dir=~/STAR/hg38/genecodev30
ml STAR
STAR --runMode genomeGenerate --genomeDir $out_dir --genomeFastaFiles $fastq_dir --sjdbGTFfile $gtf_dir --sjdbOverhang 284 --runThreadN 20
构建好索引后,第一次比对
#!/bin/bash
genome_dir=~/index/STAR/hg38/genecodev30
out_dir=~/STAR_Align_V30/First
data_dir=~/trimed/reads
job_directory=~/trimed/reads
mkdir -p out_dir
for i in $(ls $pwd *.fq.gz | sed s/.trimmed_R[12].fq.gz// | sort -u);do
job_file="${job_directory}/${i}.job"
echo "#!/bin/bash
#SBATCH --job-name=${i}.STAR.job
#SBATCH --output=$out_dir/${i}.out
#SBATCH --time=3:00:00
#SBATCH --cpus-per-task=10
#SBATCH --mem=30g
ml STAR
STAR --runThreadN 10 --genomeDir $genome_dir \
--readFilesCommand zcat \
--readFilesIn $data_dir/${i}.trimmed_R1.fq.gz $data_dir/${i}.trimmed_R2.fq.gz \
--outSAMtype BAM SortedByCoordinate \
--outFileNamePrefix $out_dir/${i}">$job_file
sbatch $job_file
done
构建二次比对的新索引:
#!/bin/bash
fastq_dir=~/hg38/GRCh38.p12.genome.fa
data_dir=~/STAR_Align_V30/First
job_directory=~STAR_Align_V30/First/job
out_dir=~/STAR_Align_V30/First/out
mkdir -p out_dir
mkdir -p job_directory
mkdir -p out_dir
for i in $(ls *SJ.out.tab); do
job_file="${job_directory}/${i%SJ.out.tab*}.job"
echo "#!/bin/bash
#SBATCH --job-name=${i%SJ.out.tab*}.job
#SBATCH --output=$out_dir/${i%SJ.out.tab*}.out
#SBATCH --time=5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --mem=50g
ml STAR
genomeDir=~/STAR_Align_V30/New_index/${i%SJ.out.tab*}
mkdir -p $genomeDir
STAR --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles $fastq_dir \
--sjdbFileChrStartEnd $data_dir/${i} --sjdbOverhang 284 --runThreadN 10">$job_file
sbatch $job_file
done
PS
STAR缓存文件太大,7TB空间run 190个RNA-seq都吃不消,还是用For循环来做。