1. 安装软件
1.1. 下载安装MMAPPR
wget http://yostlab.genetics.utah.edu/wp-content/uploads/2019/02/MMAPPR_083.tar.gz
tar -zxvf MMAPPR_083.tar.gz
cd MMAPPR_083
1.2. 下载安装USeq软件
wget https://github.com/HuntsmanCancerInstitute/USeq/releases/download/USeq_9.3.1/USeq_9.3.1.zip
unzip USeq_9.3.1.zip
2. 构建参考基因组索引
该索引用于后面Hisat2比对
hisat2-build ./B73_v5.fa B73_v5.fa
3. 利用Hisat2进行序列比对
mkdir 01Cleandata 02Alignment
for i in mut WT
do
#去接头
java -jar trimmomatic-0.39.jar PE -phred33 -threads 8 ./${i}_1.fq.gz ./${i}_2.fq.gz ./01Cleandata/${i}_1_cleandata.fq.gz ./01Cleandata/${i}_good_unpaired.fq.gz ./01Cleandata/${i}_2_cleandata.fq.gz ./01Cleandata/${i}_2_unpaired.fq.gz LEADING:20 TRAILING:20 SLIDINGWINDOW:4:15 MINLEN:20
#比对到基因组
hisat2 --dta -x B73_v5 -p 8 -1 ./01Cleandata/${i}_cleandata.fq.gz -2 ./01Cleandata/${i}_2_cleandata.fq.gz -S ./02Alignment/${i}.sam
将sam文件转化为bam文件
samtools view -S ./02Alignment/${i}.sam -b > ./02Alignment/${i}.bam
#为了节约空间,可以将不用的sam文件删除
sudo rm -r ./02Alignment/${i}.sam
samtools sort ./02Alignment/${i}.bam -o ./02Alignment/${i}_sorted.bam
done
4. 准备MMAPPR的注释文件
4.1 将基因组分成不同的染色体
pip install pyfaidx
faidx -x B73_v5.fa
mkdir ${path}/MMAPPR_083/annotation_files/B73_v5
cp chr*.fa ${path}/MMAPPR_083/annotation_files/B73_v5
4.2 给基因组建立索引
该索引是用于MMAPPR软件调用
samtools faidx 能够对fasta 序列建立一个后缀为.fai 的文件,根据这个.fai 文件和原始的fastsa文件, 能够快速的提取任意区域的序列
其他提取序列的方法:
samtools faidx input.fa chr1 > chr1.fa
samtools faidx input.fa chr1:100-200 > chr1.fa
samtools faidx B73_v5.fa
cp ./B73_v5.fa ${path}/MMAPPR_083/annotation_files/
cp ./B73_v5.fa.fai ${path}/MMAPPR_083/annotation_files/
4.3 将gtf文件转化为mammap 可以识别的注释文件
I. 下载gtfToGenePred
wget -c http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred
chmod 777 gtfToGenePred
II. 利用gtfToGenePred 将gtf文件转化为UCSC refflat 格式
echo "#geneName\tname\tchrom\tstrand\ttxStart\ttxEnd\tcdsStart\tcdsEnd\texonCount\texonStarts\texonEnds" > B73_v5.genes
./gtfToGenePred -genePredExt -geneNameAsName2 B73_v5.gtf -ignoreGroupsWithoutExons /dev/stdout| awk '{print $1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' >> B73_v5.genes
cp ./B73_v5.genes ${path}/MMAPPR_083/annotation_files/
refFlat 格式如下
table refFlat
"A gene prediction with additional geneName field."
(
string geneName; "Name of gene as it appears in Genome Browser."
string name; "Name of gene"
string chrom; "Chromosome name"
char[1] strand; "+ or - for strand"
uint txStart; "Transcription start position"
uint txEnd; "Transcription end position"
uint cdsStart; "Coding region start"
uint cdsEnd; "Coding region end"
uint exonCount; "Number of exons"
uint[exonCount] exonStarts; "Exon start positions"
uint[exonCount] exonEnds; "Exon end positions"
)
5. 运行MMAPPR软件
cd ${path}/MMAPPR_083/
vim run_mmappr_test.sh
#修改文件内容如下
#! /bin/bash
python3 ./MMAPPR_083.py \
-c ./WT_sorted.bam \ # 经过sort之后的野生型BAM文件
-m ./mut_sorted.bam \ # 经过sort之后的突变体BAM文件
-g B73_v5 \ # 参考基因组文件
-u /The_path_to_where_you_installed/USeq_9.3.1/ \ #Useq 软件的安装路径
--cores 8 # 软件运行所用到的核心数
#保存修改
nohup sh run_mmappr_test.sh &
#运行程序,等待分析结果
MMAPPR 文件夹的数据结构,请勿移动,可将文件替换成自己的文件即可
mmappr/
MMAPPR_083.py # Main python script.
MMAPPR_083.R # R script, invoked by the python script.
run_mmappr_test.sh # Sample bash script to run MMAPPR.
# Needs to be customized to run properly.
annotation_files/
zv9.fa # Zv9 (danRer7) reference genome in fasta format.
zv9.fa.fai # fasta index file for zv9.fa
zv9.genes # Refseq annotation of Zv9 genome in UCSC refflat format.
zv9/ # Folder containing Zv9 split into 1 file per chromosome.
bamfiles/
WTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, wild-type.
MUTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, mutant.
运行结果会在snpmapresults文件夹中显示,包括snp信息等
6. 对于软件的其他修改
若得到的区间不符合预期,或者未能给出预测的区间,则需要更改MAPPR_083.py文件中的power值
more MAPPR_083.py
显示结果如下:
# MMAPPR version 0.83. Copyright 2012-2013, University of Utah.
#===============================================================================
# MMAPPR: Mutation Mapping Analysis Pipeline for Pooled RNA-seq
# Jonathon Hill, Bradley Demarest, Brent Bisgrove, Bushra Gorsi, H. Joseph Yost
# Department of Neurobiology and Anatomy, University of Utah
#===============================================================================
import argparse
import os
import os.path
import re
import subprocess
import sys
import threading
import time
tlast = "0"
def log(outputstring, stamp=True):
'Allows outputs to be printed to Stdout and log file'
outputstring = outputstring.rstrip()
global tlast
t = time.strftime("[%d %b %Y %H:%M:%S] ")
if t == tlast or stamp == False:
tstamp = " " * 23
else:
tstamp = t
print(tstamp+outputstring)
logfile.write(tstamp+outputstring+'\n')
tlast = t
sys.stdout.flush()
def sout(soutfile):
o_file.write('chr\tpos\tref\tcov\tA\tC\tG\tT\n')
#compile regs
test = re.compile('[ACGTacgt]')
starts = re.compile('\^.')
ends = re.compile('[\$]')
indels = re.compile('[\+-](\d+)')
for line in soutfile:
line = line.decode(encoding="utf-8")
if len(line) != 0:
line = line.strip()
linearray = line.split() #0=chr 1=pos 2=ref base 3=coverage 4=read bases 5=base quality
if len(linearray) != 6:
log("Skipped invalid line: "+line)
continue
if (args.mask == True and linearray[2].islower()) or line[0] == '#' or linearray[1] == '*' or not test.search(linearray[4]):
continue
linearray[4] = starts.sub('',linearray[4]) #remove start of read chars
linearray[4] = ends.sub('',linearray[4]) #remove end of read character and gap character
for match in indels.findall(linearray[4]): #remove indel information (in future will modify to map indels as well)
pattern = '[\+-]'+match+'.{'+match+'}'
linearray[4] = re.sub(pattern, '', linearray[4])
i = 0
reads = ''
if len(linearray[4]) != len(linearray[5]):
log("Skipped invalid line: "+line)
continue
for qual in linearray[5]: #remove low quality bases
if ord(qual) - 33 >= args.base_qual and linearray[4][i] != "<" and linearray[4][i] != ">":
reads = reads+linearray[4][i]
i += 1
if reads.count('.')+reads.count(',') != len(reads):
trantab = str.maketrans('.,acgt', linearray[2].upper()+linearray[2].upper()+'ACGT') #translate everything to forward sequence bases
reads=reads.translate(trantab)
basecount = {}
basecount['A'] = reads.count('A') #calculate base occurences
basecount['C'] = reads.count('C')
basecount['G'] = reads.count('G')
basecount['T'] = reads.count('T')
cov = str(basecount['A']+basecount['C']+basecount['G']+basecount['T'])
o_file.write('\t'.join([linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])])+'\n')
print(" "*100+'\t'.join(['\r',linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])]), end="\r")
print(" "*100, end='\r')
def serr(serrfile, source=""):
for err in serrfile:
log(source+err.decode(encoding="utf-8"))
def mpileup(inf):
command = ' '.join(['samtools mpileup', '-q', str(args.map_qual), '-f', os.path.abspath(fasta), inf])
log("Mpileup command: "+command)
mp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1)
t1=threading.Thread(target=sout, args=(mp.stdout,))
t2=threading.Thread(target=serr, args=(mp.stderr,))
t1.start()
t2.start()
mp.wait()
t1.join()
t2.join()
exitCode = mp.returncode
if (exitCode == 0):
return "Conversion complete"
else:
raise Exception(inf, exitCode)
def rscript():
scriptpath = os.path.dirname(os.path.realpath(__file__))+"/MMAPPR_083.R"
command = ' '.join(["Rscript --vanilla", scriptpath, outfilec, outfilem, str(args.power), "aicc", str(args.min_depth), str(args.cores), str(args.chrprefix)])
log("Rscript command: "+command)
rp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
serr(rp.stdout, "R: ")
rp.wait()
exitCode = rp.returncode
if (exitCode == 0):
return "Rscript complete"
else:
raise Exception(exitCode)
def alleler():
scriptpath = os.path.abspath(os.path.join(args.u, "Apps/Alleler"))
command = ' '.join(['java -Xmx5G -jar', scriptpath,
'-a snpmapresults/putativesnps.txt -g', os.path.abspath(fastafolder), '-u', os.path.abspath(annotation), '-c -d'])
log("Alleler command: "+command)
al = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
serr(al.stdout, "Alleler: ")
al.wait()
exitCode = al.returncode
if (exitCode == 0):
return "Alleler complete"
else:
raise Exception(exitCode)
####################
dirmade = "False"
if not os.path.exists("snpmapresults/"):
os.makedirs("snpmapresults/")
dirmade = "True"
parser = argparse.ArgumentParser(description='MMAPPR version 0.82\nFor complete documentation, see website at')
parser.add_argument('-c', required=True, help='bam or sam file for control pool (required)')
parser.add_argument('-m', required=True, help='bam or sam file for mutant pool (required)')
parser.add_argument('-g', required=True, help='genome prefix (required). Corresponding files for gene and genomic annotations should be in the "annotation_files" folder (required)')
parser.add_argument('-u', required=True, help='full path to USeq directory (required)')
parser.add_argument('--log', default="snpmap.log", help='log file name (default="MMAPPR.log"')
parser.add_argument('--map_qual', default=30, type=int, help='minimum map quality score (default = 30)')
parser.add_argument('--base_qual', default=20, type=int, help='minimum base quality score (default = 20)')
parser.add_argument('--min_depth', default=10, type=int, help='minimum read depth at snp (default = 10)')
parser.add_argument('--mask', action='store_true', help='ignores lines with lowercase letters in FASTA reference file. UCSC uses these to mark repetitive regions.')
parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')
parser.add_argument('--cores', default=1, type=int, help='number of cores to use (default = 1)')
parser.add_argument('--chrprefix', default='chr', help='chromosome prefix (default = "chr".')
args = parser.parse_args()
params = re.match('Namespace\((.*)\)', str(args))
logfile = open("snpmapresults/"+args.log, 'a')
log("Starting MMAPPR version 0.83")
log("Args: "+params.group(1))
annotation = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".genes"
fasta = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".fa"
fastafolder = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g
#-------------------------------------------------------------------------------
# Check dependencies.
log('Python version found: ' + sys.version.split('\n')[0])
# Check that R and samtools are on the PATH and display version.
samversion = re.split("\n", subprocess.getoutput("samtools")) #cannot get status to check if exists because samtools always returns error if not running command
rversion = re.split("\n", subprocess.getoutput("Rscript --version"))
samstat = False
rstat = False
for line in samversion:
if 'Version' in line:
log('Samtools found: ' + line)
samstat = True
for line in rversion:
if 'version' in line:
log('Rscript found: ' + line)
rstat = True
if not samstat:
log('Error: Samtools not found.')
if not rstat:
log('Error: Rscript not found.')
# Check that R package 'parallel' is present.
parallel_check = subprocess.getoutput('''echo "res = require(parallel,
quietly=TRUE); if (res) cat('parallel TRUE\n')" | Rscript -''')
parallelstat = (parallel_check == 'parallel TRUE')
if not parallelstat:
log('''Error: R cannot load the 'parallel' package. Install R version 2.14 or higher.''')
# Check that USeq Alleler jar is callable.
useq_path = os.path.abspath(os.path.join(args.u, 'Apps/Alleler'))
useq_check = subprocess.getstatusoutput('java -Xmx4g -jar ' + useq_path)
useqstat = not useq_check[0]
if useqstat:
log('USeq found: ' + useq_path)
else:
log('Error: USeq not found.' + useq_path)
# Check that user specified files are readable.
ufiles = [args.c, args.m, annotation, fasta, fastafolder]
file_access = [os.access(os.path.abspath(ufile), os.R_OK) for ufile in ufiles]
for ufile, filestat in zip(ufiles, file_access):
check_file = log(os.path.abspath(ufile) + (' -- Found!\n' if filestat else
' -- Error: File not found.\n'))
# Exit MMAPPR if any files or dependencies are not found.
check_all = [samstat, rstat, parallelstat] + file_access
if not all(check_all):
log('MMAPPR is quitting...')
sys.exit()
#-------------------------------------------------------------------------------
if dirmade == "False":
log("Snpmap results directory found. Checking if snp files present.")
basenamec = re.match('(.*)/(.*?)\.(.*?)', args.c)
outfilec = "snpmapresults/"+basenamec.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'
basenamem = re.match('(.*)/(.*?)\.(.*?)', args.m)
outfilem = "snpmapresults/"+basenamem.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'
if os.path.isfile(outfilec):
log("Snp file for control found. Skipping conversion step.")
else:
o_file = open(outfilec, 'w')
log("Converting "+args.c+" to snp file")
log(mpileup(os.path.abspath(args.c)))
o_file.close()
if os.path.isfile(outfilem):
log("Snp file for mutant found. Skipping conversion step.")
else:
o_file = open(outfilem, 'w')
log("Converting "+args.m+" to snp file")
log(mpileup(os.path.abspath(args.m)))
o_file.close()
log("Running Rscript to find candidates")
log(rscript())
log("Running Alleler to find functional effect of candidates")
log(alleler())
第157行显示为
parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')
我们在run_mmappr_test. sh 文件,最后添加一行 -- power = 10 或者其他合适的数值即可
同时第147-158 为可以修改的参数,可以根据自己的需求进行修改
该数值可以尝试多次修改,每次修改后重新运行程序即可,程序会自动跳过Call snp的步骤,直接根据SNP信息预测区间,所用时间会比第一次操作时较短
部分内容来源网络,如侵权,联系删除。
个人学习笔记,如有错误,不吝赐教