1. 安装软件

1.1. 下载安装MMAPPR

wget http://yostlab.genetics.utah.edu/wp-content/uploads/2019/02/MMAPPR_083.tar.gz
tar -zxvf MMAPPR_083.tar.gz
cd MMAPPR_083

1.2. 下载安装USeq软件

wget https://github.com/HuntsmanCancerInstitute/USeq/releases/download/USeq_9.3.1/USeq_9.3.1.zip
unzip USeq_9.3.1.zip

2. 构建参考基因组索引

该索引用于后面Hisat2比对

hisat2-build ./B73_v5.fa B73_v5.fa

3. 利用Hisat2进行序列比对

mkdir 01Cleandata 02Alignment
for i in mut WT
    do 
        #去接头
        java -jar trimmomatic-0.39.jar PE -phred33 -threads 8 ./${i}_1.fq.gz ./${i}_2.fq.gz  ./01Cleandata/${i}_1_cleandata.fq.gz ./01Cleandata/${i}_good_unpaired.fq.gz ./01Cleandata/${i}_2_cleandata.fq.gz ./01Cleandata/${i}_2_unpaired.fq.gz LEADING:20 TRAILING:20 SLIDINGWINDOW:4:15 MINLEN:20
        #比对到基因组
        hisat2 --dta -x B73_v5 -p 8 -1 ./01Cleandata/${i}_cleandata.fq.gz -2 ./01Cleandata/${i}_2_cleandata.fq.gz -S ./02Alignment/${i}.sam
        将sam文件转化为bam文件      
        samtools view  -S ./02Alignment/${i}.sam -b > ./02Alignment/${i}.bam
        #为了节约空间，可以将不用的sam文件删除
        sudo rm -r ./02Alignment/${i}.sam
      samtools sort ./02Alignment/${i}.bam -o ./02Alignment/${i}_sorted.bam
    done

4. 准备MMAPPR的注释文件

4.1 将基因组分成不同的染色体

pip install pyfaidx
faidx -x B73_v5.fa
mkdir ${path}/MMAPPR_083/annotation_files/B73_v5
cp chr*.fa ${path}/MMAPPR_083/annotation_files/B73_v5

4.2 给基因组建立索引

该索引是用于MMAPPR软件调用
samtools faidx 能够对fasta 序列建立一个后缀为.fai 的文件，根据这个.fai 文件和原始的fastsa文件，能够快速的提取任意区域的序列
其他提取序列的方法：
samtools faidx input.fa chr1 > chr1.fa
samtools faidx input.fa chr1:100-200 > chr1.fa

samtools faidx B73_v5.fa 
cp ./B73_v5.fa ${path}/MMAPPR_083/annotation_files/
cp ./B73_v5.fa.fai ${path}/MMAPPR_083/annotation_files/

4.3 将gtf文件转化为mammap 可以识别的注释文件

I. 下载gtfToGenePred

wget -c http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred
chmod 777 gtfToGenePred

II. 利用gtfToGenePred 将gtf文件转化为UCSC refflat 格式

echo "#geneName\tname\tchrom\tstrand\ttxStart\ttxEnd\tcdsStart\tcdsEnd\texonCount\texonStarts\texonEnds" > B73_v5.genes
./gtfToGenePred -genePredExt -geneNameAsName2 B73_v5.gtf -ignoreGroupsWithoutExons /dev/stdout| awk '{print $1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' >>  B73_v5.genes
cp ./B73_v5.genes ${path}/MMAPPR_083/annotation_files/

refFlat 格式如下

table refFlat
"A gene prediction with additional geneName field."
(
string geneName; "Name of gene as it appears in Genome Browser."
string name; "Name of gene"
string chrom; "Chromosome name"
char[1] strand; "+ or - for strand"
uint txStart; "Transcription start position"
uint txEnd; "Transcription end position"
uint cdsStart; "Coding region start"
uint cdsEnd; "Coding region end"
uint exonCount; "Number of exons"
uint[exonCount] exonStarts; "Exon start positions"
uint[exonCount] exonEnds; "Exon end positions"
)

5. 运行MMAPPR软件

cd ${path}/MMAPPR_083/
vim run_mmappr_test.sh
#修改文件内容如下
#! /bin/bash
python3 ./MMAPPR_083.py               \
-c ./WT_sorted.bam  \ # 经过sort之后的野生型BAM文件
-m ./mut_sorted.bam \ # 经过sort之后的突变体BAM文件
-g B73_v5                                \ # 参考基因组文件
-u /The_path_to_where_you_installed/USeq_9.3.1/             \  #Useq 软件的安装路径
--cores 8    # 软件运行所用到的核心数

#保存修改
nohup sh run_mmappr_test.sh & 
#运行程序，等待分析结果

MMAPPR 文件夹的数据结构，请勿移动，可将文件替换成自己的文件即可
mmappr/
MMAPPR_083.py # Main python script.
MMAPPR_083.R # R script, invoked by the python script.
run_mmappr_test.sh # Sample bash script to run MMAPPR.
# Needs to be customized to run properly.
annotation_files/
zv9.fa # Zv9 (danRer7) reference genome in fasta format.
zv9.fa.fai # fasta index file for zv9.fa
zv9.genes # Refseq annotation of Zv9 genome in UCSC refflat format.
zv9/ # Folder containing Zv9 split into 1 file per chromosome.
bamfiles/
WTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, wild-type.
MUTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, mutant.
运行结果会在snpmapresults文件夹中显示，包括snp信息等

6. 对于软件的其他修改

若得到的区间不符合预期，或者未能给出预测的区间，则需要更改MAPPR_083.py文件中的power值

more  MAPPR_083.py

显示结果如下：

# MMAPPR version 0.83. Copyright 2012-2013, University of Utah.
#===============================================================================
# MMAPPR: Mutation Mapping Analysis Pipeline for Pooled RNA-seq
# Jonathon Hill, Bradley Demarest, Brent Bisgrove, Bushra Gorsi, H. Joseph Yost
# Department of Neurobiology and Anatomy, University of Utah
#===============================================================================

import argparse
import os
import os.path
import re
import subprocess
import sys
import threading
import time

tlast = "0"

def log(outputstring, stamp=True):
    'Allows outputs to be printed to Stdout and log file'
    outputstring = outputstring.rstrip()
    global tlast
    t = time.strftime("[%d %b %Y %H:%M:%S] ")
    if t == tlast or stamp == False:
        tstamp = " " * 23
    else:
        tstamp = t
    print(tstamp+outputstring)
    logfile.write(tstamp+outputstring+'\n')
    tlast = t
    sys.stdout.flush()

def sout(soutfile):
    o_file.write('chr\tpos\tref\tcov\tA\tC\tG\tT\n')

    #compile regs
    test = re.compile('[ACGTacgt]')
    starts = re.compile('\^.')
    ends = re.compile('[\$]')
    indels = re.compile('[\+-](\d+)')
    for line in soutfile:
        line = line.decode(encoding="utf-8")
        if len(line) != 0:
            line = line.strip()
            linearray = line.split() #0=chr 1=pos 2=ref base 3=coverage 4=read bases 5=base quality
            if len(linearray) != 6:
                log("Skipped invalid line: "+line)
                continue
            if (args.mask == True and linearray[2].islower()) or line[0] == '#' or linearray[1] == '*' or not test.search(linearray[4]):
                continue
            linearray[4] = starts.sub('',linearray[4])       #remove start of read chars
            linearray[4] = ends.sub('',linearray[4])        #remove end of read character and gap character
            for match in indels.findall(linearray[4]): #remove indel information (in future will modify to map indels as well)
                pattern = '[\+-]'+match+'.{'+match+'}'
                linearray[4] = re.sub(pattern, '', linearray[4])
            i = 0
            reads = ''
            if len(linearray[4]) != len(linearray[5]):
                log("Skipped invalid line: "+line)
                continue
            for qual in linearray[5]:                               #remove low quality bases
                if ord(qual) - 33 >= args.base_qual and linearray[4][i] != "<" and linearray[4][i] != ">":
                    reads = reads+linearray[4][i]
                i += 1
            if reads.count('.')+reads.count(',') != len(reads):
                trantab = str.maketrans('.,acgt', linearray[2].upper()+linearray[2].upper()+'ACGT')    #translate everything to forward sequence bases
                reads=reads.translate(trantab)
                basecount = {}
                basecount['A'] = reads.count('A')                           #calculate base occurences
                basecount['C'] = reads.count('C')
                basecount['G'] = reads.count('G')
                basecount['T'] = reads.count('T')
                cov = str(basecount['A']+basecount['C']+basecount['G']+basecount['T'])

                o_file.write('\t'.join([linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])])+'\n')
                print(" "*100+'\t'.join(['\r',linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])]), end="\r")
    print(" "*100, end='\r')


def serr(serrfile, source=""):
    for err in serrfile:
        log(source+err.decode(encoding="utf-8"))


def mpileup(inf):

    command = ' '.join(['samtools mpileup', '-q', str(args.map_qual), '-f', os.path.abspath(fasta), inf])
    log("Mpileup command: "+command)
    mp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1)

    t1=threading.Thread(target=sout, args=(mp.stdout,))
    t2=threading.Thread(target=serr, args=(mp.stderr,))
    t1.start()
    t2.start()

    mp.wait()
    t1.join()
    t2.join()

    exitCode = mp.returncode
    if (exitCode == 0):
        return "Conversion complete"
    else:
        raise Exception(inf, exitCode)


def rscript():
    scriptpath = os.path.dirname(os.path.realpath(__file__))+"/MMAPPR_083.R"
    command = ' '.join(["Rscript --vanilla", scriptpath, outfilec, outfilem, str(args.power), "aicc", str(args.min_depth), str(args.cores), str(args.chrprefix)])
    log("Rscript command: "+command)
    rp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    serr(rp.stdout, "R: ")
    rp.wait()

    exitCode = rp.returncode
    if (exitCode == 0):
        return "Rscript complete"
    else:
        raise Exception(exitCode)

def alleler():
    scriptpath = os.path.abspath(os.path.join(args.u, "Apps/Alleler"))
    command = ' '.join(['java -Xmx5G -jar', scriptpath,
        '-a snpmapresults/putativesnps.txt -g', os.path.abspath(fastafolder), '-u', os.path.abspath(annotation), '-c -d'])
    log("Alleler command: "+command)
    al = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    serr(al.stdout, "Alleler: ")
    al.wait()

    exitCode = al.returncode
    if (exitCode == 0):
        return "Alleler complete"
    else:
        raise Exception(exitCode)


####################

dirmade = "False"

if not os.path.exists("snpmapresults/"):
    os.makedirs("snpmapresults/")
    dirmade = "True"

parser = argparse.ArgumentParser(description='MMAPPR version 0.82\nFor complete documentation, see website at')
parser.add_argument('-c', required=True, help='bam or sam file for control pool (required)')
parser.add_argument('-m', required=True, help='bam or sam file for mutant pool (required)')
parser.add_argument('-g', required=True, help='genome prefix (required). Corresponding files for gene and genomic annotations should be in the "annotation_files" folder (required)')
parser.add_argument('-u', required=True, help='full path to USeq directory (required)')
parser.add_argument('--log', default="snpmap.log", help='log file name (default="MMAPPR.log"')
parser.add_argument('--map_qual', default=30, type=int, help='minimum map quality score (default = 30)')
parser.add_argument('--base_qual', default=20, type=int, help='minimum base quality score (default = 20)')
parser.add_argument('--min_depth', default=10, type=int, help='minimum read depth at snp (default = 10)')
parser.add_argument('--mask', action='store_true', help='ignores lines with lowercase letters in FASTA reference file. UCSC uses these to mark repetitive regions.')
parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')
parser.add_argument('--cores', default=1, type=int, help='number of cores to use (default = 1)')
parser.add_argument('--chrprefix', default='chr', help='chromosome prefix (default = "chr".')



args = parser.parse_args()
params = re.match('Namespace\((.*)\)', str(args))
logfile = open("snpmapresults/"+args.log, 'a')
log("Starting MMAPPR version 0.83")
log("Args: "+params.group(1))
annotation = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".genes"
fasta = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".fa"
fastafolder = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g

#-------------------------------------------------------------------------------
# Check dependencies.
log('Python version found: ' + sys.version.split('\n')[0])

# Check that R and samtools are on the PATH and display version.
samversion = re.split("\n", subprocess.getoutput("samtools")) #cannot get status to check if exists because samtools always returns error if not running command
rversion = re.split("\n", subprocess.getoutput("Rscript --version"))

samstat = False
rstat = False

for line in samversion:
    if 'Version' in line:
        log('Samtools found: ' + line)
        samstat = True

for line in rversion:
    if 'version' in line:
        log('Rscript found: ' + line)
        rstat = True

if not samstat:
    log('Error: Samtools not found.')

if not rstat:
    log('Error: Rscript not found.')

# Check that R package 'parallel' is present.
parallel_check = subprocess.getoutput('''echo "res = require(parallel,
    quietly=TRUE); if (res) cat('parallel TRUE\n')" | Rscript -''')
parallelstat = (parallel_check == 'parallel TRUE')
if not parallelstat:
    log('''Error: R cannot load the 'parallel' package. Install R version 2.14 or higher.''')

# Check that USeq Alleler jar is callable.
useq_path = os.path.abspath(os.path.join(args.u, 'Apps/Alleler'))
useq_check = subprocess.getstatusoutput('java -Xmx4g -jar ' + useq_path)
useqstat = not useq_check[0]
if useqstat:
    log('USeq found: ' + useq_path)
else:
    log('Error: USeq not found.' + useq_path)

# Check that user specified files are readable.
ufiles = [args.c, args.m, annotation, fasta, fastafolder]
file_access = [os.access(os.path.abspath(ufile), os.R_OK) for ufile in ufiles]

for ufile, filestat in zip(ufiles, file_access):
    check_file = log(os.path.abspath(ufile) + (' -- Found!\n' if filestat else
         ' -- Error: File not found.\n'))

# Exit MMAPPR if any files or dependencies are not found.
check_all = [samstat, rstat, parallelstat] + file_access

if not all(check_all):
    log('MMAPPR is quitting...')
    sys.exit()

#-------------------------------------------------------------------------------

if dirmade == "False":
    log("Snpmap results directory found. Checking if snp files present.")

basenamec = re.match('(.*)/(.*?)\.(.*?)', args.c)
outfilec = "snpmapresults/"+basenamec.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'
basenamem = re.match('(.*)/(.*?)\.(.*?)', args.m)
outfilem = "snpmapresults/"+basenamem.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'

if os.path.isfile(outfilec):
    log("Snp file for control found. Skipping conversion step.")
else:
    o_file = open(outfilec, 'w')
    log("Converting "+args.c+" to snp file")
    log(mpileup(os.path.abspath(args.c)))
    o_file.close()

if os.path.isfile(outfilem):
    log("Snp file for mutant found. Skipping conversion step.")
else:
    o_file = open(outfilem, 'w')
    log("Converting "+args.m+" to snp file")
    log(mpileup(os.path.abspath(args.m)))

    o_file.close()

log("Running Rscript to find candidates")
log(rscript())

log("Running Alleler to find functional effect of candidates")
log(alleler())

第157行显示为

parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')

我们在run_mmappr_test. sh 文件，最后添加一行 -- power = 10 或者其他合适的数值即可

同时第147-158 为可以修改的参数，可以根据自己的需求进行修改

该数值可以尝试多次修改，每次修改后重新运行程序即可，程序会自动跳过Call snp的步骤，直接根据SNP信息预测区间，所用时间会比第一次操作时较短

部分内容来源网络，如侵权，联系删除。
个人学习笔记，如有错误，不吝赐教

利用MMAPPR进行BSR分析流程