参考之前的文章 hic分析之 hic-pro 介绍与安装
分步工作原理:
先看test文件
/PATH/TO/HiC-Pro_2.11.1/test-op/test_data
.
|-- dixon_2M
| |-- SRR400264_00_R1.fastq.gz
| `-- SRR400264_00_R2.fastq.gz
`-- dixon_2M_2
|-- SRR400264_01_R1.fastq.gz
`-- SRR400264_01_R2.fastq.gz
再看看opt文件
tree
.
|-- bowtie_results
| |-- bwt2
| | |-- dixon_2M
| | | |-- SRR400264_00_R1_hg19.bwt2merged.bam
| | | |-- SRR400264_00_R1_hg19.mapstat
| | | |-- SRR400264_00_R2_hg19.bwt2merged.bam
| | | |-- SRR400264_00_R2_hg19.mapstat
| | | |-- SRR400264_00_hg19.bwt2pairs.bam
| | | `-- SRR400264_00_hg19.bwt2pairs.pairstat
| | `-- dixon_2M_2
| | |-- SRR400264_01_R1_hg19.bwt2merged.bam
| | |-- SRR400264_01_R1_hg19.mapstat
| | |-- SRR400264_01_R2_hg19.bwt2merged.bam
| | |-- SRR400264_01_R2_hg19.mapstat
| | |-- SRR400264_01_hg19.bwt2pairs.bam
| | |-- SRR400264_01_hg19.bwt2pairs.pairstat
| |-- bwt2_global
| | |-- dixon_2M
| | | |-- SRR400264_00_R1_hg19.bwt2glob.bam
| | | |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
| | | |-- SRR400264_00_R2_hg19.bwt2glob.bam
| | | |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
| | |-- dixon_2M_2
| | |-- SRR400264_01_R1_hg19.bwt2glob.bam
| | |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
| | |-- SRR400264_01_R2_hg19.bwt2glob.bam
| | |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq
| |-- bwt2_local
| |-- dixon_2M
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
| |-- dixon_2M_2
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq
|-- config_test_latest.txt
|-- hic_results
| |-- data
| | |-- dixon_2M
| | | |-- SRR400264_00_hg19.bwt2pairs.DEPairs
| | | |-- SRR400264_00_hg19.bwt2pairs.DumpPairs
| | | |-- SRR400264_00_hg19.bwt2pairs.FiltPairs
| | | |-- SRR400264_00_hg19.bwt2pairs.REPairs
| | | |-- SRR400264_00_hg19.bwt2pairs.RSstat
| | | |-- SRR400264_00_hg19.bwt2pairs.SCPairs
| | | |-- SRR400264_00_hg19.bwt2pairs.SinglePairs
| | | |-- SRR400264_00_hg19.bwt2pairs.validPairs
| | | |-- SRR400264_00_hg19.bwt2pairs_interaction.bam
| | | `-- dixon_2M.allValidPairs
| | `-- dixon_2M_2
| | |-- SRR400264_01_hg19.bwt2pairs.DEPairs
| | |-- SRR400264_01_hg19.bwt2pairs.DumpPairs
| | |-- SRR400264_01_hg19.bwt2pairs.FiltPairs
| | |-- SRR400264_01_hg19.bwt2pairs.REPairs
| | |-- SRR400264_01_hg19.bwt2pairs.RSstat
| | |-- SRR400264_01_hg19.bwt2pairs.SCPairs
| | |-- SRR400264_01_hg19.bwt2pairs.SinglePairs
| | |-- SRR400264_01_hg19.bwt2pairs.validPairs
| | |-- SRR400264_01_hg19.bwt2pairs_interaction.bam
| | \-- dixon_2M_2.allValidPairs
| |-- matrix
| | |-- dixon_2M
| | | |-- iced
| | | | |-- 500000
| | | |-- raw
| | | |-- 1000000
| | | | |-- dixon_2M_1000000.matrix
| | | | |-- dixon_2M_1000000_abs.bed
| | | |-- 500000
| | | |-- dixon_2M_500000.matrix
| | | |-- dixon_2M_500000_abs.bed
| | |-- dixon_2M_2
| | |-- raw
| | |-- 1000000
| | | |-- dixon_2M_2_1000000.matrix
| | | |-- dixon_2M_2_1000000_abs.bed
| | |-- 500000
| | |-- dixon_2M_2_500000.matrix
| | |-- dixon_2M_2_500000_abs.bed
| |-- pic
| | |-- dixon_2M
| | | |-- plotHiCContactRanges_dixon_2M.pdf
| | | |-- plotHiCFragmentSize_dixon_2M.pdf
| | | |-- plotHiCFragment_dixon_2M.pdf
| | | |-- plotMappingPairing_dixon_2M.pdf
| | | |-- plotMapping_dixon_2M.pdf
| | |-- dixon_2M_2
| | |-- plotHiCContactRanges_dixon_2M_2.pdf
| | |-- plotHiCFragmentSize_dixon_2M_2.pdf
| | |-- plotHiCFragment_dixon_2M_2.pdf
| | |-- plotMappingPairing_dixon_2M_2.pdf
| | |-- plotMapping_dixon_2M_2.pdf
| |-- stats
| |-- dixon_2M
| | |-- dixon_2M.mRSstat
| | |-- dixon_2M.mpairstat
| | |-- dixon_2M_R1.mmapstat
| | |-- dixon_2M_R2.mmapstat
| | |-- dixon_2M_allValidPairs.mergestat
| |-- dixon_2M_2
| |-- dixon_2M_2.mRSstat
| |-- dixon_2M_2.mpairstat
| |-- dixon_2M_2_R1.mmapstat
| |-- dixon_2M_2_R2.mmapstat
| |-- dixon_2M_2_allValidPairs.mergestat
|-- logs
| |-- dixon_2M
| | |-- SRR400264_00_R1_bowtie2.log
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log
| | |-- SRR400264_00_R2_bowtie2.log
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log
| | |-- build_raw_maps.log
| | |-- ice_500000.log
| | |-- make_Rplots.log
| | |-- mapped_2hic_fragments.log
| | |-- mapping_combine.log
| | |-- mapping_stats.log
| | |-- mapping_step1.log
| | |-- mapping_step2.log
| | |-- mergeSAM.log
| | |-- merge_stats.log
| | |-- merge_valid_interactions.log
| | |-- plot_hic_contacts.Rout
| | |-- plot_hic_fragment.Rout
| | |-- plot_mapping_portion.Rout
| | |-- plot_pairing_portion.Rout
| |-- dixon_2M_2
| |-- SRR400264_01_R1_bowtie2.log
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bowtie2.log
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_readsTrimming.log
| |-- SRR400264_01_R2_bowtie2.log
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bowtie2.log
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_readsTrimming.log
| |-- build_raw_maps.log
| |-- make_Rplots.log
| |-- mapped_2hic_fragments.log
| |-- mapping_combine.log
| |-- mapping_stats.log
| |-- mapping_step1.log
| |-- mapping_step2.log
| |-- mergeSAM.log
| |-- merge_stats.log
| |-- merge_valid_interactions.log
| |-- plot_hic_contacts.Rout
| |-- plot_hic_fragment.Rout
| |-- plot_mapping_portion.Rout
| |-- plot_pairing_portion.Rout
|-- rawdata -> /PATH/TO/HiC-Pro_2.11.1/test-op/test_data
|-- tmp
工作原理介绍:
1.Reads Mapping
将fastq文件用bowtie2 mapping一次,看log:
cat mapping_step1.log
###mapping fastq to reference genome , 将unmapping的read 另存为 unmap.fastq
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R2 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R2.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R2_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R1 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R1.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R1_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam
得到这样的文件:
| |-- bwt2_global
| | |-- dixon_2M
| | | |-- SRR400264_00_R1_hg19.bwt2glob.bam
| | | |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
| | | |-- SRR400264_00_R2_hg19.bwt2glob.bam
| | | |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
| | `-- dixon_2M_2
| | |-- SRR400264_01_R1_hg19.bwt2glob.bam
| | |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
| | |-- SRR400264_01_R2_hg19.bwt2glob.bam
| | |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq
这里有mapping的,和unmapping的。因为read是嵌合基因。mapping上的是背景基因组。unmapping的是需要下一步处理的。
2.片段分配和过滤
先看log
cat mapping_step2.log
### 酶切
/PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
/PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
### 再用bowtie2 mapping一次
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R1_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R2_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
这一步是将unmapping的结果,用酶切位点分割,然后将分割下来的结果,再mapping一次。得到以下结果
| |-- bwt2_local
| |-- dixon_2M
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
| | |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
| | |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
| |-- dixon_2M_2
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
| |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
| |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq
3、mapping_combine.log
cat mapping_combine.log
### 将global的bam与local的bam用samtools merge
/PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
/PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
### sort
/PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R2_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
/PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R1_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
[bam_sort_core] merging from 0 files and 20 in-memory blocks...
[bam_sort_core] merging from 0 files and 20 in-memory blocks...
### 改名
mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
4、merge
### 将R1与R2 merge为一个文件。
cat mergeSAM.log
/PATH/TO/python/python-2.7.10/bin/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mergeSAM.py -q 0 -t -v -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam -r bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam -o bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## mergeBAM.py
## forward= bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
## reverse= bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
## output= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## min mapq= 0
## report_single= False
## report_multi= False
## verbose= True
## Merging forward and reverse tags ...
5、利用HiCPro的mapped_2hic_fragments.py程序将比对结果转化为Hi-C片段信息
cat mapped_2hic_fragments.log
/PATH/TO/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mapped_2hic_fragments.py -v -S -t 100 -m 100000 -s 100 -l 600 -a -f /PATH/TO/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed -r bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam -o hic_results/data/dixon_2M
## overlapMapped2HiCFragments.py
## mappedReadsFile= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## fragmentFile= /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed
## minInsertSize= 100
## maxInsertSize= 600
## minFragSize= 100
## maxFragSize= 100000
## allOuput= True
## SAM ouput= True
## verbose= True
再对输出的valid pairs文件进行排序:
LANG=en; sort -T tmp -k2,2V -k3,3n -k5,5V -k6,6n -o hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs
6 对所有的valid pairs进行合并
LANG=en; sort -T tmp -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m hic_results/data/dixon_2M_2/SRR400264_01_hg19.bwt2pairs.validPairs | awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > hic_results/data/dixon_2M_2/dixon_2M_2.allValidPairs
这样就成了我们最终的文件,dixon_2M_2.allValidPairs。
这个文件还不能可视化,如果要可视化,一般在jucebox里面查看。我们需要用配套的hicpro2juicebox.sh来转化:
## e.g.1
HICPRO_PATH/bin/utils/hicpro2juicebox.sh -i hicpro_res/hic_results/data/dixon_2M/dixon_2M_allValidPairs -g hg19 -j /usr/local/juicebox/juicebox_clt_1.4.jar
很多人问,这个-j的 jar文件在哪里?
事实上用这个就可以。
/PATH/TO/juicer/juicer/UGER/scripts/juicer_tools.jar
下次,我们再讨论juicer与juicebox的用法。