数据来源:
Li Z, Mao K, Liu L, Xu S, Zeng M, Fu Y, Huang J, Li T, Gao G, Teng ZQ, Sun Q, Chen D, Cheng Y. Nuclear microRNA-mediated transcriptional control determines adult microglial homeostasis and brain function. Cell Rep. 2024 Mar 26;43(3):113964. doi: 10.1016/j.celrep.2024.113964. Epub 2024 Mar 13. PMID: 38489263.
1.原始数据下载
fastq-dump --split-3 --gzip SRR23051404
fastq-dump --split-3 --gzip SRR23051405
fastq-dump --split-3 --gzip SRR23051406
fastq-dump --split-3 --gzip SRR23051407
fastq-dump --split-3 --gzip SRR23051408
fastq-dump --split-3 --gzip SRR23051409
2.过滤低质量读段并生成QC 报告
trim_galore --illumina --fastqc --paired SRR23051404_1.fastq.gz SRR23051404_2.fastq.gz
trim_galore --illumina --fastqc --paired SRR23051405_1.fastq.gz SRR23051405_2.fastq.gz
trim_galore --illumina --fastqc --paired SRR23051406_1.fastq.gz SRR23051406_2.fastq.gz
trim_galore --illumina --fastqc --paired SRR23051407_1.fastq.gz SRR23051407_2.fastq.gz
trim_galore --illumina --fastqc --paired SRR23051408_1.fastq.gz SRR23051408_2.fastq.gz
trim_galore --illumina --fastqc --paired SRR23051409_1.fastq.gz SRR23051409_2.fastq.gz
3 Mapping
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cKO1_R1.fq.gz cKO1_R2.fq.gz
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cKO2_R1.fq.gz cKO2_R2.fq.gz
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cKO3_R1.fq.gz cKO3_R2.fq.gz
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cWT1_R1.fq.gz cWT1_R2.fq.gz
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cWT2_R1.fq.gz cWT2_R2.fq.gz
Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cWT3_R1.fq.gz cWT3_R2.fq.gz
其中Go_Chip_Seq_PE脚本内容为:
#!/bin/bash
# creatation: 2024-12-4 By Wenxuan Zhao
# 命令样式:Go_Chip_Seq_PE_ZWX.sh mm10_Bowtie2Index/genome 8 cWT3_R1.fq.gz cWT3_R2.fq.gz
assembly=$1
#/share/Genomes/bwaindex/mm9/mm9.fa
threads=$2
fq1=$3
fq2=$4
fq=${3}_${4}
insert=$5
if test -z $1
then
echo "please input the path of human genome(/public/share/Genomes/hg38_Bowtie2Index/genome)"
echo "please input the path of mouse genome(/public/share/Genomes/mm10_Bowtie2Index/genome)"
exit
fi
if test -z $2
then
echo "please input the number of threads"
exit
fi
if test -z $3
then
echo "please input read1 fastq file"
exit
fi
if test -z $4
then
echo "please input read2 fastq file"
exit
fi
if test -z $5
then
insert=2000
fi
######################################################
echo -e "\n\tWelcom to analyze ChIP-seq for PE-fastq files "
echo -e "\n\tstep1: perform QC for ${fq}"
echo -e "\n\tstep2: bowtie2 -p $threads -X $insert -x $assembly -1 $fq1 -2 $fq2 -S ${fq}tmp.sam"
# -S ${fq}tmp.sam 指定输出到f1_f2tmp.sam文件中 map到基因组上
bowtie2 -p $threads -X $insert -x $assembly -1 $fq1 -2 $fq2 -S ${fq}tmp.sam
# samtools view -Sbq: b将 SAM 文件转换为 BAM 格式,-q 1 表示过滤掉比对质量低的读段,-S是指定输入文件是sam格式,但新版本会自动检测
samtools view -Sbq 1 ${fq}tmp.sam > ${fq}tmp.bam
# 删除sam文件以节省空间
rm -rf ${fq}tmp.sam
echo -e "\n\tstep3: remove PCR-duplicated reads for mapped reads."
# 对bam文件排序 排序的依据是每个读段在基因组中的位置(染色体编号和位置)
samtools sort -o ${fq}tmp1.bam ${fq}tmp.bam
# 去除重复的PCR读段
samtools rmdup ${fq}tmp1.bam ${fq}tmp_rmdup.bam
# 删除中间文件
rm -rf ${fq}tmp.bam
rm -rf ${fq}tmp1.bam
echo -e "\n\tstep4: sort bam file."
samtools sort -o ${fq}_rmdup_sorted.bam ${fq}tmp_rmdup.bam
rm -rf ${fq}tmp_rmdup.bam
# 计算RPKM并标准化
# RPKM 是一种用于 RNA-seq 和 ChIP-seq 数据标准化的度量单位,表示每千碱基转录长度每百万比对的读段数。
# 在 bamCoverage 命令中,-bs 25 表示 bin size 为 25 bp,这意味着将基因组分成大小为 25 碱基对(bp)的窗口,每个窗口内计算一个 RPKM 值
# RPKM = 在该基因或区域中比对到的 reads(读段)*10**9/该基因或区域的长度 * 整个样本中所有比对的 reads 数量)
# -b 输入 -o 输出
# 对${fq}_rmdup_sorted.bam 进行质量控制
# samtools flagstat ${fq}_rmdup_sorted.bam > ./${fq}.raw.stat
# -f 2 选项代表 只保留双端比对的 reads -q 30 选项代表只保留比对质量大于等于30的 reads
echo -e "\n\tstep5: Quality Control"
samtools view -h -f 2 -q 30 ${fq}_rmdup_sorted.bam | grep -v -e "chrM" | samtools sort -O bam -@ 10 -o - > ${fq}.last.bam
samtools index ${fq}.last.bam
samtools flagstat ${fq}.last.bam > ./${fq}.last.stat
rm -rf ${fq}_rmdup_sorted.bam
echo -e "\n\tstep6: calculate RPKM within each bin of 25bp and convert bam to bw/bg file."
bamCoverage -b ${fq}.last.bam -o ${fq}.last.bw --normalizeUsing RPKM --binSize 25 -p 10
bamCoverage -of bedgraph -bs 25 --normalizeUsing RPKM -b ${fq}.last.bam -o ${fq}_RPKMtmp.bg -p 10
awk '$4>0 {printf("%s\t%d\t%d\t%.2f\n",$1,$2,$3,$4)}' ${fq}_RPKMtmp.bg > ${fq}_RPKM25bp_last.bg
# 删除过滤前的文件
rm -rf ${fq}_RPKMtmp.bg
echo -e "\n\tstep7: convert bam into bed file for macs2 callpeak"
# 转换成bed文件格式
bedtools bamtobed -i ${fq}.last.bam > ${fq}.last.bed
echo -e "\n\tProcessing $fq completed."
# 最终的文件:1.${fq}_RPKM25bp_last.bg 2.${fq}.last.bed 3.${fq}.last.bam 4.${fq}.last.bw
最终生成6 个文件,满足大多数情况下的下游分析:
image.png
4.生成可上传 UCSC track 的文件:
Bedgraph2UcscTrack.sh cKO1_R1.fq.gz_cKO1_R2.fq.gz_RPKM25bp_last.bg cKO1_Pu.1 cKO1_Pu.1 200,100,50 mm10_chromInfo.bed 3
Bedgraph2UcscTrack.sh cKO2_R1.fq.gz_cKO2_R2.fq.gz_RPKM25bp_last.bg cKO2_Pu.1 cKO2_Pu.1 200,100,50 mm10_chromInfo.bed 3
Bedgraph2UcscTrack.sh cKO3_R1.fq.gz_cKO3_R2.fq.gz_RPKM25bp_last.bg cKO3_Pu.1 cKO3_Pu.1 200,100,50 mm10_chromInfo.bed 3
Bedgraph2UcscTrack.sh cWT1_R1.fq.gz_cWT1_R2.fq.gz_RPKM25bp_last.bg cWT1_Pu.1 cWT1_Pu.1 200,100,50 mm10_chromInfo.bed 3
Bedgraph2UcscTrack.sh cWT2_R1.fq.gz_cWT2_R2.fq.gz_RPKM25bp_last.bg cWT2_Pu.1 cWT2_Pu.1 200,100,50 mm10_chromInfo.bed 3
Bedgraph2UcscTrack.sh cWT3_R1.fq.gz_cWT3_R2.fq.gz_RPKM25bp_last.bg cWT3_Pu.1 cWT3_Pu.1 200,100,50 mm10_chromInfo.bed 3
其中脚本的详细内容为:
#!/usr/bin/bash
if test -z $1
then
echo ""
echo " please specify a bedgraph file."
echo ""
exit
fi
if test -z $2
then
echo ""
echo " please specify short-name."
echo ""
exit
fi
if test -z $3
then
echo ""
echo " please specify long-name."
echo ""
exit
fi
if test -z $4
then
echo ""
echo " please specify track colour, for example 200,100,50."
echo ""
exit
fi
if test -z $5
then
echo ""
echo " please specify human chromosome-size (for example /home/public/share/Genomes/hg38_chromInfo.bed)."
echo " please specify mouse chromosome-size (for example /home/public/share/Genomes/mm10_chromInfo.bed)."
echo ""
exit
fi
if test -z $6
then
echo ""
echo " please specify mini-value (0)."
echo ""
exit
fi
#=================================================
echo -e "track type=bedGraph name=$2 description=$3 color=$4 autoScale=off alwaysZero=on windowingFunction=maximum smoothingWindow=5 viewLimits=0:100" > $1.ucsc${6}.bg
cat $1 |grep ^chr - | bedtools intersect -b $5 -a - -f 1 | awk -v N="$6" '{printf("%s\t%d\t%d\t%.2f\n",$1,$2,$3,$4-N)}' - | awk '$4>0 {printf("%s\t%d\t%d\t%.2f\n",$1,$2,$3,$4)}' - >> $1.ucsc${6}.bg
gzip $1.ucsc${6}.bg
生成的最终文件:
image.png
结果如图:
image.png
5.使用 deeptools 绘制 heatmap and profilePlot
5.1我的做法是把 cWT 和 cKO pool together:
samtools merge cWT.merge.bam cWT*.last.bam
samtools merge cKO.merge.bam cKO*.last.bam
samtools index cKO.merge.bam
samtools index cWT.merge.bam
5.2将 bam 文件转换成 bw 文件:
# 1. 使用 bamCoverage 将 BAM 文件转换为 BigWig 文件
# 对 cKO 样本
for filename in cKO*.merge.bam; do
bamCoverage -b ${filename} -o ${filename%.bam}.bw --normalizeUsing RPKM --binSize 25 -p 10 --effectiveGenomeSize 2652783500
done
# 对 cWT 样本
for filename in cWT*.merge.bam; do
bamCoverage -b ${filename} -o ${filename%.bam}.bw --normalizeUsing RPKM --binSize 25 -p 10 --effectiveGenomeSize 2652783500
done
# 2. 使用 computeMatrix 生成矩阵文件
# 对 cKO cWT 样本,窗口大小修改为上下游各 3000 bp(即 3kb),参考点为 TSS
computeMatrix reference-point \
--referencePoint center -p 10 -b 3000 -a 3000 \
-R cKO_specific_broad_peaks.bed -S ./*.bw \
--skipZeros -o cKO_cWT_matrix.TSS.gz --outFileSortedRegions cKO_cWT_regions.bed
# 绘制 热图
plotHeatmap -m cKO_cWT_matrix.TSS.gz -o cKO_cWT_heatmap.pdf --colorMap RdBu --perGroup --heatmapHeight 15 --heatmapWidth 5 --whatToShow 'heatmap and colorbar' --plotFileFormat pdf --dpi 720
# 4. 使用 plotProfile 绘制平均信号的剖面图
plotProfile -m cKO_cWT_matrix.TSS.gz -o cKO_cWT_profile.pdf --perGroup --yMax 50 --plotTitle "cKO VS cWT Profile" --plotFileFormat pdf --dpi 720
Note:
1.其中我设置了一个effectiveGenomeSize 2652783500 的参数,其数值大小可在下面网站找到:
https://test-argparse-readoc.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html
image.png
2.在使用 computeMatrix的时候我是用的是cKO_specific_broad_peaks.bed的参考,是为了看在 cKO_specific的位置上,cWT 的 Pu.1的结合情况,至于这个文件怎么生成的后面我会介绍。
5.3结果图片(经过 AI 修改)
image.png
6.masc2 callpeak and annotation
6.1
#broad mode
ls *.bed | while read id ; do
(macs2 callpeak -t $id -g mm -n ${id%%.*} --broad --outdir ./)
done
6.2 从三个重复中找到共有的 peak
chipr -i cKO1_R1_peaks.broadPeak cKO2_R1_peaks.broadPeak cKO3_R1_peaks.broadPeak -m 3 -o output_cKO_broad
chipr -i cWT1_R1_peaks.broadPeak cWT2_R1_peaks.broadPeak cWT3_R1_peaks.broadPeak -m 3 -o output_cWT_broad
6.2步的结果:
image.png
6.3找到 cKO specific 的 broad peak(对应上文的 computeMatrix)
nohup bedtools subtract -a output_cKO_broad_all.bed -b output_cWT_broad_all.bed > cKO_specific_broad_peaks.bed &
6.4 对 cKO broad peak 进行注释 (获得 PU.1的 target-genes)
annotatePeaks.pl output_cKO_broad_all.bed mm10 > Pu.1_target_broad_annotated.xls
结语:
本文未包括对 peak 的注释,后面需要补充一下,另外,还需要学习一下 diffbind 的使用。
需要注意的是,ATAC-seq 的处理流程和 Chip-seq 基本一致,不过 macs2 callpeak 的参数,需要设置一下,另外还要增加一个 find motif 的内容。
ATAC-seq findMotif(Known motif) 出图:
image.png
可以看到 Pu.1 (a key transcription fator of microglia)排名第一。
#ATAC-seq callpeak的参数设置
ls *.bed | while read id ;do (macs2 callpeak -t $id -g mm --nomodel --shift -100 --extsize 200 -n ${id%%.*} --outdir ./) ;done
#寻找 ATAC-seq中cKO-specific的 motif
awk '{print $4"\t"$1"\t"$2"\t"$3"\t+"}' cKO_specific_peaks.bed > All.homer_peaks.tmp
findMotifsGenome.pl All.homer_peaks.tmp mm10 All_motifDir -len 8,10,12
paper link:
https://www.sciencedirect.com/science/article/pii/S2211124724002924?via%3Dihub
Chip-seq/ATAC-seq中使用的 bed 文件获取方式:
从http://genome.ucsc.edu/cgi-bin/hgTables下载
image.png
数据来源:
image.png
模式图:
image.png