7月做进化树
##/bin/bash
##chenbin
/home/hujiaxiang/tools_update/prank_170703/bin/prank -d=./01data/PB2.fa +F \
-DNA -iterate=1000 -o=./02align/PB2_prank_align.fa > PB2.log
/home/hujiaxiang/tools_update/prank_170703/bin/prank -d=./01data/NP.fa +F \
-DNA -iterate=1000 -o=./02align/NP_prank_align.fa > NP.log
/home/hujiaxiang/tools_update/prank_170703/bin/prank -d=./01data/NS.fa +F \
-DNA -iterate=1000 -o=./02align/NS_prank_align.fa > NS.log
/home/hujiaxiang/tools_update/prank_170703/bin/prank -d=./01data/M.fa +F \
-DNA -iterate=1000 -o=./02align/M_prank_align.fa > M.log
mafft --maxiterate 1000 --localpair ./02align/PB2_0814.fas \
> ./02align/mafft_PB2_0814.fas
#muscle -align ./01data/all.fa -output muscle.output.fa
首先用prank比对获得比对后的fas文件然后在iqtree2中建树获得phy文件
iqtree2 -s ./02align/H3_0813b.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/H3
iqtree2 -s ./02align/M_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/M
iqtree2 -s ./02align/N8_0813.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/N8
iqtree2 -s ./02align/NP_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/NP
iqtree2 -s ./02align/NS_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/NS
iqtree2 -s ./02align/PA_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/PA
iqtree2 -s ./02align/PB1_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/PB1
iqtree2 -s ./02align/PB2_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/PB2
iqtree2 -s ./02align/mafft_PB2_0814.fas -st DNA -bb 1000 \
-m TEST -pre ./03tree/mafft_PB2
得到phy文件后到figtree中可视化,然后用iQTL进行美化
测试了三种比对方法比对差异
iqtree2 -s ./02align/PA_prank_align.fa.best.fas -st DNA \
-bb 1000 -m TEST -pre ./03tree/PA_prank
iqtree2 -s ./02align/mafft_align.fa -st DNA -bb 1000 \
-m TEST -pre ./03tree/mafft
iqtree2 -s ./muscle.output.fa -st DNA -bb 1000 \
-m TEST -pre ./03tree/muscle
选择两家公司数据进行比较
!/bin/bash
##构建其他物种的references
cellranger mkref \
--genome=GRCg6a \
--nthreads=10 \
--fasta=/home3/chenbin/scRNA/ref/Gallus_gallus.GRCg6a.dna.toplevel.fa \
--genes=/home3/chenbin/scRNA/ref/GRCg6a.filtered.gtf
#构建完成之后就可以对转录组数据进行定量
cellranger count \
--id=PBMC \
--transcriptome=/home3/chenbin/scRNA/GRCg6a \
--create-bam true \
--fastqs=/home3/chenbin/scRNA/rawdata/PBMC \
--localcores=8 \
--localmem=64
cellranger count \
--id=H9N2 \
--transcriptome=/home3/chenbin/scRNA/GRCg6a \
--create-bam true \
--fastqs=/home3/chenbin/scRNA/rawdata/H9N2 \
--localcores=8 \
--localmem=64
chip
#!/bin/Bash
#rna--seq
#from chenb
#定义环境变量便于后面调用文件路径
RAWDATA=//home2/wangyiming/CleanData/
OUT=./mapping
REF=./
#03参考基因组建立索引
mkdir mapping
/home/wangyiming/biosoft/hisat2-2.2.1/hisat2-build GRCh38.p14.genome.fa $OUT/human
#04比对、转换和建立索引
#双端测序
for i in `cat ./id.txt`
do
/home/wangyiming/biosoft/hisat2-2.2.1/hisat2 --new-summary \
-p 4 -x human \
-1 $RAWDATA/${i}_clean_R1.fq.gz \
-2 $RAWDATA/${i}_clean_R2.fq.gz \
-S $OUT/${i}.sam
samtools view -@ 16 -bh $OUT/${i}.sam -o $OUT/${i}.bam
samtools view -bF 4 -q 10 $OUT/${i}.bam > $OUT/${i}_filtered.bam
samtools sort -@8 $OUT/${i}_filtered.bam -o $OUT/${i}_filtered_sorted.bam
samtools index $OUT/${i}_filtered_sorted.bam #建立索引
rm $OUT/${i}.sam
done
#03定量
mkdir 03Quantification
for i in `cat ./id.txt`
do
featureCounts \
-a ./human.gtf \
-T 4 -p -M --countReadPairs -g gene_name -t exon \
-o ./quantification/${i}.count $OUT/${i}_filtered_sorted.bam
done
chip数据分析
# 基于python3的软件,在python2环境下不能运行;
# 支持并行预算与断点运行,内存控制,时间控制等;
###################################
# test snakemake
###################################
# cutadapt -j 1 因为ERROR: Running in parallel is not supported on Python 2
PICARD = "/home/chenbin/miniconda3/share/picard-2.18.29-0/picard.jar"
REF_INDEX = {"S1_input_V300092278_L01_101","S1_V300092278_L01_97",
"S2_input_V300092278_L01_102","S2_V300092278_L01_98",
"S3_input_V300092278_L01_103","S3_V300092278_L01_99",
"S4_input_V300092278_L01_104","S4_V300092278_L01_100"}
INDEX_BT2 = "/home3/chenbin/chuze/RNA-seq/human"
# 要保留的中间结果及最后结果
rule all:
input:
expand("fix.fastq/{rep}_R1.fq.gz",rep=REF_INDEX),
expand("bam/{rep}_sort.bam",rep=REF_INDEX),
expand("bam/{rep}_sort_rmdup.bam",rep=REF_INDEX),
expand("macs2_result/{rep}_peaks.narrowPeak",rep=REF_INDEX)
rule cutadapt:
input:
"01fastq/{rep}_1.fq.gz",
"01fastq/{rep}_2.fq.gz"
output:
"fix.fastq/{rep}_R1.fq.gz",
"fix.fastq/{rep}_R2.fq.gz"
log:
"fix.fastq/{rep}_cutadapt.log"
shell:
"cutadapt -j 1 --times 1 -e 0.1 -O 3 --quality-cutoff 25 \
-m 50 \
-a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC \
-A CTGTCTCTTATACACATCTGACGCTGCCGACGA \
-o {output[0]} -p {output[1]} {input[0]} {input[1]} > {log} 2>&1 "
rule bt2_mapping:
input:
"fix.fastq/{rep}_R1.fq.gz",
"fix.fastq/{rep}_R2.fq.gz"
output:
"bam/{rep}.sam"
log:
"bam/{rep}.log"
shell:
"/opt/biosoft/bowtie2-2.2.6/bowtie2 -x {INDEX_BT2} -p 4 \
-X 2000 -1 {input[0]} -2 {input[1]} \
-S {output} --no-unal > {log} 2>&1"
rule bam_file_filter:
input:
"bam/{rep}.sam"
output:
"bam/{rep}.bam"
log:
"bam/{rep}_filter.log"
shell:
"samtools view -h -b -@ 10 -f 3 -F 12 -F 256 {input} > {output}"
rule bam_file_sort:
input:
"bam/{rep}.bam"
output:
"bam/{rep}_sort.bam"
log:
"bam/{rep}_sort.log"
shell:
"samtools sort -O BAM -o {output} -T {output}.temp -@ 4 -m 2G {input}"
rule remove_duplication:
input:
"bam/{rep}_sort.bam"
output:
"bam/{rep}_sort_rmdup.bam",
"bam/{rep}_sort_rmdup.matrix"
log:
"bam/{rep}_sort_rmdup.log"
shell:
"java -Xms20g -Xmx20g -XX:ParallelGCThreads=10 \
-jar {PICARD} MarkDuplicates \
I={input} O={output[0]} M={output[1]}\
ASO=coordinate REMOVE_DUPLICATES=true >{log} 2>&1"
rule remove_low_qual:
input:
"bam/{rep}_sort_rmdup.bam"
output:
"bam/{rep}_sort_rmdup_q30.bam"
log:
"bam/{rep}_sort_rmdup_q30.log"
shell:
"sambamba view -t 5 -h -f bam -F \"mapping_quality < 30\" \
-o {output} {input} >{log} 2>&1"
#rule remove_MT:
# input:
# "bam/{rep}_sort_rmdup_q30.bam"
# output:
# "bam/{rep}_sort_rmdup_q30_rmMT.bam"
# log:
# "bam/{rep}_sort_rmdup_q30_rmMT.log"
# shell:
# "samtools view -h {input} | grep -v 'NC_010965.1'
# | samtools view -bS -o {output}"
rule call_peak:
input:
"bam/{rep}_sort_rmdup_q30.bam"
output:
"macs2_result/{rep}_peaks.narrowPeak"
params:
"{rep}",
"macs2_result"
log:
"macs2_result/{rep}.log"
shell:
"macs2 callpeak -t {input} -f BAM -g 1.2e9 \
--keep-dup all -B --SPMR \
--outdir {params[1]} -n {params[0]} \
--nomodel --shift -75 --extsize 150 > {log} 2>&1"
bamCoverage -e 170 -bs 10 -b ./bam/S1_input_V300092278_L01_101_sort_rmdup_q30.bam \
-o ./bw/rmdup/S1_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S1_V300092278_L01_97_sort_rmdup_q30.bam \
-o ./bw/rmdup/S1.bw
bamCoverage -e 170 -bs 10 -b ./bam/S2_input_V300092278_L01_102_sort_rmdup_q30.bam \
-o ./bw/rmdup/S2_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S2_V300092278_L01_98_sort_rmdup_q30.bam \
-o ./bw/rmdup/S2.bw
bamCoverage -e 170 -bs 10 -b ./bam/S3_input_V300092278_L01_103_sort_rmdup_q30.bam \
-o ./bw/rmdup/S3_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S3_V300092278_L01_99_sort_rmdup_q30.bam \
-o ./bw/rmdup/S3.bw
bamCoverage -e 170 -bs 10 -b ./bam/S4_input_V300092278_L01_104_sort_rmdup_q30.bam \
-o ./bw/rmdup/S4_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S4_V300092278_L01_100_sort_rmdup_q30.bam \
-o ./bw/rmdup/S4.bw
samtools index ./bam/S1_input_V300092278_L01_101_sort.bam
samtools index ./bam/S1_V300092278_L01_97_sort.bam
samtools index ./bam/S2_input_V300092278_L01_102_sort.bam
samtools index ./bam/S2_V300092278_L01_98_sort.bam
samtools index ./bam/S3_input_V300092278_L01_103_sort.bam
samtools index ./bam/S3_V300092278_L01_99_sort.bam
samtools index ./bam/S4_input_V300092278_L01_104_sort.bam
samtools index ./bam/S4_V300092278_L01_100_sort.bam
bamCoverage -e 170 -bs 10 -b ./bam/S1_input_V300092278_L01_101_sort.bam \
-o ./bw/onlysort/S1_input_.bw
bamCoverage -e 170 -bs 10 -b ./bam/S1_V300092278_L01_97_sort.bam \
-o ./bw/onlysort/S1.bw
bamCoverage -e 170 -bs 10 -b ./bam/S2_input_V300092278_L01_102_sort.bam \
-o ./bw/onlysort/S2_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S2_V300092278_L01_98_sort.bam \
-o ./bw/onlysort/S2.bw
bamCoverage -e 170 -bs 10 -b ./bam/S3_input_V300092278_L01_103_sort.bam \
-o ./bw/onlysort/S3_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S3_V300092278_L01_99_sort.bam \
-o ./bw/onlysort/S3.bw
bamCoverage -e 170 -bs 10 -b ./bam/S4_input_V300092278_L01_104_sort.bam \
-o ./bw/onlysort/S4_input.bw
bamCoverage -e 170 -bs 10 -b ./bam/S4_V300092278_L01_100_sort.bam \
-o ./bw/onlysort/S4.bw
computeMatrix reference-point --referencePoint TSS -b 2000 -a 2000 \
-R /home3/chenbin/chuze/RNA-seq/RIGI_in_GRCh38.bed \
-S /home3/chenbin/chuze/bw/onlysort/S1_input.bw
/home3/chenbin/chuze/bw/onlysort/S1.bw \
/home3/chenbin/chuze/bw/onlysort/S2_input_.bw \
/home3/chenbin/chuze/bw/onlysort/S2.bw \
/home3/chenbin/chuze/bw/onlysort/S3_input_.bw \
/home3/chenbin/chuze/bw/onlysort/S3.bw \
# /home3/chenbin/chuze/bw/onlysort/S4_input_.bw \
# /home3/chenbin/chuze/bw/onlysort/S4.bw \
-p 25 \
--missingDataAsZero \
--skipZeros --outFileName chr9_RIGI2.tss.gz \
-o chr9_RIGI2.tss.gz
plotProfile -m chr9_RIGI2.tss.gz \
-out chr9_RIGI.plotProfile2.png \
--plotTitle "RIGI" \
--samplesLabel "S1_input" "S1" "S2_input" "S2" "S3_input" "S3"
options(BIOCONDUCTOR_ONLINE_VERSION_DIAGNOSIS=TRUE)
library(ChIPseeker)
require(GenomicFeatures)
library(ggplot2)
library(plotrix)
BiocManager::install("S4Vectors")
BiocManager::install("ChIPpeakAnno")
library(ChIPpeakAnno)
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(S4Vectors)
files <- getSampleFiles()
basename(unlist(files))
covplot(files[[5]])
# 比如要画第4、5个文件(MACS生成的BED文件包含常规的5列)
peak=GenomicRanges::GRangesList(CBX6=readPeakFile(files[[4]]),CBX7=readPeakFile(files[[5]]))
covplot(peak, weightCol="V5") + facet_grid(chr ~ .id)
# Read peak data from the specified files
peak_CBX6 <- readPeakFile(files[[4]])
peak_CBX7 <- readPeakFile(files[[5]])
require(TxDb.Hsapiens.UCSC.hg19.knownGene)
setwd("D://chuzhe")
files<-as.list(list.files(path = "D://chuzhe", pattern = "bed$", full.names = TRUE, ignore.case = FALSE))
covplot(files[[1]])
require(GenomicFeatures)
OAS1<-readPeakFile("D://chuzhe//S1_V300092278_L01_97_peaks.narrowPeak")
OAS2<-readPeakFile("D://chuzhe//S2_V300092278_L01_98_peaks.narrowPeak")
OAS3<-readPeakFile("D://chuzhe//S3_V300092278_L01_99_peaks.narrowPeak")
peak=GenomicRanges::GRangesList(OAS1,OAS2,OAS3)
covplot(peak, weightCol="V5") + facet_grid(chr ~ .id)
peak_df <- as.data.frame(peak) # 转换为数据框
covplot(peak_df, weightCol = "V5") + facet_grid(chr ~ .id)
txdb = TxDb.Hsapiens.UCSC.hg38.knownGene
f = getSampleFiles()[[4]]
x = annotatePeak(f, tssRegion=c(-2000, 2000), TxDb=txdb)
oas1_peak <- peak$OAS1
covplot(oas1_peak, weightCol="V5") + facet_grid(chr ~ .id)
peakAnno <- annotatePeak(OAS3,
tssRegion=c(-2000, 2000),
TxDb=txdb, annoDb="org.Hs.eg.db")
plotAnnoPie(peakAnno)
write.table(peakAnno, 'OAS3_peak.txt', sep = '\t', row.names = FALSE, quote = FALSE)
anno<-assignChromosomeRegion(OAS1, nucleotideLevel=FALSE,precedence=c("Promoters","immediateDownstream",
"fiveUTRs", "threeUTRs","Exons", "Introns"), TxDb=txdb)
##find overlap
ol <- findOverlapsOfPeaks(OAS1, OAS2,OAS3)
write.table(ol$peaklist, 'OAS_overlap.txt', sep = '\t', row.names = FALSE, quote = FALSE)
pie3D(OAS1_anno$`anno$percentage`,labels=rownames(OAS1_anno))
barplot(OAS1_anno, las=3)
##和转录因子找overlap
TF<-read.table("TF.txt")
2018年cell文章先用转录组数据获得基因表达矩阵
#!/bin/Bash #脚本表头便于Linux识别bash脚本
#rna--seq #项目名称
#from chenb #脚本编写者及邮箱
#定义环境变量便于后面调用文件路径
RAWDATA=/home3/chenbin/chuze/cell_article/rnaseq/01rawdata
TRIM=/home3/chenbin/chuze/cell_article/rnaseq/02trim
OUT=/home3/chenbin/chuze/cell_article/rnaseq/03bam
REF=/home3/chenbin/chuze/RNA-seq
#wget -P anntation/ http://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/genes/mm39.ncbiRefSeq.gtf.gz
#wget -P genome/ http://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
#01质控
fastqc -t 4 $RAWDATA/HCT116_wt_RNASeq.fastq.gz -o $RAWDATA
#02去除低质量序列和接头
#单端测序数据
trimmomatic SE -threads 4 \
$RAWDATA/HCT116_wt_RNASeq.fastq.gz \
$TRIM/HCT116_wt_clean.fq.gz \
LEADING:3 TRAILING:3 \
SLIDINGWINDOW:4:20 MINLEN:50 TOPHRED33
fastqc -t 4 $TRIM/HCT116_wt_clean.fq.gz -o $TRIM
#03参考基因组建立索引
#mkdir 02Mapping
hisat2-build $REF/GRCh38.p14.genome.fa $REF/GRCh38
###参考基因组版本信息如链接:https://www.gencodegenes.org/mouse/
#04比对、转换和建立索引
#双端测序
hisat2 --new-summary \
-p 4 -x $REF/GRCh38 \
-U $TRIM/HCT116_wt_clean.fq.gz \
-S $OUT/HCT116_wt.sam
samtools view -@ 16 -bh $OUT/HCT116_wt.sam -o $OUT/HCT116_wt.bam
samtools view -bF 4 -q 10 $OUT/HCT116_wt.bam > $OUT/HCT116_wt_filtered.bam
samtools sort -@8 $OUT/HCT116_wt_filtered.bam -o $OUT/HCT116_wt_filtered_sorted.bam
samtools index $OUT/HCT116_wt_filtered_sorted.bam #建立索引
#rm $OUT/HCT116_wt.sam
#03定量
mkdir 04Quantification
featureCounts \
-a $REF/gencode.v47.chr_patch_hapl_scaff.annotation.gff3 \
-T 4 -p -M --countReadPairs -g gene_id -t exon \
-o ./04Quantification/HCT116_wt.count ./04Quantification/HCT116_wt_filtered_sorted.bam
根据表达量将所有基因分成四类,表达量在前10%称为high 前50%为medium,后50%称为low 不表达的为silent
import pandas as pd
# 假设数据保存在 'data.txt',用 tab 分隔
data = pd.read_csv('GSM1385719_genes.fpkm_tracking.txt', sep='\t')
# 1. 将 FPKM 为 0 的 gene_id 输出到 silent.txt
silent_genes = data[data['FPKM'] == 0]['gene_id']
silent_genes.to_csv('silent_ENSG.txt', index=False, header=False)
# 2. 删除 FPKM 为 0 的行
data = data[data['FPKM'] > 0]
# 3. 按 FPKM 降序排序
data_sorted = data.sort_values(by='FPKM', ascending=False)
# 4. 计算分位数
total_genes = len(data_sorted)
high_limit = int(0.05 * total_genes) # 前5%
medium_limit = int(0.5 * total_genes) # 前50%
# 5. 输出 gene_id 到对应文件
high_genes = data_sorted.iloc[:high_limit]['gene_id']
medium_genes = data_sorted.iloc[high_limit:medium_limit]['gene_id']
low_genes = data_sorted.iloc[medium_limit:]['gene_id']
high_genes.to_csv('high_ENSG.txt', index=False, header=False)
medium_genes.to_csv('medium_ENSG.txt', index=False, header=False)
low_genes.to_csv('low_ENSG.txt', index=False, header=False)
#cat ./high_ENSG.txt | while read id;do \
# grep "$id" gencode.v19.chr_patch_hapl_scaff.annotation.gtf >> high.gtf
#done
#cat ./medium_ENSG.txt | while read id;do \
# grep "$id" gencode.v19.chr_patch_hapl_scaff.annotation.gtf >> medium.gtf
#done
#cat ./low_ENSG.txt | while read id;do \
# grep "$id" gencode.v19.chr_patch_hapl_scaff.annotation.gtf >> low.gtf
#done
#cat ./silent_ENSG.txt | while read id;do \
# grep "$id" gencode.v19.chr_patch_hapl_scaff.annotation.gtf >> silent.gtf
#done
cat ./SYMBOL.txt | while read id;do \
grep "$id" gencode.v19.chr_patch_hapl_scaff.annotation.gtf >> RNAP_sort_symbol.gtf
done
对所获得的gtf进行转换bed
cat high.gtf | cut -f1,4,5,9 | cut -f1 -d";" | awk '{print $1, $2, $3, $5}' |\
sed -e 's/ /\t/g' | sed -e 's/\"//g' | sed -e 's/transcript\://g' > high.bed
cat low.gtf | cut -f1,4,5,9 | cut -f1 -d";" | awk '{print $1, $2, $3, $5}' | \
sed -e 's/ /\t/g' | sed -e 's/\"//g' | sed -e 's/transcript\://g' > low.bed
cat medium.gtf | cut -f1,4,5,9 | cut -f1 -d";" | awk '{print $1, $2, $3, $5}' | \
sed -e 's/ /\t/g' | sed -e 's/\"//g' | sed -e 's/transcript\://g' > medium.bed
cat silent.gtf | cut -f1,4,5,9 | cut -f1 -d";" | awk '{print $1, $2, $3, $5}' |\
sed -e 's/ /\t/g' | sed -e 's/\"//g' | sed -e 's/transcript\://g' > silent.bed
cat RNAP_ENSM_sort.gtf | cut -f1,4,5,9 | cut -f1 -d";" | awk '{print $1, $2, $3, $5}' |\
sed -e 's/ /\t/g' | sed -e 's/\"//g' | sed -e 's/transcript\://g' > RNAP_ENSM_sort.bed
绘制上下游3kb
pip install matplotlib pandas pyBigWig
import pyBigWig
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 读取 BW 文件
bw_file = 'path_to_your_bw_file.bw' # 替换为您的bw文件路径
bw = pyBigWig.open(bw_file)
# 读取 BED 文件
bed_file = 'path_to_your_bed_file.bed' # 替换为您的bed文件路径
df = pd.read_csv(bed_file, sep='\t', header=None, names=['chrom', 'start', 'end', 'expression'])
# 定义 TSS 上下游 3kb 的范围
def get_tpm_values(chrom, tss, expression):
start = max(0, tss - 3000)
end = tss + 3000
tpm_values = bw.stats(chrom, start, end, type='mean')
return tpm_values
# 为每个基因获取TPM值
tpm_data = {}
for _, row in df.iterrows():
chrom = row['chrom']
start = row['start']
end = row['end']
expression = row['expression']
tss = start # 假定 start 为 TSS
tpm = get_tpm_values(chrom, tss, expression)
if expression not in tpm_data:
tpm_data[expression] = []
tpm_data[expression].append(tpm)
# 计算平均TPM值
mean_tpm = {key: np.mean(values) for key, values in tpm_data.items()}
# 设置绘图参数
plt.figure(figsize=(10, 6))
# 绘制 Top1
plt.subplot(2, 1, 1)
for expr, tpm in mean_tpm.items():
plt.plot(np.arange(-3000, 3001, 1), tpm, label=expr) # TSS为0,范围为-3kb到+3kb
plt.title('Top1')
plt.xlabel('Distance to TSS (bp)')
plt.ylabel('TPM (tags per million)')
plt.xticks(ticks=[-3000, -2000, -1000, 0, 1000, 2000, 3000],
labels=['-3 kb', '-2 kb', '-1 kb', 'TSS', '1 kb', '2 kb', '3 kb'])
plt.legend()
# 绘制 RNAPII
plt.subplot(2, 1, 2)
for expr, tpm in mean_tpm.items():
plt.plot(np.arange(-3000, 3001, 1), tpm, label=expr) # TSS为0
plt.title('RNAPII')
plt.xlabel('Distance to TSS (bp)')
plt.ylabel('TPM (tags per million)')
plt.xticks(ticks=[-3000, -2000, -1000, 0, 1000, 2000, 3000],
labels=['-3 kb', '-2 kb', '-1 kb', 'TSS', '1 kb', '2 kb', '3 kb'])
plt.legend()
plt.tight_layout()
plt.show()
computeMatrix reference-point \
-R RNAP_ENSM_sort.bed \
-S ./wig/GSM2058666_HCT116_wt_Top1_ChIPSeq.bw ./wig/GSM2058662_HCT116_wt_PolII_ChIPSeq.bw \
--referencePoint TSS \
-b 2000 -a 2000 \
-p 25 \
--skipZeros \
-o RNAP_sort_TSS.gz
# --outFileSortedRegions regions1_H3K4me3_l2r_genes.bed
#plotProfile -m RNAP_sort_TSS.gz \
# -out TOP1_profile.png \
# --numPlotsPerRow 2 \
# --plotTitle "TOP1 "
plotHeatmap -m RNAP_sort_TSS.gz -out RNAP_sort_all.png
分析chip
PICARD = "/home/chenbin/miniconda3/share/picard-2.18.29-0/picard.jar"
REF_INDEX = {"HCT116_wt_PolII_ChIPSeq","HCT116_wt_Top1_ChIPSeq","HCT116_wt_input_ChIPSeq"}
INDEX_BT2 = "/home3/chenbin/chuze/RNA-seq/human"
# 要保留的中间结果及最后结果
rule all:
input:
# expand("fix.fastq/{rep}.fastq.gz",rep=REF_INDEX),
expand("bam/{rep}_sort.bam",rep=REF_INDEX),
expand("bam/{rep}_sort_rmdup.bam",rep=REF_INDEX),
expand("macs2_result/{rep}_peaks.narrowPeak",rep=REF_INDEX)
#rule cutadapt:
# input:
# "01fastq/{rep}.fastq.gz",
# output:
# "fix.fastq/{rep}.fq.gz",
# log:
# "fix.fastq/{rep}_cutadapt.log"
# shell:
# "cutadapt -j 1 --times 1 -e 0.1 -O 3 --quality-cutoff 25 \
# -m 50 \
# -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC \
# -o {output[0]} {input[0]} > {log} 2>&1 "
rule bt2_mapping:
input:
"rawdata/{rep}.fastq.gz",
output:
"bam/{rep}.sam"
log:
"bam/{rep}.log"
shell:
"bowtie2 -x {INDEX_BT2} -p 4 -X 2000 -U {input} -S {output} --no-unal > {log} 2>&1"
rule bam_file_filter:
input:
"bam/{rep}.sam"
output:
"bam/{rep}.bam"
log:
"bam/{rep}_filter.log"
shell:
"samtools view -h -b -@ 10 -f 3 -F 12 -F 256 {input} > {output}"
rule bam_file_sort:
input:
"bam/{rep}.bam"
output:
"bam/{rep}_sort.bam"
log:
"bam/{rep}_sort.log"
shell:
"samtools sort -O BAM -o {output} -T {output}.temp -@ 4 -m 2G {input}"
rule remove_duplication:
input:
"bam/{rep}_sort.bam"
output:
"bam/{rep}_sort_rmdup.bam",
"bam/{rep}_sort_rmdup.matrix"
log:
"bam/{rep}_sort_rmdup.log"
shell:
"java -Xms20g -Xmx20g -XX:ParallelGCThreads=10 \
-jar {PICARD} MarkDuplicates \
I={input} O={output[0]} M={output[1]}\
ASO=coordinate REMOVE_DUPLICATES=true >{log} 2>&1"
rule remove_low_qual:
input:
"bam/{rep}_sort_rmdup.bam"
output:
"bam/{rep}_sort_rmdup_q30.bam"
log:
"bam/{rep}_sort_rmdup_q30.log"
shell:
"sambamba view -t 5 -h -f bam -F \"mapping_quality < 30\" -o {output} {input} >{log} 2>&1"
rule remove_MT:
input:
"bam/{rep}_sort_rmdup_q30.bam"
output:
"bam/{rep}_sort_rmdup_q30_rmMT.bam"
log:
"bam/{rep}_sort_rmdup_q30_rmMT.log"
shell:
"samtools view -h {input} | grep -v 'NC_010965.1' | samtools view -bS -o {output}"
rule call_peak:
input:
"bam/{rep}_sort_rmdup_q30_rmMT.bam"
output:
"macs2_result/{rep}_peaks.narrowPeak"
params:
"{rep}",
"macs2_result"
log:
"macs2_result/{rep}.log"
shell:
"macs2 callpeak -t {input} -f BAM -g 1.2e9 \
--keep-dup all -B --SPMR \
--outdir {params[1]} -n {params[0]} \
--nomodel --shift -75 --extsize 150 > {log} 2>&1"
library(ChIPseeker)
library(ggplot2)
library(org.Hs.eg.db)
TOP1<-readPeakFile("D://chuzhe//cell_article/HCT116_wt_Top1.bed")
POI<-readPeakFile("D://chuzhe//cell_article/HCT116_wt_PolII.bed")
covplot(peak, weightCol="V5") + facet_grid(chr ~ .id)
peak1 <- readPeakFile(files[[4]])
peaks = lapply(files[4:5], readPeakFile)
covplot(peak2, weightCol = "V5", fill_color = c("turqiose","blue"))+facet_grid(chr ~ .id)
###绘制不同分布饼图
require(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
options(ChIPseeker.ignore_promoter_subcategory=T)
options(ChIPseeker.ignore_1st_intron = T)
options(ChIPseeker.ignore_intron =T)
options(ChIPseeker.ignore_exon =T)
options(ChIPseeker.ignore_1st_exon = T)
options(ChIPseeker.ignore_downstream = T)
peakAnno <- annotatePeak(TOP1,
tssRegion=c(-5000, 5000),
TxDb=txdb, annoDb="org.Hs.eg.db")
anno<-peakAnno@annoStat
anno$Frequency<-round(anno$Frequency,1)
library(plotrix)
layout(matrix(c(1, 2), nrow = 1), widths = c(1, 1))
par(mar = c(1, 1, 1, 1))
color_map <- setNames(c("red","khaki", "cyan", "violet", "blue", "green"),
unique(anno$Feature))
colors_TOP1 <- color_map[anno$Feature]
pie3D(anno$Frequency, labels = paste(anno$Frequency, "%"),
explode = 0, theta = 1,height = 0.15,
main = "TOP1 ", col =colors_TOP1, labelcex = 0.8)
par(mar = c(1, 1, 1, 1))
plot.new()
legend("left", legend = names(color_map), fill = color_map, cex = 0.8)
###peak结合谱图
RNAP_peakAnno<-annotatePeak(POI,
tssRegion=c(-2000, 2000),
TxDb=txdb, annoDb="org.Hs.eg.db")
RNAP_anno<-RNAP_peakAnno@anno
RNAP_tmp=as.data.frame(RNAP_peakAnno)
TOP1_peakAnno <- annotatePeak(TOP1,
tssRegion=c(-2000, 2000),
TxDb=txdb, annoDb="org.Hs.eg.db")
TOP1_anno<-TOP1_peakAnno@anno
H3k4me3<-readPeakFile("D://chuzhe//cell_article//GSM945304_hg19_wgEncodeUwHistoneHct116H3k4me3StdPkRep1.narrowPeak")
H3k4me3_peakAnno<-annotatePeak(H3k4me3,
tssRegion=c(-2000, 2000),
TxDb=txdb, annoDb="org.Hs.eg.db")
H3k4me3_anno<-H3k4me3_peakAnno@anno
# 步骤 1: 对 RNAP_anno 进行排序并筛选
RNAP_sorted <- RNAP_anno[order(RNAP_anno$V5, decreasing = TRUE)]
RNAP_sorted <- RNAP_sorted[grepl("^Promoter", RNAP_sorted$annotation)]
# 步骤 2: 提取排序后的 SYMBOL 顺序
symbol_order <- unique(RNAP_sorted$SYMBOL)
# 随后,对 TOP1_anno 和 H3k4me3_anno 根据 SYMBOL 顺序进行排序
TOP1_sorted <- TOP1_anno[TOP1_anno$SYMBOL %in% symbol_order]
TOP1_sorted <- TOP1_sorted[order(match(TOP1_sorted$SYMBOL, symbol_order))]
TOP1_sorted <- TOP1_sorted[grepl("^Promoter", TOP1_sorted$annotation)]
H3k4me3_sorted <- H3k4me3_anno[H3k4me3_anno$SYMBOL %in% symbol_order]
H3k4me3_sorted <- H3k4me3_sorted[order(match(H3k4me3_sorted$SYMBOL, symbol_order))]
H3k4me3_sorted <- H3k4me3_sorted[grepl("^Promoter", H3k4me3_sorted$annotation)]
all_peak<-c(RNAP_sorted,TOP1_sorted,H3k4me3_sorted)
pdf("all-peak.pdf",width = 20,height = 30)
par(mfrow = c(1,3))
peakHeatmap(RNAP_sorted,TxDb = txdb,upstream=2000, downstream=2000,palette = "Set3",title="RNAPⅡ")
peakHeatmap(TOP1_sorted, TxDb = txdb, upstream = 2000, downstream = 2000,palette = "Set3",title="TOP1")
peakHeatmap(H3k4me3_sorted, TxDb = txdb, upstream = 2000, downstream = 2000,palette = "RdPu",title="H3k4me3")
dev.off()
library(dplyr)
# 1. 筛选 RNAP_peakAnno 中 annotation 以 "Promoter" 开头的记录
rnap_filtered <- RNAP_peakAnno@anno %>%
as.data.frame() %>%
filter(grepl("^Promoter", annotation))
# 2. 按 V5 值从大到小排序
rnap_sorted <- rnap_filtered %>%
arrange(desc(V5))
# 3. 获取排序后的 SYMBOL 列
ordered_symbols <- rnap_sorted$SYMBOL
# 3. 获取排序后的唯一 SYMBOL 列
ordered_symbols <- unique(rnap_sorted$SYMBOL)
# 4. 对 TOP1_peakAnno 和 H3k4me3_peakAnno 进行相同的排序和筛选
top1_filtered <- TOP1_peakAnno@anno %>%
as.data.frame() %>%
filter(SYMBOL %in% ordered_symbols & grepl("^Promoter", annotation))
h3k4me3_filtered <- H3k4me3_peakAnno@anno %>%
as.data.frame() %>%
filter(SYMBOL %in% ordered_symbols & grepl("^Promoter", annotation))
# 重新按照 ordered_symbols 的顺序对筛选后的数据进行排序
top1_sorted <- top1_filtered %>%
arrange(factor(SYMBOL, levels = ordered_symbols))
h3k4me3_sorted <- h3k4me3_filtered %>%
arrange(factor(SYMBOL, levels = ordered_symbols))
# 5. 合并三个已排序的数据
library(purrr)
merged_data <- reduce(list(rnap_sorted, top1_sorted, h3k4me3_sorted), full_join)
# 6. 转换为 GRanges 对象
rnap_gr <- GRanges(seqnames = rnap_sorted$seqnames,
ranges = IRanges(start = rnap_sorted$start, end = rnap_sorted$end),
strand = rnap_sorted$strand)
top1_gr <- GRanges(seqnames = top1_sorted$seqnames,
ranges = IRanges(start = top1_sorted$start, end = top1_sorted$end),
strand = top1_sorted$strand)
h3k4me3_gr <- GRanges(seqnames = h3k4me3_sorted$seqnames,
ranges = IRanges(start = h3k4me3_sorted$start, end = h3k4me3_sorted$end),
strand = h3k4me3_sorted$strand)
# 7. 准备绘制热图
files <- list(rnap_gr, top1_gr, h3k4me3_gr)
# 8. 绘制热图
peakHeatmap(rnap_gr, TxDb = txdb,
upstream = 2000, downstream = 2000)
promoter <- getPromoters(TxDb=txdb, upstream=2000, downstream=2000)
tagMatrix <- getTagMatrix(TOP1, windows=promoter)
data("tagMatrixList")
tagMatrix <- tagMatrixList[[4]]
tagHeatmap(tagMatrix)
peakHeatmap(TOP1,TxDb = txdb,upstream=2000, downstream=2000) +
scale_fill_distiller(palette = "Set3")
peakHeatmap(TOP1, TxDb = txdb, upstream = 2000, downstream = 2000) +
scale_fill_distiller(palette = "Set3", name = "TOP1")
basename(unlist(files))
require(TxDb.Hsapiens.UCSC.hg19.knownGene)
require("TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts")
linc_txdb=TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
files<-getSampleFiles()
f<-readPeakFile(files[[4]])
x = annotatePeak(f, tssRegion=c(-2000, 2000), TxDb=txdb)
peakHeatmap(files, TxDb=txdb,
upstream=3000, downstream=3000,
palette=rainbow(length(files)))
#color="red")
options(ChIPseeker.ignore_1st_exon = T)
options(ChIPseeker.ignore_1st_intron = T)
options(ChIPseeker.ignore_downstream = T)
options(ChIPseeker.ignore_promoter_subcategory = T)
x=annotatePeak(f2)
plotAnnoPie(x)
D:\chuzhe\cell_article