传送门:
RNAseq006 转录组入门(6):reads计数
RNAseq005 转录组入门(5):序列比对
RNAseq004 转录组入门(4):参考基因组下载
RNAseq003 转录组入门(3):了解fastq测序数据
RNAseq002 转录组入门(2):数据下载
RNAseq001 转录组入门(1):资源准备
## sratoolkit
## Download and install sratoolkit
## http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software
## http://www.ncbi.nlm.nih.gov/books/NBK158900/
mkdir -p ~/biosoft && cd ~/biosoft
mkdir sratoolkit && cd sratoolkit
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/sratoolkit.2.10.8-ubuntu64.tar.gz
tar xzvf sratoolkit.2.10.8-ubuntu64.tar.gz
echo "export PATH=\$PATH:/home/cqs/biosoft/sratookit/sratoolkit.2.10.8-ubuntu64/bin" >> ~/.bashrc
source ~/.bashrc
fastq-dump -h
# 如果有报错如下:
# This sra toolkit installation has not been configured.
# Before continuing, please run: vdb-config --interactive
# For more information, see https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud/
# 参考报错信息运行代码后,退出即可
vdb-config --interactive
# *************************************************************************************************************************************
# CMake
# CMake是一个跨平台的编译自动配置工具,它能够输出各种各样的makefile或者project文件,能测试编译器所支持的C++特性,类似UNIX下的automake。
# CMake可以编译源代码、制作程式库、产生适配器(wrapper)、还可以用任意的顺序建构执行档,CMake是一个比make更高级的编译配置工具。
mkdir -p ~/biosoft/mybin
echo "export PATH=\$PATH:/home/cqs/biosoft/mybin/bin" >> ~/.bashrc
source ~/.bashrc
cd ~/biosoft
mkdir cmake-3.3.2 && cd cmake-3.3.2
wget http://cmake.org/files/v3.3/cmake-3.3.2.tar.gz
tar xvfz cmake-3.3.2.tar.gz
cd cmake-3.3.2
# 首次使用编译需要配置gcc,g++
sudo apt-get update
# build-essential这个包会安装上g++,libc6-dev,linux-libc-dev,libstdc++-dev等必须的软件和头文件
sudo apt-get install build-essential
# prefix选项是配置安装的路径,如果不配置该选项,安装后可执行文件默认放在/usr/local/bin,库文件默认放在/usr/local/lib,配置文件默认放在/usr/local/etc,其它的资源文件放在/usr/local/share,较为凌乱
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# *************************************************************************************************************************************
## samtools
## Download and install samtools
## http://samtools.sourceforge.net/
## http://www.htslib.org/doc/samtools.html
cd ~/biosoft
mkdir samtools && cd samtools
wget https://github.com/samtools/samtools/archive/1.10.tar.gz
tar xzvf 1.10.tar.gz
cd samtools-1.10/
./configure --prefix=/home/cqs/biosoft/mybin
# ./configure报错解决
# bedidx.c:33:10: fatal error: zlib.h: No such file or directory
sudo apt-get install zlib1g-dev
# bam_tview_curses.c:41:10: fatal error: curses.h: No such file or directory
sudo apt-get install libncurses5-dev
# cram/cram_io.c:53:10: fatal error: bzlib.h: No such file or directory
sudo apt-get install libboost-all-dev
sudo apt-get install libbz2-dev
# cram/cram_io.c:57:10: fatal error: lzma.h: No such file or directory
sudo apt-get install liblzma-dev
# hfile_libcurl.c:47:10: fatal error: curl/curl.h: No such file or directory
libcurl4-openssl-dev
# 重新指定路径
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# echo "export PATH=\$PATH:/home/cqs/biosoft/samtools-1.10/samtools-1.10" >> ~/.bashrc
# source ~/.bashrc
samtools
# *************************************************************************************************************************************
## FastQC
## 主页 https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc
# 判断系统是否安装java
java -version
# 安装java
sudo apt install default-jre
# 验证
java -version
cd ~/biosoft
mkdir fastqc_v0.11.9 && cd fastqc_v0.11.9
wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip
unzip fastqc_v0.11.9.zip
cd FastQC/
chmod u+x fastqc
echo "export PATH=\$PATH:/home/cqs/biosoft/fastQC-0.11.9/FastQC" >> ~/.bashrc
source ~/.bashrc
fastqc -h
# *************************************************************************************************************************************
## multiqc
## 方法一
cd ~/biosoft
mkdir multiqc-1.9 && cd multiqc-1.9
wget https://files.pythonhosted.org/packages/c8/2d/f0a6be15f861c5d165726d7afecd823ca158dff530b566379623a0e4534b/multiqc-1.9.tar.gz
tar zxvf multiqc-1.9.tar.gz
cd multiqc-1.9
python setup.py install
# 报错
# Traceback (most recent call last):
# File "setup.py", line 24, in <module>
# from setuptools import setup, find_packages
# ImportError: No module named setuptools
# python2环境下安装setuptools
sudo apt-get install python-setuptools
# python3环境下安装setuptools
sudo apt-get install python3-setuptools
# 再次执行安装
python setup.py install
-------------------------------------------------------------------------------------
## 方法二
# https://www.runoob.com/w3cnote/python-pip-install-usage.html
cd ~/biosoft
mkdir multiqc-1.9 && cd multiqc-1.9
# -t指定当前安装路径,-i指定清华源
pip install -t ./ -i https://pypi.tuna.tsinghua.edu.cn/simple multiqc
# 接下来就是一连串无法解决的报错了,multiqc不能指定位置安装,尴尬
sudo apt-get update
sudo apt-get install python3-pip
# 默认安装,完美解决
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple multiqc
echo "export PATH=\$PATH:/home/cqs/.local/bin" >> ~/.bashrc
source ~/.bashrc
which multiqc
# *************************************************************************************************************************************
## bcftools
## Download and install bcftools
## http://www.htslib.org/download/
cd ~/biosoft
mkdir bcftools-1.10.2 && cd bcftools-1.10.2
wget https://github.com/samtools/bcftools/releases/download/1.10.2/bcftools-1.10.2.tar.bz2
tar jxvf
cd bcftools-1.10.2/
./configure --prefix=/home/cqs/biosoft/mybin
make
make install
# *************************************************************************************************************************************
## tophat2
# Download and install TopHat
# https://ccb.jhu.edu/software/tophat/index.shtml
cd ~/biosoft
mkdir -p tophat-2.1.1 && cd tophat-2.1.1
#### readme: https://ccb.jhu.edu/software/tophat/manual.shtml
wget http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz
tar xzvf tophat-2.1.1.Linux_x86_64.tar.gz
ln -s tophat-2.1.1.Linux_x86_64 current
# *************************************************************************************************************************************
## hisat2
## Download and install HISAT
## https://daehwankimlab.github.io/hisat2/
cd ~/biosoft
mkdir hisat2-2.0.4 && cd hisat2-2.0.4
#### readme: https://ccb.jhu.edu/software/hisat2/manual.shtml
wget https://cloud.biohpc.swmed.edu/index.php/s/4pMgDq4oAF9QCfA/download
unzip hisat2-2.0.4-Linux_x86_64.zip
ln -s hisat2-2.0.4 current
## ~/biosoft/HISAT/current/hisat2-build
## ~/biosoft/HISAT/current/hisat2
# *************************************************************************************************************************************
## HTSeq
cd ~/biosoft
mkdir HTSeq && cd HTSeq
wget https://files.pythonhosted.org/packages/c4/04/b9b0c5514dcd09e64481e8ebc242aef162646b6de956ffb44595d1de0f69/HTSeq-0.12.4.tar.gz
chmod u+x HTSeq-0.12.4.tar.gz
tar zxvf HTSeq-0.12.4.tar.gz
ls
cd HTSeq-0.12.4/
python setup.py install
# 如报错如下:
# symlinking folders for python3
# Setup script for HTSeq: Failed to import 'numpy'.
# Please install numpy and then try again to install HTSeq.
# 解决方案:sudo apt-get install build-essential python2.7-dev python-numpy python-matplotlib
# 如果报错
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple Cython pysam matplotlib HTseq
sudo python setup.py install
# 找到htseq-count位置
which htseq-count
# /usr/local/bin/htseq-count
/usr/local/bin/htseq-count --help
echo "export PATH=\$PATH:/usr/local/bin/htseq-count" >> ~/.bashrc
source ~/.bashrc
htseq-count --help
## ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M1/
## http://hgdownload-test.cse.ucsc.edu/goldenPath/mm10/liftOver/
## GRCm38/mm10 (Dec, 2011)
## ls *bam |while read id;do ( ~/.local/bin/htseq-count -f bam $id genecode/mm9/gencode.vM1.annotation.gtf.gz 1>${id%%.*}.gene.counts ) ;done
## ls *bam |while read id;do ( ~/.local/bin/htseq-count -f bam -i exon_id $id genecode/mm9/gencode.vM1.annotation.gtf.gz 1>${id%%.*}.exon.counts ) ;done
# *************************************************************************************************************************************
# subread安装
mkdir -p ~/biosoft/subread && cd ~/biosoft/subread
wget https://nchc.dl.sourceforge.net/project/subread/subread-2.0.1/subread-2.0.1-source.tar.gz
tar zxvf subread-2.0.1-source.tar.gz
cd subread-2.0.1-source
# 查看说明书
cat ReadMe.txt
cd src
make -f Makefile.Linux
cd ~/biosoft/subread/subread-2.0.1-source/bin
./featureCounts
echo "export PATH=\$PATH:/home/cqs/biosoft/subread/subread-2.0.1-source/bin" >> ~/.bashrc
source ~/.bashrc
featureCounts
# *************************************************************************************************************************************
# SRA测序数据下载
# 我的笔记:https://www.jianshu.com/p/6819a16dee7a
# PMID: 27824034
# 文章地址:https://www.nature.com/articles/ncomms13347
# 数据地址:GSE81916
# 获得数据下载地址:
# 下载方式1
srapath SRR3589962
# https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3589962/SRR3589962.1
wget https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3589962/SRR3589962.1
# 下载方式2 PREFETCH
# 创建下载数据列表,将空格替换为换行符\n
echo SRR35899{56..62} | sed 's/ /\n/g' > SRR_Acc_List.txt
# 查看列表是否创建成功
cat SRR_Acc_List.txt
# 创建一个简单的循环脚本
vim prefetch.sh
# 选择insert模式
i
# 输入脚本内容,注意 #!/bin/bash是脚本的第一行内容,意思是该脚本通过bash运行
# 0、1和2分别表示标准输入、标准输出和标准错误信息输出,默认为标准输入,`1>$id.download.log 2>&1`表示将标准输入重定向到各ID对应的$id.download.log日志文件,并将错误信息也重定向至该文件
#
#!/bin/bash
cat SRR_Acc_List.txt | while read id;do prefetch $id 1>$id.download.log 2>&1;done
# 后台无挂断运行脚本
nohup bash prefetch.sh &
# 下载方式3 ASCP
# ENA地址:https://www.ebi.ac.uk/ena/browser/view/
# 检索关键词:PRJNA323422
# 下载tsv文件
# 获取fastq地址:
# ascp使用绝对路径
/home/caoqiansheng/.aspera/connect/bin/ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR358/002/SRR3589962/SRR3589962_2.fastq.gz .
# 批量下载数据脚本
for i in {56..62}
do
a0='/home/caoqiansheng/.aspera/connect/bin/'
a1='ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR358/00'
a2=$(($i % 10))
a3='/SRR35899'$i
a4='_1.fastq.gz .'
a5='_2.fastq.gz .'
echo $a0$a1$a2$a3$a3$a4
echo $a0$a1$a2$a3$a3$a5
done >> ascp.command
# 后台运行脚本
nohup bash ascp.command &
# 参考基因组下载 https://www.jianshu.com/p/9f06c3efb000
# genome index下载
# 每个比对软件的index都不同,需要根据genome序列构建索引,也可以在官网进行下载,但是需要注意基因组版本号
# hisat2 index下载网址 http://daehwankimlab.github.io/hisat2/download/
# 找到M. musculus的下载链接如下,迅雷下载会更快
wget https://genome-idx.s3.amazonaws.com/hisat/grcm38_genome.tar.gz
# HISAT2比对
for i in {59..62};do hisat2 -t -x /mnt/e/Work/bioinfo/public/index/mouse/hisat2/grcm38/genome -1 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_1.fastq.gz -2 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_2.fastq.gz -S /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}.sam;done
# SAM文件转换为BAM
for i in `seq 59 62`
do
samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
done
# 对排序后的bam统计flagstat
# 读取bam文件名,".bam"为删除
# ls *.bam | while read id ;do echo basename ${id} ".bam";done
# ls *.bam | while read id ;do echo $(basename ${id} ".bam").flagstat;done
ls *.bam |while read id ;do (samtools flagstat -@ 1 $id > $(basename ${id} ".bam").flagstat );done
mkdir flagstat && mv *.flagstat flagstat && cd flagstat
multiqc ./
# 构建shell文本处理脚本
cat > stat.sh
#!/bin/bash
cat *.flagstat | awk '{print $1}' | paste - - - - - - - - - - - - - > file1
# 77607517 16671207 0 0 75387881 60936310 30468155 30468155 56502696 57494864 1221810 832364 530657
# 134310379 28365145 0 0 130964009 105945234 52972617 52972617 98979648 100621038 1977826 1398380 907493
# 94264829 20737377 0 0 91921243 73527452 36763726 36763726 68525830 69723750 1460116 1023854 644490
# 111681106 24075844 0 0 109169544 87605262 43802631 43802631 82145504 83390620 1703080 1013088 643888
# 取行名
cut -d"+" -f 2 SRR3589959.flagstat | cut -d" " -f 3-90 > file2
# in total (QC-passed reads
# secondary
# supplementary
# duplicates
# mapped (97.14% : N/A)
# paired in sequencing
# read1
# read2
# properly paired (92.72% : N/A)
# with itself and mate mapped
# singletons (2.01% : N/A)
# with mate mapped to a different chr
# with mate mapped to a different chr (mapQ>=5)
# 取列名
ls *.flagstat | while read id ;do echo $(basename ${id} ".flagstat") ;done > file3
# SRR3589959
# SRR3589960
# SRR3589961
# SRR3589962
paste file3 file1 > file4
# 将file4行列转置
awk '{
for (i=1;i<=NF;i++){
if (NR==1){
res[i]=$i
}
else{
res[i]=res[i]" "$i
}
}
}END{
for(j=1;j<=NF;j++){
print res[j]
}
}' file4 > file5
# 在file2首行加入内容
sed '1i Index' file2 > file6
paste file6 file5 > stat.txt
rm file*
# Enter,Ctrl+C后运行脚本
bash stat.sh
# 排序,索引
for i in `seq 59 62`
do
samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
samtools index SRR35899${i}_sorted.bam
done
# 将SAM转换为BAM,并排序构建索引,随后删除SAM文件
# for i in `seq 59 62`
# do
# samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
# samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
# samtools index SRR35899${i}_sorted.bam
# done
# rm *.sam
# 注释
for i in {59..62}
do
htseq-count -s no -f bam -r pos /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 > /mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation/SRR35899${i}.count
done
# 代码运行报错
# Please Install PySam to use the BAM_Reader Class (http://code.google.com/p/pysam/)Error occured when reading beginning of BAM file.
# No module named pysam
# [Exception type: ImportError, raised in __init__.py:1086]
# 解决办法
# 下载pysam源代码
# 下载地址:https://pypi.org/project/pysam/#files
# 复制下载链接放入迅雷:https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
cd ~/biosoft
mkdir pysam && cd pysam
wget https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
tar zxvf pysam-0.16.0.1.tar.gz
cd pysam-0.16.0.1
python setup.py install
# 报错
# Traceback (most recent call last):
# File "setup.py", line 24, in <module>
# from setuptools import setup, find_packages
# ImportError: No module named setuptools
# python2环境下安装setuptools
sudo apt-get install python-setuptools
# python3环境下安装setuptools
sudo apt-get install python3-setuptools
# 再次执行安装
sudo python setup.py install
# 再次运行注释
# 构建脚本
cat > annotation.sh
#!/bin/bash
for i in {59..62}
do
# .sorted.bam地址
input="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam"
# .gtf地址
annotation="/mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3"
# 输出文件地址
output="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation"
htseq-count -s no -f bam -r pos ${input} ${annotation} > ${output}/SRR35899${i}.count
done
# 运行
bash annotation.sh
# featureCounts计数
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 运行后报错
# featurecounts segmentation fault (core dumped)
# 解决办法
# 下载二进制版本subread
rm -rf ~/biosoft/subread
mkdir -p ~/biosoft/subread && cd ~/biosoft/subread
wget https://nchc.dl.sourceforge.net/project/subread/subread-2.0.1/subread-2.0.1-Linux-x86_64.tar.gz
tar zxvf subread-2.0.1-Linux-x86_64.tar.gz
cd subread-2.0.1-Linux-x86_64
cd ~/biosoft/subread/subread-2.0.1-Linux-x86_64/bin
./featureCounts
echo "export PATH=\$PATH:/home/cqs/biosoft/subread/subread-2.0.1-Linux-x86_64/bin" >> ~/.bashrc
source ~/.bashrc
featureCounts
# 再次运行代码
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 对all.id.txt.summary进行multiqc,查看Counts质控
multiqc ./all.id.txt.summary
# [INFO ] multiqc : This is MultiQC v1.9
# [INFO ] multiqc : Template : default
# [INFO ] multiqc : Searching : /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt.summary
# Searching 1 files.. [####################################] 100%
# [INFO ] feature_counts : Found 4 reports
# [INFO ] multiqc : Compressing plot data
# [INFO ] multiqc : Report : multiqc_report.html
# [INFO ] multiqc : Data : multiqc_data
# [INFO ] multiqc : MultiQC complete