将 gff 文件转成 gtf (featurecounts需要使用gtf文件)
gffread coreset.gff -T -o amur_ide.gtf
# -o write the output records into <outfile> instead of stdout
#-T main output will be GTF instead of GFF3
构建参考基因组的索引文件
hisat2-build -p 8 genome.fa amur_ide
hisat2批量比对
for i in 39 40 41 42 43 44
do
nohup hisat2 -x /home/genome_index/amur_ide -1 SRR75089${i}_1.fq -2 SRR75089${i}_2.fq | samtools view -S -b > xx.bam &
done
bam文件排序
samtools sort XX.bam -o xxx_sorted.bam
featurecounts 定量
for i in 39 40 41 42 43 44
do
nohup featureCounts -p -a /home/jiamj/analysis/ref/TAIR10.gtf -o ${i}_counts.txt /home/jiamj/analysis/clean/${i}_sorted.bam &
done
-p If specified, libraries are assumed to contain paired-end reads. For any library that contains paired-end reads, the 'countReadPairs' parameter controls if read pairs or reads should be counted
结果包含有 geneid,染色体位置,基因起始结束的位置以及基因的 count 数
featureCounts进行fpkm标准化
countdata <- read.csv("countdata.csv")
#countdata.csv是提取了上一步的counts数据以及gene length
rownames(countdata) <- countdata[,1]
countdata <- countdata[,-1]
kb <- countdata$length / 1000
count <- countdata[,1:8]
rpk <- count / kb
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
fpkm <- t(t(rpk)/colSums(count) * 10^6)
#想计算数据框中每列的总和,使用colSums函数。
write.table(fpkm,file="eight_tissues_fpkm.xls",sep="\t",quote = F)