1.下载 ensembl 参考基因组和注释:
#历史版本的下载网址
https://www.ensembl.org/info/website/archives/assembly.html
#最好选择primary_assembly及其对应的 gtf
wget ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz
wget ftp://ftp.ensembl.org/pub/release-99/gtf/mus_musculus/Mus_musculus.GRCm38.99.gtf.gz
2.STAR 生成索引文件
STAR --runThreadN 40 --runMode genomeGenerate --genomeDir ./Genome_index/ --genomeFastaFiles ~/workspace/STAR_mm10/Mus_musculus.GRCm38.dna.primary_assembly.fa --sjdbGTFfile ~/workspace/STAR_mm10/Mus_musculus.GRCm38.99.gtf
3.STAR Mapping
import subprocess
import os
# 定义输入和输出目录
output_directory = "./STAR_output/"
genome_directory = "/home/u24211510018/workspace/STAR_mm10/Genome_index"
fastq_directory = "./"
# 创建输出目录(如果不存在)
if not os.path.exists(output_directory):
os.mkdir(output_directory)
# 获取所有的 R1 文件,确保处理的是双端测序
fastq_files = [f for f in os.listdir(fastq_directory) if f.endswith('_R1.fq.gz')]
for fastq_R1 in fastq_files:
# 找到对应的 R2 文件
fastq_R2 = fastq_R1.replace('_R1.fq.gz', '_R2.fq.gz')
if fastq_R2 not in os.listdir(fastq_directory):
print(f"Warning: Could not find R2 for {fastq_R1}. Skipping...")
continue
# 获取前缀名(样本名)
prefix = fastq_R1.replace('_R1.fq.gz', '')
# 为当前样本创建输出文件夹
sample_output_dir = os.path.join(output_directory, prefix)
os.mkdir(sample_output_dir)
print(f"Currently mapping: {prefix}")
# 构建 STAR 命令
star_command = (
f"STAR --runThreadN 64 "
f"--genomeDir {genome_directory} "
f"--readFilesCommand zcat "
f"--outFilterType BySJout "
f"--outFilterMismatchNoverLmax 0.04 "
f"--outFilterMismatchNmax 999 "
f"--alignSJDBoverhangMin 1 "
f"--alignSJoverhangMin 8 "
f"--outFilterMultimapNmax 20 "
f"--alignIntronMin 20 "
f"--alignIntronMax 1000000 "
f"--alignMatesGapMax 1000000 "
f"--readFilesIn {os.path.join(fastq_directory, fastq_R1)} {os.path.join(fastq_directory, fastq_R2)} "
f"--outSAMtype BAM SortedByCoordinate "
f"--quantMode GeneCounts "
f"--outFileNamePrefix {os.path.join(sample_output_dir, prefix)}_"
)
# 运行 STAR 命令
subprocess.call(star_command, shell=True)
slurm 脚本:
#!/bin/bash
#SBATCH -J rawdata
#SBATCH -p dna
#SBATCH -N 1
#SBATCH --mem=25G
#SBATCH --cpus-per-task=5
#SBATCH -t 3-00:00:00
#SBATCH -o raw_data.out
#SBATCH --mail-type=ALL
#SBATCH --mail-user=User_mail
# 加载 Conda 环境
source /opt/app/conda/etc/profile.d/conda.sh
conda activate STAR
# 运行 Python 脚本
python Mapping.py
# 退出 Conda 环境
conda deactivate
4.FeatureCount生成 rawcount 文件
4.1使用 RSEQC 判断链特异性(有些文章中没有指出,需要自己判断一下)
conda install -c bioconda rseqc
infer_experiment.py -i cKO1_R1.fq.gz.bam -r ~/workspace/mm10.RefSeq.bed
结果如下:
Reading reference gene model /home/u24211510018/workspace/mm10.RefSeq.bed ... Done
Loading SAM/BAM file ... Total 200000 usable reads were sampled
This is PairEnd Data
Fraction of reads failed to determine: 0.0166
Fraction of reads explained by "1++,1--,2+-,2-+": 0.4946
Fraction of reads explained by "1+-,1-+,2++,2--": 0.4889
由此可以判断是非链特异性
其他情况:
Fraction of reads failed to determine: 0.02
Fraction of reads explained by "1++,1--,2+-,2-+": 0.03
Fraction of reads explained by "1+-,1-+,2++,2--": 0.94
负链特异性
Fraction of reads failed to determine: 0.01
Fraction of reads explained by "1++,1--,2+-,2-+": 0.95
Fraction of reads explained by "1+-,1-+,2++,2--": 0.04
正链特异性
4.2对生成的Aligned.sortedByCoord.out.bam文件进行排序
for bam in cKO*_Aligned.sortedByCoord.out.bam cWT*_Aligned.sortedByCoord.out.bam; do
prefix=$(basename $bam .bam) # 去掉 .bam 后缀
samtools sort -o ${prefix}_sorted.bam $bam
echo "Sorted $bam into ${prefix}_sorted.bam"
done
4.3FeatureCount
#Note: new version of FeatureCounts need a parameter: --countReadPairs
featureCounts -T 40 \
-p \
-B \
-C \
-M \
-Q 10 \
-s 0 \
--countReadPairs \
-t exon \
-g gene_id \
-a ~/workspace/STAR_mm10/Mus_musculus.GRCm38.99.gtf \
-o miR137_gene_counts.txt \
*_sorted.bam
参数解释:
-T 12:使用 40 个线程并行处理。
-p:双端模式(paired-end mode)。
-B:仅计数成对比对成功的读段(R1 和 R2 都必须比对)。
-C:要求成对的读段比对到同一染色体。
-s 0:非链特异性计数。
-t exon:只计数外显子区域。
-g gene_id:以gene_id字段分组计数。
-a annotation.gtf:指定 GTF 格式的基因注释文件。
-o counts.txt:输出文件为counts.txt。
-Q 10:过滤掉比对质量分数(MAPQ)< 10 的读段。
其中关于 s参数的设置,详细介绍一下:
> -s <int or string> Perform strand-specific read counting. A single integer
>
> value (applied to all input files) or a string of comma-
>
> separated values (applied to each corresponding input
>
> file) should be provided. Possible values include:
>
> 0 (unstranded), 1 (stranded) and 2 (reversely stranded).
>
> Default value is 0 (ie. unstranded read counting carried
>
> out for all input files).
0(unstranded):非链特异性,默认值。对所有输入文件执行非链特异性读段计数。
1(stranded):链特异性,读段的方向与转录方向一致。
2(reversely stranded):反向链特异性,读段的方向与转录方向相反。
5.差异表达基因分析
rm(list=ls())
setwd("~/workspace/BI_project/Bulk-RNA-seq/StarMapping/FeatureCount/")
# 转换 ID -------------------------------------------------------------------
# 读取 count 文件
count_data <- read.table("counts.txt", header = TRUE, row.names = 1, sep = "\t",skip = 1)
count_data <- read.table("miR137_gene_counts.txt", header = TRUE, row.names = 1, sep = "\t")
head(count_data)
library(biomaRt)
library(rtracklayer)
## 1.读取 GTF 文件
{
# gtf_file <- "~/workspace/STAR_mm10/Mus_musculus.GRCm38.99.gtf"
#
# # 使用 rtracklayer 读取 GTF 文件
# gtf_data <- import(gtf_file)
#
# # 提取基因信息(仅保留基因相关行)
# gene_annotations <- as.data.frame(gtf_data[gtf_data$type == "gene", c("gene_id", "gene_name")])
#
# # 查看前几行
# head(gene_annotations)
#
# gene_annotations = gene_annotations[,c(6,7)]
}
## 2.use BiomaRt
# -------------------------------------------------------------------------
annotation <- read.table("mart_export.txt", header = TRUE, sep = ",")
# 将行名(ENSEMBL ID)转换为一列,方便合并
count_data$Ensembl_ID <- rownames(count_data)
# 按 Ensembl ID 合并注释信息
merged_data <- merge(count_data, annotation, by.x = "Ensembl_ID", by.y = "Gene.stable.ID", all.x = TRUE)
# 查看结果
head(merged_data)
# -------------------------------------------------------------------------
merged_data = merged_data[,-c(2,3,4,5,6,14)]
head(merged_data)
# -------------------------------------------------------------------------
merged_data_unique <- merged_data %>%
rowwise() %>%
mutate(total_count = sum(c_across(starts_with("cKO")), c_across(starts_with("cWT")))) %>%
group_by(Gene.name) %>%
slice_max(order_by = total_count, n = 1, with_ties = FALSE) %>% # 保留计数最多的行
ungroup() %>%
select(-total_count)
head(merged_data_unique)
##检查重复
# 检查 Gene.name 列中的重复值数量
duplicate_count <- sum(duplicated(merged_data_unique$Gene.name))
cat("Gene.name 中的重复值数量:", duplicate_count, "\n")
# 如果有重复值,列出所有重复的 Gene.name
if (duplicate_count > 0) {
duplicated_genes <- merged_data_unique$Gene.name[duplicated(merged_data_unique$Gene.name)]
cat("重复的 Gene.name 值:\n")
print(unique(duplicated_genes))
# 查看这些重复值对应的行
duplicate_rows <- merged_data_unique[merged_data_unique$Gene.name %in% duplicated_genes, ]
cat("包含重复 Gene.name 的行:\n")
print(duplicate_rows)
}
##检查 NA 值
# 检查 Gene.name 列中 NA 值的数量
na_count <- sum(is.na(merged_data_unique$Gene.name))
cat("Gene.name 中的 NA 值数量:", na_count, "\n")
# 如果有 NA 值,查看包含 NA 的行
if (na_count > 0) {
na_rows <- merged_data_unique[is.na(merged_data_unique$Gene.name), ]
cat("包含 NA 值的行:\n")
print(na_rows)
}
# 如果 Gene.name 中有 NA 值,用 Ensembl_ID 替代
merged_data_unique$Gene.name[is.na(merged_data_unique$Gene.name)] <- merged_data_unique$Ensembl_ID[is.na(merged_data_unique$Gene.name)]
##检查空字符串
# 检查 Gene.name 列中空字符串的数量
empty_count <- sum(merged_data_unique$Gene.name == "")
cat("Gene.name 中的空字符串数量:", empty_count, "\n")
# 如果有空字符串,查看包含空字符串的行
if (empty_count > 0) {
empty_rows <- merged_data_unique[merged_data_unique$Gene.name == "", ]
cat("包含空字符串的行:\n")
print(empty_rows)
}
# 如果 Gene.name 中是空字符串 (""),用 Ensembl_ID 替代
merged_data_unique$Gene.name[merged_data_unique$Gene.name == ""] <- merged_data_unique$Ensembl_ID[merged_data_unique$Gene.name == ""]
head(merged_data_unique)
names(merged_data_unique) = c("Ensembl_ID","cKO1","cKO2","cKO3","cWT1","cWT2","cWT3","Gene_name")
# -------------------------------------------------------------------------
library(DESeq2)
# 导入表达矩阵
count_data <- as.matrix(merged_data_unique[, 2:7]) # 假设前1列是基因ID
rownames(count_data) <- merged_data_unique$Gene_name
# 创建分组信息
col_data <- data.frame(
row.names = colnames(count_data),
condition = factor(c("cKO", "cKO", "cKO", "cWT", "cWT", "cWT"), levels = c("cWT", "cKO")) #第一个因子水平即为参考水平(对照组)
)
dds <- DESeqDataSetFromMatrix(
countData = count_data,
colData = col_data,
design = ~ condition
)
head(dds)
dds <- DESeq(dds)
resultsNames(dds)
res <- results(dds, contrast = c("condition", "cKO", "cWT")) # results(dds, contrast = c("condition", "实验组", "对照组"))
resOrdered <- res[order(res$padj), ]
DEG <- as.data.frame(resOrdered)
DEG_deseq2 <- na.omit(DEG)
head(DEG_deseq2)
#write.csv(DEG_deseq2, file = "differential_expression_cKO_vs_cWT.csv")
plotMA(res, ylim = c(-2, 2))
#其中mart_export.txt文件是在 Ensembl 网站上的BioMart 上下载的,注意下载的 annotation 需要和基因组版本一致!
一些图: