10X单细胞转录组测序—常规流程

文章中其中用到的文件获取方式https://pan.baidu.com/s/1HCsHRXNX9Il8u8RIPXSEug?pwd=2626

1.上游分析

1.1 安装conda、sratoolkit、cellranger 关

#安装conda，最好去官网下载最新版
cd ~
wget https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh
bash Anaconda3-2021.11-Linux-x86_64.sh

#安装sratoolkit，用于下载sra数据，同时也会安装fastq-dump
#去官网下载最新版本，conda安装的不好用，https://github.com/ncbi/sra-tools/wiki/01.-Downloading-SRA-Toolkit
#conda install -c daler sratoolkit

#安装cellranger
#官网下载最新版cellranger
https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest
#执行以下命令进行安装
tar -xzvf cellranger-6.1.2.tar
#解压完即可使用，但需要添加到环境变量

1.2 使用conda进行常用软件安装注

conda install -y -c bioconda aspera-cli bwa samtools bedtools bowtie2 fasterq-dump hisat2 cutadapt multiqc

1.3 参考基因组下载我

##1.2 mm10
wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz

md5sum refdata-gex-mm10-2020-A.tar.gz
#886eeddde8731ffb58552d0bb81f533d refdata-gex-mm10-2020-A.tar.gz
tar -xzvf refdata-gex-mm10-2020-A.tar.gz

###1.3 hg38
wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz

md5sum refdata-gex-GRCh38-2020-A.tar.gz
#dfd654de39bff23917471e7fcc7a00cd refdata-gex-GRCh38-2020-A.tar.gz
tar -xzvf refdata-gex-GRCh38-2020-A.tar.gz

1.4 文件夹创建的

mkdir 1.sra 2.raw_fastq 3.cellranger

1.5 数据下载公

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112302
#SRA Run Selector里面选择需要的数据下载，在该篇文章中是下载6个tumor数据。选择要下载的数据样本后点击Accession List获取相应的id号。并将其命名为id，保存在1.sra文件夹中。

cd 1.sra
cat id | while read id;do (prefetch $id &);done

1.6 SRA转fastq 众

cd 2.fastq
ln -s ../1.sra/SRR* ./
ls SRR* |while read id;do (nohup fasterq-dump -O ./ --split-files -e 6 ./$id --include-technical & );done

#上一步运行完会非常占用空间，可压缩节省空间。
ls SRR*fastq | while read id;do gzip $id;done

1.7 cellranger count流程号

##修改文件名，改成cellranger可识别的文件名。
cat ../1.sra/id |while read id ;do (mv ${id}_1*.gz 
${id}_S1_L001_I1_001.fastq.gz;mv ${id}_2*.gz ${id}_S1_L001_R1_001.fastq.gz;mv 
${id}_3*.gz ${id}_S1_L001_R2_001.fastq.gz);done

##运行cellranger
#指定参考基因组
ref=/home/data/vip10t17/software_install/10x_refernce/refdata-gex-GRCh38-2020-A
ls *.fastq.gz | cut -d "_" -f 1 |uniq |while read id;do cellranger count --id $id --transcriptome $ref --fastqs 2.raw.fastq/ --sample $id --nosecondary --localcores 10 --localmem 30;done

##参数解读
--id 指定输出文件夹的名字
--transcriptome 指定参考基因组的路径
--sample 指定需要处理的fastq文件的前缀
--expect-cell 指定预期的细胞数目，默认参数是3000个
--localcores 指定计算的核心数
--mempercore 指定内存大小 GB
--nosecondary 不需要进行降维聚类（因为后期会用R可视化）
--chemistry，默认是自动识别chemistry，但是有些时候识别不出chemistry的时候，需要加入参数特别标明

1.8 结果解读 2

web_summary.html：必看，官方说明 summary HTML file ，包括许多QC指标，预估细胞数，比对率等；
metrics_summary.csv：CSV格式数据摘要，可以不看；
possorted_genome_bam.bam：比对文件，用于可视化比对的reads和重新创建FASTQ文件，可以不看；
possorted_genome_bam.bam.bai：索引文件；
filtered_gene_bc_matrices：是重要的一个目录，下面又包含了 barcodes.tsv.gz、features.tsv.gz、matrix.mtx.gz，是下游Seurat、Scater、Monocle等分析的输入文件，是经过Cell Ranger过滤后构建矩阵所需要的所有文件；
filtered_feature_bc_matrix.h5：过滤掉的barcode信息HDF5 format，可以不看；
raw_feature_bc_matrix：原始barcode信息，未过滤的可以用于构建矩阵的文件，可以不看；
raw_feature_bc_matrix.h5：原始barcode信息HDF5 format，可以不看；
analysis：数据分析目录，下面又包含聚类clustering（有graph-based & k-means）、差异分析diffexp、主成分线性降维分析pca、非线性降维tsne，因为我们自己会走Seurat流程，所以不用看；
molecule_info.h5：可用于整合多样本，使用cellranger aggr函数；
cloupe.cloupe：官方可视化工具Loupe Cell Browser 输入文件，无代码分析的情况下使用，会代码的同学通常用不到。
将所有结果中的filtered_gene_bc_matrices文件夹保存在自己电脑output文件夹下，进行下游分析

2. 下游分析

2.1 R包安装、加载 6

library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
library(harmony)
library(SingleR)
library(celldex)
library(EnhancedVolcano)
library(tidyverse)
library(ggrepel)
library(rgl)
library(fgsea)
library(clustree)
library(patchwork)

2.2 数据读取过滤号

#设置工作路径
setwd('E:/project/test/')

dir='E:/project/test/cellranger' 
#设置样本名
samples=c("100","400")
samples
#获取单细胞数据
sceList = lapply(samples,function(pro){ 
  print(pro) 
  sce=CreateSeuratObject( Read10X(file.path(dir,pro)), 
                          project = pro,
                          min.cells = 5,
                          min.features = 200) 
  return(sce)
})
#会自动读取output文件夹下所有的文件，同时进行数据过滤，
#一个基因最少细胞表达数为5，一个细胞最少基因表达量为300。

#合并数据
PBMC.all=merge(x=sceList[[1]],
              y=sceList[[2]],
              add.cell.ids = c("100","400"))

#计算^MT-线粒体基因所占的比例,人：全大写，小鼠：全小写
PBMC.all[["percent.mt"]] = PercentageFeatureSet(PBMC.all,pattern = "^mt-")
#计算血红细胞相关基因比例，人：全大写，小鼠：首字母大写 其余小写
PBMC.all[["percent.hb"]] = PercentageFeatureSet(PBMC.all,pattern = "^Hb[^(p)]")
#分析核糖体基因。人：全大写，小鼠：首字母大写 其余小写
PBMC.all[["percent.rp"]] = PercentageFeatureSet(PBMC.all,pattern = "^Rp[sl]")

VlnPlot(PBMC.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt",
                               "percent.hb", "percent.rp"), ncol = 3)
#其中nFeature_RNA：一个细胞中检测到的基因总数，nCount_RNA：一个细胞中检测到的分子数

#根据数据的分布进行过滤，主要过滤极值，其中mt部分一般是小于10，但也不绝对
#肌肉细胞的线粒体基因占比最高可达50%，肿瘤区域的正常细胞线粒体含量有时也在30%以上
#但为了尽可能保留更多的数据可以提高这个数值，HB部分一般是小于3。
PBMC.all = subset(PBMC.all,subset = nFeature_RNA<7500&percent.mt<15&percent.hb<3&percent.rp<50)
#通过质控检测数据过滤的好坏
plot1 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA",
                       feature2 = "percent.mt")+NoLegend()
plot1#会呈现出三角的形成
plot2 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA",
                       feature2 = "percent.hb")+NoLegend()
plot2#会呈现成横线状
plot3 = FeatureScatter(PBMC.all,feature1 = "nCount_RNA",
                       feature2 = "nFeature_RNA")+NoLegend()
plot3#会呈现出圆弧状
#将三张图合并显示
CombinePlots(plots = list(plot1, plot2,plot3))

#计算基因reads数贡献值
C <- PBMC.all@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]

boxplot(t(as.matrix(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell", 
        col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)
#结果中根据占比，删除占比较大的基因，其中MALAT1的高检测可能是一个技术问题

dim(PBMC.all)
# 过滤MALAT1基因
PBMC.all <- PBMC.all[!grepl("Malat1", rownames(PBMC.all)), ]
# 过滤线粒体基因
PBMC.all <- PBMC.all[!grepl("^mt-", rownames(PBMC.all)), ]
# 过滤核糖体基因
PBMC.all <- PBMC.all[!grepl('^Rp[sl]', rownames(PBMC.all)), ]
# 过滤血红蛋白基因
PBMC.all <- PBMC.all[!grepl("^Hb[^(p)]", rownames(PBMC.all)), ]
dim(PBMC.all)

#PBMC.all1用于保存原始数据，方便后续调用
saveRDS(PBMC.all, "E:/project/test/PBMC.all1.rds")

2.3 标准化宇

#标准化 通过总表达值对每个细胞的基因表达值归一化
PBMC.all <- NormalizeData(PBMC.all, 
                     normalization.method = "LogNormalize",
                     scale.factor = 1e4)

2.4 寻找高变基因宙

#寻找差异基因
PBMC.all = FindVariableFeatures(PBMC.all)

# 选择前10个，并将前10的高变异基因在图中标注出来
top10 <- head(VariableFeatures(PBMC.all), 10)
plot4 = VariableFeaturePlot(object = PBMC.all)+NoLegend()
plot4
plot5 <- LabelPoints(plot = plot4, points = top10, repel = TRUE)
plot5
#合并显示
CombinePlots(plots = list(plot4, plot5))

2.5 （可选）可通过以下方法修改组织样本信息

timePoints <-ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722939', 'PBMC_Pre', 
                    ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722940', 'PBMC_EarlyD27',
                           ifelse(PBMC.all@meta.data[["orig.ident"]] == 'SRR7722941'|PBMC.all@meta.data[["orig.ident"]] == 'SRR7722942', 'PBMC_RespD376', 'PBMC_ARD614')))

#PBMC <- AddMetaData(object = PBMC.all, metadata = apply(dataPBMC, 2, sum), col.name = 'nUMI_raw')
PBMC.all <- AddMetaData(object = PBMC.all, metadata = timePoints, col.name = 'TimePoints')

2.6 细胞周期分析

#细胞周期分析
mouse_cell_cycle_genes <- readRDS("E:/project/celldex数据库/mouse_cell_cycle_genes.rds")
s.genes=mouse_cell_cycle_genes[[1]]
g2m.genes=mouse_cell_cycle_genes[[2]]
PBMC.all <- CellCycleScoring(PBMC.all, s.features = s.genes, g2m.features = g2m.genes)

mouse_cellmarker=read.csv("E:/project/cellmarker数据库/Cell_marker_Mouse.csv",header = TRUE)
mouse_cellmarker=unique(mouse_cellmarker$Symbol)

2.7 PCA分析

#使用scaleData改变每个基因的表达，使细胞间的平均表达为0；缩放每个基因的表达，
#使细胞间的差异为1，这样高表达基因就不会影响后续分析
#这是pca等降维技术之前的标准预处理步骤，同时去除细胞周期相关基因
PBMC.all = ScaleData(PBMC.all, vars.to.regress = c("S.Score", "G2M.Score"),
                      features = rownames(PBMC.all),split.by = "orig.ident")

PBMC.all <- RunPCA(PBMC.all, npcs=30,features = mouse_cellmarker,verbose = FALSE)
#其中npcs要设置的大一点，在后面主成分分析的时候会受限于这个数值

#使用Harmony进行整合，消除样本间的批次效应
PBMC.all <- RunHarmony(PBMC.all,'orig.ident')

#查看去除批次效应前后的结果
DimPlot(PBMC.all,reduction = "pca",group.by = "orig.ident")+NoLegend()
DimPlot(PBMC.all,reduction = "harmony",group.by = "orig.ident")+NoLegend()

#用热图查看前500个细胞在前18个PCA中的热图,但分群较多，分三组展示
DimHeatmap(PBMC.all, dims = 1:6, cells = 500, balanced = TRUE)
DimHeatmap(PBMC.all, dims = 7:12, cells = 500, balanced = TRUE)
DimHeatmap(PBMC.all, dims = 13:18, cells = 500, balanced = TRUE)
#PBMC.all3用于亚群数分析，确定resolution
saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all3.rds")
#PBMC.all=readRDS("E:/project/lc_test/PBMC.all3.rds")

2.8 非线性降维

#使用确定好的resolution进行后续分析
PBMC.all = FindNeighbors(PBMC.all,reduction = "harmony",dims = 1:30)
PBMC.all <- FindClusters(object = PBMC.all,method = "igraph",resolution = 0.1)
#进行TSNE和UMAP降维
#若前期进行了harmony整合，需要设置reduction = "harmony"，
#若前期没有进行harmony整合，需要设置reduction = "pca"
PBMC.all <- RunTSNE(PBMC.all,dims = 1:30,reduction = "harmony")
PBMC.all <- RunUMAP(PBMC.all,dims = 1:30,reduction = "harmony")

#TSNE和UMAP降维结果可视化
#可以传入split.by = "orig.ident"，按照样品名进行拆分再可视化，
#也可以传入group.by = "orig.ident"，按照样品名进行整体可视化。以查看
#之前进行的harmony整合是否合理
DimPlot(PBMC.all, reduction = "umap",label = T) 
DimPlot(PBMC.all, reduction = "umap", split.by = "orig.ident")
DimPlot(PBMC.all, reduction = "umap", group.by = "orig.ident")

DimPlot(PBMC.all, reduction = "tsne",label = T) 
DimPlot(PBMC.all, reduction = "tsne", split.by = "orig.ident")
DimPlot(PBMC.all, reduction = "tsne", group.by = "orig.ident")

2.9 分析各亚群的marker基因

#确定每个分群相应的marker基因
markers = FindAllMarkers(PBMC.all,only.pos = TRUE,min.pct = 0.1,
                         logfc.threshold = 0.2,return.thresh = 0.01)
saveRDS(markers, "E:/project/lc_test/markers.rds")
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(PBMC.all, features = top10$gene)
#可分析特定基因在降维图中的表达情况
FeaturePlot(PBMC.all,features = "GAPDH")

2.10 singleR 预测亚群名称

#为singleR获取可识别的数据集
PBMC.all_for_SingleR <- GetAssayData(PBMC.all, slot="data")
clusters=PBMC.all@meta.data$seurat_clusters
#singleR预测亚群名称
#获取用来自动预测亚群的数据库，可用的数据库在celldex R包里

#开始预测
load("E:/project/celldex数据库/MouseRNAseqData.Rdata")
pred.mouseImmu_M <- SingleR(test = PBMC.all_for_SingleR, ref = MouseRNAseqData, 
                          labels = MouseRNAseqData$label.main,
                          method = "cluster", clusters = clusters, 
                          assay.type.test = "logcounts", assay.type.ref = "logcounts")
print(plotScoreHeatmap(pred.mouseImmu_M,show_colnames =TRUE))

load("E:/project/celldex数据库/ImmGenData.Rdata")
pred.mouseImmu_I <- SingleR(test = PBMC.all_for_SingleR, ref = ImmGenData, 
                          labels = ImmGenData$label.main,
                          method = "cluster", clusters = clusters, 
                          assay.type.test = "logcounts", assay.type.ref = "logcounts")
print(plotScoreHeatmap(pred.mouseImmu_I,show_colnames =TRUE))

 #获取预测结果
cellType=data.frame(ClusterID=levels(PBMC.all@meta.data$seurat_clusters),
                    ref.se=pred.mouseImmu_M$labels)
#可通过 fix(cellType) 对亚群名称进行修改

#将预测到的细胞亚群传给原始数据集
new.cluster.ids <- cellType$ref.se
names(new.cluster.ids) <- levels(PBMC.all)
PBMC.all <- RenameIdents(PBMC.all, new.cluster.ids)
DimPlot(PBMC.all, reduction = 'umap', 
        label = TRUE, pt.size = 0.5)
###若分群不明显，可将所有细胞分为肿瘤细胞和免疫细胞，对免疫细胞重新进行分析，也可通过marker基因的方式，人工确定亚群名称，也可结合数据库预测。
saveRDS(PBMC.all, "E:/project/lc_test/PBMC.all4.rds")