Seurat有V5,V4版本,它们的数据格式略微有点不同,如果想使用不同版本,可放在不同的R包保存路径中,避免冲突。
而且安装这个包前,需要把它相关的依赖包安装上,才能安装成功。
rm(list=ls())
options(stringsAsFactors = F)
# 创建个人库
#dir.create("~/R_libs/SeuratV4", recursive = TRUE)
## 你需要先安装这个
lib = "~/R_libs/SeuratV4"
.libPaths(c("~/R_libs/SeuratV4", .libPaths()))
# namespace ‘Matrix’ 1.6-0 is being loaded, but >= 1.6.4 is required
packageVersion("Matrix")
#https://cran.r-project.org/src/contrib/Archive/Matrix/
install.packages("~/R_libs/SeuratV4/Matrix_1.6-4.tar.gz",repos = NULL, type="source",lib ="~/R_libs/SeuratV4")
install.packages("SeuratObject",lib ="~/R_libs/SeuratV4" )
#Tools for Single Cell Genomics • Seurat https://satijalab.org/seurat/v
remotes::install_version("SeuratObject", "4.1.4", repos = c("https://satijalab.r-universe.dev", getOption("repos")))
remotes::install_version("Seurat", "4.4.0", repos = c("https://satijalab.r-universe.dev", getOption("repos")))
加载包
# 加载必要的包
# 使用时先改库路径
.libPaths(c("~/R_libs/SeuratV4", .libPaths()))
library(Matrix)#1.6-4,一般安装完需要重启下R
library(Seurat)
packageVersion("Seurat") # 应返回 4.4.0 或 4.3.x
library(tidyverse)
library(patchwork)
library(ggplot2)
library(cowplot)
library(data.table)
library(dplyr)
加载数据
###1.加载数据
#单细胞数据保存的目录
dir='/home/dyhscRNA'
#samples=list.files( dir )
samples <-c("GSM6213958","GSM6213959","GSM6213961","GSM6213976","GSM6213980","GSM6213983")
#前面三个是PD-1,后面三个untreated
samples
# dir/pro/outs/filtered_gene_bc_matrices/hg19/
# 里面有 matrix.mtx, features.tsv, barcodes.tsv
###2. 创建Seurat对象
sceList = lapply(samples,function(pro){
# pro=samples[1]
print(pro)
tmp = Read10X(file.path(dir,pro ))
if(length(tmp)==2){
ct = tmp[[1]]
}else{ct = tmp}
sce =CreateSeuratObject(counts = ct ,
project = pro ,
min.cells = 5,
min.features = 300 )
return(sce)
})
do.call(rbind,lapply(sceList, dim))
sce.all=merge(x=sceList[[1]],
y=sceList[ -1 ],
add.cell.ids = samples )
sce.all@assays$RNA@counts[1:5,1:5]
save(sce.all,file = "sce.all")
load("sce.all")
table(sce.all$orig.ident)
sce.all@meta.data[1:3,1:3]
### 3. 数据预处理
#统计一些关键基因的比例,如线粒体基因,核糖体基因,红细胞基因的比例,根据这个比例进一步过滤
{
sce.all[["percent.mt"]] <- PercentageFeatureSet(object = sce.all, pattern = "^MT-") # 人是MT,小鼠是mt
sce.all[["percent.rb"]] <- PercentageFeatureSet(object = sce.all, pattern = "^RP[SL]") # 计算核糖体基因比例
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 计算红细胞基因比例
HB.genes <- CaseMatch(HB.genes, rownames(sce.all))
sce.all[["percent.HB"]] <- PercentageFeatureSet(object = sce.all, features=HB.genes)
# Visualize QC metrics as a violin plot
# 对 pbmc 对象做小提琴图,分别为基因数,细胞数和线粒体占比,核糖体基因占比,红细胞基因占比
VlnPlot(object = sce.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rb","percent.HB"), ncol = 3)
# 接下来,根据图片中基因数和线粒体数,分别设置过滤参数,这里基因数 200-2500,线粒体百分比为小于 5%
sce.all <- subset(x = sce.all, subset = nFeature_RNA > 200 & nFeature_RNA < 7500)
}
#但这里我发现这个数据的这部分基因奇奇怪怪的,且每个样本的差异太大了,不过滤了。
#所以直接进行下一步
#数据归一化处理,LogNormalize目的是让整体的数据服从正态分布
sce.all<-NormalizeData(sce.all,
normalization.method="LogNormalize",
scale.factor=1e4)
#查找高变基因,指一些基因在细胞中表达的浮动比较大,这些往往是我们后续分群的时候需要关注的
# 识别高变基因
sce.all <- FindVariableFeatures(sce.all, selection.method = "vst", nfeatures = 2000)
#有 3 种选择高表达变异基因的方法,可以通过 selection.method参数来选择,它们分别是:vst(默认值), mean.var.plot (mvp) 和 dispersion (disp)。
# Identify the top highly variable genes
top <- head(x = VariableFeatures(object = sce.all), 5)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(object = sce.all)
plot2 <- LabelPoints(plot = plot1, points = top, repel = TRUE)
plot2
#基因归一化
# 这里设置对所有的基因都做了scale,但是需要知道的是,其实后续的分析都是基于高变基因的,因此,使用默认参数就可以了,而且提升效率
all.genes <- rownames(x = sce.all)
sce.all <- ScaleData(object = sce.all, features = all.genes) # 对所有基因做归一化处理
# 线性PCA降维
sce.all <- RunPCA(object = sce.all, features = VariableFeatures(object = sce.all))
DimPlot(object = sce.all, reduction = "pca")
VizDimLoadings(sce.all, dims = 1:2, reduction = "pca")
DimHeatmap(sce.all, dims = 1:15, cells = 500, balanced = TRUE)
# 鉴定数据集的可用维度,虚线以上的为可用维度
sce.all <- JackStraw(object = sce.all, num.replicate = 100)
sce.all <- ScoreJackStraw(object = sce.all, dims = 1:20)
JackStrawPlot(object = sce.all, dims = 1:20)
JackStrawPlot(object = sce.all, dims = c(1:15))
# 另外一种鉴定手段是绘制所有PC的分布点图,拐点处的pc作为选定数目
ElbowPlot(sce.all)
saveRDS(sce.all,'sc_int.rds')#太大了,内存
rm(list=ls())
merged_seurat <- readRDS('sc_int.rds') # 读取 rds
## 聚类
merged_seurat <- FindNeighbors(merged_seurat, dims = 1:20)
merged_seurat <- FindClusters(merged_seurat, resolution = 1.2)
# UMAP降维
merged_seurat <- RunUMAP(merged_seurat, dims = 1:20)
DimPlot(merged_seurat,reduction = 'umap',label=TRUE)