基本流程:数据导入,QC去除低质量细胞,归一化,由于矩阵过于稀疏所以选取top1000~2000,对每个基因进行zscore,线性降维和非线性降维,聚类(基于线性降维的结果进行KNN,SNN)
rm( list = ls())
library(dplyr)
library(Seurat)
library(patchwork)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "F:/class/test/filtered_gene_bc_matrices/hg19/")
#### Method1对于不规范数据
dir.10x = 'F:/class/test/filtered_gene_bc_matrices/hg19/'
#首先处理genelist文件,考虑到只需要第二行
genes <- read.table(paste0(dir.10x, 'genes.tsv'), stringsAsFactors=F, header=F)$V2
genes <- make.unique(genes, sep = '.')#将重复的以一个点分开
#barcode是当列矩阵
barcodes <- readLines(paste0(dir.10x, 'barcodes.tsv'))
mtx <- Matrix::readMM(paste0(dir.10x, 'matrix.mtx'))
mtx <- as(mtx, 'dgCMatrix')#格式转换
colnames(mtx) = barcodes#赋予列名
rownames(mtx) = genes#赋予行名
pbmc <- CreateSeuratObject(counts = mtx, project = "pbmc3k",min.cells = 3, min.features = 200)
#### Method2
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
# Lets examine a few genes in the first thirty cells
pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
dense.size <- object.size(as.matrix(pbmc.data))
dense.size
sparse.size <- object.size(pbmc.data)
sparse.size
dense.size/sparse.size
#######QC计算线粒体比例,线粒体比例高是死细胞,过滤掉线粒体高的细胞
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
#三张图每个细胞检测到的基因数量,每个细胞检测到的count数量,每个细胞检测到的线粒体比例
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
#count和线粒体基因的的关系
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
#count和基因总数的关系
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#根据数据特点选择适合的过滤方式
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
######NormalizeData去除测序深度对数据分析的影响
#将每一个细胞的count先归一化到10000,再对每一个基因表达值log
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#pbmc <- NormalizeData(pbmc)
##########Feature selection 1500,3000
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
top10
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel=TRUE)
plot2
#FindVariableFeatures选择基因的原理平均表达量,一个细胞的平均表达量在不同的细胞之间的方差有多大,方差越大就是细胞类群分开的依据
plot1 + plot2
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)#本质还是zscore,每一个基因在所有细胞中平均细胞表达量为0,方差为1
########降维Reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# Examine and visualize PCA results a few different ways
DimPlot(pbmc, reduction = "pca")#可视化
# FeaturePlot(pbmc, reduction = 'pca', features = c('CD79A', 'CD14', 'FCGR3A', 'CD4', 'CST3', 'PPBP'))
# FeaturePlot(pbmc, reduction = 'pca', features = c('nCount_RNA', 'nFeature_RNA'))
# loadings_sorted = dplyr::arrange(as.data.frame(loadings), desc(PC_1))
# DimHeatmap(pbmc, dims = 1:2, cells = 200, balanced = TRUE)
# print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
# VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
# DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
# DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
ElbowPlot(pbmc,ndims = 50)#做了50个pca可视化,纵轴奇异值
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
#pbmc <- JackStraw(pbmc, num.replicate = 100)
#pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
#JackStrawPlot(pbmc, dims = 1:15)
#ElbowPlot(pbmc)
# Look at cluster IDs of the first 5 cells
#head(Idents(pbmc), 5)
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
#选取前10的特征进行非线性降维,两种方式umap,tsne,但是umap更好可以更真实的表现群与群的真实距离
pbmc <- RunUMAP(pbmc, dims = 1:10)
FeaturePlot(pbmc, features = c('FCGR3A', 'CD14'), reduction = 'umap')
#DimPlot(pbmc, reduction = "umap")
# note that you can set `label = TRUE` or use the LabelClusters function to help label
#######cluster分群
pbmc <- FindNeighbors(pbmc, dims = 1:10)# louvain cluster, graph based
pbmc <- FindClusters(pbmc, resolution = 0.5)#resolution越大群越多
DimPlot(pbmc, reduction = "umap", group.by = 'seurat_clusters', label=T)
#根据maker对细胞进行细胞注释
FeaturePlot(pbmc, features = c("MS4A1", "TYROBP", "CD14",'FCGR3A', "FCER1A",
"CCR7", "IL7R", "PPBP", "CD8A"))
new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
#重新定义的亚群
DimPlot(pbmc, reduction = "umap", label=T)
# ############################################################################################################
# ## DE analysis有很多小群无法判断时
# ############################################################################################################
#利用差异基因的方式,来看所有群表达怎样的marker,来确定每一个群表达什么样的基因,再去推断那个群可能是什么样的细胞亚群
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)#都所有亚群依次求差异基因
pbmc.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC)#每个cluster top2的差异基因看
CD4.mem.DEGS <- FindMarkers(pbmc, ident.1 = 'Memory CD4 T', ident.2 = 'Naive CD4 T', min.pct = 0.25)#要计算哪一群的差异基因,要计算的差异基因对照的是什么
#test.use计算方式
# ############################################################################################################
# ## gene signature analysis
# ############################################################################################################
exhaustion_genes = list(c('PDCD1','CD160','FASLG','CD244','LAG3','TNFRSF1B','CCR5','CCL3',
'CCL4','CXCL10','CTLA4','LGALS1','LGALS3','PTPN13','RGS16','ISG20',
'MX1','IRF4','EOMES','PBX3','NFATC1','NR4A2','CKS2','GAS2','ENTPD1','CA2'))
pbmc = Seurat::AddModuleScore(pbmc, features = exhaustion_genes, name='exhaustion.score')
FeaturePlot(pbmc, features = 'exhaustion.score1', reduction = 'umap')
VlnPlot(pbmc,features = 'exhaustion.score1', group.by = 'seurat_clusters')