最全面的GEO单细胞数据挖掘

单样本官方分析流程：
https://www.jianshu.com/p/2314a472317f
注释网站cellMarker:
http://xteam.xbio.top/CellMarker/search.jsp?cellMarkerSpeciesType=Mouse&cellMarker=CD14
https://panglaodb.se/markers.html?cell_type=%27Pulmonary%20alveolar%20type%20I%20cells%27
单细胞测序数据拆分
https://www.jianshu.com/p/56c550557a9f
========================================
关于单细胞的文件：
matrix文件：
第一行，第一个字段表示基因的类别数，第二个字段代表细胞的个数，第三列代表所有细胞所有基因的总表达量
第一列代表基因编号，比如67号基因
第二列是细胞编号：比如1，代表1号细胞，有独立的barcode；
第三列代表基因的表达量
如果遇到格式和上面对不上的需要自行修改
=========================================
记住用conda安装R和R包（centOS系统要用这个安装最快最简单！）
https://blog.csdn.net/x_yAOTU/article/details/126141223
永远选择最简单有效的方法完成任务！
=========================================
常用的分析流程：

#####chongzuo chongzuo chongzuo########
#setwd('F://danxibaofenxi20230529//GSE151674//analysis//redo230606/')
setwd('F://danxibaofenxi20230529//GSE151674//analysis//674redo20230607')
setwd('F://danxibaofenxi20230529//GSE136831//analysis')
#BiocManager::install('SoupX')
#library(SoupX)
#rm(list=ls())
library(dplyr)
#install.packages('dplyr')
library(Seurat)
#BiocManager::install('lazyeval')
library(patchwork)
## =============1.Load the PBMC dataset
data_dir <- "."
pbmc.data <- Read10X(data.dir = data_dir)
pbmc.data
pbmc <- CreateSeuratObject(counts = pbmc.data, 
                           project = "pbmc3k", 
                           min.cells = 3,
                           min.features = 200)
head(pbmc@meta.data)

#pbmc <- JackStraw(pbmc, num.replicate = 100)

# QC指标
## =============3.QC
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
grep(pattern = "^mt-", rownames(pbmc), value = T)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")


head(pbmc@meta.data, 5)
summary(pbmc@meta.data$nCount_RNA)


# QC指标使用小提琴图可视化
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# 密度曲线图
data <- pbmc@meta.data
library(ggplot2)
p <- ggplot(data = data, aes(x=nFeature_RNA)) + geom_density()
p

# 两个指标之间的相关性
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# 过滤
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
summary(pbmc@meta.data$nCount_RNA)
#设置分组信息
library(tidyverse) #加载tidyverse
#BiocManager::install('tidyverse')
#install.packages("tidyverse")
rownames(pbmc@meta.data) -> pbmc@meta.data$rowLeo #反向赋值，增加了1列
head(pbmc@meta.data) #展示如下
#str_split(pbmc$rowLeo,"_")  
head(str_split(pbmc$rowLeo,"_",simplify=T) [,3])
str_split(pbmc$rowLeo,"_",simplify=T) [,3] -> pbmc@meta.data$group #反向赋值，实现分组
#str_split(pbmc$rowLeo,"_",simplify=T) [,4] -> pbmc@meta.data$celltype
###
## =============4.标准化
# LogNormalize
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

# 标准化后的值保存在：pbmc[["RNA"]]@data
normalized.data <- pbmc[["RNA"]]@data
normalized.data[1:20,1:4]
dim(normalized.data)
## =============5.鉴定高变基因
# 高变基因：在一些细胞中表达高，另一些细胞中表达低的基因
# 变异指标： mean-variance relationship
# 默认返回两千个高变基因，用于下游如PCA降维分析。
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# 提取前10的高变基因
top10 <- head(VariableFeatures(pbmc), 10)
top10

# 展示高变基因
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

## =============6.Scaling the data
# 归一化处理：每一个基因在所有细胞中的均值变为0，方差标为1，对于降维来说是必需步骤
# 归一化后的值保存在：pbmc[["RNA"]]@scale.data
pbmc <- ScaleData(pbmc)
scale.data <- pbmc[["RNA"]]@scale.data
dim(scale.data)
scale.data[1:10,1:4]


# 可以选择全部基因归一化
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## =============7.降维
# PCA降维，默认使用前面2000个高变基因，可以使用features改变用于降维的基因集
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

# 可视化
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
## =============8.确定使用PC个数
# each PC essentially representing a ‘metafeature’
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
ElbowPlot(pbmc)
## =============9.对细胞聚类
# 首先基于PCA空间构建一个基于欧氏距离的KNN图
pbmc <- FindNeighbors(pbmc, dims = 1:10)

# 聚类并最优化
# resolution参数：值越大，细胞分群数越多，
# 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells
# Optimal resolution often increases for larger datasets. 
pbmc <- FindClusters(pbmc, resolution = 0.5)

# 查看聚类数ID
head(Idents(pbmc), 5)

# 查看每个类别多少个细胞
head(pbmc@meta.data)
table(pbmc@meta.data$seurat_clusters)
## =============10.将细胞在低维空间可视化UMAP/tSNE
pbmc <- RunUMAP(pbmc, dims = 1:10)
pbmc <- RunTSNE(pbmc, dims = 1:10)
p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "group")
p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2
DimPlot(immune.combined, reduction = "umap", split.by = "group")
####

# For performing differential expression after integration, we switch back to the original
# data

pbmc <- JackStraw(pbmc, num.replicate = 100, dims=40)
pbmc <- ScoreJackStraw(pbmc, dims = 1:40)

DefaultAssay(immune.combined) <- "RNA"
#BiocManager::install('multtest')
#install.packages('tmvnsim')
library(metap)
#nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "group", verbose = FALSE)
#head(nk.markers)

saveRDS(immune.combined, 'immune.combined674-final.rds')
saveRDS(pbmc, 'pbmc0621yiJiangwei-1.rds')


#######
pbmc.markers <- FindAllMarkers(immune.combined, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC) 
head(pbmc.markers)
write.csv(pbmc.markers, 'p0615.markers.csv')

top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
#DoHeatmap(immune.combined, features = top10$gene[1:10])
VlnPlot(immune.combined, features = top10$gene[1:20],pt.size=0)
bfreaname.pbmc <- immune.combined

new.cluster.ids <- c("SLC16A7+ cell","SLC16A7+ cell","Alveolar macrophages",
                     "Alveolar macrophages","SLC16A7+ cell","CD8+ cytotoxic T cell",
                     "Alveolar macrophages","Natural killer cell","Ionocyte cell",
                     "FOXN4+ cell","Macrophage","B cell","Alveolar macrophages",
                     "Ionocytes","Alveolar macrophages","Brush cell (Tuft cell)",
                     "Ciliated cell","B cell","Ionocyte cell","SLC16A7+ cell",
                     "Clara cells","Ionocyte cell","Basal cell","Basal cell",
                     "Endothelial cell","Alveolar macrophages","T cell",
                     "Ciliated cell","Natural killer cell","AT2",
                     "Endothelial cell","Real Ionocytes","Ionocytes","SLC16A7+ cell",
                     "Neuroendocrine cell","Epithelial cell","Ciliated cells",
                     "Ciliated cell","Neuroendocrine cell","Ciliated cell",
                     "Ciliated cells","AT1","Myeloid dendritic cell",
                     "Alveolar macrophages","Basal cell")
names(new.cluster.ids) <- levels(immune.combined)
immune.combined <- RenameIdents(immune.combined, new.cluster.ids)
  DimPlot(immune.combined, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
##

#DimPlot(immune.combined,split.by = 'group', cols = c('blue','yellow','pink', 'green', 'orange','black', 'purple', 'red'))

DotPlot(immune.combined,features = 'Ripk1',cols = c('red','blue'),)
DotPlot(immune.combined,features = 'Ripk1',dot.scale = 8,split.by ='group',cols=c('blue','blue','blue'))

new.idents <- seurat@meta.data$cell_type_singler
Idents(object = seurat) <- new.idents #add cell type as idents
seurat$celltype.sample_WT_P21<- paste(Idents(seurat),seurat$sampleNOrep_WT_P21, sep = "_") #add sample subgruops to the cell type idents
new.idents <- seurat@meta.data$celltype.sample_WT_P21
Idents(object = seurat) <- new.idents # now just add it to the seurat object as new idents
library(ggplot2)
DotPlot(immune.combined, split.by ='group', cols = c("red","blue"), features ='Ripk1') + RotatedAxis() + 
  theme(axis.text = element_text(size = 14))+scale_colour_gradient(low =c("white"), high =c("red")) +
  guides(color = guide_colorbar(title = 'Average Expression'))

mydeg <- FindMarkers(immune.combined,ident.1 = 'Immune cell_SMK-MOCP',ident.2 = 'Immune cell_FA', verbose = FALSE, test.use = 'wilcox',min.pct = 0.1)
head(mydeg)
write.csv(mydeg,'Immunecell-mydeg.csv')

mydeg <- FindMarkers(pbmc,ident.1 = 'AT2_COPD',ident.2 = 'AT2_Control', verbose = FALSE, test.use = 'wilcox',min.pct = 0.1)
head(mydeg)
write.csv(mydeg,'mydeg-immune.csv')
#附加
######查看不同细胞在各组中的表达水平代码
myref$celltype <- Idents(myref)
mymatrix <- as.data.frame(pbmc@assays$RNA@data)
mymatrix2<-t(mymatrix)%>%as.data.frame()
mymatrix2[,1]<-pbmc$celltype
colnames(mymatrix2)[1] <- "celltype"

mymatrix2[,ncol(mymatrix2)+1]<-pbmc$group
colnames(mymatrix2)[ncol(mymatrix2)] <- "group"

#绘图
library(ggplot2)
p1<- ggplot2::ggplot(mymatrix2,aes(x=celltype,y=Thbs1,fill=group))+
  geom_boxplot(alpha=0.7)+
  scale_y_continuous(name = "Expression")+
  scale_x_discrete(name="Celltype")+


##################附加：数据合并，去除批次效应###############
Xuan <- SplitObject(pbmc,split.by='group')
head(Xuan$FA@meta.data)
FA=Xuan$FA
SMK = Xuan$`SMK-MOCP`
ifnb.list <- list(FA, SMK)

ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize", scale.factor = 10000)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

features <- SelectIntegrationFeatures(object.list = ifnb.list)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features)

# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)

immune.combined <- readRDS('immune.combined674-final.rds')
##
##

DefaultAssay(immune.combined) <- "integrated"

# Run the standard workflow for visualization and clustering

immune.combined <- ScaleData(immune.combined, verbose = FALSE)

immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindClusters(immune.combined, resolution = 0.5)
head(immune.combined@meta.data)

最全面的GEO单细胞数据挖掘

推荐阅读更多精彩内容