GEO单细胞测序数据挖掘全在这里

1. 首先GEO下载的数据可能会有问题:

1.1 genes.tsv文件只有一列,缺第一列和第三列,遇到这种情况第一列ENSG123433这种字符串补上即可,第三列全部写Gene Expression;

1.2 .mtx文件也可能有问题:主要是第一列和第二列顺序错误,需要调换。第一列是基因编号,第二列是细胞编号才对;另外第三列count必须是整数,如果不是整数要改为整数

1.3 下载的数据的分组信息在xxx_metadata.tsv里面,需要自己写代码整合到barcodes.tsv文件中,详见后面的代码

上述问题解决后,把文件改为barcodes.tsv.gz,features.tsv.gz,matrix.mtx.gz就可以使用Seurat包的Read10X正常读取。

这里以GSE151674的数据为例进行分析,代码如下:

```

############################################################################

#####chongzuo chongzuo chongzuo########

setwd('F://danxibaofenxi20230529//GSE151674//analysis')

#BiocManager::install('SoupX')

rm(list=ls())

library(dplyr)

#install.packages('dplyr')

library(Seurat)

#BiocManager::install('lazyeval')

library(patchwork)

## =============1.Load the PBMC dataset

data_dir <- "."

pbmc.data <- Read10X(data.dir = data_dir)

pbmc.data

pbmc <- CreateSeuratObject(counts = pbmc.data,

                          project = "pbmc3k",

                          min.cells = 3,

                          min.features = 200)

head(pbmc@meta.data)

library(tidyverse) #加载tidyverse

#BiocManager::install('tidyverse')

#nstall.packages("tidyverse")

rownames(pbmc@meta.data) -> pbmc@meta.data$rowLeo #反向赋值,增加了1列

head(pbmc@meta.data) #展示如下

str_split(pbmc$rowLeo,"_") 

head(str_split(pbmc$rowLeo,"_",simplify=T) [,3])

str_split(pbmc$rowLeo,"_",simplify=T) [,3] -> pbmc@meta.data$group #反向赋值,实现分组

head(pbmc@meta.data)

names(pbmc@meta.data)

unique(pbmc$group)

####

## =============3.QC

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats

grep(pattern = "^mt-", rownames(pbmc), value = T)

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-")

##

# QC指标

head(pbmc@meta.data, 5)

summary(pbmc@meta.data$nCount_RNA)

# QC指标使用小提琴图可视化

#VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# 密度曲线图

data <- pbmc@meta.data

library(ggplot2)

p <- ggplot(data = data, aes(x=nFeature_RNA)) + geom_density()

p

# 两个指标之间的相关性

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")

plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

plot1 + plot2

# 过滤

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

## =============4.标准化

# LogNormalize

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

# 标准化后的值保存在:pbmc[["RNA"]]@data

normalized.data <- pbmc[["RNA"]]@data

normalized.data[1:20,1:4]

dim(normalized.data)

##5、选择高变基因

## =============5.鉴定高变基因

# 高变基因:在一些细胞中表达高,另一些细胞中表达低的基因

# 变异指标: mean-variance relationship

# 默认返回两千个高变基因,用于下游如PCA降维分析。

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# 提取前10的高变基因

top10 <- head(VariableFeatures(pbmc), 10)

top10

all <- head(VariableFeatures(pbmc), 2000)

write.csv(all,'allGaoBianGene.csv')

# 展示高变基因

plot1 <- VariableFeaturePlot(pbmc)

plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

plot1 + plot2

#6、Scaling the data 归一化数据

## =============6.Scaling the data

# 归一化处理:每一个基因在所有细胞中的均值变为0,方差标为1,对于降维来说是必需步骤

# 归一化后的值保存在:pbmc[["RNA"]]@scale.data

pbmc <- ScaleData(pbmc)

scale.data <- pbmc[["RNA"]]@scale.data

dim(scale.data)

scale.data[1:10,1:4]

# 可以选择全部基因归一化 !注意! !注意!

#all.genes <- rownames(pbmc)

#pbmc <- ScaleData(pbmc, features = all.genes)

## =============7.降维

# PCA降维,默认使用前面2000个高变基因,可以使用features改变用于降维的基因集

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

# 可视化

VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:6, cells = 500, balanced = TRUE)

## =============8.确定使用PC个数

# each PC essentially representing a ‘metafeature’

pbmc <- JackStraw(pbmc, num.replicate = 100)

pbmc <- ScoreJackStraw(pbmc, dims = 1:20)

JackStrawPlot(pbmc, dims = 1:20)

ElbowPlot(pbmc)

## =============9.对细胞聚类

# 首先基于PCA空间构建一个基于欧氏距离的KNN图

pbmc <- FindNeighbors(pbmc, dims = 1:20)

# 聚类并最优化

# resolution参数:值越大,细胞分群数越多,

# 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells

# Optimal resolution often increases for larger datasets.

pbmc <- FindClusters(pbmc, resolution = 0.5)

# 查看聚类数ID

head(Idents(pbmc), 5)

# 查看每个类别多少个细胞

head(pbmc@meta.data)

table(pbmc@meta.data$seurat_clusters)

##

## =============10.将细胞在低维空间可视化UMAP/tSNE

pbmc <- RunUMAP(pbmc, dims = 1:20)

pbmc <- RunTSNE(pbmc, dims = 1:20)

DimPlot(pbmc,split.by = 'group')

# 可视化

#DimPlot(pbmc, reduction = "umap", label = T, label.size = 5)

#DimPlot(pbmc, reduction = "tsne", label = T, label.size = 5)

saveRDS(pbmc, file = "pbmc_tutorial.rds")

####

###1.细胞注释

############细胞注释2

# 清空当前环境变量

#rm(list=ls())

options(stringsAsFactors = F)

# 加载包

library(Seurat)

library(SingleR)

#BiocManager::install('celldex')

refdata <- HumanPrimaryCellAtlasData()

save(refdata,file = "refdata.RData")

load("./refdata.RData")

getwd()

refdata

head(colnames(refdata))

head(rownames(refdata))

# 查看共有多少种细胞类型

unique(refdata@colData@listData[["label.main"]])

# 加载pbmc结果

#pbmc <- readRDS("pbmc_tutorial.rds")

# 使用的数据为标化后的数据

testdata <- GetAssayData(pbmc, slot="data")

dim(testdata)

testdata[1:30,1:4]

clusters <- pbmc@meta.data$seurat_clusters

table(clusters)

cellpred <- SingleR(test = testdata, 

                    ref = refdata,

                    labels = refdata$label.main,

                    method = "cluster",

                    clusters = clusters,

                    assay.type.test = "logcounts",

                    assay.type.ref = "logcounts")

str(cellpred,max.level = 3)

metadata <- cellpred@metadata

head(metadata)

celltype = data.frame(ClusterID = rownames(cellpred),

                      celltype = cellpred$labels,

                      stringsAsFactors = F)

celltype

write.csv(celltype, "celltype_anno_SingleR.csv")

levels(pbmc)

new.cluster.ids <- c("Neutrophils","Neutrophils","DC",

                    "Neutrophils","Neutrophils","Neutrophils",

                    "Epithelial_cells","Monocyte","Neutrophils",

                    "HSC_-G-CSF","Fibroblasts","DC","Neutrophils",

                    "Neutrophils","DC","Neutrophils","Neutrophils",

                    "Neutrophils","Neutrophils","Macrophage","Neutrophils",

                    "Gametocytes","HSC_-G-CSF")

names(new.cluster.ids) <- levels(pbmc)

new.cluster.ids

pbmc <- RenameIdents(pbmc, new.cluster.ids)

###

VlnPlot(pbmc,features = c("Irak4", "Cdk9", 'Tyk2', 'Jak1', 'Jak2', 'Jak3'),idents = '0', split.by = 'group',split.plot = TRUE)

DotPlot(pbmc,features = "Irak4",split.by ='group',cols = c('blue','yellow','pink', 'green', 'orange'))

DimPlot(pbmc,split.by = 'group', reduction = "umap", cols = c('blue','yellow','pink', 'green', 'orange','black', 'purple', 'red'))

DimPlot(pbmc,split.by = 'group', reduction = "tsne", cols = c('blue','yellow','pink', 'green', 'orange','black', 'purple', 'red'))

###其他图

VlnPlot(pbmc,features = 'Jak2',split.by = 'group', cols = c('blue','yellow','pink'))

names(pbmc@meta.data)

```

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容