#参考生信开荒牛
#将文件处理成10xgenomics的标准输入形式
> fs = list.files('./GSE136001_RAW/',pattern = '^GSM')
> fs
[1] "GSM4039241_f-ctrl-1-filtered-barcodes.tsv.gz" "GSM4039241_f-ctrl-1-filtered-features.tsv.gz"
[3] "GSM4039241_f-ctrl-1-filtered-matrix.mtx.gz" "GSM4039242_f-ctrl-2-filtered-barcodes.tsv.gz"
[5] "GSM4039242_f-ctrl-2-filtered-features.tsv.gz" "GSM4039242_f-ctrl-2-filtered-matrix.mtx.gz"
[7] "GSM4039243_f-tumor-1-filtered-barcodes.tsv.gz" "GSM4039243_f-tumor-1-filtered-features.tsv.gz"
[9] "GSM4039243_f-tumor-1-filtered-matrix.mtx.gz" "GSM4039244_f-tumor-2-filtered-barcodes.tsv.gz"
[11] "GSM4039244_f-tumor-2-filtered-features.tsv.gz" "GSM4039244_f-tumor-2-filtered-matrix.mtx.gz"
[13] "GSM4039245_m-ctrl-1-filtered-barcodes.tsv.gz" "GSM4039245_m-ctrl-1-filtered-features.tsv.gz"
[15] "GSM4039245_m-ctrl-1-filtered-matrix.mtx.gz" "GSM4039246_m-ctrl-2-filtered-barcodes.tsv.gz"
[17] "GSM4039246_m-ctrl-2-filtered-features.tsv.gz" "GSM4039246_m-ctrl-2-filtered-matrix.mtx.gz"
[19] "GSM4039247_m-tumor-1-filtered-barcodes.tsv.gz" "GSM4039247_m-tumor-1-filtered-features.tsv.gz"
[21] "GSM4039247_m-tumor-1-filtered-matrix.mtx.gz" "GSM4039248_m-tumor-2-filtered-barcodes.tsv.gz"
[23] "GSM4039248_m-tumor-2-filtered-features.tsv.gz" "GSM4039248_m-tumor-2-filtered-matrix.mtx.gz"
> samples <- substr(fs,1,10)
> unique(samples)
[1] "GSM4039241" "GSM4039242" "GSM4039243" "GSM4039244" "GSM4039245" "GSM4039246" "GSM4039247" "GSM4039248"
> x="GSM4039241"
> #x="GSM4039241"
> y = fs[grepl(x,fs)]
> y
[1] "GSM4039241_f-ctrl-1-filtered-barcodes.tsv.gz" "GSM4039241_f-ctrl-1-filtered-features.tsv.gz"
[3] "GSM4039241_f-ctrl-1-filtered-matrix.mtx.gz"
> folder = paste0('./GSE136001_RAW/',strsplit(y[1],split = '-f')[[1]][1])
> folder
[1] "./GSE136001_RAW/GSM4039241_f-ctrl-1"
> paste0('./GSE136001_RAW/',y[1])
[1] "./GSE136001_RAW/GSM4039241_f-ctrl-1-filtered-barcodes.tsv.gz"
> file.path(folder,"barcodes.tsv.gz")
[1] "./GSE136001_RAW/GSM4039241_f-ctrl-1/barcodes.tsv.gz"
rm(list = ls())
getwd()
setwd("./Rdata/胶质瘤/")
#将文件处理成10xgenomics的标准输入形式
fs = list.files('./GSE136001_RAW/',pattern = '^GSM')
fs
samples <- substr(fs,1,10)
unique(samples)
lapply( unique(samples), function(x){
#x="GSM4039241"
y = fs[grepl(x,fs)]
folder = paste0('./GSE136001_RAW/',strsplit(y[1],split = '-f')[[1]][1])
#创建文件夹
dir.create(folder,recursive = T)
#重命名子文件夹并移动到相应的文件夹中
file.rename(paste0('./GSE136001_RAW/',y[1]),file.path(folder,"barcodes.tsv.gz"))
file.rename(paste0('./GSE136001_RAW/',y[2]),file.path(folder,"features.tsv.gz"))
file.rename(paste0('./GSE136001_RAW/',y[3]),file.path(folder,"matrix.mtx.gz"))
})
if(!require(BiocManager)) install.packages("BiocManager",update = F,ask = F)
if(!require(Seurat))install.packages("Seurat")
if(!require(Matrix))install.packages("Matrix")
if(!require(ggplot2))install.packages("ggplot2")
if(!require(cowplot))install.packages("cowplot")
if(!require(magrittr))install.packages("magrittr")
if(!require(dplyr))install.packages("dplyr")
if(!require(purrr))install.packages("purrr")
samples <- dir(path="./GSE136001_RAW/", pattern="^GSM")
for (i in samples){
assign(paste0("samples_raw_data", i), Read10X(data.dir = paste0("./GSE136001_RAW/", i)))
}
samples_raw_data=list(
`samples_raw_dataGSM4039241_f-ctrl-1`,`samples_raw_dataGSM4039242_f-ctrl-2`,
`samples_raw_dataGSM4039243_f-tumor-1`,`samples_raw_dataGSM4039244_f-tumor-2`,
`samples_raw_dataGSM4039245_m-ctrl-1`,`samples_raw_dataGSM4039246_m-ctrl-2`,
`samples_raw_dataGSM4039247_m-tumor-1`,`samples_raw_dataGSM4039248_m-tumor-2`)
names(samples_raw_data) <- samples
samples_objects <- lapply(seq_along(samples_raw_data), function(i) {
seurat_object <- CreateSeuratObject(counts = samples_raw_data[[i]],
project = paste0("mm.gam.gender.", names(samples_raw_data)[i]),
min.cells = 5)
# 通过样本名称添加condition
if(grepl("ctrl", names(samples_raw_data)[i])) {
seurat_object$condition <- "ctrl"
} else {
seurat_object$condition <- "tumor"
}
seurat_object
})
names(samples_objects) <- names(samples_raw_data)
dim(samples_objects$`GSM4039241_f-ctrl-1`)
#[1] 12520 5223
#二、质控
#计算线粒体基因比例,注意这里是小鼠样本,线粒体基因以mt-开头;
#查看一下线粒体基因
mt.gene = rownames(samples_objects$`GSM4039241_f-ctrl-1`)[grep("^mt-",rownames(samples_objects$`GSM4039241_f-ctrl-1`))]
mt.gene
#[1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
#[8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
#批量计算线粒体基因比例
for (i in seq_along(samples_objects)){
samples_objects[[i]] <- PercentageFeatureSet(samples_objects[[i]],
pattern = "^mt-",
col.name = "percent_mito")
}
names(samples_objects) <- names(samples_raw_data)
head(samples_objects[[1]]$percent_mito)
#AAACCTGAGAGGTTAT-1 AAACCTGCAATGGATA-1 AAACCTGCACAGATTC-1
# 2.2007515 0.9986505 0.4046945
#AAACGGGAGGTGACCA-1 AAACGGGCATCGGACC-1 AAACGGGTCAGGATCT-1
# 2.2660099 2.3625844 2.6066351
#画个图可视化一下线粒体基因比例
VlnPlot(samples_objects[[1]],features = c("nFeature_RNA", "nCount_RNA", "percent_mito"),
pt.size = 0, ncol = 3)
#质控过滤(基因200-3000,线粒体基因占比<5%)
samples_objects <- lapply(seq_along(samples_objects), function(i) {
samples_objects[[i]] <- subset(x = samples_objects[[i]],
subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent_mito < 5)
samples_objects[[i]] <- NormalizeData(object = samples_objects[[i]], verbose = FALSE)
})
names(samples_objects) <- names(samples_raw_data)
dim(samples_objects$`GSM4039241_f-ctrl-1`)
#[1] 12520 5167