DepMap抑癌基因的寻找

复现的图片如下:


(A) Identification of essential genes in CCA through a druggable kinase library screen in 22 CCA cell lines from DepMap

图片来自于《Targeting Aurorakinase B regulates cholesterol metabolism and enhances chemoimmunotherapy in cholangiocarcinoma》

询问AI之后,参考提供的代码并一步一步修改的代码如下:
前期基础包的安装

#文章复现1
#从DepMap获取CCA细胞系CRISPR筛选数据 https://depmap.org/portal/

# Step 0:一键安装依赖
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

needed <- c("depmap", "tidyverse", "RobustRankAggreg", "experimenthub")
for (pkg in needed) {
  if (!requireNamespace(pkg, quietly = TRUE))
    BiocManager::install(pkg, ask = FALSE, update = FALSE)
}
library(tidyverse)
library(depmap)#This is depmap release 22Q2
library(RobustRankAggreg)
library(ExperimentHub)

数据下载:这里数据解释可以查看depmap包以及ExperimentHub包的官网介绍,非常全面。
ExperimentHub: Access the ExperimentHub Web Service https://bioconductor.org/packages/release/bioc/vignettes/ExperimentHub/inst/doc/ExperimentHub.html
The depmap data https://bioconductor.org/packages/release/data/experiment/vignettes/depmap/inst/doc/depmap.html

#Step 1:连接 ExperimentHub 并拿到 22Q4 数据
eh <- ExperimentHub::ExperimentHub()
query(eh, "depmap")
#depmap::depmap_crispr(),最新的数据
# 获取CRISPR数据和元数
#crispr数据集包含选定基因和癌细胞系的(批量校正的CERES推断基因效应)CRISPR-Cas9敲除数据
#metadata数据集包含所有癌细胞系的元数据。它对应于depmap_19Q1_cell_lines.csv文件
#最新的数据
crispr_data <-depmap::depmap_crispr()
metadata <-depmap::depmap_metadata()

head(crispr_data)
head(metadata)

#Step 2:过滤 CCA 细胞系
# 筛选CCA细胞系
colnames(metadata)
table(metadata$primary_disease)
# Bile Duct Cancer,36
cca_metadata <- metadata %>%
  filter(primary_disease == "Bile Duct Cancer") %>%
  select(depmap_id, cell_line)

# 提取CERES分数
colnames(crispr_data)
#[1] "depmap_id"  "gene"       "dependency" "entrez_id"  "gene_name"  "cell_line" 
#dependency: CERES分数(基因依赖性分数)
head(crispr_data)
head(cca_metadata)
cca_crispr <- crispr_data %>%
  filter(depmap_id %in% cca_metadata$depmap_id) %>%
  pivot_wider(names_from = depmap_id, values_from = dependency)

# 提取CERES分数
cca_crispr <- crispr_data %>%
  filter(depmap_id %in% cca_metadata$depmap_id) %>%
  select(gene_name, depmap_id, dependency) %>%
  pivot_wider(names_from = depmap_id, values_from = dependency) %>%
  column_to_rownames("gene_name")

colnames(cca_crispr)
head(cca_crispr)
#Step 3:只保留 KinBase 人类激酶基因
# KinBase 人类激酶列表(自动缓存)
kin_tbl <- read_csv("kinaseprotein.csv",col_names = TRUE)
kin_tbl<-kin_tbl[-1,]
kin_genes <-kin_tbl$HGN
kin_genes<-na.omit(kin_genes)

# 取交集
# 提取激酶基因数据
#这个列表来源于【干货】518个激酶的分类及列表 https://mp.weixin.qq.com/s?src=11&timestamp=1755181353&ver=6174&signature=0-onti55N8GeRguZyCvNi1xdU0PDgOpSH3Gb0PdfhWBVCJWSKqAdcCVNFHYbkm7JPDISdFCc5-FnsW4qzdIb*nC0n*Q7-1Xvi3HeDGx8M13P60aD1e8p3vvK17AxWcQI&new=1
#自己拷贝到excel里做成kinaseprotein.csv
###数据转换为矩阵形式,其中行是基因(使用基因名),列是细胞系,每个值为dependency。
kinase_ceres <- cca_crispr[rownames(cca_crispr) %in% kin_genes, ]

开始处理数据,准备绘图

###根据平均值,绘制点图###
# 行名为基因,列名为细胞系ID
ceres_matrix<-kinase_ceres
# 1. 计算每个基因的平均CERES分数(所有细胞系的平均值)
gene_means <- apply(ceres_matrix, 1, mean, na.rm = TRUE)

# 2. 创建排名列表(每个细胞系内的基因排序)
rank_list <- apply(ceres_matrix, 2, function(cell_scores) {
  # 按CERES分数升序排列(低分数=高依赖性)
  rownames(ceres_matrix)[order(cell_scores)]
})

# 3. 执行Robust Rank聚合分析
rra_result <- aggregateRanks(
  glist = rank_list,
  N = nrow(ceres_matrix)  # 总基因数
)
colnames(rra_result)

# 4. 准备绘图数据
p_values <- apply(ceres_matrix, 1, function(gene_scores) {t.test(gene_scores, mu = 0)$p.value})
#对每个基因进行单样本t检验

plot_data <- data.frame(
  gene = rownames(ceres_matrix),
  mean_ceres = gene_means,
  p_values = p_values,
  log10_pvalues=-log10(p_values),
  rra_score = rra_result$Score[match(rownames(ceres_matrix), rra_result$Name)]
) %>%
  # 按平均CERES分数升序排列(低分数=高依赖性)
  arrange(mean_ceres) %>%
  mutate(
    rank = 1:n(),  # 创建排名位置
    log10_rra = -log10(rra_score)  # 转换为-log10(RRA Score)用于纵坐标
  )
colnames(plot_data)


# 5. 识别Top 6基因(按平均CERES分数排序)
top_genes <- head(plot_data$gene, 6)
#[1] "PLK1"  "WEE1"  "CHEK1" "CDK1"  "AURKB" "CDC7"
#试出只有平均值能做出一样的基因,如果是中位数会有点不一样,没有CDC7,而是CDK7

# 6. 创建符合文献描述的图表
colnames(plot_data)
library(ggplot2)
library(ggrepel)

ggplot(plot_data, aes(x = mean_ceres, y = log10_pvalues)) +
  # 绘制所有基因点
  geom_point(aes(size = abs(mean_ceres)),  # 点大小表示平均依赖性强度
             color = "grey60", alpha = 0.6) +
  
  # 高亮Top 6基因
  geom_point(
    data = filter(plot_data, gene %in% top_genes),
    aes(fill = gene, size = abs(mean_ceres)),
    shape = 21, color = "black", stroke = 1
  ) +
  
  # 添加Top 6基因标签
  geom_label_repel(
    data = filter(plot_data, gene %in% top_genes),
    aes(label = gene, fill = gene),
    color = "white",
    fontface = "bold",
    box.padding = 0.5,
    segment.color = "grey50",
    size = 4,
    max.overlaps = 20,
    direction = "y"  # 优先垂直方向避免重叠
  ) +
  
  # 添加参考线(标记前6个基因)
  #geom_vline(xintercept = 6.5, linetype = "dashed", color = "grey40", alpha = 0.7) +
  
  # 设置坐标轴
  scale_x_continuous(
    name = "Gene Rank (Average of CERES Score)",
    #breaks = seq(1, nrow(plot_data), by = 5),  # 每5个基因一个刻度
    expand = expansion(mult = c(0.02, 0.05))   # 左右留白
  ) +
  scale_y_continuous(
    name = "-log10(RRA Score)",
    expand = expansion(mult = c(0, 0.1))       # 上方留白
  ) +
  
  # 设置点大小范围(根据平均CERES分数)
  scale_size_continuous(
    name = "|Mean CERES|",
    range = c(2, 8),  # 点大小范围
    breaks = c(0.1, 0.5, 1.0)
  ) +
  
  # 设置颜色
  scale_fill_brewer(palette = "Set1") +
  
  # 添加标题和主题
  labs(title = "Essential Kinases in CCA Cell Lines",
       subtitle = "CRISPR knockout screen (DepMap) - Rank Aggregation Analysis",
       caption = "Top 6 genes identified by mean CERES score") +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "grey80", fill = NA, size = 0.5),
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(color = "grey40", hjust = 0.5),
    axis.title = element_text(face = "bold"),
    axis.title.x = element_text(margin = margin(t = 10)),
    axis.title.y = element_text(margin = margin(r = 10)),
    axis.text.x = element_text(angle = 45, hjust = 1)  # 旋转X轴标签
  )

# 7. 保存结果
ggsave("CCA_essential_kinases_rank_average_pval_plot.png", width = 12, height = 8, dpi = 300)
write_csv(plot_data, "CCA_kinase_rank_data.csv")

最后生成图片如下


虽然没有百分百一样,但感觉差不多系列

后续想了想为什么不是一模一样,可能原因如下:(1)文章是用了22个CCA细胞系,而我用的是最新的depmap版本是Bile Duct Cancer,36,可能需要剔除一些非CCA的,如GBC细胞系;(2)图片美观程度可能自己后续具体调整,deepseek生成只能给个大概,有空再仔细研究怎么调整。

但觉得用这个方法去寻找靶标,实验验证,还是很值的参考学习的。

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容