复现的图片如下:

(A) Identification of essential genes in CCA through a druggable kinase library screen in 22 CCA cell lines from DepMap
图片来自于《Targeting Aurorakinase B regulates cholesterol metabolism and enhances chemoimmunotherapy in cholangiocarcinoma》
询问AI之后,参考提供的代码并一步一步修改的代码如下:
前期基础包的安装
#文章复现1
#从DepMap获取CCA细胞系CRISPR筛选数据 https://depmap.org/portal/
# Step 0:一键安装依赖
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
needed <- c("depmap", "tidyverse", "RobustRankAggreg", "experimenthub")
for (pkg in needed) {
if (!requireNamespace(pkg, quietly = TRUE))
BiocManager::install(pkg, ask = FALSE, update = FALSE)
}
library(tidyverse)
library(depmap)#This is depmap release 22Q2
library(RobustRankAggreg)
library(ExperimentHub)
数据下载:这里数据解释可以查看depmap包以及ExperimentHub包的官网介绍,非常全面。
ExperimentHub: Access the ExperimentHub Web Service https://bioconductor.org/packages/release/bioc/vignettes/ExperimentHub/inst/doc/ExperimentHub.html
The depmap data https://bioconductor.org/packages/release/data/experiment/vignettes/depmap/inst/doc/depmap.html
#Step 1:连接 ExperimentHub 并拿到 22Q4 数据
eh <- ExperimentHub::ExperimentHub()
query(eh, "depmap")
#depmap::depmap_crispr(),最新的数据
# 获取CRISPR数据和元数
#crispr数据集包含选定基因和癌细胞系的(批量校正的CERES推断基因效应)CRISPR-Cas9敲除数据
#metadata数据集包含所有癌细胞系的元数据。它对应于depmap_19Q1_cell_lines.csv文件
#最新的数据
crispr_data <-depmap::depmap_crispr()
metadata <-depmap::depmap_metadata()
head(crispr_data)
head(metadata)
#Step 2:过滤 CCA 细胞系
# 筛选CCA细胞系
colnames(metadata)
table(metadata$primary_disease)
# Bile Duct Cancer,36
cca_metadata <- metadata %>%
filter(primary_disease == "Bile Duct Cancer") %>%
select(depmap_id, cell_line)
# 提取CERES分数
colnames(crispr_data)
#[1] "depmap_id" "gene" "dependency" "entrez_id" "gene_name" "cell_line"
#dependency: CERES分数(基因依赖性分数)
head(crispr_data)
head(cca_metadata)
cca_crispr <- crispr_data %>%
filter(depmap_id %in% cca_metadata$depmap_id) %>%
pivot_wider(names_from = depmap_id, values_from = dependency)
# 提取CERES分数
cca_crispr <- crispr_data %>%
filter(depmap_id %in% cca_metadata$depmap_id) %>%
select(gene_name, depmap_id, dependency) %>%
pivot_wider(names_from = depmap_id, values_from = dependency) %>%
column_to_rownames("gene_name")
colnames(cca_crispr)
head(cca_crispr)
#Step 3:只保留 KinBase 人类激酶基因
# KinBase 人类激酶列表(自动缓存)
kin_tbl <- read_csv("kinaseprotein.csv",col_names = TRUE)
kin_tbl<-kin_tbl[-1,]
kin_genes <-kin_tbl$HGN
kin_genes<-na.omit(kin_genes)
# 取交集
# 提取激酶基因数据
#这个列表来源于【干货】518个激酶的分类及列表 https://mp.weixin.qq.com/s?src=11×tamp=1755181353&ver=6174&signature=0-onti55N8GeRguZyCvNi1xdU0PDgOpSH3Gb0PdfhWBVCJWSKqAdcCVNFHYbkm7JPDISdFCc5-FnsW4qzdIb*nC0n*Q7-1Xvi3HeDGx8M13P60aD1e8p3vvK17AxWcQI&new=1
#自己拷贝到excel里做成kinaseprotein.csv
###数据转换为矩阵形式,其中行是基因(使用基因名),列是细胞系,每个值为dependency。
kinase_ceres <- cca_crispr[rownames(cca_crispr) %in% kin_genes, ]
开始处理数据,准备绘图
###根据平均值,绘制点图###
# 行名为基因,列名为细胞系ID
ceres_matrix<-kinase_ceres
# 1. 计算每个基因的平均CERES分数(所有细胞系的平均值)
gene_means <- apply(ceres_matrix, 1, mean, na.rm = TRUE)
# 2. 创建排名列表(每个细胞系内的基因排序)
rank_list <- apply(ceres_matrix, 2, function(cell_scores) {
# 按CERES分数升序排列(低分数=高依赖性)
rownames(ceres_matrix)[order(cell_scores)]
})
# 3. 执行Robust Rank聚合分析
rra_result <- aggregateRanks(
glist = rank_list,
N = nrow(ceres_matrix) # 总基因数
)
colnames(rra_result)
# 4. 准备绘图数据
p_values <- apply(ceres_matrix, 1, function(gene_scores) {t.test(gene_scores, mu = 0)$p.value})
#对每个基因进行单样本t检验
plot_data <- data.frame(
gene = rownames(ceres_matrix),
mean_ceres = gene_means,
p_values = p_values,
log10_pvalues=-log10(p_values),
rra_score = rra_result$Score[match(rownames(ceres_matrix), rra_result$Name)]
) %>%
# 按平均CERES分数升序排列(低分数=高依赖性)
arrange(mean_ceres) %>%
mutate(
rank = 1:n(), # 创建排名位置
log10_rra = -log10(rra_score) # 转换为-log10(RRA Score)用于纵坐标
)
colnames(plot_data)
# 5. 识别Top 6基因(按平均CERES分数排序)
top_genes <- head(plot_data$gene, 6)
#[1] "PLK1" "WEE1" "CHEK1" "CDK1" "AURKB" "CDC7"
#试出只有平均值能做出一样的基因,如果是中位数会有点不一样,没有CDC7,而是CDK7
# 6. 创建符合文献描述的图表
colnames(plot_data)
library(ggplot2)
library(ggrepel)
ggplot(plot_data, aes(x = mean_ceres, y = log10_pvalues)) +
# 绘制所有基因点
geom_point(aes(size = abs(mean_ceres)), # 点大小表示平均依赖性强度
color = "grey60", alpha = 0.6) +
# 高亮Top 6基因
geom_point(
data = filter(plot_data, gene %in% top_genes),
aes(fill = gene, size = abs(mean_ceres)),
shape = 21, color = "black", stroke = 1
) +
# 添加Top 6基因标签
geom_label_repel(
data = filter(plot_data, gene %in% top_genes),
aes(label = gene, fill = gene),
color = "white",
fontface = "bold",
box.padding = 0.5,
segment.color = "grey50",
size = 4,
max.overlaps = 20,
direction = "y" # 优先垂直方向避免重叠
) +
# 添加参考线(标记前6个基因)
#geom_vline(xintercept = 6.5, linetype = "dashed", color = "grey40", alpha = 0.7) +
# 设置坐标轴
scale_x_continuous(
name = "Gene Rank (Average of CERES Score)",
#breaks = seq(1, nrow(plot_data), by = 5), # 每5个基因一个刻度
expand = expansion(mult = c(0.02, 0.05)) # 左右留白
) +
scale_y_continuous(
name = "-log10(RRA Score)",
expand = expansion(mult = c(0, 0.1)) # 上方留白
) +
# 设置点大小范围(根据平均CERES分数)
scale_size_continuous(
name = "|Mean CERES|",
range = c(2, 8), # 点大小范围
breaks = c(0.1, 0.5, 1.0)
) +
# 设置颜色
scale_fill_brewer(palette = "Set1") +
# 添加标题和主题
labs(title = "Essential Kinases in CCA Cell Lines",
subtitle = "CRISPR knockout screen (DepMap) - Rank Aggregation Analysis",
caption = "Top 6 genes identified by mean CERES score") +
theme_minimal(base_size = 14) +
theme(
legend.position = "right",
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "grey80", fill = NA, size = 0.5),
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(color = "grey40", hjust = 0.5),
axis.title = element_text(face = "bold"),
axis.title.x = element_text(margin = margin(t = 10)),
axis.title.y = element_text(margin = margin(r = 10)),
axis.text.x = element_text(angle = 45, hjust = 1) # 旋转X轴标签
)
# 7. 保存结果
ggsave("CCA_essential_kinases_rank_average_pval_plot.png", width = 12, height = 8, dpi = 300)
write_csv(plot_data, "CCA_kinase_rank_data.csv")
最后生成图片如下

虽然没有百分百一样,但感觉差不多系列
后续想了想为什么不是一模一样,可能原因如下:(1)文章是用了22个CCA细胞系,而我用的是最新的depmap版本是Bile Duct Cancer,36,可能需要剔除一些非CCA的,如GBC细胞系;(2)图片美观程度可能自己后续具体调整,deepseek生成只能给个大概,有空再仔细研究怎么调整。
但觉得用这个方法去寻找靶标,实验验证,还是很值的参考学习的。