复现图片如下

生存曲线分析图
图片来自于《Targeting Aurorakinase B regulates cholesterol metabolism and enhances chemoimmunotherapy in cholangiocarcinoma》
生存分析如果要引用他人的,需要看该数据是否有提供生存数据,如果没有,就可能分析不了,具体看文献的数据介绍,如果有,一般在官网的series_matrix.txt.gz文件中。
先安装必要的包
#文献复现之生存曲线
options(BioC_mirror="https://mirrors.westlake.edu.cn/bioconductor")
# 加载必要的包
needed <- c("GEOquery", "survminer", "survival", "tidyverse")
for (pkg in needed) {
if (!requireNamespace(pkg, quietly = TRUE))
BiocManager::install(pkg, ask = FALSE, update = FALSE)
}
#BiocManager::install("GEOquery")
library(devtools)
install_github("seandavi/GEOquery")
library(tidyverse)
library(survival)
library(survminer)
library(GEOquery) # 用于获取GEO数据
去官网下载数据,直接方便
# 1. 下载GSE107943数据集
#gse <- getGEO(GEO = "GSE107943", GSEMatrix = TRUE, AnnotGPL = TRUE)下载不了
#手动下载
#访问GEO数据集页面:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE107943
#在"Download family"部分找到"Series Matrix File(s)"
#点击下载"GSE107943_series_matrix.txt.gz"
# 设置文件路径(根据您保存文件的实际位置)
gse_file <- "GSE107943_series_matrix.txt.gz"
# 2. 读取文件内容
file_content <- readLines(gzfile(gse_file))
# 3. 提取样本信息
sample_lines <- file_content[grep("^!Sample_", file_content)]
sample_data <- strsplit(sample_lines, "\t")
sample_df <- as.data.frame(do.call(rbind, sample_data), stringsAsFactors = FALSE)
colnames(sample_df) <- c("attribute", paste0("sample", 1:(ncol(sample_df)-1)))
head(sample_df)
# 4. 提取临床信息
# 创建临床数据框
sample_df$attribute
#直接查看数据格式
#13,复发,14dsfree,15death,16survival
#
clin_data <- data.frame(
sample = as.character(sample_df[1, -1]),
title = as.character(sample_df[2, -1]),
os_time = as.character(sample_df[16, -1]),
os_event = as.character(sample_df[15, -1]),
stringsAsFactors = FALSE
)
# 5. 清理临床数据
clin_data <- clin_data %>%
mutate(
os_time = as.numeric(str_extract(os_time, "[0-9.]+")),
os_event = as.numeric(str_extract(os_event, "[0-9]+"))
)
head(clin_data)
!is.na(clin_data$os_time)
clin_data_clean <- clin_data[!is.na(clin_data$os_time),]
# 6. 提取表达矩阵
gse_file1 <- "GSE107943_RPKM.txt.gz"
rpkm <- data.table::fread(gse_file1)
expr_df<-as.data.frame(rpkm)
colnames(expr_df)
expr_df<-expr_df[,c(4,8:64)]
#把所有重复 symbol 根据平均值合并(推荐做表达矩阵)
library(dplyr)
expr_df <- expr_df %>%
group_by(Genesymbol) %>%
summarise(across(everything(), mean), .groups = "drop") # 均值合并
#运行起来太慢了
#方案 1:只保留每个 symbol 第一次出现(最简单)
expr_df <- expr_df[!duplicated(expr_df$Genesymbol), ] # 去重
save(clin_data_clean,expr_df,file = "expr_df.Rdata")
#如果你希望保留最大表达量而不是均值,把 mean 改成 max 即可。
rownames(expr_df) <- expr_df$Genesymbol
expr_df<-expr_df[,-1]
colnames(expr_df)<-expr_df[1,]
expr_df<-expr_df[-1,]
表达矩阵,临床信息的表格有了,可以进行分析了
# 7. 提取AURKB表达数据(使用已知探针ID)
# 根据GPL570平台,AURKB对应的探针ID为"202954_s_at"
genename <- "AURKB"
aurkb_expr <- t(expr_df[genename,])
aurkb_expr <-as.data.frame(aurkb_expr)
aurkb_expr$sample<-rownames(aurkb_expr)
library(stringr)
clin_data_clean <- clin_data_clean %>%
mutate(
sample = str_remove_all(sample, '"'),
title = str_remove_all(title, '"')
)
head(clin_data_clean)
# 8. 合并临床数据和表达数据
combined_data <- clin_data_clean %>%
inner_join(aurkb_expr, by = "sample") %>%
filter(!is.na(genename))
# 9. 根据AURKB表达中位数分组
#argument is not numeric or logical: returning NA
#解决方案
str(combined_data$AURKB) # 看看类型
table(is.na(combined_data$AURKB)) # 看看有多少 NA
head(combined_data$AURKB) # 看看实际取值
#返回的是 Factor w/ 3 levels "0.3", "1.5", "5.7" 或 chr,就需要先转成数值:
# 因子/字符 → 数值
combined_data$AURKB <- as.numeric(as.character(combined_data$AURKB))
# 再计算
median_expr <- median(combined_data$AURKB)
combined_data <- combined_data %>%
mutate(
group = ifelse(AURKB > median_expr, "AURKBHigh", "AURKBLow"),
group = factor(group, levels = c("AURKBLow", "AURKBHigh"))
)
# 10. 创建生存对象
surv_object <- Surv(time = combined_data$os_time, event = combined_data$os_event)
# 11. 拟合生存曲线
fit <- survfit(surv_object ~ group, data = combined_data)
# 12. 计算log-rank检验p值
logrank <- survdiff(surv_object ~ group, data = combined_data)
p_value <- 1 - pchisq(logrank$chisq, df = 1)
formatted_p <- ifelse(p_value < 0.0001,
format(p_value, scientific = TRUE, digits = 4),
sprintf("%.4f", p_value))
# 13. 绘制生存曲线
surv_plot <- ggsurvplot(
fit,
data = combined_data,
pval = FALSE, # 我们将手动添加p值
conf.int = FALSE,
risk.table = TRUE,
risk.table.height = 0.25,
risk.table.y.text = FALSE,
risk.table.title = "Number at risk",
legend.labs = c(
sprintf("AURKBLow (N = %d)", sum(combined_data$group == "AURKBLow")),
sprintf("AURKBHigh (N = %d)", sum(combined_data$group == "AURKBHigh"))
),
legend.title = "",
legend = c(0.8, 0.85), # 图例位置
xlab = "Time (Months)",
ylab = "Overall Survival Probability",
break.time.by = 20,
xlim = c(0, 100),
palette = c("#2E9FDF", "#E7B800"), # 蓝-黄配色
ggtheme = theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
legend.position = "top",
legend.text = element_text(size = 12),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(size = 10),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "grey80", fill = NA, size = 0.5)
)
)
# 14. 添加自定义p值
surv_plot$plot <- surv_plot$plot +
annotate("text", x = 30, y = 0.25,
label = paste0("P = ", formatted_p),
size = 6, fontface = "bold") +
labs(title = "Overall Survival by AURKB Expression",
subtitle = "GSE107943 Cohort")
# 15. 打印并保存图像
print(surv_plot)
ggsave("AURKB_survival_GSE107943.png",
plot = print(surv_plot),
width = 8, height = 8, dpi = 300)
write_csv(combined_data,file = "AURKBcombineddata.csv")
生成图片如下:

一点都不一样,因为不是用中位数分组
修改方向:(1)直接用最后保存的csv文件看下,毕竟样本不多,可以看怎么分组合适;(2)用graphad绘图,比用R语言绘图方便调整,个人觉得文献里应该也是用graphad绘图;(3)从GEO数据库里下载的表达矩阵,可以考虑是否在有重复基因名时取值方法是否需要修正,或者先normalizd整个表达矩阵后再看比较好;
但先模糊看下基因与病人生存是否有趋势,还是可以。