用R写了个函数实现GDC文件合并,记录一下
image.png
library(dplyr)
library(readr)
merge_column <- function(input_dir, column_name, output_dir) {
# 加载必要的包
library(dplyr)
library(readr)
# 创建输出目录(如果不存在)
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
cat("output_dir:", output_dir, "\n")
# 获取所有子文件夹路径
sub_dirs <- list.dirs(input_dir, full.names = TRUE, recursive = FALSE)
# 初始化合并结果数据框
merged_data <- NULL
print("merging data...")
for (i in 1:length(sub_dirs)) {
cat(i, ":",sub_dirs[i])
# 获取子文件夹中的所有文件
files <- list.files(sub_dirs[i], pattern = "\\.tsv$", full.names = TRUE)
for (file in files) {
# 读取文件
exp_data <- read_tsv(file, skip = 6, col_names = FALSE, show_col_types = FALSE) # 从第7行开始读取
colnames(exp_data) <- c("gene_id", "gene_name", "gene_type", "unstranded", "stranded_first", "stranded_second", "tpm_unstranded", "fpkm_unstranded", "fpkm_uq_unstranded")
exp_data <- as.data.frame(exp_data)
# 检查文件中是否有指定的列
if (!(column_name %in% colnames(exp_data))) {
stop(paste("文件中缺少列:", column_name, ",文件:", file))
}
# 提取Ensembl基因ID、基因名称和目标列
sample_data <- exp_data %>%
select(gene_id = 1, gene_name = 2, gene_type = 3, target_column = !!sym(column_name))
# 为目标列命名为当前文件名(不含扩展名)
colnames(sample_data)[4] <- tools::file_path_sans_ext(basename(file))
# 合并数据
if (is.null(merged_data)) {
merged_data <- sample_data
} else {
merged_data <- full_join(merged_data, sample_data, by = c("gene_id", "gene_name", "gene_type"))
}
}
}
# 保存合并结果
output_file <- file.path(output_dir, paste0(column_name, "_merged.tsv"))
write_tsv(merged_data, output_file)
cat("保存合并表到:", output_file, "\n")
return(merged_data) # 返回合并后的数据框
}
input_dir <- getwd()
output_dir <- "/Users/zhengyiyi/Desktop/res"
column_name <- "unstranded"
# 调用函数合并 unstranded 列
res <- list()
for (column_i in c("unstranded", "stranded_first", "stranded_second", "tpm_unstranded", "fpkm_unstranded", "fpkm_uq_unstranded")){
print(column_i)
res[[column_i]] <- merge_column(input_dir, column_i, output_dir)
}