以前写过的方法在;X9信息列因为有些基因没有基因名而错位
更新
################################################################
# 2025-09-24 #
# Sus_scrofa.Sscrofa11.1.110.chr.gtf #
# R 4.2.2 #
# FREE-U LAB by wangwenjing #
################################################################
#---------------------------------加载R包------------------------
library(stringr)
library(readr)
library(dplyr)
library(tidyverse)
library(tidyr)
gtf_in <- read.delim("Sus_scrofa.Sscrofa11.1.110.chr.gtf",
header = FALSE, sep = "\t",
stringsAsFactors = FALSE,
comment.char = "#", quote = "")
# ========== 读取 GTF ==========
gtf <- read.delim(
"Sus_scrofa.Sscrofa11.1.110.chr.gtf",
header = FALSE, sep = "\t", stringsAsFactors = FALSE,
comment.char = "#", quote = "")
colnames(gtf)[1:9] <- c("seqname","source","feature","start","end","score","strand","frame","attribute")
# 只保留 gene 行
genes <- subset(gtf, feature == "gene")
# 统一引号
attr_raw <- gsub("[\u201C\u201D]", '"', trimws(genes$attribute), perl = TRUE)
# 辅助函数:提取 key 对应的值
extract1 <- function(pattern) {
m <- regexec(pattern, attr_raw, perl = TRUE)
v <- regmatches(attr_raw, m)
sapply(v, function(z) if (length(z) >= 2) z[2] else NA_character_)
}
# 批量提取
out <- data.frame(
Geneid = extract1('(?:^|;)\\s*(?:gene_id|ID)\\s*"([^"]+)"'),
gene_name = extract1('(?:^|;)\\s*(?:gene_name|Name)\\s*"([^"]+)"'),
gene_source = extract1('(?:^|;)\\s*gene_source\\s*"([^"]+)"'),
gene_biotype = extract1('(?:^|;)\\s*(?:gene_biotype|gene_type)\\s*"([^"]+)"'),
seqname = genes$seqname,
start = genes$start,
end = genes$end,
strand = genes$strand,
stringsAsFactors = FALSE
)
# ========== 导出 ==========
write.csv(out, "Sus_scrofa.Sscrofa11.1.110.gene.csv", row.names = FALSE)