好久没写,写个提取GTF信息的

以前写过的方法在;X9信息列因为有些基因没有基因名而错位
更新

################################################################
#                2025-09-24                                    #
#            Sus_scrofa.Sscrofa11.1.110.chr.gtf                #
#                R 4.2.2                                       #
#                FREE-U LAB  by wangwenjing                    #
################################################################

#---------------------------------加载R包------------------------
library(stringr)
library(readr)
library(dplyr)
library(tidyverse)
library(tidyr)

gtf_in <- read.delim("Sus_scrofa.Sscrofa11.1.110.chr.gtf",
          header = FALSE, sep = "\t",
          stringsAsFactors = FALSE,
          comment.char = "#", quote = "")

# ========== 读取 GTF ==========
gtf <- read.delim(
  "Sus_scrofa.Sscrofa11.1.110.chr.gtf",
  header = FALSE, sep = "\t", stringsAsFactors = FALSE,
  comment.char = "#", quote = "")

colnames(gtf)[1:9] <- c("seqname","source","feature","start","end","score","strand","frame","attribute")

# 只保留 gene 行
genes <- subset(gtf, feature == "gene")

# 统一引号
attr_raw <- gsub("[\u201C\u201D]", '"', trimws(genes$attribute), perl = TRUE)

# 辅助函数:提取 key 对应的值
extract1 <- function(pattern) {
  m <- regexec(pattern, attr_raw, perl = TRUE)
  v <- regmatches(attr_raw, m)
  sapply(v, function(z) if (length(z) >= 2) z[2] else NA_character_)
}

# 批量提取
out <- data.frame(
  Geneid       = extract1('(?:^|;)\\s*(?:gene_id|ID)\\s*"([^"]+)"'),
  gene_name    = extract1('(?:^|;)\\s*(?:gene_name|Name)\\s*"([^"]+)"'),
  gene_source  = extract1('(?:^|;)\\s*gene_source\\s*"([^"]+)"'),
  gene_biotype = extract1('(?:^|;)\\s*(?:gene_biotype|gene_type)\\s*"([^"]+)"'),
  seqname      = genes$seqname,
  start        = genes$start,
  end          = genes$end,
  strand       = genes$strand,
  stringsAsFactors = FALSE
)

# ========== 导出 ==========
write.csv(out, "Sus_scrofa.Sscrofa11.1.110.gene.csv", row.names = FALSE)
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容