该教材是如何制作固定每行70碱基的fasta文件,主要设计如何将特别长的碱基序列均匀切割成每行70个碱基的文件。代码如下:
如何构建fasta文件及其index索引文件
fasta文件更改header
rm(list = ls())
suppressMessages(library(Biostrings))
suppressMessages(library(tidyverse))
trim_fa<-function(fa_dir,prefix,out_dir){
trimSeq <- function(Seq, binwidth = 70){
N_line = nchar(Seq) / binwidth
if(N_line > as.integer(N_line)){
N = as.integer(N_line) + 1
} else{
N = as.integer(N_line)
}
out_seq<-vector()
bases = strsplit(Seq, "")[[1]]
for(i in 0:(N-1)){
xseq = paste0(bases[(1 + binwidth*i):(binwidth + i*binwidth)], collapse = "")
if(i != (N-1)){
xseq = paste0(bases[(1 + binwidth*i):(binwidth + i*binwidth)], collapse = "")
} else{
xseq = paste0(bases[(1 + binwidth*i):length(bases)], collapse = "")
}
out_seq<-paste0(out_seq,xseq, "\n")
}
return(out_seq)
}
fa<-readDNAStringSet(fa_dir)
seq_name = names(fa)
sequence = paste(fa)
df <- data.frame(seq_name, sequence)
df$sequence<-trimSeq(Seq = as.character(df$sequence))
df$seq_name<-paste0(">",df$seq_name)
D <- do.call(rbind, lapply(seq(nrow(df)), function(i) t(df[i, ])))
write.table(D,paste0(out_dir,prefix,".fa"), row.names = FALSE, col.names = FALSE, quote = FALSE)
}