用多基因序列构建进化树,要先将每个基因序列进行比对,拼接后构树。比对可以用muscle或mafft,当数据量大时muscle就会因内存不足而报错,这时只能用mafft。怎么进行序列拼接呢?自己写程序吧。
# 读序列文件的函数
#' build sequence database with raw fasta files
#'
#' @param dat_path
#' give the raw file path
#' @param type dna or prot or other types
#' @return
#' @export
#'
#' @examples
db_build <- function(dat_path, type) {
require(tidyverse)
dat <- readLines(dat_path)
indx <- dat %>% str_detect(">") %>% c(1:length(dat))[.]
gen=dat[indx] %>% str_remove_all("^>") %>% str_remove_all("-RA.*$")
seqs=rep(NA, length(indx))
for (i in 1:(length(indx)-1)) {
seqs[i] <- dat[(indx[i]+1):(indx[i+1]-1)] %>% paste0(collapse = "")
}
seqs[length(indx)] <- dat[(indx[length(indx)]+1):length(dat)] %>% paste0(collapse = "")
attr(seqs, "type") <- type
db <- list(gen=gen, seq=seqs)
return(db)
}
读序列
fil_nam_f=list.files("D:/Single_Copy_Orthologue_Sequences/",full.names = T)
fxj=lapply(fil_nam_f, function(x){db_build(x,"prot")})#蛋白序列
#转成一个df,每列一个基因
fxj_df=data.frame(gen=fxj[[1]]$gen,seq1=fxj[[1]]$seq)
for (i in 2:length(fxj)) {
fxj_df[,i+1] =fxj[[i]]$seq
}
#把每列中的序列粘成一条
fxj_df2=fxj_df[,-1]
temp_fun=function(vec){
paste0(vec,collapse = "")
}
#再转成fasta
temp_fun=function(vec){
paste0(vec,collapse = "")
}
fxj_df=data.frame(gen=fxj_df$gen,seq=apply(fxj_df[,-1],1,FUN=temp_fun))
fxj_seq <- c()
for (i in 1:nrow(fxj_df)) {
hit_seq_temp <- c(paste0(">", fxj_df$gen[i]), fxj_df$seq[i])
fxj_seq <- c(fxj_seq, hit_seq_temp)
}
write(fxj_seq, file = "fxj_single_cop_seq.fa")