导入一段DNA序列
library(stringr)
library(Biostrings)
myfiles <- list.files(pattern = "*.fasta")
myfiles
fasta_sequences <- readDNAStringSet(myfiles[3])
dna_seq = fasta_sequences|> as.character()
设置窗口碱基的长度
bin = 50
l = nchar(dna_seq) - bin + 1
l
计算GC含量
GC <- c()#
for(i in 1:l){
windows_seq = str_sub(dna_seq,i,i + bin -1)
frequency <- str_count(windows_seq, 'G')/bin*100 +str_count(windows_seq, 'C')/bin*100
GC <- append(GC,frequency)
}
GC
length(GC)
计算AT含量
AT <- c()#
for(i in 1:l){
windows_seq = str_sub(dna_seq,i,i + bin - 1)
frequency <- str_count(windows_seq, 'A')/bin*100 +str_count(windows_seq, 'T')/bin*100
AT <- append(AT,frequency)
}
AT
可视化
par(mfrow = c(2, 1),mar = c(2, 4.2, 1, 1))
plot(GC,type = "l",xlab = "mRNA/nt",ylab = "GC content,%")
abline(h=80, col="red")
abline(v=838, col="orange")
abline(v=103, col="green")
abline(v=1094, col="green")
plot(AT,type = "l", xlab = "mRNA/nt",ylab = "AT content,%")
abline(h=50, col="red")
abline(v=838, col="green")