R语言计算一段DNA序列的GC含量分布及可视化

导入一段DNA序列

library(stringr)
library(Biostrings)
myfiles <- list.files(pattern = "*.fasta")
myfiles
fasta_sequences <- readDNAStringSet(myfiles[3])
dna_seq = fasta_sequences|> as.character()

设置窗口碱基的长度

bin = 50
l = nchar(dna_seq) - bin + 1
l

计算GC含量

GC <- c()#
for(i in 1:l){
  windows_seq = str_sub(dna_seq,i,i + bin -1)
  frequency <- str_count(windows_seq, 'G')/bin*100 +str_count(windows_seq, 'C')/bin*100
  GC <- append(GC,frequency)
  }
GC

length(GC)

计算AT含量

AT <- c()#
for(i in 1:l){
  windows_seq = str_sub(dna_seq,i,i + bin - 1)
  
  frequency <- str_count(windows_seq, 'A')/bin*100 +str_count(windows_seq, 'T')/bin*100
  AT <- append(AT,frequency)
}
AT

可视化

par(mfrow = c(2, 1),mar = c(2, 4.2, 1, 1))
plot(GC,type = "l",xlab = "mRNA/nt",ylab = "GC content,%")
abline(h=80, col="red")
abline(v=838, col="orange")
abline(v=103, col="green")
abline(v=1094, col="green")

plot(AT,type = "l", xlab = "mRNA/nt",ylab = "AT content,%")
abline(h=50, col="red")
abline(v=838, col="green")
Hoxc13.png
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。