转自:http://blog.csdn.net/u014801157/article/details/24372449
一、用R base的函数来处理序列
# 按长度设置产生随机序列的函数
rndSeq <- function(dict, n) {
paste(sample(dict, n, replace = T), collapse = "")
}
# 产生3条长度均为50bp的序列
set.seed(1000)
(seqs <- mapply(rndSeq, list(c("A", "T", "G", "C")), rep(50, 3)))
## [1] "TCAGGAGGATTCTCCATGAGACTAGGTCAAGAGCGGACGTCGAGCGGGTG"
## [2] "GGAGGGGGCTTACCACTCACGGTTCAAAAATACAGATTTTCCGAGGACAT"
## [3] "GTAGGCAGGTCGAGGACAAAGTTGCCCGTATGTACTACGAAGGGGTCGTC"
# 用regexpr函数查找第一次匹配ATG的位置和序列长度
(pos <- regexpr("ATG", seqs))
## [1] 16 -1 30
## attr(,"match.length")
## [1] 3 -1 3
## attr(,"useBytes")
## [1] TRUE
as.numeric(pos)
## [1] 16 -1 30
# 用gregexpr函数可以查找匹配ATG的全部起始位置,但后续处理不是很方便
pos <- gregexpr("ATG", seqs)
# 获取互补序列很简单
(seqs_comp <- chartr("ATGC", "TACG", seqs))
## [1] "AGTCCTCCTAAGAGGTACTCTGATCCAGTTCTCGCCTGCAGCTCGCCCAC"
## [2] "CCTCCCCCGAATGGTGAGTGCCAAGTTTTTATGTCTAAAAGGCTCCTGTA"
## [3] "CATCCGTCCAGCTCCTGTTTCAACGGGCATACATGATGCTTCCCCAGCAG"
# 获取反向序列不大容易,要经过几个步骤
vseq <- function(x) {
substring(x, 1:nchar(x), 1:nchar(x))
} #把字符串向量化的函数
seqs_rc <- lapply(seqs_comp, vseq)
seqs_rc <- lapply(seqs_rc, rev)
(seqs_rc <- sapply(seqs_rc, paste, collapse = "")) #这一步才得到反向互补序列
## [1] "CACCCGCTCGACGTCCGCTCTTGACCTAGTCTCATGGAGAATCCTCCTGA"
## [2] "ATGTCCTCGGAAAATCTGTATTTTTGAACCGTGAGTGGTAAGCCCCCTCC"
## [3] "GACGACCCCTTCGTAGTACATACGGGCAACTTTGTCCTCGACCTGCCTAC"
二、Biostrings定义的常量
library(Biostrings)
DNA_BASES
## [1] "A" "C" "G" "T"
DNA_ALPHABET
## [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+"
RNA_BASES
## [1] "A" "C" "G" "U"
RNA_ALPHABET
## [1] "A" "C" "G" "U" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+"
RNA_GENETIC_CODE
## UUU UUC UUA UUG UCU UCC UCA UCG UAU UAC UAA UAG UGU UGC UGA UGG CUU CUC
## "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L"
## CUA CUG CCU CCC CCA CCG CAU CAC CAA CAG CGU CGC CGA CGG AUU AUC AUA AUG
## "L" "L" "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M"
## ACU ACC ACA ACG AAU AAC AAA AAG AGU AGC AGA AGG GUU GUC GUA GUG GCU GCC
## "T" "T" "T" "T" "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A"
## GCA GCG GAU GAC GAA GAG GGU GGC GGA GGG
## "A" "A" "D" "D" "E" "E" "G" "G" "G" "G"
AA_ALPHABET
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T"
## [18] "W" "Y" "V" "U" "B" "Z" "X" "*" "-" "+"
AMINO_ACID_CODE
## A R N D C Q E G H I L K
## "Ala" "Arg" "Asn" "Asp" "Cys" "Gln" "Glu" "Gly" "His" "Ile" "Leu" "Lys"
## M F P S T W Y V U B Z X
## "Met" "Phe" "Pro" "Ser" "Thr" "Trp" "Tyr" "Val" "Sec" "Asx" "Glx" "Xaa"
GENETIC_CODE
## TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TAA TAG TGT TGC TGA TGG CTT CTC
## "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L"
## CTA CTG CCT CCC CCA CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG
## "L" "L" "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M"
## ACT ACC ACA ACG AAT AAC AAA AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC
## "T" "T" "T" "T" "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A"
## GCA GCG GAT GAC GAA GAG GGT GGC GGA GGG
## "A" "A" "D" "D" "E" "E" "G" "G" "G" "G"
IUPAC_CODE_MAP
## A C G T M R W S Y K
## "A" "C" "G" "T" "AC" "AG" "AT" "CG" "CT" "GT"
## V H D B N
## "ACG" "ACT" "AGT" "CGT" "ACGT"
data(BLOSUM45)
data(BLOSUM50)
data(BLOSUM62)
data(BLOSUM80)
data(BLOSUM100)
data(PAM30)
data(PAM40)
data(PAM70)
data(PAM120)
data(PAM250)
nucleotideSubstitutionMatrix()
qualitySubstitutionMatrices()
errorSubstitutionMatrices()
三、Biostrings定义的类(序列容器)
1、容纳单一序列的类
getClass("XString")
## Virtual Class "XString" [package "Biostrings"]
##
## Slots:
##
## Name: shared offset length elementMetadata
## Class: SharedRaw integer integer DataTableORNULL
##
## Name: metadata
## Class: list
##
## Extends:
## Class "XRaw", directly
## Class "XVector", by class "XRaw", distance 2
## Class "Vector", by class "XRaw", distance 3
## Class "Annotated", by class "XRaw", distance 4
##
## Known Subclasses: "BString", "DNAString", "RNAString", "AAString"
xxx <- "qwertyuiopasdfghjklzxcvbnm,./;'@#$%^&*()_+~"
(BString(xxx))
## 43-letter "BString" instance
## seq: qwertyuiopasdfghjklzxcvbnm,./;'@#$%^&*()_+~
(DNAString(rndSeq(DNA_BASES, 20)))
## 20-letter "DNAString" instance
## seq: GGGTATGGTTCGTCTTTGCC
# 用法错误
(DNAString(xxx))
## Error: key 113 (char 'q') not in lookup table
(RNAString(rndSeq(RNA_BASES, 20)))
## 20-letter "RNAString" instance
## seq: UCUAGAUUUUCACGACGCUG
# 用法错误
(RNAString(xxx))
## Error: key 113 (char 'q') not in lookup table
(AAString(rndSeq(AA_ALPHABET, 20)))
## 20-letter "AAString" instance
## seq: LYPBXIC+UNHSFMIBPPVS
# 下面语句并不出错
(AAString(xxx))
## 43-letter "AAString" instance
## seq: qwertyuiopasdfghjklzxcvbnm,./;'@#$%^&*()_+~
2、容纳序列集的类
getClass("XStringSet")
## Virtual Class "XStringSet" [package "Biostrings"]
##
## Slots:
##
## Name: pool ranges elementType elementMetadata
## Class: SharedRaw_Pool GroupedIRanges character DataTableORNULL
##
## Name: metadata
## Class: list
##
## Extends:
## Class "XRawList", directly
## Class "XVectorList", by class "XRawList", distance 2
## Class "List", by class "XRawList", distance 3
## Class "Vector", by class "XRawList", distance 4
## Class "Annotated", by class "XRawList", distance 5
##
## Known Subclasses:
## Class "BStringSet", directly
## Class "DNAStringSet", directly
## Class "RNAStringSet", directly
## Class "AAStringSet", directly
## Class "QualityScaledXStringSet", directly
## Class "XStringQuality", by class "BStringSet", distance 2
## Class "PhredQuality", by class "BStringSet", distance 3
## Class "SolexaQuality", by class "BStringSet", distance 3
## Class "IlluminaQuality", by class "BStringSet", distance 3
## Class "QualityScaledBStringSet", by class "BStringSet", distance 2
## Class "QualityScaledDNAStringSet", by class "DNAStringSet", distance 2
## Class "QualityScaledRNAStringSet", by class "RNAStringSet", distance 2
## Class "QualityScaledAAStringSet", by class "AAStringSet", distance 2
set.seed(1000)
DNAs <- mapply(rndSeq, list(DNA_BASES), c(10000, 20000, 50000, 25000))
names(DNAs) <- paste("SEQ", 1:4, sep = "-")
(DNAset <- DNAStringSet(DNAs))
## A DNAStringSet instance of length 4
## width seq names
## [1] 10000 CTAGGAGGACCTCTTACGAGA...TTGCCCTAACCTCTCCCATG SEQ-1
## [2] 20000 TACATGATGGGGCCATTGGTG...TGGCTTGAAATGAGGGAATA SEQ-2
## [3] 50000 CGCGCGGTGACTTAACGTGGA...GATGACTCTACGACTATACC SEQ-3
## [4] 25000 CAATAATTCGGCCTAGGTGCG...GGCAAACCCACTCGCACCGG SEQ-4
(chrs <- readDNAStringSet(file.choose(), "fasta"))
## A DNAStringSet instance of length 7
## width seq names
## [1] 30427671 CCCTAAACCCTAAACCCTA...TAGGGTTTAGGGTTTAGGG Chr1 CHROMOSOME
## [2] 19698289 NNNNNNNNNNNNNNNNNNN...TAGGGTTTAGGGTTTAGGG Chr2 CHROMOSOME
## [3] 23459830 NNNNNNNNNNNNNNNNNNN...AACCCTAAACCCTAAACCC Chr3 CHROMOSOME
## [4] 18585056 NNNNNNNNNNNNNNNNNNN...TTAGGGTTTAGGGTTTAGG Chr4 CHROMOSOME
## [5] 26975502 TATACCATGTACCCTCAAC...GATTTAGGGTTTTTAGATC Chr5 CHROMOSOME
## [6] 154478 ATGGGCGAACGACGGGAAT...TAACTTGGTCCCGGGCATC chloroplast CHROM...
## [7] 366924 GGATCCGTTCGAAACAGGT...AATGGAAACAAACCGGATT mitochondria CHRO...
rm(chrs)
RNAStringSet(DNAset)
## A RNAStringSet instance of length 4
## width seq names
## [1] 10000 CUAGGAGGACCUCUUACGAGA...UUGCCCUAACCUCUCCCAUG SEQ-1
## [2] 20000 UACAUGAUGGGGCCAUUGGUG...UGGCUUGAAAUGAGGGAAUA SEQ-2
## [3] 50000 CGCGCGGUGACUUAACGUGGA...GAUGACUCUACGACUAUACC SEQ-3
## [4] 25000 CAAUAAUUCGGCCUAGGUGCG...GGCAAACCCACUCGCACCGG SEQ-4
# 使用start和end 参数获取子序列
DNAStringSet(DNAset, start = seq(1000, length = 4, by = 1000), end = seq(1500,
length = 4, by = 1000))
## A DNAStringSet instance of length 4
## width seq names
## [1] 501 GTTGCCGAACAGAGCACGGCT...AGGGAATCGTTAGCGATGAC SEQ-1
## [2] 501 AGCGGTGATGTCTGACATTGA...CTATCGCGTGGGACTAGCAC SEQ-2
## [3] 501 CCAGCGTAGTTGGGAGAATTG...ACTGCACCCGCTTCGTTGTA SEQ-3
## [4] 501 AGCACTGAGGAACAGCTGTAG...GGGAGCCGACACTAAAATTC SEQ-4
# 使用 end(或start)和width参数获取子序列
DNAStringSet(DNAset, end = c(234, 3000, 1029), width = 100)
## A DNAStringSet instance of length 4
## width seq names
## [1] 100 TCATGAAGGGGCTGCTCGGGT...ACTATTTCCCCGCTTGCAGG SEQ-1
## [2] 100 GGGATCATGATCGTACGCTAT...CACGCTCCAGGCCTATGAGG SEQ-2
## [3] 100 CCCATGTGGCGTTCTTATAGG...CATCAGTCAATAATCATACG SEQ-3
## [4] 100 CCACTATGGCGCGACGTAGAC...TTGTTTATCTTACTCCTCAT SEQ-4
class(DNAset)
## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"
class(DNAset[1])
## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"
DNAStringSet(DNAset, start = seq(1000, length = 2, by = 1000), width = c(5,
10, 15))
## A DNAStringSet instance of length 4
## width seq names
## [1] 5 GTTGC SEQ-1
## [2] 10 AGCGGTGATG SEQ-2
## [3] 15 GCTTAAGTCCCATCA SEQ-3
## [4] 5 TGTCC SEQ-4
DNAStringSet(DNAset[1], start = seq(1000, length = 2, by = 1000), width = c(5,
10, 15))
## Error: 'start', 'end' or 'width' is longer than 'refwidths'
class(DNAset[[1]])
## [1] "DNAString"
## attr(,"package")
## [1] "Biostrings"
DNAStringSet(DNAset[[1]], start = seq(1000, length = 2, by = 1000), width = c(5,
10, 15))
## A DNAStringSet instance of length 3
## width seq
## [1] 5 GTTGC
## [2] 10 ATAAATGGGA
## [3] 15 GTTGCCGAACAGAGC
DNAStringSet(DNAset[[1]], start = seq(1000, length = 2, by = 1e+05), width = c(5,
10, 15))
## Error: solving row 2: 'allow.nonnarrowing' is FALSE and the supplied start
## (101000) is > refwidth + 1
DNAStringSet(DNAset[[1]], start = seq(1000, length = 2, by = 1000), width = 1e+05)
## Error: solving row 1: 'allow.nonnarrowing' is FALSE and the solved end
## (100999) is > refwidth
3、XStringViews类
getClass("XStringViews")
## Class "XStringViews" [package "Biostrings"]
##
## Slots:
##
## Name: subject ranges elementType elementMetadata
## Class: XString IRanges character DataTableORNULL
##
## Name: metadata
## Class: list
##
## Extends:
## Class "Views", directly
## Class "List", by class "Views", distance 2
## Class "Vector", by class "Views", distance 3
## Class "Annotated", by class "Views", distance 4
##
## Known Subclasses: "XStringPartialMatches"
Views(DNAset[[1]], start = seq(1000, length = 2, by = 1000), width = 100)
## Views on a 10000-letter DNAString subject
## subject: CTAGGAGGACCTCTTACGAGATCAGGCTAAGA...ACGTTCACTATTTGCCCTAACCTCTCCCATG
## views:
## start end width
## [1] 1000 1099 100 [GTTGCCGAACAGAGCACGGCTCGAT...ACGACCGTTTTGTATAGGAATGAC]
## [2] 2000 2099 100 [ATAAATGGGATCCGATTCAGGTTCC...CCAAATATGAGCACCGGCGCCACT]
Views(DNAs[1], start = seq(1000, length = 2, by = 1000), width = 100)
## Views on a 10000-letter BString subject
## subject: CTAGGAGGACCTCTTACGAGATCAGGCTAAGA...ACGTTCACTATTTGCCCTAACCTCTCCCATG
## views:
## start end width
## [1] 1000 1099 100 [GTTGCCGAACAGAGCACGGCTCGAT...ACGACCGTTTTGTATAGGAATGAC]
## [2] 2000 2099 100 [ATAAATGGGATCCGATTCAGGTTCC...CCAAATATGAGCACCGGCGCCACT]
Views(DNAs, start = seq(1000, length = 2, by = 1000), width = 100)
## Error: zero or more than one input sequence
Views(DNAset[1], start = seq(1000, length = 2, by = 1000), width = 100)
## Error: unable to find an inherited method for function 'Views' for
## signature '"DNAStringSet"'
successiveViews(subject, width, gapwidth = 0, from = 1)
(successiveViews(DNAset[[1]], width = rep(20, 4), gapwidth = 0, from = 1))
## Views on a 10000-letter DNAString subject
## subject: CTAGGAGGACCTCTTACGAGATCAGGCTAAGA...ACGTTCACTATTTGCCCTAACCTCTCCCATG
## views:
## start end width
## [1] 1 20 20 [CTAGGAGGACCTCTTACGAG]
## [2] 21 40 20 [ATCAGGCTAAGAGTGGATGC]
## [3] 41 60 20 [TGAGTGGGCGGGAGGGGGTC]
## [4] 61 80 20 [CATTATCTATGGCCTAAAAA]
(successiveViews(DNAset[[1]], width = rep(20, 4), gapwidth = 80, from = 1))
## Views on a 10000-letter DNAString subject
## subject: CTAGGAGGACCTCTTACGAGATCAGGCTAAGA...ACGTTCACTATTTGCCCTAACCTCTCCCATG
## views:
## start end width
## [1] 1 20 20 [CTAGGAGGACCTCTTACGAG]
## [2] 101 120 20 [GCAGGTAGGCTGAGGATAAA]
## [3] 201 220 20 [ACGCCCTGGGGGGGACTATT]
## [4] 301 320 20 [ACTAGTTTCGAGACGAGCAA]
(successiveViews(DNAset[[1]], width = rep(20, 4), gapwidth = -19, from = 1))
## Views on a 10000-letter DNAString subject
## subject: CTAGGAGGACCTCTTACGAGATCAGGCTAAGA...ACGTTCACTATTTGCCCTAACCTCTCCCATG
## views:
## start end width
## [1] 1 20 20 [CTAGGAGGACCTCTTACGAG]
## [2] 2 21 20 [TAGGAGGACCTCTTACGAGA]
## [3] 3 22 20 [AGGAGGACCTCTTACGAGAT]
## [4] 4 23 20 [GGAGGACCTCTTACGAGATC]
4、MaskedXString掩膜序列类
getClass("MaskedXString")
## Virtual Class "MaskedXString" [package "Biostrings"]
##
## Slots:
##
## Name: unmasked masks
## Class: XString MaskCollection
##
## Known Subclasses: "MaskedBString", "MaskedDNAString", "MaskedRNAString", "MaskedAAString"
DNAset[[1]]
## 10000-letter "DNAString" instance
## seq: CTAGGAGGACCTCTTACGAGATCAGGCTAAGAGT...GTACGTTCACTATTTGCCCTAACCTCTCCCATG
(mDNA <- maskMotif(DNAset[[2]], "CATTAG"))
## 20000-letter "MaskedDNAString" instance (# for masking)
## seq: TACATGATGGGGCCATTGGTGGACAGCGTTTTTA...CTCATGGTACTCGTGGCTTGAAATGAGGGAATA
## masks:
## maskedwidth maskedratio active desc
## 1 12 6e-04 TRUE CATTAG-blocks
as(mDNA, "Views")
## Views on a 20000-letter DNAString subject
## subject: TACATGATGGGGCCATTGGTGGACAGCGTTTT...CATGGTACTCGTGGCTTGAAATGAGGGAATA
## views:
## start end width
## [1] 1 3129 3129 [TACATGATGGGGCCATTGGTGGAC...GACCTACCACGCATCGTAATCCAT]
## [2] 3136 4979 1844 [GTTCGTCCGTTCAGAAGATATCCG...AAAGCGCGTGCCACTCTGGTGTGC]
## [3] 4986 20000 15015 [AATCGATAGAGTGTAGGGCCCAGG...CTCGTGGCTTGAAATGAGGGAATA]
mask(DNAset[[2]], "CATTAG")
## Views on a 20000-letter DNAString subject
## subject: TACATGATGGGGCCATTGGTGGACAGCGTTTT...CATGGTACTCGTGGCTTGAAATGAGGGAATA
## views:
## start end width
## [1] 1 3129 3129 [TACATGATGGGGCCATTGGTGGAC...GACCTACCACGCATCGTAATCCAT]
## [2] 3136 4979 1844 [GTTCGTCCGTTCAGAAGATATCCG...AAAGCGCGTGCCACTCTGGTGTGC]
## [3] 4986 20000 15015 [AATCGATAGAGTGTAGGGCCCAGG...CTCGTGGCTTGAAATGAGGGAATA]
as(gaps(mDNA), "Views")
## Views on a 20000-letter DNAString subject
## subject: TACATGATGGGGCCATTGGTGGACAGCGTTTT...CATGGTACTCGTGGCTTGAAATGAGGGAATA
## views:
## start end width
## [1] 3130 3135 6 [CATTAG]
## [2] 4980 4985 6 [CATTAG]
unmasked(mDNA)
## 20000-letter "DNAString" instance
## seq: TACATGATGGGGCCATTGGTGGACAGCGTTTTTA...CTCATGGTACTCGTGGCTTGAAATGAGGGAATA
masks(mDNA)
## MaskCollection of length 1 and width 20000
## masks:
## maskedwidth maskedratio active desc
## 1 12 6e-04 TRUE CATTAG-blocks