Documentation
¶
Index ¶
- Constants
- Variables
- func AA2Index(aa uint8) (idx int, err error)
- func AlphabetFromString(alphabet string) int
- func Complement(seq []uint8) (err error)
- func DetectAlphabet(seq string) int
- func EqualOrCompatible(nt1, nt2 uint8) (ok bool, err error)
- func GenAllPossibleCodons(nt1, nt2, nt3 uint8) (codons []string)
- func Index2AA(index int) (aa uint8, err error)
- func Index2Nt(index int) (nt uint8, err error)
- func NewAlign(alphabet int) *align
- func NewPwAligner(seq1, seq2 Sequence, algo int) *pwaligner
- func NewSeqBag(alphabet int) *seqbag
- func NewSequence(name string, sequence []uint8, comment string) *seq
- func Nt2Index(nt uint8) (idx int, err error)
- func Nt2IndexIUPAC(nt uint8) (idx uint8, err error)
- func NtIUPACDifference(nt1, nt2 uint8) (diff float64, err error)
- func PossibleNtIUPAC(nt uint8) (idx []uint8, err error)
- func RandomSequence(alphabet, length int) ([]uint8, error)
- func Reverse(seq []uint8)
- type AlignChannel
- type Alignment
- type CountProfile
- func (p *CountProfile) AppendCount(i, count int) (err error)
- func (p *CountProfile) CheckLength(length int) bool
- func (p *CountProfile) Count(r uint8, site int) (count int, err error)
- func (p *CountProfile) CountAt(i, site int) (count int, err error)
- func (p *CountProfile) CountsAt(i int) (counts []int, err error)
- func (p *CountProfile) NameAt(i int) (name uint8, err error)
- func (p *CountProfile) NameIndex(r uint8) (index int, ok bool)
- func (p *CountProfile) NbCharacters() (nb int)
- func (p *CountProfile) Print()
- func (p *CountProfile) SetHeader(header []uint8)
- type Mutation
- type PairwiseAligner
- type PartitionSet
- func (ps *PartitionSet) AddRange(partName, modelName string, start, end, modulo int) (err error)
- func (ps *PartitionSet) AliLength() int
- func (ps *PartitionSet) CheckSites() (err error)
- func (ps *PartitionSet) ModeleName(code int) string
- func (ps *PartitionSet) NPartitions() int
- func (ps *PartitionSet) Partition(position int) int
- func (ps *PartitionSet) PartitionName(code int) string
- func (ps *PartitionSet) String() string
- type PhasedSequence
- type Phaser
- type SeqBag
- type Sequence
Constants ¶
const ( ALIGN_UP = iota ALIGN_LEFT ALIGN_DIAG ALIGN_STOP ALIGN_ALGO_SW = iota ALIGN_ALGO_ATG )
const ( AMINOACIDS = 0 // Amino acid sequence alphabet NUCLEOTIDS = 1 // Nucleotid sequence alphabet BOTH = 2 // Could be both UNKNOWN = 3 // Unkown alphabet GAP = '-' POINT = '.' OTHER = '*' ALL_AMINO = 'X' ALL_NUCLE = 'N' PSSM_NORM_NONE = 0 // No normalization PSSM_NORM_FREQ = 1 // Normalization by freq in the site PSSM_NORM_DATA = 2 // Normalization by aa/nt frequency in data PSSM_NORM_UNIF = 3 // Normalization by uniform frequency PSSM_NORM_LOGO = 4 // Normalization like LOGO : v(site)=freq*(log2(alphabet)-H(site)-pseudocount FORMAT_FASTA = 0 FORMAT_PHYLIP = 1 FORMAT_NEXUS = 2 FORMAT_CLUSTAL = 3 POSITION_IDENTICAL = 0 // All characters in a position are the same POSITION_CONSERVED = 1 // Same strong group POSITION_SEMI_CONSERVED = 2 // Same weak group POSITION_NOT_CONSERVED = 3 // None of the above values GENETIC_CODE_STANDARD = 0 // Standard genetic code GENETIC_CODE_VETEBRATE_MITO = 1 // Vertebrate mitochondrial genetic code GENETIC_CODE_INVETEBRATE_MITO = 2 // Invertebrate mitochondrial genetic code IGNORE_NONE = 0 IGNORE_NAME = 1 IGNORE_SEQUENCE = 2 // IUPAC Nucleotide Code : For bitwise operations NT_OTHER = 0 // GAP, *, etc;? NT_A = 1 NT_C = 2 NT_G = 4 NT_T = 8 NT_R = NT_A | NT_G NT_Y = NT_C | NT_T NT_S = NT_G | NT_C NT_W = NT_A | NT_T NT_K = NT_G | NT_T NT_M = NT_A | NT_C NT_B = NT_C | NT_G | NT_T NT_D = NT_A | NT_G | NT_T NT_H = NT_A | NT_C | NT_T NT_V = NT_A | NT_C | NT_G NT_N = NT_A | NT_C | NT_G | NT_T )
Variables ¶
var IupacCode = map[uint8][]uint8{
'A': {'A'},
'C': {'C'},
'G': {'G'},
'T': {'T'},
'R': {'A', 'G'},
'Y': {'C', 'T'},
'S': {'G', 'C'},
'W': {'A', 'T'},
'K': {'G', 'T'},
'M': {'A', 'C'},
'B': {'C', 'G', 'T'},
'D': {'A', 'G', 'T'},
'H': {'A', 'C', 'T'},
'V': {'A', 'C', 'G'},
'N': {'A', 'C', 'G', 'T'},
'-': {'-'},
}
Functions ¶
func AlphabetFromString ¶ added in v0.2.3
AlphabetFromString converts the alphabet name to its code If the alphabet name is not known, returns align.UNKNOWN
func DetectAlphabet ¶
func EqualOrCompatible ¶ added in v0.3.4
EqualOrCompatible returns true if the two nucleotides are identical or if they are compatible in case they are ambigous.
For example : Y: {C | T} is compatible with S: {G | C} because there is one nt in common If nt1 or nt2 are not nucleotides, then returns an error n1 and nt2 valures are from NT_... in const.go
func GenAllPossibleCodons ¶ added in v0.3.4
GenAllPossibleCodons generates all possible codons given the 3 nucleotides in arguments Multiple codons may exist if IUPAC code is employed (R=A|G, etc.). The 3 nucleotites in arguments are converted to upper case and U converted to T. If one character does not correspond to a known nucleotide in IUPAC code, then Returns an empty slice. If one of the nucleotides is a GAP, then returns an empty slice.
For example GenAllPossibleCodons('A','G','N') should return {"AGA","AGC","AGG","AGT"}.
func NewPwAligner ¶ added in v0.3.0
func NewSequence ¶
func Nt2IndexIUPAC ¶ added in v0.3.4
Returns the int code of the given nucleotide. It takes the upper case of the given uint8. Ex: 'B': NT_B
func NtIUPACDifference ¶ added in v0.3.4
NtIUPACDifference returns the cost of the difference between the two potentially ambiguous nucleotides.
- if the two nucleotides are identical : returns 0.0 - if the two nucleotides are different:
- If none are ambigous: returns 1.0
- Otherwise, returns 1-Card(I)/Card(U), I being the intersection of the sets of possible nucleotides of nt1 and nt2, and U being the union of the sets of possible nucleotides of nt1 and nt2.
For example, if we want to compare Y and S : Y = {C | T} and S = {G | C}. Card(I)=1, Card(U)=3, so diff=2/3
Precisions: - For N vs. A for example: the difference will be 1-1/4 : 3/4 - For gaps: Returns diff=1.0
nt1 and nt2 values are in NT_... of const.go
func PossibleNtIUPAC ¶ added in v0.3.4
PossibleNtIUPAC returns the possible meaning of the given iupac nucleotide Ex: NT_B : {NT_C, NT_G, NT_T}
func RandomSequence ¶ added in v0.1.3
Types ¶
type AlignChannel ¶ added in v0.2.4
AlignChannel is used for iterating over alignments
type Alignment ¶
type Alignment interface { SeqBag AddGaps(rate, lenprop float64) Append(Alignment) error // Appends alignment sequences to this alignment AvgAllelesPerSite() float64 BuildBootstrap(frac float64) Alignment // Bootstrap alignment CharStatsSite(site int) (map[uint8]int, error) Clone() (Alignment, error) CodonAlign(ntseqs SeqBag) (codonAl *align, err error) // Remove identical patterns/sites and return number of occurence // of each pattern (order of patterns/sites may have changed) Compress() []int // concatenates the given alignment with this alignment Concat(Alignment) error // Computes the majority consensus of the given alignemnt // To do so, it takes the majority character at each alignment site // if ignoreGaps is true, then gaps are not taken into account for majority computation (except if only Gaps) // if ignoreNs is true, then Ns are not taken into account for majority computation (except if only Ns) Consensus(ignoreGaps, ignoreNs bool) *align // Compares all sequences to the first one and counts all differences per sequence // // - alldiffs: The set of all differences that have been seen at least once // - diffs : The number of occurences of each difference, for each sequence // Sequences are ordered as the original alignment. Differences are // written as REFNEW, ex: diffs["AC"]=12 . CountDifferences() (alldiffs []string, diffs []map[string]int) // Compares all sequences to the first one and replace identical characters with . DiffWithFirst() Entropy(site int, removegaps bool) (float64, error) // Entropy of the given site // Positions of potential frameshifts // if startinggapsasincomplete is true, then considers gaps as the beginning // as incomplete sequence, then take the right phase Frameshifts(startingGapsAsIncomplete bool) []struct{ Start, End int } // Returns informative positions of the alignment. Informative positions // are sites that contain at least two characters that occur at least twice each // X, N and GAPS are not considered in this definition InformativeSites() (sites []int) // Positions of potential stop in frame // if startinggapsasincomplete is true, then considers gaps as the beginning // as incomplete sequence, then take the right phase Stops(startingGapsAsIncomplete bool, geneticode int) (stops []int, err error) Length() int // Length of the alignment // maskreplace defines the replacing character. If maskreplace is "", then, masked characters // are replaced by "N" or "X" depending on the alphabet. Orherwise: // 1) if maskreplace is AMBIG: just like "" // 2) if maskreplace is MAJ: Replacing character is most frequent character of the column // 3) if maskreplace is GAP: Replacing character is a GAP // if nogap is true, then Mask will not replace gaps with the replacement character // if noref is true, then does not replace the character if it is the same as the reference sequences (only if refseq is specified). Mask(refseq string, start, length int, maskreplace string, nogap, noref bool) error // Masks given positions // Masks unique mutations in the given aligment (not the gaps). // If refseq is not "" then masks unique characters if // 1) they are different from the given reference sequence // 2) or if the reference is a GAP // maskreplace defines the replacing character. If maskreplace is "", then, masked characters // are replaced by "N" or "X" depending on the alphabet. Orherwise: // 1) if maskreplace is AMBIG: just like "" // 2) if maskreplace is MAJ: Replacing character is most frequent character of the column // 3) if maskreplace is GAP: Replacing character is a GAP MaskUnique(refseq string, maskreplace string) error // Masks mutations that appear less or equal than the given number of max occurences in their columns (not the gaps). // If refseq is not "" then masks these characters if // 1) they are different from the given reference sequence // 2) or if the reference is a GAP // maskreplace defines the replacing character. If maskreplace is "", then, masked characters // are replaced by "N" or "X" depending on the alphabet. Orherwise: // 1) if maskreplace is AMBIG: just like "" // 2) if maskreplace is MAJ: Replacing character is most frequent character of the column // 3) if maskreplace is GAP: Replacing character is a GAP MaskOccurences(refseq string, maxOccurence int, maskreplace string) error MaxCharStats(excludeGaps, excludeNs bool) (out []uint8, occur []int, total []int) Mutate(rate float64) // Adds uniform substitutions in the alignment (~sequencing errors) NbVariableSites() int // Nb of variable sites // Number of Gaps in each sequence that are unique in their alignment site NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error) // returns the number of characters in each sequence that are unique in their alignment site (gaps or others) // It does not take into account 'N' and '-' as unique mutations NumMutationsUniquePerSequence(profile *CountProfile) (numuniques []int, numnew []int, nummuts []int, err error) Pssm(log bool, pseudocount float64, normalization int) (pssm map[uint8][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA Rarefy(nb int, counts map[string]int) (Alignment, error) // Take a new rarefied sample taking into accounts weights RandSubAlign(length int, consecutive bool) (Alignment, error) // Extract a random subalignment with given length from this alignment Recombine(rate float64, lenprop float64) // converts coordinates on the given sequence to coordinates on the alignment RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error) // converts sites on the given sequence to coordinates on the alignment RefSites(name string, sites []int) (refsites []int, err error) // Overwrites the character at position "site" of the sequence "seqname" by "newchar" ReplaceChar(seqname string, site int, newchar uint8) error // Removes sequences having >= cutoff gaps, returns number of removed sequences RemoveGapSeqs(cutoff float64, ignoreNs bool) int // Removes sequences having >= cutoff character, returns number of removed sequences RemoveCharacterSeqs(c uint8, cutoff float64, ignoreCase, ignoreGaps, ignoreNs bool) int // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment RemoveGapSites(cutoff float64, ends bool) (first, last int, kept, removed []int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment RemoveCharacterSites(c []uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs, reverse bool) (first, last int, kept, removed []int) // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment RemoveMajorityCharacterSites(cutoff float64, ends, ignoreGaps, ignoreNs bool) (first, last int, kept, removed []int) // Replaces match characters (.) by their corresponding characters on the first sequence ReplaceMatchChars() Sample(nb int) (Alignment, error) // generate a sub sample of the sequences ShuffleSites(rate float64, roguerate float64, randroguefirst bool) []string SimulateRogue(prop float64, proplen float64) ([]string, []string) // add "rogue" sequences SiteConservation(position int) (int, error) // If the site is conserved: Split(part *PartitionSet) ([]Alignment, error) //Splits the alignment given the paritions in argument SubAlign(start, length int) (Alignment, error) // Extract a subalignment from this alignment SelectSites(sites []int) (Alignment, error) // Extract givens sites from the alignment InverseCoordinates(start, length int) (invstarts, invlengths []int, err error) InversePositions(sites []int) (invsites []int, err error) Swap(rate float64) // TranslateByReference translates the alignment codon by codon using the given reference sequence as guide // We traverse reference nt 3 by 3 // The reference codon may have gaps between nt , // ex 1: // Ref: AC--GTACGT // Seq: ACTTGTACGT // In that case, the first ref codon is [0,1,4], corresponding to sequence ACTTG in seq // ACTTG % 3 != 0 ==> Frameshift? => Replaced by X in the compared sequence. // ex 2: // Ref: AC---GTACGT // Seq: ACTTTGTACGT // ref codon: [0,1,5] // seq : ACTTTG : Insertion - OK => Replaced by "T-" in ref and "TT" in seq // ex 3: // Ref: ACGTACGT // Seq: A--TACGT // ref codon: [0,1,2] // seq : A--: Deletion: not ok : Frameshift? => Replaced by "T" in ref and "X" in comp // ex 4: // Ref: AC----GTACGT // Seq: ACTT-TGTACGT // ref codon: [0,1,6] // seq : ACTTTG : Insertion - OK => Replaced by "T-" in ref and "TT" in seq // ex 5: // Ref: AC----GTACGT // Seq: ACT--TGTACGT // ref codon: [0,1,6] // seq : ACTTTG : Insertion not OK : Frameshift? => Replaced by "T-" in ref and "XX" in seq TranslateByReference(phase int, geneticcode int, refseq string) (err error) Transpose() (Alignment, error) // Output sequences are made of sites and output sites are sequences TrimSequences(trimsize int, fromStart bool) error }
Alignment represents a set of aligned sequences (multiple Sequence Alignment)
func RandomAlignment ¶ added in v0.1.3
RandomAlignment generates a random alignment with a given alphabet length and number of sequences. Each character is randomly choosen in a uniform distribution.
type CountProfile ¶ added in v0.3.4
type CountProfile struct {
// contains filtered or unexported fields
}
CountProfile represents a simple view of an alignment and stores the number of occurences of each characters at each position of an alignment
func NewCountProfile ¶ added in v0.3.4
func NewCountProfile() (p *CountProfile)
NewCountProfile initializes a new Profile with nil attributes
func NewCountProfileFromAlignment ¶ added in v0.3.4
func NewCountProfileFromAlignment(al Alignment) (p *CountProfile)
NewCountProfileFromAlignment initializes a new CountProfile using an input alignment
func (*CountProfile) AppendCount ¶ added in v0.3.4
func (p *CountProfile) AppendCount(i, count int) (err error)
AppendCount appends a new site to the profile for the ith character, and associates count to it
func (*CountProfile) CheckLength ¶ added in v0.3.4
func (p *CountProfile) CheckLength(length int) bool
CheckLength returns true if the number of sites of the profile corresponds to the given length false otherwise.
func (*CountProfile) Count ¶ added in v0.3.4
func (p *CountProfile) Count(r uint8, site int) (count int, err error)
Count returns the number of occurences of the character r at the position site
func (*CountProfile) CountAt ¶ added in v0.3.4
func (p *CountProfile) CountAt(i, site int) (count int, err error)
CountAt returns the number of occurences of the ith character at the position site
func (*CountProfile) CountsAt ¶ added in v0.3.4
func (p *CountProfile) CountsAt(i int) (counts []int, err error)
CountsAt returns the counts for all sites, for the ith character (arbitrary order of character)
func (*CountProfile) NameAt ¶ added in v0.3.4
func (p *CountProfile) NameAt(i int) (name uint8, err error)
NameAt returns the name of ith character in the header
func (*CountProfile) NameIndex ¶ added in v0.3.4
func (p *CountProfile) NameIndex(r uint8) (index int, ok bool)
NameIndex returns the index of the given character in the header If the character does not exist, returns false
func (*CountProfile) NbCharacters ¶ added in v0.3.4
func (p *CountProfile) NbCharacters() (nb int)
NbCharacters returns the number of different characters in the profile
func (*CountProfile) Print ¶ added in v0.3.4
func (p *CountProfile) Print()
func (*CountProfile) SetHeader ¶ added in v0.3.4
func (p *CountProfile) SetHeader(header []uint8)
SetHeader sets the Header and initializes the count structure
type PairwiseAligner ¶ added in v0.3.0
type PairwiseAligner interface { AlignEnds() (int, int) AlignStarts() (int, int) Seq1Ali() []uint8 Seq2Ali() []uint8 SetGapOpenScore(open float64) SetGapExtendScore(extend float64) SetScore(match, mismatch float64) MaxScore() float64 // Maximum score of the alignment NbMatches() int // Number of matches NbMisMatches() int // Number of mismatches NbGaps() int // Nuber of gaps Length() int // Length of the alignment Alignment() (Alignment, error) AlignmentStr() string }
type PartitionSet ¶ added in v0.3.2
type PartitionSet struct {
// contains filtered or unexported fields
}
func NewPartitionSet ¶ added in v0.3.2
func NewPartitionSet(alignmentLength int) (ps *PartitionSet)
func (*PartitionSet) AddRange ¶ added in v0.3.2
func (ps *PartitionSet) AddRange(partName, modelName string, start, end, modulo int) (err error)
func (*PartitionSet) AliLength ¶ added in v0.3.2
func (ps *PartitionSet) AliLength() int
returns the length of the alignment
func (*PartitionSet) CheckSites ¶ added in v0.3.2
func (ps *PartitionSet) CheckSites() (err error)
If not all sites are in a partition, returns an error
func (*PartitionSet) ModeleName ¶ added in v0.3.2
func (ps *PartitionSet) ModeleName(code int) string
Returns the name of the modele associated to the given index If the code does not exist, then returns ""
func (*PartitionSet) NPartitions ¶ added in v0.3.2
func (ps *PartitionSet) NPartitions() int
func (*PartitionSet) Partition ¶ added in v0.3.2
func (ps *PartitionSet) Partition(position int) int
Returns the partition code associated to the given position
If the position is outside the alignment, then returns -1
func (*PartitionSet) PartitionName ¶ added in v0.3.2
func (ps *PartitionSet) PartitionName(code int) string
Returns the name of the partition associated to the given index If the code does not exist, then returns ""
func (*PartitionSet) String ¶ added in v0.3.2
func (ps *PartitionSet) String() string
type PhasedSequence ¶ added in v0.3.0
type PhasedSequence struct { Err error Removed bool Position int // phased nt sequence NtSeq Sequence // phased nt sequence // with first nt corresponding // first position of aa codon CodonSeq Sequence // phased aa sequence AaSeq Sequence // Aligned sequences // 1st: best found orf // 2nd: sequence Ali Alignment }
type Phaser ¶ added in v0.3.0
type Phaser interface { Phase(orfs, seqs SeqBag) (chan PhasedSequence, error) SetLenCutoff(cutoff float64) SetMatchCutoff(cutoff float64) SetReverse(reverse bool) SetCutEnd(cutend bool) SetCpus(cpus int) SetTranslate(translate bool, geneticcode int) (err error) SetAlignScores(match, mismatch float64) SetGapOpen(float64) SetGapExtend(float64) }
* If SetTranslate(true):
align all sequences to the given ORF and trims sequences to the start position If orf is nil, searches for the longest ORF (in 3 or 6 phases depending on reverse arg) in all sequences
To do so, Phase() will:
- Translate the given ORF in aminoacids;
- For each sequence of the dataset: translate it in the 3 phases (forward) if reverse is false or 6 phases (forward and reverse) if reverse is true, align it with the translated orf, and take the phase giving the best alignment; If no phase gives a good alignment (>lencutoff * orf length, >matchcutoff matches over the align length and starting at first position of the ORF), then the sequence is discarded;
- For each sequence, take the Start corresponding to the Start of the ORF, and remove nucleotides before;
- Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa) the positions of starts in the nucleotidic sequences, and the removed sequence names.
If cutend is true, then also remove the end of sequences that do not align with orf ¶
It does not modify the input object ¶
* If SetTranslate(false):
align all sequences to the given ORF and trims sequences to the start position, it does not take into account protein information
If orf is nil, searches for the longest ORF (in forward only or both strands depending on reverse arg) in all sequences ¶
To do so:
1. If alignment is bad (>lencutoff * orf length, >matchcutoff matches over the align length and starting at first position of the ORF), then the sequence is discarded; 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove nucleotides before; 4. Return the trimmed nucleotidic sequences (phased), the positions of starts in the nucleotidic sequences, and the removed sequence names. If cutend is true, then also remove the end of sequences that do not align with orf It does not modify the input object
type SeqBag ¶ added in v0.3.0
type SeqBag interface { AddSequence(name string, sequence string, comment string) error AddSequenceChar(name string, sequence []uint8, comment string) error AppendSeqIdentifier(identifier string, right bool) Alphabet() int AlphabetStr() string AlphabetCharacters() []uint8 AlphabetCharToIndex(c uint8) int // Returns index of the character (nt or aa) in the AlphabetCharacters() array AutoAlphabet() // detects and sets alphabet automatically for all the sequences CharStats() map[uint8]int64 UniqueCharacters() []uint8 CharStatsSeq(idx int) (map[uint8]int, error) // Computes frequency of characters for the given sequence CleanNames(namemap map[string]string) // Clean sequence names (newick special char) Clear() // Removes all sequences CloneSeqBag() (seqs SeqBag, err error) // Clones the seqqbag Deduplicate() (identical [][]string, err error) // Remove duplicate sequences FilterLength(minlength, maxlength int) error // Remove sequences whose length is <minlength or >maxlength GetSequence(name string) (string, bool) // Get a sequence by names GetSequenceById(ith int) (string, bool) GetSequenceChar(name string) ([]uint8, bool) GetSequenceCharById(ith int) ([]uint8, bool) GetSequenceNameById(ith int) (string, bool) GetSequenceByName(name string) (Sequence, bool) GetSequenceIdByName(name string) (i int) // if the name does not exist, i < 0 SetSequenceChar(ithAlign, ithSite int, char uint8) error // IgnoreIdentical sets the behavior when duplicate names are encountered while building the alignment // If ignore is IGNORE_NONE: Does not ignore anything // If ignore is IGNORE_NAME: Ignore sequences having the same name (keep the first one whatever their sequence) // If ignore is IGNORE_SEQUENCE: Ignore sequences having the same name and the same sequence // Otherwise, sets IGNORE_NONE IgnoreIdentical(int) SampleSeqBag(nb int) (SeqBag, error) // generate a sub sample of the sequences Sequence(ith int) (Sequence, bool) SequenceByName(name string) (Sequence, bool) Identical(SeqBag) bool Iterate(it func(name string, sequence string) bool) IterateChar(it func(name string, sequence []uint8) bool) IterateAll(it func(name string, sequence []uint8, comment string) bool) Sequences() []Sequence SequencesChan() chan Sequence LongestORF(reverse bool) (orf Sequence, err error) MaxNameLength() int // maximum sequence name length NbSequences() int RarefySeqBag(nb int, counts map[string]int) (SeqBag, error) // Take a new rarefied sample taking into accounts weights Rename(namemap map[string]string) RenameRegexp(regex, replace string, namemap map[string]string) error Replace(old, new string, regex bool) error // Replaces old string with new string in sequences of the alignment ShuffleSequences() // Shuffle sequence order String() string // Raw string representation (just write all sequences) Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa ReverseComplement() (err error) // Reverse-complements the alignment TrimNames(namemap map[string]string, size int) error TrimNamesAuto(namemap map[string]string, curid *int) error Sort() // Sorts the sequences by name Unalign() SeqBag }
SeqBag represents a set of unaligned sequences
type Sequence ¶
type Sequence interface { Sequence() string SequenceChar() []uint8 SameSequence([]uint8) bool CharAt(int) uint8 Name() string SetName(name string) Comment() string Length() int LongestORF() (start, end int) // Detects the longest ORF in forward strand only Reverse() Complement() error // Returns an error if not nucleotide sequence Translate(phase int, geneticcode int) (Sequence, error) // Translates the sequence using the given code DetectAlphabet() int // Try to detect alphabet (nt or aa) NumGaps() int // Number of Gaps NumGapsOpenning() int // Number of Gaps opennin, it counts streches of gap only once NumGapsFromStart() int // Number of Gaps from Start (until a non gap is encountered) NumGapsFromEnd() int // Number of Gaps from End (until a non gap is encountered) // returns the number of differences between the reference sequence and each sequence of the alignment // If lengths are different, returns an error // It does not take into account 'N' and '-' in sequences as mutations compared to ref /// sequence (ref sequence can have a '-' or a 'N') NumMutationsComparedToReferenceSequence(alphabet int, seq Sequence) (nummutations int, err error) // returns the list ofdifferences between the reference sequence and each sequence of the alignment // Counts only non N sites in each sequences (may be a gap or a N in the reference sequence though) // If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with // the reference character. // if aa is true: the sequences are nucleotides and nucleotides are taken codon by codon of the reference sequence // to list mutations. In case of insertion or a deletion in the target sequence: if %3==0: - or aa insert, // otherwise "/" ~frameshift? // // If lengths are different, returns an error ListMutationsComparedToReferenceSequence(alphabet int, refseq Sequence, aa bool) (mutations []Mutation, err error) Clone() Sequence }