align

package

v0.3.7 Latest Latest Go to latest Published: Nov 6, 2023 License: GPL-2.0 Imports: 15 Imported by: 8

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/evolbioinfo/goalign

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func AA2Index(aa uint8) (idx int, err error)
func AlphabetFromString(alphabet string) int
func Complement(seq []uint8) (err error)
func DetectAlphabet(seq string) int
func EqualOrCompatible(nt1, nt2 uint8) (ok bool, err error)
func GenAllPossibleCodons(nt1, nt2, nt3 uint8) (codons []string)
func Index2AA(index int) (aa uint8, err error)
func Index2Nt(index int) (nt uint8, err error)
func NewAlign(alphabet int) *align
func NewPwAligner(seq1, seq2 Sequence, algo int) *pwaligner
func NewSeqBag(alphabet int) *seqbag
func NewSequence(name string, sequence []uint8, comment string) *seq
func Nt2Index(nt uint8) (idx int, err error)
func Nt2IndexIUPAC(nt uint8) (idx uint8, err error)
func NtIUPACDifference(nt1, nt2 uint8) (diff float64, err error)
func PossibleNtIUPAC(nt uint8) (idx []uint8, err error)
func RandomSequence(alphabet, length int) ([]uint8, error)
func Reverse(seq []uint8)
type AlignChannel
type Alignment
- func RandomAlignment(alphabet, length, nbseq int) (al Alignment, err error)
type CountProfile
- func NewCountProfile() (p *CountProfile)
- func NewCountProfileFromAlignment(al Alignment) (p *CountProfile)
- func (p *CountProfile) AppendCount(i, count int) (err error)
- func (p *CountProfile) CheckLength(length int) bool
- func (p *CountProfile) Count(r uint8, site int) (count int, err error)
- func (p *CountProfile) CountAt(i, site int) (count int, err error)
- func (p *CountProfile) CountsAt(i int) (counts []int, err error)
- func (p *CountProfile) NameAt(i int) (name uint8, err error)
- func (p *CountProfile) NameIndex(r uint8) (index int, ok bool)
- func (p *CountProfile) NbCharacters() (nb int)
- func (p *CountProfile) Print()
- func (p *CountProfile) SetHeader(header []uint8)
type Mutation
type PairwiseAligner
type PartitionSet
- func NewPartitionSet(alignmentLength int) (ps *PartitionSet)
- func (ps *PartitionSet) AddRange(partName, modelName string, start, end, modulo int) (err error)
- func (ps *PartitionSet) AliLength() int
- func (ps *PartitionSet) CheckSites() (err error)
- func (ps *PartitionSet) ModeleName(code int) string
- func (ps *PartitionSet) NPartitions() int
- func (ps *PartitionSet) Partition(position int) int
- func (ps *PartitionSet) PartitionName(code int) string
- func (ps *PartitionSet) String() string
type PhasedSequence
type Phaser
- func NewPhaser() Phaser
type SeqBag
type Sequence

Constants ¶

View Source

const (
	ALIGN_UP = iota
	ALIGN_LEFT
	ALIGN_DIAG
	ALIGN_STOP

	ALIGN_ALGO_SW = iota
	ALIGN_ALGO_ATG
)

View Source

const (
	AMINOACIDS = 0 // Amino acid sequence alphabet
	NUCLEOTIDS = 1 // Nucleotid sequence alphabet
	BOTH       = 2 // Could be both
	UNKNOWN    = 3 // Unkown alphabet

	GAP   = '-'
	POINT = '.'
	OTHER = '*'

	ALL_AMINO = 'X'
	ALL_NUCLE = 'N'

	PSSM_NORM_NONE = 0 // No normalization
	PSSM_NORM_FREQ = 1 // Normalization by freq in the site
	PSSM_NORM_DATA = 2 // Normalization by aa/nt frequency in data
	PSSM_NORM_UNIF = 3 // Normalization by uniform frequency
	PSSM_NORM_LOGO = 4 // Normalization like LOGO : v(site)=freq*(log2(alphabet)-H(site)-pseudocount

	FORMAT_FASTA   = 0
	FORMAT_PHYLIP  = 1
	FORMAT_NEXUS   = 2
	FORMAT_CLUSTAL = 3

	POSITION_IDENTICAL      = 0 // All characters in a position are the same
	POSITION_CONSERVED      = 1 // Same strong group
	POSITION_SEMI_CONSERVED = 2 // Same weak group
	POSITION_NOT_CONSERVED  = 3 // None of the above values

	GENETIC_CODE_STANDARD         = 0 // Standard genetic code
	GENETIC_CODE_VETEBRATE_MITO   = 1 // Vertebrate mitochondrial genetic code
	GENETIC_CODE_INVETEBRATE_MITO = 2 // Invertebrate mitochondrial genetic code

	IGNORE_NONE     = 0
	IGNORE_NAME     = 1
	IGNORE_SEQUENCE = 2

	// IUPAC Nucleotide Code : For bitwise operations
	NT_OTHER = 0 // GAP, *, etc;?
	NT_A     = 1
	NT_C     = 2
	NT_G     = 4
	NT_T     = 8
	NT_R     = NT_A | NT_G
	NT_Y     = NT_C | NT_T
	NT_S     = NT_G | NT_C
	NT_W     = NT_A | NT_T
	NT_K     = NT_G | NT_T
	NT_M     = NT_A | NT_C
	NT_B     = NT_C | NT_G | NT_T
	NT_D     = NT_A | NT_G | NT_T
	NT_H     = NT_A | NT_C | NT_T
	NT_V     = NT_A | NT_C | NT_G
	NT_N     = NT_A | NT_C | NT_G | NT_T
)

Variables ¶

View Source

var IupacCode = map[uint8][]uint8{
	'A': {'A'},
	'C': {'C'},
	'G': {'G'},
	'T': {'T'},
	'R': {'A', 'G'},
	'Y': {'C', 'T'},
	'S': {'G', 'C'},
	'W': {'A', 'T'},
	'K': {'G', 'T'},
	'M': {'A', 'C'},
	'B': {'C', 'G', 'T'},
	'D': {'A', 'G', 'T'},
	'H': {'A', 'C', 'T'},
	'V': {'A', 'C', 'G'},
	'N': {'A', 'C', 'G', 'T'},
	'-': {'-'},
}

Functions ¶

func AA2Index ¶ added in v0.3.1

func AA2Index(aa uint8) (idx int, err error)

func AlphabetFromString ¶ added in v0.2.3

func AlphabetFromString(alphabet string) int

AlphabetFromString converts the alphabet name to its code If the alphabet name is not known, returns align.UNKNOWN

func Complement ¶ added in v0.3.0

func Complement(seq []uint8) (err error)

Complement sequence

func DetectAlphabet ¶

func DetectAlphabet(seq string) int

func EqualOrCompatible ¶ added in v0.3.4

func EqualOrCompatible(nt1, nt2 uint8) (ok bool, err error)

EqualOrCompatible returns true if the two nucleotides are identical or if they are compatible in case they are ambigous.

For example : Y: {C | T} is compatible with S: {G | C} because there is one nt in common If nt1 or nt2 are not nucleotides, then returns an error n1 and nt2 valures are from NT_... in const.go

func GenAllPossibleCodons ¶ added in v0.3.4

func GenAllPossibleCodons(nt1, nt2, nt3 uint8) (codons []string)

GenAllPossibleCodons generates all possible codons given the 3 nucleotides in arguments Multiple codons may exist if IUPAC code is employed (R=A|G, etc.). The 3 nucleotites in arguments are converted to upper case and U converted to T. If one character does not correspond to a known nucleotide in IUPAC code, then Returns an empty slice. If one of the nucleotides is a GAP, then returns an empty slice.

For example GenAllPossibleCodons('A','G','N') should return {"AGA","AGC","AGG","AGT"}.

func Index2AA ¶ added in v0.3.1

func Index2AA(index int) (aa uint8, err error)

func Index2Nt ¶ added in v0.3.1

func Index2Nt(index int) (nt uint8, err error)

Returns the index of each nts 0=A 1=C 2=G 3=T

func NewAlign ¶

func NewAlign(alphabet int) *align

NewAlign initializes a new alignment

func NewPwAligner ¶ added in v0.3.0

func NewPwAligner(seq1, seq2 Sequence, algo int) *pwaligner

func NewSeqBag ¶ added in v0.3.0

func NewSeqBag(alphabet int) *seqbag

func NewSequence ¶

func NewSequence(name string, sequence []uint8, comment string) *seq

func Nt2Index ¶ added in v0.3.1

func Nt2Index(nt uint8) (idx int, err error)

Returns the index of each nts A=0 C=1 G=2 T=3

func Nt2IndexIUPAC ¶ added in v0.3.4

func Nt2IndexIUPAC(nt uint8) (idx uint8, err error)

Returns the int code of the given nucleotide. It takes the upper case of the given uint8. Ex: 'B': NT_B

func NtIUPACDifference ¶ added in v0.3.4

func NtIUPACDifference(nt1, nt2 uint8) (diff float64, err error)

NtIUPACDifference returns the cost of the difference between the two potentially ambiguous nucleotides.

- if the two nucleotides are identical : returns 0.0 - if the two nucleotides are different:

If none are ambigous: returns 1.0
Otherwise, returns 1-Card(I)/Card(U), I being the intersection of the sets of possible nucleotides of nt1 and nt2, and U being the union of the sets of possible nucleotides of nt1 and nt2.

For example, if we want to compare Y and S : Y = {C | T} and S = {G | C}. Card(I)=1, Card(U)=3, so diff=2/3

Precisions: - For N vs. A for example: the difference will be 1-1/4 : 3/4 - For gaps: Returns diff=1.0

nt1 and nt2 values are in NT_... of const.go

func PossibleNtIUPAC ¶ added in v0.3.4

func PossibleNtIUPAC(nt uint8) (idx []uint8, err error)

PossibleNtIUPAC returns the possible meaning of the given iupac nucleotide Ex: NT_B : {NT_C, NT_G, NT_T}

func RandomSequence ¶ added in v0.1.3

func RandomSequence(alphabet, length int) ([]uint8, error)

func Reverse ¶ added in v0.3.0

func Reverse(seq []uint8)

Reverses a sequence

Types ¶

type AlignChannel ¶ added in v0.2.4

type AlignChannel struct {
	Achan chan Alignment
	Err   error
}

AlignChannel is used for iterating over alignments

type Alignment ¶

type Alignment interface {
	SeqBag
	AddGaps(rate, lenprop float64)
	Append(Alignment) error // Appends alignment sequences to this alignment
	AvgAllelesPerSite() float64
	BuildBootstrap(frac float64) Alignment // Bootstrap alignment
	CharStatsSite(site int) (map[uint8]int, error)
	Clone() (Alignment, error)
	CodonAlign(ntseqs SeqBag) (codonAl *align, err error)
	// Remove identical patterns/sites and return number of occurence
	// of each pattern (order of patterns/sites may have changed)
	Compress() []int
	// concatenates the given alignment with this alignment
	Concat(Alignment) error
	// Computes the majority consensus of the given alignemnt
	// To do so, it takes the majority character at each alignment site
	// if ignoreGaps is true, then gaps are not taken into account for majority computation (except if only Gaps)
	// if ignoreNs is true, then Ns are not taken into account for majority computation (except if only Ns)
	Consensus(ignoreGaps, ignoreNs bool) *align
	// Compares all sequences to the first one and counts all differences per sequence
	//
	// - alldiffs: The set of all differences that have been seen at least once
	// - diffs   : The number of occurences of each difference, for each sequence
	//             Sequences are ordered as the original alignment. Differences are
	//             written as REFNEW, ex: diffs["AC"]=12 .
	CountDifferences() (alldiffs []string, diffs []map[string]int)
	// Compares all sequences to the first one and replace identical characters with .
	DiffWithFirst()
	Entropy(site int, removegaps bool) (float64, error) // Entropy of the given site
	// Positions of potential frameshifts
	// if startinggapsasincomplete is true, then considers gaps as the beginning
	// as incomplete sequence, then take the right phase
	Frameshifts(startingGapsAsIncomplete bool) []struct{ Start, End int }
	// Returns informative positions of the alignment. Informative positions
	// are sites that contain at least two characters that occur at least twice each
	// X, N and GAPS are not considered in this definition
	InformativeSites() (sites []int)
	// Positions of potential stop in frame
	// if startinggapsasincomplete is true, then considers gaps as the beginning
	// as incomplete sequence, then take the right phase
	Stops(startingGapsAsIncomplete bool, geneticode int) (stops []int, err error)
	Length() int // Length of the alignment
	// maskreplace defines the replacing character. If maskreplace is "", then, masked characters
	// are replaced by "N" or "X" depending on the alphabet. Orherwise:
	//    1) if maskreplace is AMBIG: just like ""
	//    2) if maskreplace is MAJ: Replacing character is most frequent character of the column
	//    3) if maskreplace is GAP: Replacing character is a GAP
	// if nogap is true, then Mask will not replace gaps with the replacement character
	// if noref is true, then does not replace the character if it is the same as the reference sequences (only if refseq is specified).
	Mask(refseq string, start, length int, maskreplace string, nogap, noref bool) error // Masks given positions
	// Masks unique mutations in the given aligment (not the gaps).
	// If refseq is not "" then masks unique characters if
	//    1) they are different from the given reference sequence
	//    2) or if the reference is a GAP
	// maskreplace defines the replacing character. If maskreplace is "", then, masked characters
	// are replaced by "N" or "X" depending on the alphabet. Orherwise:
	//    1) if maskreplace is AMBIG: just like ""
	//    2)  if maskreplace is MAJ: Replacing character is most frequent character of the column
	//    3)  if maskreplace is GAP: Replacing character is a GAP
	MaskUnique(refseq string, maskreplace string) error
	// Masks mutations that appear less or equal than the given number of max occurences in their columns (not the gaps).
	// If refseq is not "" then masks these characters if
	//    1) they are different from the given reference sequence
	//    2) or if the reference is a GAP
	// maskreplace defines the replacing character. If maskreplace is "", then, masked characters
	// are replaced by "N" or "X" depending on the alphabet. Orherwise:
	//    1) if maskreplace is AMBIG: just like ""
	//    2)  if maskreplace is MAJ: Replacing character is most frequent character of the column
	//    3)  if maskreplace is GAP: Replacing character is a GAP
	MaskOccurences(refseq string, maxOccurence int, maskreplace string) error
	MaxCharStats(excludeGaps, excludeNs bool) (out []uint8, occur []int, total []int)
	Mutate(rate float64)  // Adds uniform substitutions in the alignment (~sequencing errors)
	NbVariableSites() int // Nb of variable sites
	// Number of Gaps in each sequence that are unique in their alignment site
	NumGapsUniquePerSequence(countProfile *CountProfile) (numuniques []int, numnew []int, numboth []int, err error)
	// returns the number of characters in each sequence that are unique in their alignment site (gaps or others)
	// It does not take into account 'N' and '-' as unique mutations
	NumMutationsUniquePerSequence(profile *CountProfile) (numuniques []int, numnew []int, nummuts []int, err error)
	Pssm(log bool, pseudocount float64, normalization int) (pssm map[uint8][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA
	Rarefy(nb int, counts map[string]int) (Alignment, error)                                     // Take a new rarefied sample taking into accounts weights
	RandSubAlign(length int, consecutive bool) (Alignment, error)                                // Extract a random subalignment with given length from this alignment
	Recombine(rate float64, lenprop float64)
	// converts coordinates on the given sequence to coordinates on the alignment
	RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
	// converts sites on the given sequence to coordinates on the alignment
	RefSites(name string, sites []int) (refsites []int, err error)
	// Overwrites the character at position "site" of the sequence "seqname" by "newchar"
	ReplaceChar(seqname string, site int, newchar uint8) error
	// Removes sequences having >= cutoff gaps, returns number of removed sequences
	RemoveGapSeqs(cutoff float64, ignoreNs bool) int
	// Removes sequences having >= cutoff character, returns number of removed sequences
	RemoveCharacterSeqs(c uint8, cutoff float64, ignoreCase, ignoreGaps, ignoreNs bool) int
	// Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
	RemoveGapSites(cutoff float64, ends bool) (first, last int, kept, removed []int)
	// Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
	RemoveCharacterSites(c []uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs, reverse bool) (first, last int, kept, removed []int)
	// Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment
	RemoveMajorityCharacterSites(cutoff float64, ends, ignoreGaps, ignoreNs bool) (first, last int, kept, removed []int)
	// Replaces match characters (.) by their corresponding characters on the first sequence
	ReplaceMatchChars()
	Sample(nb int) (Alignment, error) // generate a sub sample of the sequences
	ShuffleSites(rate float64, roguerate float64, randroguefirst bool) []string
	SimulateRogue(prop float64, proplen float64) ([]string, []string) // add "rogue" sequences
	SiteConservation(position int) (int, error)                       // If the site is conserved:
	Split(part *PartitionSet) ([]Alignment, error)                    //Splits the alignment given the paritions in argument
	SubAlign(start, length int) (Alignment, error)                    // Extract a subalignment from this alignment
	SelectSites(sites []int) (Alignment, error)                       // Extract givens sites from the alignment
	InverseCoordinates(start, length int) (invstarts, invlengths []int, err error)
	InversePositions(sites []int) (invsites []int, err error)

	Swap(rate float64)
	// TranslateByReference translates the alignment codon by codon using the given reference sequence as guide
	// We traverse reference nt 3 by 3
	// The reference codon may have gaps between nt ,
	// ex 1:
	// Ref: AC--GTACGT
	// Seq: ACTTGTACGT
	// In that case, the first ref codon is [0,1,4], corresponding to sequence ACTTG in seq
	// ACTTG % 3 != 0 ==> Frameshift? => Replaced by X in the compared sequence.
	// ex 2:
	// Ref: AC---GTACGT
	// Seq: ACTTTGTACGT
	// ref codon: [0,1,5]
	// seq      : ACTTTG : Insertion - OK => Replaced by "T-" in ref and "TT" in seq
	// ex 3:
	// Ref: ACGTACGT
	// Seq: A--TACGT
	// ref codon: [0,1,2]
	// seq      : A--: Deletion: not ok : Frameshift? => Replaced by "T" in ref and "X" in comp
	// ex 4:
	// Ref: AC----GTACGT
	// Seq: ACTT-TGTACGT
	// ref codon: [0,1,6]
	// seq      : ACTTTG : Insertion - OK => Replaced by "T-" in ref and "TT" in seq
	// ex 5:
	// Ref: AC----GTACGT
	// Seq: ACT--TGTACGT
	// ref codon: [0,1,6]
	// seq      : ACTTTG : Insertion not OK : Frameshift? => Replaced by "T-" in ref and "XX" in seq
	TranslateByReference(phase int, geneticcode int, refseq string) (err error)
	Transpose() (Alignment, error) // Output sequences are made of sites and output sites are sequences
	TrimSequences(trimsize int, fromStart bool) error
}

Alignment represents a set of aligned sequences (multiple Sequence Alignment)

func RandomAlignment ¶ added in v0.1.3

func RandomAlignment(alphabet, length, nbseq int) (al Alignment, err error)

RandomAlignment generates a random alignment with a given alphabet length and number of sequences. Each character is randomly choosen in a uniform distribution.

type CountProfile ¶ added in v0.3.4

type CountProfile struct {
	// contains filtered or unexported fields
}

CountProfile represents a simple view of an alignment and stores the number of occurences of each characters at each position of an alignment

func NewCountProfile ¶ added in v0.3.4

func NewCountProfile() (p *CountProfile)

NewCountProfile initializes a new Profile with nil attributes

func NewCountProfileFromAlignment ¶ added in v0.3.4

func NewCountProfileFromAlignment(al Alignment) (p *CountProfile)

NewCountProfileFromAlignment initializes a new CountProfile using an input alignment

func (*CountProfile) AppendCount ¶ added in v0.3.4

func (p *CountProfile) AppendCount(i, count int) (err error)

AppendCount appends a new site to the profile for the ith character, and associates count to it

func (*CountProfile) CheckLength ¶ added in v0.3.4

func (p *CountProfile) CheckLength(length int) bool

CheckLength returns true if the number of sites of the profile corresponds to the given length false otherwise.

func (*CountProfile) Count ¶ added in v0.3.4

func (p *CountProfile) Count(r uint8, site int) (count int, err error)

Count returns the number of occurences of the character r at the position site

func (*CountProfile) CountAt ¶ added in v0.3.4

func (p *CountProfile) CountAt(i, site int) (count int, err error)

CountAt returns the number of occurences of the ith character at the position site

func (*CountProfile) CountsAt ¶ added in v0.3.4

func (p *CountProfile) CountsAt(i int) (counts []int, err error)

CountsAt returns the counts for all sites, for the ith character (arbitrary order of character)

func (*CountProfile) NameAt ¶ added in v0.3.4

func (p *CountProfile) NameAt(i int) (name uint8, err error)

NameAt returns the name of ith character in the header

func (*CountProfile) NameIndex ¶ added in v0.3.4

func (p *CountProfile) NameIndex(r uint8) (index int, ok bool)

NameIndex returns the index of the given character in the header If the character does not exist, returns false

func (*CountProfile) NbCharacters ¶ added in v0.3.4

func (p *CountProfile) NbCharacters() (nb int)

NbCharacters returns the number of different characters in the profile

func (*CountProfile) Print ¶ added in v0.3.4

func (p *CountProfile) Print()

func (*CountProfile) SetHeader ¶ added in v0.3.4

func (p *CountProfile) SetHeader(header []uint8)

SetHeader sets the Header and initializes the count structure

type Mutation ¶ added in v0.3.6

type Mutation struct {
	Ref uint8
	Pos int
	Alt []uint8
}

type PairwiseAligner ¶ added in v0.3.0

type PairwiseAligner interface {
	AlignEnds() (int, int)
	AlignStarts() (int, int)
	Seq1Ali() []uint8
	Seq2Ali() []uint8
	SetGapOpenScore(open float64)
	SetGapExtendScore(extend float64)
	SetScore(match, mismatch float64)
	MaxScore() float64 // Maximum score of the alignment
	NbMatches() int    // Number of matches
	NbMisMatches() int // Number of mismatches
	NbGaps() int       // Nuber of gaps
	Length() int       // Length of the alignment
	Alignment() (Alignment, error)
	AlignmentStr() string
}

type PartitionSet ¶ added in v0.3.2

type PartitionSet struct {
	// contains filtered or unexported fields
}

func NewPartitionSet ¶ added in v0.3.2

func NewPartitionSet(alignmentLength int) (ps *PartitionSet)

func (*PartitionSet) AddRange ¶ added in v0.3.2

func (ps *PartitionSet) AddRange(partName, modelName string, start, end, modulo int) (err error)

func (*PartitionSet) AliLength ¶ added in v0.3.2

func (ps *PartitionSet) AliLength() int

returns the length of the alignment

func (*PartitionSet) CheckSites ¶ added in v0.3.2

func (ps *PartitionSet) CheckSites() (err error)

If not all sites are in a partition, returns an error

func (*PartitionSet) ModeleName ¶ added in v0.3.2

func (ps *PartitionSet) ModeleName(code int) string

Returns the name of the modele associated to the given index If the code does not exist, then returns ""

func (*PartitionSet) NPartitions ¶ added in v0.3.2

func (ps *PartitionSet) NPartitions() int

func (*PartitionSet) Partition ¶ added in v0.3.2

func (ps *PartitionSet) Partition(position int) int

Returns the partition code associated to the given position

If the position is outside the alignment, then returns -1

func (*PartitionSet) PartitionName ¶ added in v0.3.2

func (ps *PartitionSet) PartitionName(code int) string

Returns the name of the partition associated to the given index If the code does not exist, then returns ""

func (*PartitionSet) String ¶ added in v0.3.2

func (ps *PartitionSet) String() string

type PhasedSequence ¶ added in v0.3.0

type PhasedSequence struct {
	Err      error
	Removed  bool
	Position int
	// phased nt sequence
	NtSeq Sequence
	// phased nt sequence
	// with first nt corresponding
	// first position of aa codon
	CodonSeq Sequence
	// phased aa sequence
	AaSeq Sequence
	// Aligned sequences
	// 1st: best found orf
	// 2nd: sequence
	Ali Alignment
}

type Phaser ¶ added in v0.3.0

type Phaser interface {
	Phase(orfs, seqs SeqBag) (chan PhasedSequence, error)
	SetLenCutoff(cutoff float64)
	SetMatchCutoff(cutoff float64)
	SetReverse(reverse bool)
	SetCutEnd(cutend bool)
	SetCpus(cpus int)
	SetTranslate(translate bool, geneticcode int) (err error)
	SetAlignScores(match, mismatch float64)
	SetGapOpen(float64)
	SetGapExtend(float64)
}

* If SetTranslate(true):

align all sequences to the given ORF and trims sequences to the start position If orf is nil, searches for the longest ORF (in 3 or 6 phases depending on reverse arg) in all sequences

To do so, Phase() will:

Translate the given ORF in aminoacids;
For each sequence of the dataset: translate it in the 3 phases (forward) if reverse is false or 6 phases (forward and reverse) if reverse is true, align it with the translated orf, and take the phase giving the best alignment; If no phase gives a good alignment (>lencutoff * orf length, >matchcutoff matches over the align length and starting at first position of the ORF), then the sequence is discarded;
For each sequence, take the Start corresponding to the Start of the ORF, and remove nucleotides before;
Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa) the positions of starts in the nucleotidic sequences, and the removed sequence names.

If cutend is true, then also remove the end of sequences that do not align with orf ¶

It does not modify the input object ¶

* If SetTranslate(false):

align all sequences to the given ORF and trims sequences to the start position, it does not take into account protein information

If orf is nil, searches for the longest ORF (in forward only or both strands depending on reverse arg) in all sequences ¶

To do so:

1. If alignment is bad (>lencutoff * orf length, >matchcutoff matches over the align length and starting at first position of the ORF), then the sequence is discarded; 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove nucleotides before; 4. Return the trimmed nucleotidic sequences (phased), the positions of starts in the nucleotidic sequences, and the removed sequence names. If cutend is true, then also remove the end of sequences that do not align with orf It does not modify the input object

func NewPhaser ¶ added in v0.3.0

func NewPhaser() Phaser

type SeqBag ¶ added in v0.3.0

type SeqBag interface {
	AddSequence(name string, sequence string, comment string) error
	AddSequenceChar(name string, sequence []uint8, comment string) error
	AppendSeqIdentifier(identifier string, right bool)
	Alphabet() int
	AlphabetStr() string
	AlphabetCharacters() []uint8
	AlphabetCharToIndex(c uint8) int // Returns index of the character (nt or aa) in the AlphabetCharacters() array
	AutoAlphabet()                   // detects and sets alphabet automatically for all the sequences
	CharStats() map[uint8]int64
	UniqueCharacters() []uint8
	CharStatsSeq(idx int) (map[uint8]int, error)    // Computes frequency of characters for the given sequence
	CleanNames(namemap map[string]string)           // Clean sequence names (newick special char)
	Clear()                                         // Removes all sequences
	CloneSeqBag() (seqs SeqBag, err error)          // Clones the seqqbag
	Deduplicate() (identical [][]string, err error) // Remove duplicate sequences
	FilterLength(minlength, maxlength int) error    // Remove sequences whose length is <minlength or >maxlength
	GetSequence(name string) (string, bool)         // Get a sequence by names
	GetSequenceById(ith int) (string, bool)
	GetSequenceChar(name string) ([]uint8, bool)
	GetSequenceCharById(ith int) ([]uint8, bool)
	GetSequenceNameById(ith int) (string, bool)
	GetSequenceByName(name string) (Sequence, bool)
	GetSequenceIdByName(name string) (i int) // if the name does not exist, i < 0
	SetSequenceChar(ithAlign, ithSite int, char uint8) error
	// IgnoreIdentical sets the behavior when duplicate names are encountered while building the alignment
	// If ignore is IGNORE_NONE: Does not ignore anything
	// If ignore is IGNORE_NAME: Ignore sequences having the same name (keep the first one whatever their sequence)
	// If ignore is IGNORE_SEQUENCE: Ignore sequences having the same name and the same sequence
	// Otherwise, sets IGNORE_NONE
	IgnoreIdentical(int)
	SampleSeqBag(nb int) (SeqBag, error) // generate a sub sample of the sequences
	Sequence(ith int) (Sequence, bool)
	SequenceByName(name string) (Sequence, bool)
	Identical(SeqBag) bool
	Iterate(it func(name string, sequence string) bool)
	IterateChar(it func(name string, sequence []uint8) bool)
	IterateAll(it func(name string, sequence []uint8, comment string) bool)
	Sequences() []Sequence
	SequencesChan() chan Sequence
	LongestORF(reverse bool) (orf Sequence, err error)
	MaxNameLength() int // maximum sequence name length
	NbSequences() int
	RarefySeqBag(nb int, counts map[string]int) (SeqBag, error) // Take a new rarefied sample taking into accounts weights
	Rename(namemap map[string]string)
	RenameRegexp(regex, replace string, namemap map[string]string) error
	Replace(old, new string, regex bool) error        // Replaces old string with new string in sequences of the alignment
	ShuffleSequences()                                // Shuffle sequence order
	String() string                                   // Raw string representation (just write all sequences)
	Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa
	ReverseComplement() (err error)                   // Reverse-complements the alignment
	TrimNames(namemap map[string]string, size int) error
	TrimNamesAuto(namemap map[string]string, curid *int) error
	Sort() // Sorts the sequences by name
	Unalign() SeqBag
}

SeqBag represents a set of unaligned sequences

type Sequence ¶

type Sequence interface {
	Sequence() string
	SequenceChar() []uint8
	SameSequence([]uint8) bool
	CharAt(int) uint8
	Name() string
	SetName(name string)
	Comment() string
	Length() int
	LongestORF() (start, end int) // Detects the longest ORF in forward strand only
	Reverse()
	Complement() error                                      // Returns an error if not nucleotide sequence
	Translate(phase int, geneticcode int) (Sequence, error) // Translates the sequence using the given code
	DetectAlphabet() int                                    // Try to detect alphabet (nt or aa)
	NumGaps() int                                           // Number of Gaps
	NumGapsOpenning() int                                   // Number of Gaps opennin, it counts streches of gap only once
	NumGapsFromStart() int                                  // Number of Gaps from Start (until a non gap is encountered)
	NumGapsFromEnd() int                                    // Number of Gaps from End (until a non gap is encountered)
	// returns the number of differences between the reference sequence and each sequence of the alignment
	// If lengths are different, returns an error
	// It does not take into account 'N' and '-' in sequences as mutations compared to ref
	/// sequence (ref sequence can have a '-' or a 'N')
	NumMutationsComparedToReferenceSequence(alphabet int, seq Sequence) (nummutations int, err error)
	// returns the list ofdifferences between the reference sequence and each sequence of the alignment
	// Counts only non N sites in each sequences (may be a gap or a N in the reference sequence though)
	// If a character is ambigous (IUPAC notation), then it is counted as a mutation only if it is incompatible with
	// the reference character.
	// if aa is true: the sequences are nucleotides and nucleotides are taken codon by codon of the reference sequence
	// to list mutations. In case of insertion or a deletion in the target sequence: if %3==0: - or aa insert,
	// otherwise "/" ~frameshift?
	//
	// If lengths are different, returns an error
	ListMutationsComparedToReferenceSequence(alphabet int, refseq Sequence, aa bool) (mutations []Mutation, err error)

	Clone() Sequence
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL