Documentation ¶
Index ¶
- Constants
- Variables
- func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
- func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
- func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
- func ClearSubstrPairs(subs *[]*SubstrPair, k int)
- func Combinations2(set []uint64) [][2]uint64
- func Execute()
- func IntSlice2StringSlice(vals []int) []string
- func MeanStdev(values []float64) (float64, float64)
- func ParseByteSize(val string) (int64, error)
- func RC(s []byte) []byte
- func RecycleChaining2Result(chains *[]*Chain2Result)
- func RecycleChainingResult(chains *[]*[]int)
- func RecycleSeqComparatorResult(r *SeqComparatorResult)
- func RecycleSubstrPairs(subs *[]*SubstrPair)
- func TrimSubStrPairs(subs *[]*SubstrPair, k int)
- type Chain2Result
- type Chainer
- type Chainer2
- type Chaining2Options
- type ChainingOptions
- type Index
- func (idx *Index) Close() error
- func (idx *Index) RecycleSearchResult(r *SearchResult)
- func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
- func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
- func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
- func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
- type IndexBuildingOptions
- type IndexInfo
- type IndexSearchingOptions
- type Options
- type Query
- type SearchResult
- type SeqComparator
- type SeqComparatorOptions
- type SeqComparatorResult
- type SimilarityDetail
- type SubstrPair
- type Uint64Slice
Constants ¶
const BITS_BATCH_IDX = 17
BITS_BATCH_IDX is the number of bits to store the genome batch index.
const BITS_FLAGS = BITS_STRAND + BITS_REVERSE
BITS_FLAGS is the number of bits to store two bits
const BITS_GENOME_IDX = 17
BITS_GENOME_IDX is the number of bits to store the genome index.
const BITS_IDX = BITS_BATCH_IDX + BITS_BATCH_IDX
BITS_IDX is the number of bits to strore batch index and genome index.
const BITS_IDX_FLAGS = BITS_IDX + BITS_FLAGS
BITS_IDX_FLAGS is the sum of BITS_IDX and BITS_FLAGS
const BITS_NONE_IDX = 64 - BITS_BATCH_IDX - BITS_GENOME_IDX
BITS_NONE_IDX is the number of bits to store data except for batch index and genome index.
const BITS_NONE_POS = 64 - BITS_POSITION
BITS_NONE_POS is the number of bits except for position
const BITS_POSITION = 28
BITS_POSITION is the number of bits to store the k-mer position/coordinate.
const BITS_REVERSE = 1
BITS_SUFFIX_IDX is the flag to indicate if the k-mer is reversed.
const BITS_STRAND = 1
BITS_STRAND is the flag to indicate if the k-mer is from the reverse complement strand.
const DirGenomes = "genomes"
DirGenomes is the directory of genomes datas
const DirSeeds = "seeds"
DirSeeds is the directory of k-mer-value data files
const ExtSeeds = ".bin"
ExtSeeds is file extention of k-mer-value data files
const ExtTmpDir = ".tmp"
ExtTmpDir is the path extension for temporary files
const FileGenomeChunks = "genomes.chunks.bin"
FileGenomeChunks store lists of batch+genome index of genome chunks
const FileGenomeIndex = "genomes.map.bin"
FileGenomeIndex maps genome id to genome batch id and index in the batch
const FileGenomes = "genomes.bin"
FileGenomes is the name of each genome file
const FileInfo = "info.toml"
FileInfo is the summary file
const FileMasks = "masks.bin"
FileMasks is the file for storing lexichash mask
const FileSeedPositions = "seed_positions.bin"
FileSeedPositions is the name of seed position file
const MASK_GENOME_IDX = (1 << BITS_GENOME_IDX) - 1
MASK_GENOME_IDX is the mask of genome index.
const MASK_NONE_IDX = (1 << BITS_NONE_IDX) - 1
MASK_NONE_IDX is the mask of non-index data
const MASK_REVERSE = 1
MASK_REVERSE is the mask of reversed flag
const MAX_GENOME_SIZE = 1 << BITS_POSITION
MAX_GENOME_SIZE is the maximum genome size, 268435456
const NO_VALID_SEQS = "no_valid_seqs"
NO_VALID_SEQS means there are no valid sequences in a genome file.
const TOO_LARGE_GENOME = "too_large_genome"
TOO_LARGE_GENOME means the genome is too big to index.
const TOO_MANY_SEQS = "too_many_seqs"
TOO_MANY_SEQS means there are too many sequences, as we require: $total_bases + ($num_contigs - 1) * $interval_size <= 268,435,456
Variables ¶
var BufferSize = 65536 // os.Getpagesize()
BufferSize is size of buffer
var COMMIT = func() string { if info, ok := debug.ReadBuildInfo(); ok { for _, setting := range info.Settings { if setting.Key == "vcs.revision" { return setting.Value[:7] } } } return "" }()
COMMIT is the last commit
var DefaultChaining2Options = Chaining2Options{
MaxGap: 50,
MinScore: 50,
MinAlignLen: 50,
MaxDistance: 100,
BandCount: 50,
BandBase: 100,
}
DefaultChaining2Options is the defalt vaule of Chaining2Option.
var DefaultChainingOptions = ChainingOptions{
MaxGap: 5000,
MinScore: 40,
MaxDistance: 10000,
}
DefaultChainingOptions is the defalt vaule of ChainingOption.
var DefaultIndexSearchingOptions = IndexSearchingOptions{ NumCPUs: runtime.NumCPU(), MaxOpenFiles: 512, MinPrefix: 15, MinSinglePrefix: 17, TopN: 500, MaxGap: 5000, MaxDistance: 10000, ExtendLength: 2000, MinQueryAlignedFractionInAGenome: 70, }
var DefaultSeqComparatorOptions = SeqComparatorOptions{ K: 32, MinPrefix: 11, Chaining2Options: Chaining2Options{ MaxGap: 50, MinScore: 50, MaxDistance: 50, BandBase: 100, }, MinAlignedFraction: 0, }
DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.
var MainVersion uint8 = 3
MainVersion is use for checking compatibility
var MinorVersion uint8 = 1
MinorVersion is less important
var RootCmd = &cobra.Command{ Use: "lexicmap", Short: "efficient sequence alignment against millions of prokaryotic genomes", Long: fmt.Sprintf(` LexicMap: efficient sequence alignment against millions of prokaryotic genomes Version: v%s (%s) Documents: https://bioinf.shenwei.me/LexicMap Source code: https://github.com/shenwei356/LexicMap `, VERSION, COMMIT), }
RootCmd represents the base command when called without any subcommands
var Strands = [2]byte{'+', '-'}
Strands could be used to output strand for a reverse complement flag
var VERSION = "0.5.0"
VERSION is the version
Functions ¶
func BuildIndex ¶ added in v0.2.0
func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
BuildIndex builds index from a list of input files
func CheckIndexBuildingOptions ¶ added in v0.2.0
func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
CheckIndexBuildingOptions checks some important options
func CheckIndexSearchingOptions ¶ added in v0.2.0
func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
func ClearSubstrPairs ¶ added in v0.2.0
func ClearSubstrPairs(subs *[]*SubstrPair, k int)
ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.
func Combinations2 ¶
Note: set should not have duplicates
func Execute ¶
func Execute()
Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.
func IntSlice2StringSlice ¶
func ParseByteSize ¶ added in v0.4.0
ParseByteSize parses byte size from string
func RecycleChaining2Result ¶ added in v0.2.0
func RecycleChaining2Result(chains *[]*Chain2Result)
RecycleChaining2Result reycles the chaining paths. Please remember to call this after using the results.
func RecycleChainingResult ¶ added in v0.2.0
func RecycleChainingResult(chains *[]*[]int)
RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.
func RecycleSeqComparatorResult ¶ added in v0.2.0
func RecycleSeqComparatorResult(r *SeqComparatorResult)
RecycleSeqComparatorResult recycles a SeqComparatorResult
func RecycleSubstrPairs ¶ added in v0.2.0
func RecycleSubstrPairs(subs *[]*SubstrPair)
RecycleSubstrPairs recycles a list of SubstrPairs
func TrimSubStrPairs ¶ added in v0.5.0
func TrimSubStrPairs(subs *[]*SubstrPair, k int)
TrimSubStrPairs trims anchors for query/subjects with tandem repeats in either end.
case 1: embeded anchor in query/target
61: 156-186 (+) vs 1163-1193 (+), len:31 62: 157-187 (-) vs 1164-1194 (-), len:31 63: 158-188 (+) vs 1165-1195 (+), len:31 64: 168-195 (-) vs 1168-1195 (-), len:28 65: 175-202 (-) vs 1168-1195 (-), len:28 <--- 66: 182-209 (-) vs 1168-1195 (-), len:28 <--- 67: 189-216 (-) vs 1168-1195 (-), len:28 <--- 68: 196-223 (+) vs 1168-1195 (+), len:28 <--- 69: 203-230 (+) vs 1168-1195 (+), len:28 <--- 70: 210-237 (-) vs 1168-1195 (-), len:28 <--- 71: 217-244 (-) vs 1168-1195 (-), len:28 <--- gap=7, overlap=28 (28/28)
case 2: big overlap + big gap
727: 789-819 (-) vs 789-819 (-), len:31 728: 790-820 (-) vs 790-820 (-), len:31 729: 804-821 (-) vs 821-838 (-), len:18 <--- gap=17, overlap=17 (17/18)
Types ¶
type Chain2Result ¶ added in v0.3.0
type Chain2Result struct { NAnchors int // The number of substrings AlignedFraction float64 MatchedBases int // The number of matched bases. AlignedBasesQ int // The number of aligned bases in Query sequence AlignedBasesT int // The number of aligned bases in Subject sequence PIdent float64 // percentage of identity AlignedLength int // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT Gaps int // The number of gaps QBegin, QEnd int // Query begin/end position (0-based) TBegin, TEnd int // Target begin/end position (0-based) // for output CIGAR []byte // cigar string QSeq []byte // query seq TSeq []byte // target seq Alignment []byte // alignment text // contains filtered or unexported fields }
Chain2Result represents a result of a chain
func (*Chain2Result) Reset ¶ added in v0.3.0
func (r *Chain2Result) Reset()
Reset resets a Chain2Result
type Chainer ¶ added in v0.2.0
type Chainer struct {
// contains filtered or unexported fields
}
Chainer is an object for chaining the lexichash substrings between query and reference sequences.
func NewChainer ¶ added in v0.2.0
func NewChainer(options *ChainingOptions) *Chainer
NewChainer creates a new chainer.
type Chainer2 ¶ added in v0.2.0
type Chainer2 struct {
// contains filtered or unexported fields
}
Chainer2 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.
func NewChainer2 ¶ added in v0.2.0
func NewChainer2(options *Chaining2Options) *Chainer2
NewChainer creates a new chainer.
func (*Chainer2) Chain ¶ added in v0.2.0
func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)
Chain finds the possible chain paths. Please remember to call RecycleChaining2Result after using the results. Returned results:
- Chain2Results.
- The number of matched bases.
- The number of aligned bases.
- QBegin.
- QEnd.
- TBegin.
- TEnd.
type Chaining2Options ¶ added in v0.2.0
type Chaining2Options struct { MaxGap int MinScore int // minimum score of a chain MinAlignLen int MinIdentity float64 MaxDistance int BandCount int // only check i in range of i − A < j < i BandBase int // only check i where i.Qstart+i.Len + A < j.Qstart }
Chaining2Options contains all options in chaining.
type ChainingOptions ¶ added in v0.2.0
ChainingOptions contains all options in chaining.
type Index ¶ added in v0.2.0
type Index struct { // k-mer-value searchers Searchers []*kv.Searcher InMemorySearchers []*kv.InMemorySearcher // contains filtered or unexported fields }
Index creates a LexicMap index from a path and supports searching with query sequences.
func NewIndexSearcher ¶ added in v0.2.0
func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)
NewIndexSearcher creates a new searcher
func (*Index) RecycleSearchResult ¶ added in v0.2.0
func (idx *Index) RecycleSearchResult(r *SearchResult)
RecycleSearchResults recycles a search result object
func (*Index) RecycleSearchResults ¶ added in v0.2.0
func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
RecycleSearchResults recycles search results objects
func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0
func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
RecycleSimilarityDetails recycles a list of SimilarityDetails
func (*Index) Search ¶ added in v0.2.0
func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().
func (*Index) SetSeqCompareOptions ¶ added in v0.2.0
func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
SetSeqCompareOptions sets the sequence comparing options
type IndexBuildingOptions ¶ added in v0.2.0
type IndexBuildingOptions struct { // general NumCPUs int Verbose bool // show log Log2File bool // log file Force bool // force overwrite existed index MaxOpenFiles int // maximum opened files, used in merging indexes MergeThreads int // Maximum Concurrent Merge Jobs MinSeqLen int // minimum sequence length, should be >= k // skipping extremely large genome MaxGenomeSize int // Maximum genome size. Extremely large genomes (non-isolate assemblies) will be skipped BigGenomeFile string // Out file of skipped files with genomes // LexicHash MaskFile string // file of custom masks K int // k-mer size Masks int // number of masks RandSeed int64 // random seed // filling sketching deserts DisableDesertFilling bool // disable desert filling (just for analysis index) DesertMaxLen uint32 // maxi length of sketching deserts DesertExpectedSeedDist int // expected distance between seeds DesertSeedPosRange int // the upstream and down stream region for adding a seeds // generate mask from the top N biggest genomes TopN int // Select the the top N largest genomes for generating masks PrefixExt int // Extension length of prefixes Chunks int // the number of chunks for storing k-mer data Partitions int // the number of partitions for indexing k-mer data GenomeBatchSize int // the maximum number of genomes of a batch ReRefName *regexp.Regexp // for extracting genome id from the file name ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern ContigInterval int // the length of N's between contigs SaveSeedPositions bool Debug bool }
IndexBuildingOptions contains all options for building an LexicMap index.
type IndexInfo ¶ added in v0.2.0
type IndexInfo struct { MainVersion uint8 `toml:"main-version" comment:"Index format"` MinorVersion uint8 `toml:"minor-version"` K uint8 `toml:"max-K" comment:"LexicHash"` Masks int `toml:"masks"` RandSeed int64 `toml:"rand-seed"` MaxDesert int `toml:"max-seed-dist" comment:"Seed distance"` SeedDistInDesert int `toml:"seed-dist-in-desert"` Chunks int `toml:"chunks" comment:"Seeds (k-mer-value data) files"` Partitions int `toml:"index-partitions"` InputGenomes int `toml:"input-genomes" comment:"Input genomes"` Genomes int `` /* 239-byte string literal not displayed */ GenomeBatchSize int `toml:"genome-batch-size"` GenomeBatches int `toml:"genome-batches"` ContigInterval int `toml:"contig-interval"` }
IndexInfo contains summary of the index
type IndexSearchingOptions ¶ added in v0.2.0
type IndexSearchingOptions struct { // general NumCPUs int Verbose bool // show log Log2File bool // log file MaxOpenFiles int // maximum opened files, used in merging indexes InMemorySearch bool // load the seed/kv data into memory MinPrefix uint8 // minimum prefix length, e.g., 15 // MaxMismatch int // maximum mismatch, e.g., 3 MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20 // MinMatchedBases uint8 // the total matched bases TopN int // keep the topN scores, e.g, 10 // seeds chaining MaxGap float64 // e.g., 5000 MaxDistance float64 // e.g., 20k // alignment ExtendLength int // the length of extra sequence on the flanking of seeds. // seq similarity MinQueryAlignedFractionInAGenome float64 // minimum query aligned fraction in the target genome // WFA alignment MoreAccurateAlignment bool // Output OutputSeq bool // Debug bool }
IndexSearchingOptions contains all options for searching
type Options ¶
type Options struct { NumCPUs int Verbose bool LogFile string Log2File bool Compress bool CompressionLevel int }
Options contains the global flags
type Query ¶
type Query struct {
// contains filtered or unexported fields
}
Query is an object for each query sequence, it also contains the query result.
type SearchResult ¶ added in v0.2.0
type SearchResult struct { BatchGenomeIndex uint64 // just for finding genome chunks of the same genome GenomeBatch int GenomeIndex int ID []byte GenomeSize int Subs *[]*SubstrPair // matched substring pairs (query,target) Score float64 // score for soring Chains *[]*[]int // more about the alignment detail SimilarityDetails *[]*SimilarityDetail // sequence comparing AlignedFraction float64 // query coverage per genome }
SearchResult stores a search result in a genome for the given query sequence.
func (*SearchResult) Reset ¶ added in v0.2.0
func (r *SearchResult) Reset()
func (*SearchResult) SortBySeqID ¶ added in v0.4.0
func (sr *SearchResult) SortBySeqID()
type SeqComparator ¶ added in v0.2.0
type SeqComparator struct {
// contains filtered or unexported fields
}
SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).
func NewSeqComparator ¶ added in v0.2.0
func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator
NewSeqComparator creates a new SeqComparator with given options. No options checking now.
func (*SeqComparator) Compare ¶ added in v0.2.0
func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)
Compare matchs k-mers for the query sequence (begin: end), chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.
func (*SeqComparator) Index ¶ added in v0.2.0
func (cpr *SeqComparator) Index(s []byte) error
Index initializes the SeqComparator with the query sequence.
func (*SeqComparator) RecycleIndex ¶ added in v0.2.0
func (cpr *SeqComparator) RecycleIndex()
RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.
type SeqComparatorOptions ¶ added in v0.2.0
type SeqComparatorOptions struct { // indexing K uint8 MinPrefix uint8 // chaining Chaining2Options // seq similarity MinAlignedFraction float64 // minimum query aligned fraction in a HSP MinIdentity float64 }
SeqComparatorOptions contains options for comparing two sequences.
type SeqComparatorResult ¶ added in v0.2.0
type SeqComparatorResult struct { AlignedBases int // The number of aligned bases. AlignedFraction float64 // query (original query) coverage per HSP MatchedBases int PIdent float64 QueryLen int // length of the original query, used to compute/update AlignedFraction QBegin int QEnd int TBegin int TEnd int TSeq []byte // target seq Chains *[]*Chain2Result }
SeqComparatorResult contains the details of a seq comparison result.
func (*SeqComparatorResult) Update ¶ added in v0.3.0
func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)
Update updates the data with new chains. However it does not considerate gaps.
func (*SeqComparatorResult) Update2 ¶ added in v0.4.0
func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)
Update2 only compute the aligned fraction for all chains
type SimilarityDetail ¶ added in v0.2.0
type SimilarityDetail struct { // QBegin int // QEnd int // TBegin int // TEnd int RC bool SimilarityScore float64 Similarity *SeqComparatorResult // Chain *[]int NSeeds int // sequence details SeqLen int SeqID []byte // seqid of the region }
SimilarityDetail is the similarity detail of one reference sequence
type SubstrPair ¶ added in v0.2.0
type SubstrPair struct { QBegin int32 // start position of the substring (0-based) in query TBegin int32 // start position of the substring (0-based) in reference Len uint8 // prefix length TRC bool // is the substring from the reference seq on the negative strand. QRC bool // is the substring from the query seq on the negative strand. }
SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.
func (SubstrPair) String ¶ added in v0.2.0
func (s SubstrPair) String() string
type Uint64Slice ¶
type Uint64Slice []uint64
func (Uint64Slice) Len ¶
func (s Uint64Slice) Len() int
func (Uint64Slice) Less ¶
func (s Uint64Slice) Less(i, j int) bool
func (*Uint64Slice) Pop ¶
func (s *Uint64Slice) Pop() interface{}
func (*Uint64Slice) Push ¶
func (s *Uint64Slice) Push(x interface{})
func (Uint64Slice) Swap ¶
func (s Uint64Slice) Swap(i, j int)
Source Files ¶
- 2blast.go
- autocomplete.go
- genomes.go
- index.go
- kmers.go
- lib-chaining.go
- lib-chaining2.go
- lib-index-build.go
- lib-index-merge.go
- lib-index-search.go
- lib-seq_compare.go
- masks.go
- re-merge.go
- reindex-seeds.go
- root.go
- search.go
- seed-pos.go
- subseq.go
- util-cli.go
- util-io.go
- util-logging.go
- util.go
- utils.go
- version.go