Documentation
¶
Index ¶
- Constants
- Variables
- func CountPattern(seq []byte, pattern Pattern) int
- func ErrorAbort(err error)
- func Execute() error
- func L50(lengths []int64) int64
- func Make2DGArraySlice(m, n int) [][]GArray
- func Make2DSlice(m, n int) [][]int
- func Make2DSliceInt64(m, n int) [][]int64
- func MutInsertion(genome eaopt.Slice, rng *rand.Rand)
- func MutInversion(genome eaopt.Slice, rng *rand.Rand)
- func MutPermute(genome eaopt.Slice, rng *rand.Rand)
- func MutSplice(genome eaopt.Slice, rng *rand.Rand)
- func OutlierCutoff(a []float64) (float64, float64)
- func Percentage(a, b int) string
- func ReadCSVLines(filename string) [][]string
- func RemoveExt(filename string) string
- func Round(input float64) float64
- func SumLog(a []int) float64
- type AGP
- type AGPLine
- type AlleleGroup
- type Alleler
- type Anchorer
- type AnchorerJSON
- type Assesser
- type BedLine
- type Builder
- type CLM
- func (r *CLM) Activate(shuffle bool, rng *rand.Rand)
- func (r *CLM) EvaluateQ() float64
- func (r *CLM) GARun(fwtour *os.File, opt *Optimizer, phase int) Tour
- func (r *CLM) M() [][]int
- func (r *CLM) O() *mat64.SymDense
- func (r *CLM) OptimizeOrdering(fwtour *os.File, opt *Optimizer, phase int)
- func (r *CLM) OptimizeOrientations(fwtour *os.File, phase int) (string, string)
- func (r *CLM) Q() [][]GArray
- type CLMLine
- type Clusters
- type Contact
- type Contig
- type ContigAB
- type ContigInfo
- type ContigPair
- type CtgAlleleGroupPair
- type Edge
- type Extracter
- type GArray
- type Graph
- type Link
- type LinkDensityModel
- type Node
- type OO
- type OOLine
- type Optimizer
- type OrientedPair
- type PAFFile
- type PAFRecord
- type Pair
- type Partitioner
- type Path
- type PathSet
- type Pattern
- type Piler
- type Plotter
- type Pruner
- type RECountsFile
- type RECountsRecord
- type Range
- type SparseMatrix
- type Tag
- type Tig
- type TigF
- type Tour
- func (r Tour) Append(q eaopt.Slice) eaopt.Slice
- func (r Tour) At(i int) interface{}
- func (r Tour) Clone() eaopt.Genome
- func (r Tour) Copy() eaopt.Slice
- func (r Tour) Crossover(_ eaopt.Genome, _ *rand.Rand)
- func (r Tour) Evaluate() (float64, error)
- func (r Tour) EvaluateSumLog() (float64, error)
- func (r Tour) Len() int
- func (r Tour) Mutate(rng *rand.Rand)
- func (r Tour) Replace(q eaopt.Slice)
- func (r Tour) Set(i int, v interface{})
- func (r Tour) Shuffle(rng *rand.Rand)
- func (r Tour) Slice(a, b int) eaopt.Slice
- func (r Tour) Split(k int) (eaopt.Slice, eaopt.Slice)
- func (r Tour) Swap(i, j int)
Constants ¶
const ( // LineWidth specifies how many bases to show per line LineWidth = 60 // LargeSequence will notify the writer to send a notification LargeSequence = 1000000 )
const ( // Version is the current version of ALLHIC Version = "0.9.13" // LB is lower bound for GoldenArray LB = 18 // UB is upper bound for GoldenArray UB = 29 // BB is span for GoldenArray BB = UB - LB + 1 // PHI is natural log of golden ratio PHI = 0.4812118250596684 // math.Log(1.61803398875) // OUTLIERTHRESHOLD is how many deviation from MAD OUTLIERTHRESHOLD = 3.5 // MINSIZE is the minimum size cutoff for tig to be considered MINSIZE = 10000 // GeometricBinSize is the max/min ratio for each bin GeometricBinSize = 1.0442737824274138403219664787399 // MinLinkDist is the minimum link distance we care about MinLinkDist = 1 << 11 // DefaultRE is the default restriction site used DefaultRE = "GATC" // MinLinks is the minimum number of links between contig pair to consider MinLinks = 3 // MaxLinkDist is the maximum link distance we care about MaxLinkDist = 1 << 27 // BigNorm is a big integer multiplier so we don't have to mess with float64 BigNorm = int64(1000000000000) // MinAvgLinkage is the minimum cutoff for merging clusters MinAvgLinkage = 0 // LinkDist specifies to maximum size of the links going over a certain position LinkDist = int64(1000000) // Seed is the random seed Seed = 42 // Npop is the population size used in GA Npop = 100 // Ngen is the number of generations for convergence Ngen = 5000 // MutaProb is the mutation probability in GA MutaProb = 0.2 // MinREs is the minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES) MinREs = 10 // MaxLinkDensity is the density threshold before marking a contig as 'repetitive' (CLUSTER_MAX_LINK_DENSITY) MaxLinkDensity = 2 // NonInformativeRatio is the cutoff for recovering skipped contigs back into the clusters (CLUSTER_NON-INFORMATIVE_RATIO) NonInformativeRatio = 3 // REHeader is the first line in the RE counts file REHeader = "#Contig\tRECounts\tLength\n" // PairsFileHeader is the first line in the pairs.txt file PairsFileHeader = "#X\tY\tContig1\tContig2\tRE1\tRE2\tObservedLinks\tExpectedLinksIfAdjacent\tLabel\n" // DistributionHeader is the first line in the distribution.txt file DistributionHeader = "#Bin\tBinStart\tBinSize\tNumLinks\tTotalSize\tLinkDensity\n" // PostProbHeader is the first line in the postprob file PostProbHeader = "#SeqID\tStart\tEnd\tContig\tPostProb\n" )
const ACCEPT = "ACCEPT"
ACCEPT tag show to accept orientation flip
const LIMIT = 10000000
LIMIT determines the largest distance for two tigs to add to total score
const REJECT = "REJECT"
REJECT tag show to reject orientation flip
Variables ¶
var Backend = logging.NewLogBackend(os.Stderr, "", 0)
Backend is the default stderr output
var BackendFormatter = logging.NewBackendFormatter(Backend, format)
BackendFormatter contains the fancy debug formatter
var GR = [...]int{5778, 9349, 15127, 24476,
39603, 64079, 103682, 167761,
271443, 439204, 710647, 1149851}
GR is a precomputed list of exponents of golden ratio phi
var LimitLog = math.Log(LIMIT)
LimitLog is the Log of LIMIT
Functions ¶
func CountPattern ¶ added in v0.9.12
CountPattern count how many times a pattern occurs in seq
func ErrorAbort ¶ added in v0.8.12
func ErrorAbort(err error)
ErrorAbort logs an error message and then exit with retcode of 1
func L50 ¶ added in v0.8.4
L50 returns the sequence length L where half of the genome is covered in contigs of length >= L50
func Make2DGArraySlice ¶
Make2DGArraySlice allocates a 2D matrix with shape (m, n)
func Make2DSlice ¶
Make2DSlice allocates a 2D matrix with shape (m, n)
func Make2DSliceInt64 ¶ added in v0.8.4
Make2DSliceInt64 allocates a 2D int64 matrix with shape (m, n)
func MutInsertion ¶
MutInsertion applies insertion operation on the genome
func MutInversion ¶
MutInversion applies inversion operation on the genome
func MutPermute ¶ added in v0.8.4
MutPermute permutes two genes at random n times
func MutSplice ¶ added in v0.8.4
MutSplice splits a genome in 2 and glues the pieces back together in reverse order
func OutlierCutoff ¶
OutlierCutoff implements Iglewicz and Hoaglin's robust, returns the cutoff values - lower bound and upper bound.
func Percentage ¶ added in v0.8.4
Percentage prints a human readable message of the percentage
func ReadCSVLines ¶ added in v0.8.4
ReadCSVLines parses all the csv lines into 2D array of tokens
Types ¶
type AGP ¶ added in v0.8.4
type AGP struct {
// contains filtered or unexported fields
}
AGP is a collection of AGPLines
type AGPLine ¶ added in v0.8.4
type AGPLine struct {
// contains filtered or unexported fields
}
AGPLine is a line in the AGP file
type AlleleGroup ¶ added in v0.8.12
type AlleleGroup []string
AlleleGroup stores the contig names that are considered allelic
type Alleler ¶ added in v0.9.8
type Alleler struct { PafFile string // ex. "genome.paf" ReFile string // ex. "genome.counts_GATC.txt" Paf PAFFile // The PAF data ReCounts RECountsFile // The RE data }
Alleler is responsible for building the allele table
type Anchorer ¶ added in v0.8.4
Anchorer runs the merging algorithm
func (*Anchorer) ExtractInterContigLinks ¶ added in v0.8.4
func (r *Anchorer) ExtractInterContigLinks()
ExtractInterContigLinks extracts links from the Bamfile
type AnchorerJSON ¶ added in v0.8.4
type AnchorerJSON struct { Starts map[string]int64 `json:"starts"` Sizes map[string]int64 `json:"sizes"` TotalBins int `json:"total_bins"` DistBinStarts []int64 `json:"distbinstarts"` DistBinSizes []int64 `json:"distbinsizes"` Resolution int64 `json:"resolution"` }
AnchorerJSON keeps a succinct subset of all fields in Anchorer
type Assesser ¶ added in v0.8.4
type Assesser struct { Bamfile string Bedfile string Seqid string // contains filtered or unexported fields }
Assesser takes input of bamfile and bedfile and output per contig confidence in the orientation
Summary of algorithm: Step 1. Take all intra-contig links and build the background distribution Step 2. Loop through each contig, compute the likelihood of all links coming
out of the contig, assuming + orientation, and - orientation, separately
Step 3. Normalize the likelihood to get the posterior probability (implicit assumption)
of equal prior probability for each contig
type BedLine ¶ added in v0.8.4
type BedLine struct {
// contains filtered or unexported fields
}
BedLine stores the information from each line in the bedfile
type Builder ¶ added in v0.8.4
type Builder struct { Tourfiles []string Fastafile string // Output file OutAGPfile string OutFastafile string }
Builder reconstructs the genome release AGP and FASTA files
type CLM ¶ added in v0.8.4
type CLM struct { REfile string Clmfile string Tigs []*TigF Tour Tour Signs []byte // contains filtered or unexported fields }
CLM has the following format:
tig00046211+ tig00063795+ 1 53173 tig00046211+ tig00063795- 1 116050 tig00046211- tig00063795+ 1 71155 tig00046211- tig00063795- 1 134032 tig00030676+ tig00077819+ 7 136407 87625 87625 106905 102218 169660 169660 tig00030676+ tig00077819- 7 126178 152952 152952 35680 118923 98367 98367 tig00030676- tig00077819+ 7 118651 91877 91877 209149 125906 146462 146462 tig00030676- tig00077819- 7 108422 157204 157204 137924 142611 75169 75169
func (*CLM) Activate ¶ added in v0.8.4
Activate selects active contigs in the current partition. This is the setup phase of the algorithm, and supports two modes:
- "de novo": This is useful at the start of a new run where no tours are available. We select the strong contigs that have significant number of links to other contigs in the partition. We build a histogram of link density (# links per bp) and remove the contigs that appear to be outliers. The orientations are derived from the matrix decomposition of the pairwise strandedness matrix O.
- "hotstart": This is useful when there was a past run, with a given tourfile. In this case, the active contig list and orientations are derived from the last tour in the file.
func (*CLM) EvaluateQ ¶ added in v0.8.4
EvaluateQ sums up all distance is defined as the sizes of interleaving contigs plus the actual link distances. Maximize Sum(1 / distance) for all links. For performance consideration, we actually use a histogram to approximate all link distances. See goldenArray() for details.
func (*CLM) M ¶ added in v0.8.4
M yields a contact frequency matrix, where each cell contains how many links between i-th and j-th contig
func (*CLM) O ¶ added in v0.8.4
O yields a pairwise orientation matrix, where each cell contains the strandedness times the number of links between i-th and j-th contig
func (*CLM) OptimizeOrdering ¶ added in v0.8.4
OptimizeOrdering changes the ordering of contigs by Genetic Algorithm
func (*CLM) OptimizeOrientations ¶ added in v0.8.4
OptimizeOrientations changes the orientations of contigs by using heuristic flipping algorithms.
type CLMLine ¶ added in v0.8.4
type CLMLine struct {
// contains filtered or unexported fields
}
CLMLine stores the data structure of the CLM file
type Contact ¶
type Contact struct {
// contains filtered or unexported fields
}
Contact stores how many links between two contigs
type Contig ¶ added in v0.8.4
type Contig struct {
// contains filtered or unexported fields
}
Contig stores the name and length of each contig
type ContigInfo ¶ added in v0.8.4
type ContigInfo struct {
// contains filtered or unexported fields
}
ContigInfo stores results calculated from f
func (ContigInfo) String ¶ added in v0.8.4
func (r ContigInfo) String() string
String outputs the string representation of ContigInfo
type ContigPair ¶ added in v0.8.4
ContigPair stores results calculated from findDistanceBetweenContigs
func (ContigPair) String ¶ added in v0.8.4
func (r ContigPair) String() string
String outputs the string representation of ContigInfo
type CtgAlleleGroupPair ¶ added in v0.8.12
type CtgAlleleGroupPair struct {
// contains filtered or unexported fields
}
CtgAlleleGroupPair stores a pair of the contig and the alleleGroup it resides in
type Edge ¶ added in v0.8.4
type Edge struct {
// contains filtered or unexported fields
}
Edge is between two nodes in a graph
type Extracter ¶ added in v0.8.4
type Extracter struct { Bamfile string Fastafile string RE string MinLinks int // Output file OutContigsfile string OutPairsfile string OutClmfile string // contains filtered or unexported fields }
Extracter processes the distribution step
type GArray ¶
GArray contains golden array of size BB
func GoldenArray ¶
GoldenArray is given list of ints, we aggregate similar values so that it becomes an array of multiples of phi, where phi is the golden ratio.
phi ^ 18 = 5778 phi ^ 29 = 1149851
So the array of counts go between 843 to 788196. One triva is that the exponents of phi gets closer to integers as N grows. See interesting discussion here: <https://www.johndcook.com/blog/2017/03/22/golden-powers-are-nearly-integers/>
type Link ¶ added in v0.8.4
type Link struct {
// contains filtered or unexported fields
}
Link contains a specific inter-contig link
type LinkDensityModel ¶ added in v0.8.4
type LinkDensityModel struct {
A, B float64
// contains filtered or unexported fields
}
LinkDensityModel is a power-law model Y = A * X ^ B, stores co-efficients this density than needs to multiply C - X to make it a probability distribution where C is chromosome length
func NewLinkDensityModel ¶ added in v0.8.4
func NewLinkDensityModel() *LinkDensityModel
NewLinkDensityModel makes an empty link distribution ready to be filled in
func (*LinkDensityModel) BinSize ¶ added in v0.8.4
func (r *LinkDensityModel) BinSize(i int) int
BinSize returns the size of each bin
type Node ¶ added in v0.8.4
type Node struct {
// contains filtered or unexported fields
}
Node is the scaffold ends, Left or Right (5` or 3`)
type OO ¶ added in v0.8.4
type OO struct {
// contains filtered or unexported fields
}
OO describes a scaffolding experiment and contains an array of OOLine
func (*OO) ParseAllTours ¶ added in v0.8.4
ParseAllTours reads tour from file
A tour file has the following format: > name contig1+ contig2- contig3?
type OOLine ¶ added in v0.8.4
type OOLine struct {
// contains filtered or unexported fields
}
OOLine describes a simple contig entry in a scaffolding experiment
type Optimizer ¶
type Optimizer struct { REfile string Clmfile string RunGA bool Resume bool Seed int64 NPop int NGen int MutProb float64 CrossProb float64 // Output files OutTourFile string // contains filtered or unexported fields }
Optimizer runs the order-and-orientation procedure, given a clmfile
type OrientedPair ¶
type OrientedPair struct {
// contains filtered or unexported fields
}
OrientedPair contains two contigs and their orientations
type PAFFile ¶ added in v0.9.8
type PAFFile struct { PafFile string // File path of the paf Records []PAFRecord // List of PAF records }
PAFFile parses the PAF file into a set of records
func (*PAFFile) ParseRecords ¶ added in v0.9.8
func (r *PAFFile) ParseRecords()
ParseRecords collects all records in memory
type PAFRecord ¶ added in v0.9.8
type PAFRecord struct { Query string // Query sequence name QueryLength int // Query sequence length QueryStart int // Query start (0-based) QueryEnd int // Query end (0-based) RelativeStrand byte // `+' if query and target on the same strand; `-' if opposite Target string // Target sequence name TargetLength int // Target sequence length TargetStart int // Target start on original strand (0-based) TargetEnd int // Target end on original strand (0-based) NumMatches int // Number of matching bases in the mapping AlignmentLength int // Number bases, including gaps, in the mapping MappingQuality uint8 // Mapping quality (0-255 with 255 for missing) Tags map[string]Tag // Tags, e.g. tp, cm etc. }
PAFRecord holds one line in the PAF file The file spec: https://github.com/lh3/miniasm/blob/master/PAF.md
type Pair ¶
type Pair struct {
// contains filtered or unexported fields
}
Pair contains two contigs in contact
type Partitioner ¶
type Partitioner struct { Contigsfile string PairsFile string K int // Output files OutREfiles []string // Parameters MinREs int MaxLinkDensity int NonInformativeRatio int // contains filtered or unexported fields }
Partitioner converts the bamfile into a matrix of link counts
func (*Partitioner) Cluster ¶ added in v0.8.4
func (r *Partitioner) Cluster()
Cluster performs the hierarchical clustering This function is a re-implementation of the AHClustering() function in LACHESIS
type Path ¶ added in v0.8.4
type Path struct {
LNode, RNode *Node // Two nodes at each end
// contains filtered or unexported fields
}
Path is a collection of ordered contigs
type Pattern ¶ added in v0.9.12
type Pattern struct {
// contains filtered or unexported fields
}
Pattern is a string pattern that is either simple or a regex
func MakePattern ¶ added in v0.9.12
MakePattern builds a regex-aware pattern that could be passed around and counted Multiple patterns will be split at comma (,) and N is converted to [ACGT]
type Piler ¶ added in v0.8.4
type Piler struct {
BS, BE []int64
}
Piler has the data structures to support overlap counts Here we use a data structure described in: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530906/ We store the starts and ends of links in sorted arrays The `icount` algorithm then search an interval (or in this case) point query into these sorted interval ends
type Plotter ¶ added in v0.8.4
type Plotter struct {
Anchor *Anchorer
}
Plotter extracts a matrix of link counts and plot a heatmap
type Pruner ¶ added in v0.8.4
type Pruner struct { AllelesFile string PairsFile string // contains filtered or unexported fields }
Pruner processes the pruning step
func (*Pruner) Run ¶ added in v0.8.4
func (r *Pruner) Run()
Run calls the pruning steps The pruning algorithm is a heuristic method that removes the following pairs:
- Allelic, these are directly the pairs of allelic contigs given in the allele table
- Cross-allelic, these are any contigs that connect to the allelic contigs so we only keep the best contig pair
Pruned edges are then annotated as allelic/cross-allelic/ok
type RECountsFile ¶ added in v0.9.8
type RECountsFile struct { Filename string // File path Records []RECountsRecord // List of records }
RECountsFile holds a list of RECountsRecord
func (*RECountsFile) ParseRecords ¶ added in v0.9.8
func (r *RECountsFile) ParseRecords()
ParseRecords reads a list of records from REFile
type RECountsRecord ¶ added in v0.9.8
type RECountsRecord struct { Contig string // Name of the contig RECounts int // Number of restriction sites Length int // Length of the contig, in base pairs }
RECountsRecord contains a line in the RE file
type Range ¶ added in v0.8.4
type Range struct {
// contains filtered or unexported fields
}
Range tracks contig:start-end
type SparseMatrix ¶ added in v0.8.4
SparseMatrix stores a big square matrix that is sparse
type Tag ¶ added in v0.9.8
type Tag = interface{}
Tag represents the additional info in the 12+ columns in the PAF file. The type of the tag is dynamically determined
See also: https://github.com/lh3/minimap2/blob/master/minimap2.1
The following tags are supported Tag Type Description _ tp A Type of aln: P/primary, S/secondary and I,i/inversion cm i Number of minimizers on the chain s1 i Chaining score s2 i Chaining score of the best secondary chain NM i Total number of mismatches and gaps in the alignment MD Z To generate the ref sequence in the alignment AS i DP alignment score ms i DP score of the max scoring segment in the alignment nn i Number of ambiguous bases in the alignment ts A Transcript strand (splice mode only) cg Z CIGAR string (only in PAF) cs Z Difference string dv f Approximate per-base sequence divergence de f Gap-compressed per-base sequence divergence rl i Length of query regions harboring repetitive seeds
type Tour ¶
Tour stores a number of tigs along with 2D matrices for evaluation
func (Tour) Crossover ¶
Crossover a Tour with another Tour by using Partially Mixed Crossover (PMX).
func (Tour) EvaluateSumLog ¶ added in v0.8.12
EvaluateSumLog calculates a score for the current tour