bam

package

v0.0.0-...-d966d87 Latest Latest Go to latest Published: Aug 18, 2020 License: Apache-2.0 Imports: 26 Imported by: 14

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/grailbio/bio

Links

Open Source Insights

Documentation ¶

Overview ¶

Package bam provides types and functions that augment BAM and SAM packages in github.com/biogo/hts.

Index ¶

Constants
Variables
func BaseAtPos(record *sam.Record, pos int) (byte, bool)
func ClearAuxTags(r *sam.Record, tagsToRemove []sam.Tag)
func CoordFromSAMRecord(rec *sam.Record, seq int32) biopb.Coord
func FivePrimeClipDistance(record *sam.Record) int
func GetCoordAtOffset(bamReader *bam.Reader, off bgzf.Offset) (biopb.Coord, error)
func HasNoMappedMate(record *sam.Record) bool
func IsDuplicate(record *sam.Record) bool
func IsLinearDuplicate(record *sam.Record) bool
func IsMateReverse(record *sam.Record) bool
func IsMateUnmapped(record *sam.Record) bool
func IsPaired(record *sam.Record) bool
func IsPrimary(record *sam.Record) bool
func IsProperPair(record *sam.Record) bool
func IsQCFail(record *sam.Record) bool
func IsQCFailed(record *sam.Record) bool
func IsRead1(record *sam.Record) bool
func IsRead2(record *sam.Record) bool
func IsReverse(record *sam.Record) bool
func IsReversedRead(record *sam.Record) bool
func IsSecondary(record *sam.Record) bool
func IsSupplementary(record *sam.Record) bool
func IsUnmapped(record *sam.Record) bool
func LeftClipDistance(record *sam.Record) int
func MateCoordFromSAMRecord(rec *sam.Record, seq int32) biopb.Coord
func NewCoord(ref *sam.Reference, pos int, seq int32) biopb.Coord
func NewShardChannel(shards []Shard) chan Shard
func QualAtPos(record *sam.Record, pos int) (byte, bool)
func RightClipDistance(record *sam.Record) int
func ShardToCoordRange(shard Shard) biopb.CoordRange
func ToBGZFOffset(voffset uint64) bgzf.Offset
func UnclippedEnd(record *sam.Record) int
func UnclippedFivePrimePosition(record *sam.Record) int
func UnclippedStart(record *sam.Record) int
func Unmarshal(b []byte, header *sam.Header) (*sam.Record, error)
func UnmarshalHeader(buf []byte) (*sam.Header, error)
func UnsafeBytesToCigar(src []byte) (cigar sam.Cigar)
func UnsafeBytesToDoublets(src []byte) (d []sam.Doublet)
func UnsafeCigarToBytes(src sam.Cigar) (d []byte)
func UnsafeDoubletsToBytes(src []sam.Doublet) (d []byte)
func ValidateShardList(header *sam.Header, shardList []Shard, padding int)
func WriteGIndex(w io.Writer, r io.Reader, byteInterval, parallelism int) error
type AdjacentBAMShard
- func (s *AdjacentBAMShard) Record() Pair
- func (s *AdjacentBAMShard) Scan() bool
type AdjacentShardedBAMReader
- func NewAdjacentShardedBAMReader(ctx context.Context, r io.Reader, recordsPerShard int, queueSize int) (*AdjacentShardedBAMReader, error)
- func (r *AdjacentShardedBAMReader) GetShard() *AdjacentBAMShard
- func (r *AdjacentShardedBAMReader) Header() *sam.Header
type Bin
type Chunk
type CoordGenerator
- func NewCoordGenerator() CoordGenerator
- func (g *CoordGenerator) Generate(refID, pos int32) biopb.Coord
- func (g *CoordGenerator) GenerateFromRecord(rec *sam.Record) biopb.Coord
type FieldType
- func ParseFieldType(v string) (FieldType, error)
- func (f FieldType) String() string
type GIndex
- func ReadGIndex(r io.Reader) (gindex *GIndex, err error)
- func (idx *GIndex) RecordOffset(refID, pos int32, seq uint32) bgzf.Offset
- func (idx *GIndex) UnmappedOffset() bgzf.Offset
type GIndexEntry
type Index
- func ReadIndex(rawr io.Reader) (*Index, error)
- func (i *Index) AllOffsets() map[int][]bgzf.Offset
type Metadata
type Pair
type Reference
type Shard
- func CoordRangeToShard(header *sam.Header, r biopb.CoordRange, padding, shardIdx int) Shard
- func GetByteBasedShards(bamPath, baiPath string, bytesPerShard int64, minBases, padding int, ...) (shards []Shard, err error)
- func GetPositionBasedShards(header *sam.Header, shardSize int, padding int, includeUnmapped bool) ([]Shard, error)
- func UniversalShard(header *sam.Header) Shard
- func (s *Shard) CoordInShard(padding int, coord biopb.Coord) bool
- func (s *Shard) MateInShard(r *sam.Record) bool
- func (s *Shard) PadEnd(padding int) int
- func (s *Shard) PadStart(padding int) int
- func (s *Shard) PaddedEnd() int
- func (s *Shard) PaddedStart() int
- func (s *Shard) RecordInPaddedShard(r *sam.Record) bool
- func (s *Shard) RecordInShard(r *sam.Record) bool
- func (s *Shard) RecordInStartPadding(r *sam.Record) bool
- func (s *Shard) String() string
type ShardedBAMCompressor
- func (c *ShardedBAMCompressor) AddRecord(r *sam.Record) error
- func (c *ShardedBAMCompressor) CloseShard() error
- func (c *ShardedBAMCompressor) StartShard(shardNum int) error
type ShardedBAMWriter
- func NewShardedBAMWriter(w io.Writer, gzLevel, queueSize int, header *sam.Header) (*ShardedBAMWriter, error)
- func (bw *ShardedBAMWriter) Close() error
- func (bw *ShardedBAMWriter) GetCompressor() *ShardedBAMCompressor
type StrandType
- func GetStrand(flags sam.Flags) StrandType

Constants ¶

View Source

const CigarOpSize = int(unsafe.Sizeof(sam.CigarOp(0)))

CigarOpSize is the size of one sam.CigarOp, in bytes.

Variables ¶

View Source

var FieldNames = []string{
	"coord",
	"flags",
	"mapq",
	"cigar",
	"materefid",
	"matepos",
	"templen",
	"name",
	"seq",
	"qual",
	"aux",
}

FieldNames lists all the bam Field names.

View Source

var MappedRange = biopb.CoordRange{
	Start: biopb.Coord{0, 0, 0},
	Limit: biopb.Coord{biopb.LimitValidRefID, biopb.InfinityPos, 0},
}

MappedRange is a range that covers all mapped records.

View Source

var UniversalRange = biopb.CoordRange{
	Start: biopb.Coord{0, 0, 0},
	Limit: biopb.Coord{biopb.InfinityRefID, biopb.InfinityPos, 0},
}

UniversalRange is a range that covers all possible records.

Functions ¶

func BaseAtPos ¶

func BaseAtPos(record *sam.Record, pos int) (byte, bool)

BaseAtPos returns the base at reference pos (0 based) from record, if the mapped part of the read overlaps pos. If not, return (0, false). If pos is in the mapped portion of the read, but the reference base was skipped, then the returned value will be (0, true).

func ClearAuxTags ¶

func ClearAuxTags(r *sam.Record, tagsToRemove []sam.Tag)

ClearAuxTags removes all instances of the tags in tagsToRemove[] from r. (Current implementation is not designed for very large len(tagsToRemove); at some point map lookups become better.)

func CoordFromSAMRecord ¶

func CoordFromSAMRecord(rec *sam.Record, seq int32) biopb.Coord

CoordFromSAMRecord computes the biopb.Coord for the given record. It is a shorthand for biopb.CoordFromCoord(rec.Ref, rec.Pos, seq).

func FivePrimeClipDistance ¶

func FivePrimeClipDistance(record *sam.Record) int

FivePrimeClipDistance returns the total amount of clipping (both hard and soft) on the 5' side of record.

func GetCoordAtOffset ¶

func GetCoordAtOffset(bamReader *bam.Reader, off bgzf.Offset) (biopb.Coord, error)

GetCoordAtOffset starts reading BAM from "off", and finds the first place where the read position increases. It returns the record coordinate. Coord.Seq field is always zero.

func HasNoMappedMate ¶

func HasNoMappedMate(record *sam.Record) bool

HasNoMappedMate returns true if record is unpaired or has an unmapped mate.

func IsDuplicate ¶

func IsDuplicate(record *sam.Record) bool

IsDuplicate returns true if record is a duplicate.

func IsLinearDuplicate ¶

func IsLinearDuplicate(record *sam.Record) bool

IsLinearDuplicate returns true if record is a linear duplicate.

func IsMateReverse ¶

func IsMateReverse(record *sam.Record) bool

IsMateReverse returns true if mate of record maps to reverse strand.

func IsMateUnmapped ¶

func IsMateUnmapped(record *sam.Record) bool

IsMateUnmapped returns true if mate of record is unmapped.

func IsPaired ¶

func IsPaired(record *sam.Record) bool

IsPaired returns true if record is paired.

func IsPrimary ¶

func IsPrimary(record *sam.Record) bool

IsPrimary returns true if record is a primary alignment.

func IsProperPair ¶

func IsProperPair(record *sam.Record) bool

IsProperPair returns true if record is properly aligned.

func IsQCFail ¶

func IsQCFail(record *sam.Record) bool

IsQCFail returns true if record does not pass quality control filters.

func IsQCFailed ¶

func IsQCFailed(record *sam.Record) bool

IsQCFailed returns true if the QC failed flag is set on record.

func IsRead1 ¶

func IsRead1(record *sam.Record) bool

IsRead1 returns true if record is first in pair.

func IsRead2 ¶

func IsRead2(record *sam.Record) bool

IsRead2 returns true if record is second in pair.

func IsReverse ¶

func IsReverse(record *sam.Record) bool

IsReverse returns true if record maps to reverse strand.

func IsReversedRead ¶

func IsReversedRead(record *sam.Record) bool

IsReversedRead returns true if the reverse flag is set on record.

func IsSecondary ¶

func IsSecondary(record *sam.Record) bool

IsSecondary returns true if record is a secondary alignment.

func IsSupplementary ¶

func IsSupplementary(record *sam.Record) bool

IsSupplementary returns true if record is a supplementary alignment.

func IsUnmapped ¶

func IsUnmapped(record *sam.Record) bool

IsUnmapped returns true if record is unmapped.

func LeftClipDistance ¶

func LeftClipDistance(record *sam.Record) int

LeftClipDistance returns the total amount of clipping (both hard and soft) on the left-most side of record.

func MateCoordFromSAMRecord ¶

func MateCoordFromSAMRecord(rec *sam.Record, seq int32) biopb.Coord

func NewCoord ¶

func NewCoord(ref *sam.Reference, pos int, seq int32) biopb.Coord

NewCoord generates biopb.Coord from the given parameters.

func NewShardChannel ¶

func NewShardChannel(shards []Shard) chan Shard

NewShardChannel returns a closed channel containing the shards.

func QualAtPos ¶

func QualAtPos(record *sam.Record, pos int) (byte, bool)

QualAtPos returns the base quality byte at reference pos (0 based) from record, if the mapped part of the read overlaps pos. If not, return (0, false). If pos is in the mapped portion of the read, but the reference base was skipped, then the returned value will be (0, true).

func RightClipDistance ¶

func RightClipDistance(record *sam.Record) int

RightClipDistance returns the total amount of clipping (both hard and soft) on the right-most side of record.

func ShardToCoordRange ¶

func ShardToCoordRange(shard Shard) biopb.CoordRange

ShardToCoordRange converts bam.Shard to CoordRange.

func ToBGZFOffset ¶

func ToBGZFOffset(voffset uint64) bgzf.Offset

ToBGZFOffset takes a uint64 voffset and returns a bgzf.Offset.

func UnclippedEnd ¶

func UnclippedEnd(record *sam.Record) int

UnclippedStart returns the unclipped right-most position of record, regardless of record's read direction.

func UnclippedFivePrimePosition ¶

func UnclippedFivePrimePosition(record *sam.Record) int

UnclippedFivePrimePosition returns the unclipped 5' position of record.

func UnclippedStart ¶

func UnclippedStart(record *sam.Record) int

UnclippedStart returns the unclipped left-most position of record, regardless of record's read direction.

func Unmarshal ¶

func Unmarshal(b []byte, header *sam.Header) (*sam.Record, error)

Unmarshal a serialized BAM record.

func UnmarshalHeader ¶

func UnmarshalHeader(buf []byte) (*sam.Header, error)

UnmarshalHeader parses a sam.Header encoded in BAM binary format.

func UnsafeBytesToCigar ¶

func UnsafeBytesToCigar(src []byte) (cigar sam.Cigar)

UnsafeBytesToCigar casts src to sam.Cigar. "src" must store an array of uint32s (sam.CigarOps) in host byte order.

func UnsafeBytesToDoublets ¶

func UnsafeBytesToDoublets(src []byte) (d []sam.Doublet)

UnsafeBytesToDoublets casts []byte to []sam.Doublet.

func UnsafeCigarToBytes ¶

func UnsafeCigarToBytes(src sam.Cigar) (d []byte)

UnsafeCigarToBytes casts a cigar string to []byte.

func UnsafeDoubletsToBytes ¶

func UnsafeDoubletsToBytes(src []sam.Doublet) (d []byte)

UnsafeDoubletsToBytes casts []sam.Doublet to []byte.

func ValidateShardList ¶

func ValidateShardList(header *sam.Header, shardList []Shard, padding int)

ValidateShardList validates that shardList has sensible values. Exposed only for testing.

func WriteGIndex ¶

func WriteGIndex(w io.Writer, r io.Reader, byteInterval, parallelism int) error

WriteGIndex reads a .bam file from r, and writes a .gbai file to w. The spacing between voffset file locations will be approximately byteInterval, and parallelism controls the .bam file read parallelism. Currently, WriteGIndex will not create two index entries for a given (RefID, Pos) pair, i.e. Seq will always be zero. That means there will be only one entry for the entire unmapped region.

Types ¶

type AdjacentBAMShard ¶

type AdjacentBAMShard struct {
	// ShardIdx is the index of the shard.
	// Indexing starts at zero.
	ShardIdx int
	// contains filtered or unexported fields
}

AdjacentBAMShard represents an ordered subset of records from an AdjacentShardedBAMReader. The order of AdjacentBAMShard is determined by the shard's ShardIdx. If all of an AdjacentShardedBAMReader's AdjacentBAMShards were read sequentially from shard 0 to n, all the records would be read in the same order they appear in the underlying BAM file.

func (*AdjacentBAMShard) Record ¶

func (s *AdjacentBAMShard) Record() Pair

Record returns the current pair, or an error.

REQUIRES: Scan() has been called and its last call returned true.

func (*AdjacentBAMShard) Scan ¶

func (s *AdjacentBAMShard) Scan() bool

Scan reads the next record. It returns true if a record has been read or if an error is encountered, and false on end of data stream.

type AdjacentShardedBAMReader ¶

type AdjacentShardedBAMReader struct {
	// contains filtered or unexported fields
}

AdjacentShardedBAMReader provides a deterministic way to read a BAM file, provided that all records are grouped into adjacent pairs. Each AdjacentBAMShard has sequentially increasing shard numbers starting at zero. The records in each AdjacentBAMShard are in the same order as they appear in the underlying BAM file.

AdjacentBAMShards are returned one at a time by GetShard() as soon as a shard is ready. Shards are created automatically in a goroutine when NewAdjacentShardedBAMReader() is called. Each AdjacentBAMShard is thread-safe.

When records are adjacent, there is no BAM index so the the number of shards is indeterminate. Because of this, it is recommended that the caller limits the number of goroutines used for reading a BAM file. In this example, the number of goroutines is limited to the number of available CPUs. If there are more shards that can be concurrently processed than CPUs, there will be multiple shards sequentially processed in the same goroutine.

Example Use of AdjacentShardedBAMReader:

ctx := context.Background()
f, _ := os.Create("input.bam")
r, _ := NewAdjacentShardedBAMReader(ctx, f, 100000, 2)

err = traverse.CPU(func() error {
   for {
      shard := r.GetShard()
      if shard == nil { break }
      for shard.Scan() {
            pair := shard.Record()
            if pair.Err != nil { return pair.Err }
            // Do something with the record and
            // and use shard.ShardIdx to denote
            // the order of the shards.
      }
   }
   return nil
})

func NewAdjacentShardedBAMReader ¶

func NewAdjacentShardedBAMReader(ctx context.Context, r io.Reader, recordsPerShard int, queueSize int) (*AdjacentShardedBAMReader, error)

NewAdjacentShardedBAMReader returns a new AdjacentShardedBAMReader that allows for the concurrent reading of a BAM file with adjacent paired records.

func (*AdjacentShardedBAMReader) GetShard ¶

func (r *AdjacentShardedBAMReader) GetShard() *AdjacentBAMShard

GetShard returns one AdjacentBAMShard from AdjacentShardedBAMReader. GetShard will wait until a shard is available, or until AdjacentShardedBAMReader has no more shards to return.

func (r *AdjacentShardedBAMReader) Header() *sam.Header

Header returns the SAM Header held by the Reader.

type Bin ¶

type Bin struct {
	BinNum uint32
	Chunks []Chunk
}

Bin represents the bin data within a .bai file.

type Chunk ¶

type Chunk struct {
	Begin bgzf.Offset
	End   bgzf.Offset
}

Chunk represents the Chunk data within a .bai file.

type CoordGenerator ¶

type CoordGenerator struct {
	LastRec biopb.Coord
}

CoordGenerator is a helper class for computing the Coord.Seq value from a sam.Record. This object must be created per pam shard. Generate() must be called for every record that is being read or written to the pam file in order.

func NewCoordGenerator ¶

func NewCoordGenerator() CoordGenerator

NewCoordGenerator creates a new CoordGenerator.

func (*CoordGenerator) Generate ¶

func (g *CoordGenerator) Generate(refID, pos int32) biopb.Coord

Generate generates the Coord for the given (refid,pos).

REQUIRES: successive calls to this function must supply a non-decreasing sequnece of (ref,pos) values.

func (*CoordGenerator) GenerateFromRecord ¶

func (g *CoordGenerator) GenerateFromRecord(rec *sam.Record) biopb.Coord

GenerateFromRecord generates the Coord for the given record.

REQUIRES: successive calls to this function must supply record in non-decreasing coordinate order.

type FieldType ¶

type FieldType uint8

FieldType defines a sam.Record field. Each field is stored in a separate file.

const (
	// FieldCoord combines <sam.Reference.ID(), sam.Record.Pos>. They need to be
	// read together when seeking to a specific coordinate.
	FieldCoord FieldType = iota
	// Rest of Field* stands for the sam.Record field with the same name.
	FieldFlags
	FieldMapq
	FieldCigar
	FieldMateRefID
	FieldMatePos
	FieldTempLen
	FieldName
	FieldSeq
	FieldQual
	FieldAux

	// FieldInvalid is a sentinel
	FieldInvalid
	NumFields = int(FieldInvalid)
)

func ParseFieldType ¶

func ParseFieldType(v string) (FieldType, error)

ParseFieldType converts a string to FieldType. For example, "cigar" will return FieldCigar.

func (FieldType) String ¶

func (f FieldType) String() string

String returns the name of the type. The name is used as part of the PAM filenames, so it shall not be changed.

type GIndex ¶

type GIndex []GIndexEntry

GIndex is an alternate .bam file index format that uses the .gbai file extension. The .gbai file format contains mappings from genomic position to .bam file voffset. This index format is simpler than the legacy style .bai file, but allows a user to seek into a .bam file much more efficiently for some genomic positions.

The .gbai format exists because the .bai format can point into a .bam file with a minimum genomic spacing of 16 kbp. The problem with this minimum spacing is that if there are many alignments in the .bam file within a 16 kbp region, then seeking to a target genomic position within the 16 kbp region requires the reader to seek to the beginning for the 16 kbp region and then scan through bam records until reaching the target genomic position. This scanning requires unnecessary IO and CPU time for reading and decompressing records that come before the target position.

The .gbai file format contains a set of mappings from (genomic position, and record number at that position) to the voffset in the bam file where the record begins. In typical use, the spacing between the genomic positions in the .gbai file are chosen so that the spacing between voffsets in the .bam file are uniform and relatively small. This allows a user to divide the .bam file into uniform sized shards. For example, 64 KBytes is a reasonable default spacing between voffsets. This spacing allows a reader to seek directly to within 64 KBytes of any target genomic position.

The on disk .gbai format is a header followed by a sequence of entries. The header consists of the magic byte sequence {0x47, 0x42, 0x41, 0x49, 0x01, 0xf1, 0x78, 0x5c,

0x7b, 0xcb, 0xc1, 0xba, 0x08, 0x23, 0xb1, 0x19}

which is "GBAI1" followed by 11 random bytes.

Each entry consists of 4 values, each in little-endian byte order:

int32 RefID to match the .bam file RefIDs. The unmapped records at the end of the .bam have RefID equal to -1.
int32 Position to match the .bam file Positions
uint32 Sequence number of the record at the particular (RefID, Position) pair. If the record is the first record with this (RefID, Position) pair, then Sequence will be 0. If the record is the second, then Sequence will be 1, and so on.
uint64 VOffset of the record in the .bam file as described in the .bam specification.

The .gbai index entries are sorted in ascending order using the key (RefID, Position, Sequence) and the .gbai index requires that the corresponding .bam file is also sorted by position.

If the bam file contains a bam record for a given RefID, then the gindex contains an entry for the first bam record with the given RefID. This implies that the first entry in the gindex points to the first record in the bam file. If there are no bam records with RefID R, then there will be no entries in the gindex with RefID R.

The series of index entries is then compressed with gzip before writing to the .gbai file.

func ReadGIndex ¶

func ReadGIndex(r io.Reader) (gindex *GIndex, err error)

ReadGIndex expects a .gbai file as r, and returns the parsed GIndex and any errors encountered while reading and unmarshalling r.

func (*GIndex) RecordOffset ¶

func (idx *GIndex) RecordOffset(refID, pos int32, seq uint32) bgzf.Offset

RecordOffset returns a voffset into the bam from which, reading forward will eventually read records at the target position. When reading from the returned voffset, if the bam record's (refid, position) is greater than the target (refid, position), then the target position is not present in the bam file.

func (*GIndex) UnmappedOffset ¶

func (idx *GIndex) UnmappedOffset() bgzf.Offset

UnmappedOffset returns a voffset at or before the first read in the .bam's unmapped section.

type GIndexEntry ¶

type GIndexEntry struct {
	RefID   int32
	Pos     int32
	Seq     uint32
	VOffset uint64
}

GIndexEntry is one entry of the .gbai index.

type Index ¶

type Index struct {
	Magic         [4]byte
	Refs          []Reference
	UnmappedCount *uint64
}

Index represents the content of a .bai index file (for use with a .bam file).

func ReadIndex ¶

func ReadIndex(rawr io.Reader) (*Index, error)

ReadIndex parses the content of r and returns an Index or nil and an error.

func (*Index) AllOffsets ¶

func (i *Index) AllOffsets() map[int][]bgzf.Offset

AllOffsets returns a map of chunk offsets in the index file, it includes chunk begin locations, and interval locations. The Key of the map is the Reference ID, and the value is a slice of bgzf.Offsets. The return map will have an entry for every reference ID, even if the list of offsets is empty.

type Metadata ¶

type Metadata struct {
	UnmappedBegin uint64
	UnmappedEnd   uint64
	MappedCount   uint64
	UnmappedCount uint64
}

Metadata represents the Metadata data within a .bai file.

type Pair ¶

type Pair struct {
	R1  *sam.Record
	R2  *sam.Record
	Err error
}

Pair encapsulates a pair of SAM records for a pair of reads, and whether any error was encountered in retrieving them.

type Reference ¶

type Reference struct {
	Bins      []Bin
	Intervals []bgzf.Offset
	Meta      Metadata
}

Reference represents the reference data within a .bai file.

type Shard ¶

type Shard struct {
	StartRef *sam.Reference
	EndRef   *sam.Reference
	Start    int
	End      int
	StartSeq int
	EndSeq   int

	Padding  int
	ShardIdx int
}

Shard represents a genomic interval. The <StartRef,Start,StartSeq> and <EndRef,End,EndSeq> coordinates form a half-open, 0-based interval. An iterator for such a range will return reads whose start positions fall within that range.

The StartSeq, EndSeq fields are used to distinguish a list of reads that start at the same coordinate. The Nth read that start at the coordinate is assigned the seq value of N-1 (assuming N is 1-based). For example, Passing range [(startref=10,start=100,startseq=15), (limitref=10,limit=100,limitseq=20)] will read 16th to 20th read sequences at coordinate (10,100)

Uses of non-zero {Start,End}Seq is supported only in PAM files. For BAM files, *Seq must be zero.

An unmapped sequence has coordinate (nil,0,seq), and it is stored after any mapped sequence. Thus, a shard that contains an unmapped sequence will have EndRef=nil, End=1, EndSeq=0> (in theory, End can be any value > 0, but in practice we use End=1).

Padding must be >=0. It expands the read range to [PaddedStart, PaddedEnd), where PaddedStart=max(0, Start-Padding) and PaddedEnd=min(EndRef.Len(), End+Padding)). The regions [PaddedStart,Start) and [End,PaddedEnd) are not part of the shard, since the padding regions will overlap with another Shard's [Start, End).

The Shards are ordered according to the order of the bam input file. ShardIdx is an index into that ordering. The first Shard has index 0, and the subsequent shards increment the ShardIdx by one each.

func CoordRangeToShard ¶

func CoordRangeToShard(header *sam.Header, r biopb.CoordRange, padding, shardIdx int) Shard

CoordRangeToShard converts RecRange to bam.Shard.

func GetByteBasedShards ¶

func GetByteBasedShards(bamPath, baiPath string, bytesPerShard int64, minBases, padding int, includeUnmapped bool) (shards []Shard, err error)

GetByteBasedShards returns a list of shards much like GetPositionBasedShards, but the shards are based on a target bytesPerShard, and a minimum number of bases pershard (minBases). baiPath can point to a traditional style .bai index, or a new style .gbai index.

func GetPositionBasedShards ¶

func GetPositionBasedShards(header *sam.Header, shardSize int, padding int, includeUnmapped bool) ([]Shard, error)

GetPositionBasedShards returns a list of shards that cover the genome using the specified shard size and padding size. Return a shard for the unmapped && mate-unmapped pairs if includeUnmapped is true.

The Shards split the BAM data from the given provider into contiguous, non-overlapping genomic intervals (Shards). A SAM record is associated with a shard if its alignment start position is within the given padding distance of the shard. This means reads near shard boundaries may be associated with more than one shard.

func UniversalShard ¶

func UniversalShard(header *sam.Header) Shard

UniversalShard creates a Shard that covers the entire genome and unmapped reads.

func (*Shard) CoordInShard ¶

func (s *Shard) CoordInShard(padding int, coord biopb.Coord) bool

CoordInShard returns whether coord is within the shard plus the supplied padding (this uses the padding parameter in place of s.Padding).

func (*Shard) MateInShard ¶

func (s *Shard) MateInShard(r *sam.Record) bool

MateInShard returns true if mate of r is in s.

func (*Shard) PadEnd ¶

func (s *Shard) PadEnd(padding int) int

PadEnd end returns min(s.End+padding, length of s.EndRef)

func (*Shard) PadStart ¶

func (s *Shard) PadStart(padding int) int

PadStart returns max(s.Start-padding, 0).

func (*Shard) PaddedEnd ¶

func (s *Shard) PaddedEnd() int

PaddedEnd computes the effective limit of the range to read, including padding.

func (*Shard) PaddedStart ¶

func (s *Shard) PaddedStart() int

PaddedStart computes the effective start of the range to read, including padding.

func (*Shard) RecordInPaddedShard ¶

func (s *Shard) RecordInPaddedShard(r *sam.Record) bool

RecordInPaddedShard returns true if r is in s+padding.

func (*Shard) RecordInShard ¶

func (s *Shard) RecordInShard(r *sam.Record) bool

RecordInShard returns true if r is in s.

func (*Shard) RecordInStartPadding ¶

func (s *Shard) RecordInStartPadding(r *sam.Record) bool

RecordInStartPadding returns true if r is in the start padding of s.

func (*Shard) String ¶

func (s *Shard) String() string

String returns a debug string for s.

type ShardedBAMCompressor ¶

type ShardedBAMCompressor struct {
	// contains filtered or unexported fields
}

ShardedBAMCompressor contains the state of an in-progress compressed shard. A caller should create a ShardedBAMCompressor using ShardedBAMWriter.GetCompressor(). The ShardedBAMCompressor will compress the records and store the compressed bytes until the caller is finished with the shard. When the caller is finished adding records, the caller should call CloseShard(). More than one ShardedBAMCompressor can exist at once, and they can all compress records in parallel with each other.

func (*ShardedBAMCompressor) AddRecord ¶

func (c *ShardedBAMCompressor) AddRecord(r *sam.Record) error

AddRecord adds a sam record to the current in-progress shard.

func (*ShardedBAMCompressor) CloseShard ¶

func (c *ShardedBAMCompressor) CloseShard() error

CloseShard finalizes the in-progress shard, and passes the compressed data to its parent ShardedBAMWriter. It removes the current shard from the compressor and prepares the compressor for the next call to StartShard().

The ShardedBAMWriter will buffer shards up to its queue size, so the caller must be careful about how out of order it is when calling CloseShard(), otherwise, calls to CloseShard() will block.

func (*ShardedBAMCompressor) StartShard ¶

func (c *ShardedBAMCompressor) StartShard(shardNum int) error

StartShard begins a new shard with the specified shard number. If the compressor still has data from the previous shard, it will crash.

type ShardedBAMWriter ¶

type ShardedBAMWriter struct {
	// contains filtered or unexported fields
}

ShardedBAMWriter writes out ShardedBAMBuffers in the order of their shard numbers.

func NewShardedBAMWriter ¶

func NewShardedBAMWriter(w io.Writer, gzLevel, queueSize int, header *sam.Header) (*ShardedBAMWriter, error)

NewShardedBAMWriter creates a new ShardedBAMWriter that writes the output bam to w.

func (*ShardedBAMWriter) Close ¶

func (bw *ShardedBAMWriter) Close() error

Close the bam file. This should be called only after all shards have been added with WriteShard. Returns an error of failure.

func (*ShardedBAMWriter) GetCompressor ¶

func (bw *ShardedBAMWriter) GetCompressor() *ShardedBAMCompressor

GetCompressor returns a child ShardedBAMCompressor.

type StrandType ¶

type StrandType int

const (
	// StrandNone denotes an unmapped read.
	StrandNone StrandType = iota
	// StrandFwd denotes a read mapped to the R1+ strand.
	StrandFwd
	// StrandRev denotes a read mapped to the R1- strand.
	StrandRev
)

func GetStrand ¶

func GetStrand(flags sam.Flags) StrandType

GetStrand returns whether the current read is mapped to the R1+ strand, the R1- strand, or neither, in a manner that ignores all flags associated with the other read end.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
process_example

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func BaseAtPos ¶

func ClearAuxTags ¶

func CoordFromSAMRecord ¶

func FivePrimeClipDistance ¶

func GetCoordAtOffset ¶

func HasNoMappedMate ¶

func IsDuplicate ¶

func IsLinearDuplicate ¶

func IsMateReverse ¶

func IsMateUnmapped ¶

func IsPaired ¶

func IsPrimary ¶

func IsProperPair ¶

func IsQCFail ¶

func IsQCFailed ¶

func IsRead1 ¶

func IsRead2 ¶

func IsReverse ¶

func IsReversedRead ¶

func IsSecondary ¶

func IsSupplementary ¶

func IsUnmapped ¶

func LeftClipDistance ¶

func MateCoordFromSAMRecord ¶

func NewCoord ¶

func NewShardChannel ¶

func QualAtPos ¶

func RightClipDistance ¶

func ShardToCoordRange ¶

func ToBGZFOffset ¶

func UnclippedEnd ¶

func UnclippedFivePrimePosition ¶

func UnclippedStart ¶

func Unmarshal ¶

func UnmarshalHeader ¶

func UnsafeBytesToCigar ¶

func UnsafeBytesToDoublets ¶

func UnsafeCigarToBytes ¶

func UnsafeDoubletsToBytes ¶

func ValidateShardList ¶

func WriteGIndex ¶

Types ¶

type AdjacentBAMShard ¶

func (*AdjacentBAMShard) Record ¶

func (*AdjacentBAMShard) Scan ¶

type AdjacentShardedBAMReader ¶

func NewAdjacentShardedBAMReader ¶

func (*AdjacentShardedBAMReader) GetShard ¶

func (*AdjacentShardedBAMReader) Header ¶

type Bin ¶

type Chunk ¶

type CoordGenerator ¶

func NewCoordGenerator ¶

func (*CoordGenerator) Generate ¶

func (*CoordGenerator) GenerateFromRecord ¶

type FieldType ¶

func ParseFieldType ¶

func (FieldType) String ¶

type GIndex ¶

func ReadGIndex ¶

func (*GIndex) RecordOffset ¶

func (*GIndex) UnmappedOffset ¶

type GIndexEntry ¶

type Index ¶

func ReadIndex ¶

func (*Index) AllOffsets ¶

type Metadata ¶

type Pair ¶

type Reference ¶

type Shard ¶

func CoordRangeToShard ¶

func GetByteBasedShards ¶

func GetPositionBasedShards ¶

func UniversalShard ¶