sstable

package

v1.1.2 Latest Latest Go to latest Published: Aug 13, 2024 License: BSD-3-Clause Imports: 36 Imported by: 73

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/cockroachdb/pebble

Links

Open Source Insights

Documentation ¶

Overview ¶

Package sstable implements readers and writers of pebble tables.

Tables are either opened for reading or created for writing but not both.

A reader can create iterators, which allow seeking and next/prev iteration. There may be multiple key/value pairs that have the same key and different sequence numbers.

A reader can be used concurrently. Multiple goroutines can call NewIter concurrently, and each iterator can run concurrently with other iterators. However, any particular iterator should not be used concurrently, and iterators should not be used once a reader is closed.

A writer writes key/value pairs in increasing key order, and cannot be used concurrently. A table cannot be read until the writer has finished.

Readers and writers can be created with various options. Passing a nil Options pointer is valid and means to use the default values.

One such option is to define the 'less than' ordering for keys. The default Comparer uses the natural ordering consistent with bytes.Compare. The same ordering should be used for reading and writing a table.

To return the value for a key:

r := table.NewReader(file, options)
defer r.Close()
i := r.NewIter(nil, nil)
defer i.Close()
ikey, value := r.SeekGE(key)
if options.Comparer.Compare(ikey.UserKey, key) != 0 {
  // not found
} else {
  // value is the first record containing key
}

To count the number of entries in a table:

i, n := r.NewIter(nil, nil), 0
for key, value := i.First(); key != nil; key, value = i.Next() {
	n++
}
if err := i.Close(); err != nil {
	return 0, err
}
return n, nil

To write a table with three entries:

w := table.NewWriter(file, options)
if err := w.Set([]byte("apple"), []byte("red")); err != nil {
	w.Close()
	return err
}
if err := w.Set([]byte("banana"), []byte("yellow")); err != nil {
	w.Close()
	return err
}
if err := w.Set([]byte("cherry"), []byte("red")); err != nil {
	w.Close()
	return err
}
return w.Close()

Index ¶

Constants
Variables
func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error)
func RewriteKeySuffixesAndReturnFormat(sst []byte, rOpts ReaderOptions, out objstorage.Writable, o WriterOptions, ...) (*WriterMetadata, TableFormat, error)
type AbbreviatedKey
type BlockHandle
type BlockHandleWithProperties
type BlockIntervalCollector
- func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error
- func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock()
- func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error)
- func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error)
- func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error)
- func (b *BlockIntervalCollector) Name() string
type BlockIntervalFilter
- func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter
- func NewTestKeysBlockPropertyFilter(filterMin, filterMax uint64) *BlockIntervalFilter
- func (b *BlockIntervalFilter) Init(name string, lower, upper uint64)
- func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error)
- func (b *BlockIntervalFilter) Name() string
- func (b *BlockIntervalFilter) SetInterval(lower, upper uint64)
type BlockPropertiesFilterer
- func IntersectsTable(filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter, ...) (*BlockPropertiesFilterer, error)
type BlockPropertyCollector
- func NewBlockIntervalCollector(name string, pointCollector, rangeCollector DataBlockIntervalCollector) BlockPropertyCollector
- func NewTestKeysBlockPropertyCollector() BlockPropertyCollector
type BlockPropertyFilter
type BoundLimitedBlockPropertyFilter
type Buf
- func (b *Buf) Release()
- func (b Buf) Valid() bool
type BufferPool
- func (p *BufferPool) Alloc(n int) Buf
- func (p *BufferPool) Init(initialSize int)
- func (p *BufferPool) Release()
type ChecksumType
- func (t ChecksumType) String() string
type CommonProperties
- func (c *CommonProperties) NumPointDeletions() uint64
- func (c *CommonProperties) String() string
type CommonReader
type Compare
type Comparer
type Comparers
type Compression
- func (c Compression) String() string
type DataBlockIntervalCollector
type Equal
type FilterMetrics
type FilterMetricsTracker
- func (m *FilterMetricsTracker) Load() FilterMetrics
type FilterPolicy
type FilterType
type FilterWriter
type InternalKey
type InternalKeyKind
type Iterator
type Layout
- func (l *Layout) Describe(w io.Writer, verbose bool, r *Reader, ...)
type Merger
type Mergers
type PreviousPointKeyOpt
- func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey
type Properties
- func (p *Properties) NumPointDeletions() uint64
- func (p *Properties) NumRangeKeys() uint64
- func (p *Properties) String() string
type ReadableFile
type Reader
- func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error)
- func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error)
- func (r *Reader) Close() error
- func (r *Reader) CommonProperties() *CommonProperties
- func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error)
- func (r *Reader) Layout() (*Layout, error)
- func (r *Reader) NewCompactionIter(bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool) (Iterator, error)
- func (r *Reader) NewIter(lower, upper []byte) (Iterator, error)
- func (r *Reader) NewIterWithBlockPropertyFilters(lower, upper []byte, filterer *BlockPropertiesFilterer, useFilterBlock bool, ...) (Iterator, error)
- func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(ctx context.Context, lower, upper []byte, filterer *BlockPropertiesFilterer, ...) (Iterator, error)
- func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error)
- func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
- func (r *Reader) TableFormat() (TableFormat, error)
- func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(snapshotForHideObsoletePoints uint64, fileLargestSeqNum uint64, ...) (hideObsoletePoints bool, filters []BlockPropertyFilter)
- func (r *Reader) ValidateBlockChecksums() error
type ReaderOption
type ReaderOptions
type ReaderProvider
type SeekGEFlags
type SeekLTFlags
type Separator
type Split
type Successor
type SuffixReplaceableBlockCollector
type SuffixReplaceableTableCollector
type TableFormat
- func ParseTableFormat(magic []byte, version uint32) (TableFormat, error)
- func (f TableFormat) AsTuple() (string, uint32)
- func (f TableFormat) String() string
type TablePropertyCollector
type TestKeysMaskingFilter
- func NewTestKeysMaskingFilter() TestKeysMaskingFilter
- func (f TestKeysMaskingFilter) Intersects(prop []byte) (bool, error)
- func (f TestKeysMaskingFilter) SetSuffix(suffix []byte) error
type TrivialReaderProvider
- func (trp TrivialReaderProvider) Close()
- func (trp TrivialReaderProvider) GetReader() (*Reader, error)
type UserKeyPrefixBound
- func (ukb *UserKeyPrefixBound) IsEmpty() bool
type VirtualReader
- func MakeVirtualReader(reader *Reader, meta manifest.VirtualFileMeta, isForeign bool) VirtualReader
- func (v *VirtualReader) CommonProperties() *CommonProperties
- func (v *VirtualReader) EstimateDiskUsage(start, end []byte) (uint64, error)
- func (v *VirtualReader) NewCompactionIter(bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool) (Iterator, error)
- func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc(ctx context.Context, lower, upper []byte, filterer *BlockPropertiesFilterer, ...) (Iterator, error)
- func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error)
- func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
- func (v *VirtualReader) ValidateBlockChecksumsOnBacking() error
type Writer
- func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer
- func (w *Writer) Add(key InternalKey, value []byte) error
- func (w *Writer) AddRangeKey(key InternalKey, value []byte) error
- func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error
- func (w *Writer) Close() (err error)
- func (w *Writer) Delete(key []byte) error
- func (w *Writer) DeleteRange(start, end []byte) error
- func (w *Writer) EstimatedSize() uint64
- func (w *Writer) Merge(key, value []byte) error
- func (w *Writer) Metadata() (*WriterMetadata, error)
- func (w *Writer) RangeKeyDelete(start, end []byte) error
- func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error
- func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error
- func (w *Writer) Set(key, value []byte) error
- func (w *Writer) Write(blockWithTrailer []byte) (n int, err error)
type WriterMetadata
- func RewriteKeySuffixes(sst []byte, rOpts ReaderOptions, out objstorage.Writable, o WriterOptions, ...) (*WriterMetadata, error)
- func RewriteKeySuffixesViaWriter(r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte) (*WriterMetadata, error)
- func (m *WriterMetadata) SetLargestPointKey(k InternalKey)
- func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey)
- func (m *WriterMetadata) SetLargestRangeKey(k InternalKey)
- func (m *WriterMetadata) SetSmallestPointKey(k InternalKey)
- func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey)
- func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey)
type WriterOption
type WriterOptions

Constants ¶

View Source

const (
	InternalKeyKindDelete          = base.InternalKeyKindDelete
	InternalKeyKindSet             = base.InternalKeyKindSet
	InternalKeyKindMerge           = base.InternalKeyKindMerge
	InternalKeyKindLogData         = base.InternalKeyKindLogData
	InternalKeyKindSingleDelete    = base.InternalKeyKindSingleDelete
	InternalKeyKindRangeDelete     = base.InternalKeyKindRangeDelete
	InternalKeyKindSetWithDelete   = base.InternalKeyKindSetWithDelete
	InternalKeyKindDeleteSized     = base.InternalKeyKindDeleteSized
	InternalKeyKindMax             = base.InternalKeyKindMax
	InternalKeyKindInvalid         = base.InternalKeyKindInvalid
	InternalKeySeqNumBatch         = base.InternalKeySeqNumBatch
	InternalKeySeqNumMax           = base.InternalKeySeqNumMax
	InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel
)

These constants are part of the file format, and should not be changed.

View Source

const MaximumBlockSize = 1 << 28

MaximumBlockSize is an extremely generous maximum block size of 256MiB. We explicitly place this limit to reserve a few bits in the restart for internal use.

View Source

const (
	TableFilter = base.TableFilter
)

Exported TableFilter constants.

Variables ¶

View Source

var DefaultComparer = base.DefaultComparer

DefaultComparer exports the base.DefaultComparer variable.

Functions ¶

func NewSimpleReadable ¶

func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error)

NewSimpleReadable wraps a ReadableFile in a objstorage.Readable implementation (which does not support read-ahead)

func RewriteKeySuffixesAndReturnFormat ¶

func RewriteKeySuffixesAndReturnFormat(
	sst []byte,
	rOpts ReaderOptions,
	out objstorage.Writable,
	o WriterOptions,
	from, to []byte,
	concurrency int,
) (*WriterMetadata, TableFormat, error)

RewriteKeySuffixesAndReturnFormat copies the content of the passed SSTable bytes to a new sstable, written to `out`, in which the suffix `from` has is replaced with `to` in every key. The input sstable must consist of only Sets or RangeKeySets and every key must have `from` as its suffix as determined by the Split function of the Comparer in the passed WriterOptions. Range deletes must not exist in this sstable, as they will be ignored.

Data blocks are rewritten in parallel by `concurrency` workers and then assembled into a final SST. Filters are copied from the original SST without modification as they are not affected by the suffix, while block and table properties are only minimally recomputed.

TODO(sumeer): document limitations, if any, due to this limited re-computation of properties (is there any loss of fidelity?).

Any block and table property collectors configured in the WriterOptions must implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector.

The WriterOptions.TableFormat is ignored, and the output sstable has the same TableFormat as the input, which is returned in case the caller wants to do some error checking. Suffix rewriting is meant to be efficient, and allowing changes in the TableFormat detracts from that efficiency.

Any obsolete bits that key-value pairs may be annotated with are ignored and lost during the rewrite. Additionally, the output sstable has the pebble.obsolete.is_strict property set to false. These limitations could be removed if needed. The current use case for RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file ingestion, where these files do not contain RANGEDELs and have one key-value pair per userkey -- so they trivially satisfy the strict criteria, and we don't need the obsolete bit as a performance optimization. For disaggregated storage, strict obsolete sstables are needed for L5 and L6, but at the time of writing, we expect such MVCC-compliant file ingestion to only ingest into levels L4 and higher. If this changes, we can do one of two things to get rid of this limitation:

Validate that there are no duplicate userkeys and no RANGEDELs/MERGEs in the sstable to be rewritten. Validating no duplicate userkeys is non-trivial when rewriting blocks in parallel, so we could encode the pre-existing condition in the (existing) SnapshotPinnedKeys property -- we need to update the external sst writer to calculate and encode this property.
Preserve the obsolete bit (with changes to the blockIter).

Types ¶

type AbbreviatedKey ¶

type AbbreviatedKey = base.AbbreviatedKey

AbbreviatedKey exports the base.AbbreviatedKey type.

type BlockHandle ¶

type BlockHandle struct {
	Offset, Length uint64
}

BlockHandle is the file offset and length of a block.

type BlockHandleWithProperties ¶

type BlockHandleWithProperties struct {
	BlockHandle
	Props []byte
}

BlockHandleWithProperties is used for data blocks and first/lower level index blocks, since they can be annotated using BlockPropertyCollectors.

type BlockIntervalCollector ¶

type BlockIntervalCollector struct {
	// contains filtered or unexported fields
}

BlockIntervalCollector is a helper implementation of BlockPropertyCollector for users who want to represent a set of the form [lower,upper) where both lower and upper are uint64, and lower <= upper.

The set is encoded as: - Two varint integers, (lower,upper-lower), when upper-lower > 0 - Nil, when upper-lower=0

Users must not expect this to preserve differences between empty sets -- they will all get turned into the semantically equivalent [0,0).

A BlockIntervalCollector that collects over point and range keys needs to have both the point and range DataBlockIntervalCollector specified, since point and range keys are fed to the BlockIntervalCollector in an interleaved fashion, independently of one another. This also implies that the DataBlockIntervalCollectors for point and range keys should be references to independent instances, rather than references to the same collector, as point and range keys are tracked independently.

func (*BlockIntervalCollector) Add ¶

func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error

Add implements the BlockPropertyCollector interface.

func (*BlockIntervalCollector) AddPrevDataBlockToIndexBlock ¶

func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock()

AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector interface.

func (*BlockIntervalCollector) FinishDataBlock ¶

func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error)

FinishDataBlock implements the BlockPropertyCollector interface.

func (*BlockIntervalCollector) FinishIndexBlock ¶

func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error)

FinishIndexBlock implements the BlockPropertyCollector interface.

func (*BlockIntervalCollector) FinishTable ¶

func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error)

FinishTable implements the BlockPropertyCollector interface.

func (*BlockIntervalCollector) Name ¶

func (b *BlockIntervalCollector) Name() string

Name implements the BlockPropertyCollector interface.

type BlockIntervalFilter ¶

type BlockIntervalFilter struct {
	// contains filtered or unexported fields
}

BlockIntervalFilter is an implementation of BlockPropertyFilter when the corresponding collector is a BlockIntervalCollector. That is, the set is of the form [lower, upper).

func NewBlockIntervalFilter ¶

func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter

NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks based on an interval property collected by BlockIntervalCollector and the given [lower, upper) bounds. The given name specifies the BlockIntervalCollector's properties to read.

func NewTestKeysBlockPropertyFilter ¶

func NewTestKeysBlockPropertyFilter(filterMin, filterMax uint64) *BlockIntervalFilter

NewTestKeysBlockPropertyFilter constructs a new block-property filter that excludes blocks containing exclusively suffixed keys where all the suffixes fall outside of the range [filterMin, filterMax).

The filter only filters based on data derived from the key. The iteration results of this block property filter are deterministic for unsuffixed keys and keys with suffixes within the range [filterMin, filterMax). For keys with suffixes outside the range, iteration is nondeterministic.

func (*BlockIntervalFilter) Init ¶

func (b *BlockIntervalFilter) Init(name string, lower, upper uint64)

Init initializes (or re-initializes, clearing previous state) an existing BLockPropertyFilter to filter blocks based on an interval property collected by BlockIntervalCollector and the given [lower, upper) bounds. The given name specifies the BlockIntervalCollector's properties to read.

func (*BlockIntervalFilter) Intersects ¶

func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error)

Intersects implements the BlockPropertyFilter interface.

func (*BlockIntervalFilter) Name ¶

func (b *BlockIntervalFilter) Name() string

Name implements the BlockPropertyFilter interface.

func (*BlockIntervalFilter) SetInterval ¶

func (b *BlockIntervalFilter) SetInterval(lower, upper uint64)

SetInterval adjusts the [lower, upper) bounds used by the filter. It is not generally safe to alter the filter while it's in use, except as part of the implementation of BlockPropertyFilterMask.SetSuffix used for range-key masking.

type BlockPropertiesFilterer ¶

type BlockPropertiesFilterer struct {
	// contains filtered or unexported fields
}

BlockPropertiesFilterer provides filtering support when reading an sstable in the context of an iterator that has a slice of BlockPropertyFilters. After the call to NewBlockPropertiesFilterer, the caller must call IntersectsUserPropsAndFinishInit to check if the sstable intersects with the filters. If it does intersect, this function also finishes initializing the BlockPropertiesFilterer using the shortIDs for the relevant filters. Subsequent checks for relevance of a block should use the intersects method.

func IntersectsTable ¶

func IntersectsTable(
	filters []BlockPropertyFilter,
	limited BoundLimitedBlockPropertyFilter,
	userProperties map[string]string,
) (*BlockPropertiesFilterer, error)

IntersectsTable evaluates the provided block-property filter against the provided set of table-level properties. If there is no intersection between the filters and the table or an error is encountered, IntersectsTable returns a nil filterer (and possibly an error). If there is an intersection, IntersectsTable returns a non-nil filterer that may be used by an iterator reading the table.

type BlockPropertyCollector ¶

type BlockPropertyCollector interface {
	// Name returns the name of the block property collector.
	Name() string
	// Add is called with each new entry added to a data block in the sstable.
	// The callee can assume that these are in sorted order.
	Add(key InternalKey, value []byte) error
	// FinishDataBlock is called when all the entries have been added to a
	// data block. Subsequent Add calls will be for the next data block. It
	// returns the property value for the finished block.
	FinishDataBlock(buf []byte) ([]byte, error)
	// AddPrevDataBlockToIndexBlock adds the entry corresponding to the
	// previous FinishDataBlock to the current index block.
	AddPrevDataBlockToIndexBlock()
	// FinishIndexBlock is called when an index block, containing all the
	// key-value pairs since the last FinishIndexBlock, will no longer see new
	// entries. It returns the property value for the index block.
	FinishIndexBlock(buf []byte) ([]byte, error)
	// FinishTable is called when the sstable is finished, and returns the
	// property value for the sstable.
	FinishTable(buf []byte) ([]byte, error)
}

BlockPropertyCollector is used when writing a sstable.

All calls to Add are included in the next FinishDataBlock, after which the next data block is expected to start.
The index entry generated for the data block, which contains the return value from FinishDataBlock, is not immediately included in the current index block. It is included when AddPrevDataBlockToIndexBlock is called. An alternative would be to return an opaque handle from FinishDataBlock and pass it to a new AddToIndexBlock method, which requires more plumbing, and passing of an interface{} results in a undesirable heap allocation. AddPrevDataBlockToIndexBlock must be called before keys are added to the new data block.

func NewBlockIntervalCollector ¶

func NewBlockIntervalCollector(
	name string, pointCollector, rangeCollector DataBlockIntervalCollector,
) BlockPropertyCollector

NewBlockIntervalCollector constructs a BlockIntervalCollector with the given name. The BlockIntervalCollector makes use of the given point and range key DataBlockIntervalCollectors when encountering point and range keys, respectively.

The caller may pass a nil DataBlockIntervalCollector for one of the point or range key collectors, in which case keys of those types will be ignored. This allows for flexible construction of BlockIntervalCollectors that operate on just point keys, just range keys, or both point and range keys.

If both point and range keys are to be tracked, two independent collectors should be provided, rather than the same collector passed in twice (see the comment on BlockIntervalCollector for more detail)

func NewTestKeysBlockPropertyCollector ¶

func NewTestKeysBlockPropertyCollector() BlockPropertyCollector

NewTestKeysBlockPropertyCollector constructs a sstable property collector over testkey suffixes.

type BlockPropertyFilter ¶

type BlockPropertyFilter = base.BlockPropertyFilter

BlockPropertyFilter is used in an Iterator to filter sstables and blocks within the sstable. It should not maintain any per-sstable state, and must be thread-safe.

type BoundLimitedBlockPropertyFilter ¶

type BoundLimitedBlockPropertyFilter interface {
	BlockPropertyFilter

	// KeyIsWithinLowerBound tests whether the provided internal key falls
	// within the current lower bound of the filter. A true return value
	// indicates that the filter may be used to filter blocks that exclusively
	// contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
	// bound.
	KeyIsWithinLowerBound(key []byte) bool
	// KeyIsWithinUpperBound tests whether the provided internal key falls
	// within the current upper bound of the filter. A true return value
	// indicates that the filter may be used to filter blocks that exclusively
	// contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
	// bound.
	KeyIsWithinUpperBound(key []byte) bool
}

BoundLimitedBlockPropertyFilter implements the block-property filter but imposes an additional constraint on its usage, requiring that only blocks containing exclusively keys between its lower and upper bounds may be filtered. The bounds may be change during iteration, so the filter doesn't expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods for performing bound comparisons.

To be used, a BoundLimitedBlockPropertyFilter must be supplied directly through NewBlockPropertiesFilterer's dedicated parameter. If supplied through the ordinary slice of block property filters, this filter's bounds will be ignored.

The current [lower,upper) bounds of the filter are unknown, because they may be changing. During forward iteration the lower bound is externally guaranteed, meaning Intersects only returns false if the sstable iterator is already known to be positioned at a key ≥ lower. The sstable iterator is then only responsible for ensuring filtered blocks also meet the upper bound, and should only allow a block to be filtered if all its keys are < upper. The sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check, where key is an inclusive upper bound on the block's keys.

During backward iteration the upper bound is externally guaranteed, and Intersects only returns false if the sstable iterator is already known to be positioned at a key < upper. The sstable iterator is responsible for ensuring filtered blocks also meet the lower bound, enforcing that a block is only filtered if all its keys are ≥ lower. This check is made through passing the block's inclusive lower bound to KeyIsWithinLowerBound.

Implementations may become active or inactive through implementing Intersects to return true whenever the filter is disabled.

Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers should not implement this interface directly. This interface is an internal detail in the implementation of block-property range-key masking.

type Buf ¶ added in v1.1.0

type Buf struct {
	// contains filtered or unexported fields
}

A Buf holds a reference to a manually-managed, pooled byte buffer.

func (*Buf) Release ¶ added in v1.1.0

func (b *Buf) Release()

Release releases the buffer back to the pool.

func (Buf) Valid ¶ added in v1.1.0

func (b Buf) Valid() bool

Valid returns true if the buf holds a valid buffer.

type BufferPool ¶ added in v1.1.0

type BufferPool struct {
	// contains filtered or unexported fields
}

A BufferPool holds a pool of buffers for holding sstable blocks. An initial size of the pool is provided on Init, but a BufferPool will grow to meet the largest working set size. It'll never shrink. When a buffer is released, the BufferPool recycles the buffer for future allocations.

A BufferPool should only be used for short-lived allocations with well-understood working set sizes to avoid excessive memory consumption.

BufferPool is not thread-safe.

func (*BufferPool) Alloc ¶ added in v1.1.0

func (p *BufferPool) Alloc(n int) Buf

Alloc allocates a new buffer of size n. If the pool already holds a buffer at least as large as n, the pooled buffer is used instead.

Alloc is O(MAX(N,M)) where N is the largest number of concurrently in-use buffers allocated and M is the initialSize passed to Init.

func (*BufferPool) Init ¶ added in v1.1.0

func (p *BufferPool) Init(initialSize int)

Init initializes the pool with an initial working set buffer size of `initialSize`.

func (*BufferPool) Release ¶ added in v1.1.0

func (p *BufferPool) Release()

Release releases all buffers held by the pool and resets the pool to an uninitialized state.

type ChecksumType ¶

type ChecksumType byte

ChecksumType specifies the checksum used for blocks.

const (
	ChecksumTypeNone     ChecksumType = 0
	ChecksumTypeCRC32c   ChecksumType = 1
	ChecksumTypeXXHash   ChecksumType = 2
	ChecksumTypeXXHash64 ChecksumType = 3
)

The available checksum types.

func (ChecksumType) String ¶

func (t ChecksumType) String() string

String implements fmt.Stringer.

type CommonProperties ¶ added in v1.1.0

type CommonProperties struct {
	// The number of entries in this table.
	NumEntries uint64 `prop:"rocksdb.num.entries"`
	// Total raw key size.
	RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
	// Total raw value size.
	RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
	// Total raw key size of point deletion tombstones. This value is comparable
	// to RawKeySize.
	RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
	// Sum of the raw value sizes carried by point deletion tombstones
	// containing size estimates. See the DeleteSized key kind. This value is
	// comparable to Raw{Key,Value}Size.
	RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
	// The number of point deletion entries ("tombstones") in this table that
	// carry a size hint indicating the size of the value the tombstone deletes.
	NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
	// The number of deletion entries in this table, including both point and
	// range deletions.
	NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
	// The number of range deletions in this table.
	NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
	// The number of RANGEKEYDELs in this table.
	NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
	// The number of RANGEKEYSETs in this table.
	NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
	// Total size of value blocks and value index block. Only serialized if > 0.
	ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
}

CommonProperties holds properties for either a virtual or a physical sstable. This can be used by code which doesn't care to make the distinction between physical and virtual sstables properties.

For virtual sstables, fields are constructed through extrapolation upon virtual reader construction. See MakeVirtualReader for implementation details.

NB: The values of these properties can affect correctness. For example, if NumRangeKeySets == 0, but the sstable actually contains range keys, then the iterators will behave incorrectly.

func (*CommonProperties) NumPointDeletions ¶ added in v1.1.0

func (c *CommonProperties) NumPointDeletions() uint64

NumPointDeletions is the number of point deletions in the sstable. For virtual sstables, this is an estimate.

func (*CommonProperties) String ¶ added in v1.1.0

func (c *CommonProperties) String() string

String is only used for testing purposes.

type CommonReader ¶ added in v1.1.0

type CommonReader interface {
	NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
	NewRawRangeDelIter() (keyspan.FragmentIterator, error)
	NewIterWithBlockPropertyFiltersAndContextEtc(
		ctx context.Context, lower, upper []byte,
		filterer *BlockPropertiesFilterer,
		hideObsoletePoints, useFilterBlock bool,
		stats *base.InternalIteratorStats,
		rp ReaderProvider,
	) (Iterator, error)
	NewCompactionIter(
		bytesIterated *uint64,
		rp ReaderProvider,
		bufferPool *BufferPool,
	) (Iterator, error)
	EstimateDiskUsage(start, end []byte) (uint64, error)
	CommonProperties() *CommonProperties
}

CommonReader abstracts functionality over a Reader or a VirtualReader. This can be used by code which doesn't care to distinguish between a reader and a virtual reader.

type Compare ¶

type Compare = base.Compare

Compare exports the base.Compare type.

type Comparer ¶

type Comparer = base.Comparer

Comparer exports the base.Comparer type.

type Comparers ¶

type Comparers map[string]*Comparer

Comparers is a map from comparer name to comparer. It is used for debugging tools which may be used on multiple databases configured with different comparers. Comparers implements the OpenOption interface and can be passed as a parameter to NewReader.

type Compression ¶

type Compression int

Compression is the per-block compression algorithm to use.

const (
	DefaultCompression Compression = iota
	NoCompression
	SnappyCompression
	ZstdCompression
	NCompression
)

The available compression types.

func (Compression) String ¶

func (c Compression) String() string

type DataBlockIntervalCollector ¶

type DataBlockIntervalCollector interface {
	// Add is called with each new entry added to a data block in the sstable.
	// The callee can assume that these are in sorted order.
	Add(key InternalKey, value []byte) error
	// FinishDataBlock is called when all the entries have been added to a
	// data block. Subsequent Add calls will be for the next data block. It
	// returns the [lower, upper) for the finished block.
	FinishDataBlock() (lower uint64, upper uint64, err error)
}

DataBlockIntervalCollector is the interface used by BlockIntervalCollector that contains the actual logic pertaining to the property. It only maintains state for the current data block, and resets that state in FinishDataBlock. This interface can be used to reduce parsing costs.

type Equal ¶

type Equal = base.Equal

Equal exports the base.Equal type.

type FilterMetrics ¶

type FilterMetrics struct {
	// The number of hits for the filter policy. This is the
	// number of times the filter policy was successfully used to avoid access
	// of a data block.
	Hits int64
	// The number of misses for the filter policy. This is the number of times
	// the filter policy was checked but was unable to filter an access of a data
	// block.
	Misses int64
}

FilterMetrics holds metrics for the filter policy.

type FilterMetricsTracker ¶ added in v1.1.0

type FilterMetricsTracker struct {
	// contains filtered or unexported fields
}

FilterMetricsTracker is used to keep track of filter metrics. It contains the same metrics as FilterMetrics, but they can be updated atomically. An instance of FilterMetricsTracker can be passed to a Reader as a ReaderOption.

func (*FilterMetricsTracker) Load ¶ added in v1.1.0

func (m *FilterMetricsTracker) Load() FilterMetrics

Load returns the current values as FilterMetrics.

type FilterPolicy ¶

type FilterPolicy = base.FilterPolicy

FilterPolicy exports the base.FilterPolicy type.

type FilterType ¶

type FilterType = base.FilterType

FilterType exports the base.FilterType type.

type FilterWriter ¶

type FilterWriter = base.FilterWriter

FilterWriter exports the base.FilterWriter type.

type InternalKey ¶

type InternalKey = base.InternalKey

InternalKey exports the base.InternalKey type.

type InternalKeyKind ¶

type InternalKeyKind = base.InternalKeyKind

InternalKeyKind exports the base.InternalKeyKind type.

type Iterator ¶

type Iterator interface {
	base.InternalIterator

	// NextPrefix implements (base.InternalIterator).NextPrefix.
	NextPrefix(succKey []byte) (*InternalKey, base.LazyValue)

	// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
	// whether or not the last positioning method may have skipped any keys due
	// to block-property filters. This is used by the Pebble levelIter to
	// control when an iterator steps to the next sstable.
	//
	// MaybeFilteredKeys may always return false positives, that is it may
	// return true when no keys were filtered. It should only be called when the
	// iterator is exhausted. It must never return false negatives when the
	// iterator is exhausted.
	MaybeFilteredKeys() bool

	SetCloseHook(fn func(i Iterator) error)
}

Iterator iterates over an entire table of data.

type Layout ¶

type Layout struct {
	Data       []BlockHandleWithProperties
	Index      []BlockHandle
	TopIndex   BlockHandle
	Filter     BlockHandle
	RangeDel   BlockHandle
	RangeKey   BlockHandle
	ValueBlock []BlockHandle
	ValueIndex BlockHandle
	Properties BlockHandle
	MetaIndex  BlockHandle
	Footer     BlockHandle
	Format     TableFormat
}

Layout describes the block organization of an sstable.

func (*Layout) Describe ¶

func (l *Layout) Describe(
	w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
)

Describe returns a description of the layout. If the verbose parameter is true, details of the structure of each block are returned as well.

type Merger ¶

type Merger = base.Merger

Merger exports the base.Merger type.

type Mergers ¶

type Mergers map[string]*Merger

Mergers is a map from merger name to merger. It is used for debugging tools which may be used on multiple databases configured with different mergers. Mergers implements the OpenOption interface and can be passed as a parameter to NewReader.

type PreviousPointKeyOpt ¶

type PreviousPointKeyOpt struct {
	// contains filtered or unexported fields
}

PreviousPointKeyOpt is a WriterOption that provides access to the last point key written to the writer while building a sstable.

func (PreviousPointKeyOpt) UnsafeKey ¶

func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey

UnsafeKey returns the last point key written to the writer to which this option was passed during creation. The returned key points directly into a buffer belonging to the Writer. The value's lifetime ends the next time a point key is added to the Writer. Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed.

type Properties ¶

type Properties struct {
	// CommonProperties needs to be at the top of the Properties struct so that the
	// offsets of the fields in CommonProperties match the offsets of the embedded
	// fields of CommonProperties in Properties.
	CommonProperties `prop:"pebble.embbeded_common_properties"`

	// The name of the comparer used in this table.
	ComparerName string `prop:"rocksdb.comparator"`
	// The compression algorithm used to compress blocks.
	CompressionName string `prop:"rocksdb.compression"`
	// The compression options used to compress blocks.
	CompressionOptions string `prop:"rocksdb.compression_options"`
	// The total size of all data blocks.
	DataSize uint64 `prop:"rocksdb.data.size"`
	// The external sstable version format. Version 2 is the one RocksDB has been
	// using since 5.13. RocksDB only uses the global sequence number for an
	// sstable if this property has been set.
	ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"`
	// The name of the filter policy used in this table. Empty if no filter
	// policy is used.
	FilterPolicyName string `prop:"rocksdb.filter.policy"`
	// The size of filter block.
	FilterSize uint64 `prop:"rocksdb.filter.size"`
	// The global sequence number to use for all entries in the table. Present if
	// the table was created externally and ingested whole.
	GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"`
	// Total number of index partitions if kTwoLevelIndexSearch is used.
	IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
	// The size of index block.
	IndexSize uint64 `prop:"rocksdb.index.size"`
	// The index type. TODO(peter): add a more detailed description.
	IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
	// For formats >= TableFormatPebblev4, this is set to true if the obsolete
	// bit is strict for all the point keys.
	IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
	// The name of the merger used in this table. Empty if no merger is used.
	MergerName string `prop:"rocksdb.merge.operator"`
	// The number of blocks in this table.
	NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
	// The number of merge operands in the table.
	NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
	// The number of RANGEKEYUNSETs in this table.
	NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
	// The number of value blocks in this table. Only serialized if > 0.
	NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
	// The number of values stored in value blocks. Only serialized if > 0.
	NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
	// The name of the prefix extractor used in this table. Empty if no prefix
	// extractor is used.
	PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"`
	// If filtering is enabled, was the filter created on the key prefix.
	PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"`
	// A comma separated list of names of the property collectors used in this
	// table.
	PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
	// Total raw rangekey key size.
	RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
	// Total raw rangekey value size.
	RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
	// The total number of keys in this table that were pinned by open snapshots.
	SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
	// The cumulative bytes of keys in this table that were pinned by
	// open snapshots. This value is comparable to RawKeySize.
	SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
	// The cumulative bytes of values in this table that were pinned by
	// open snapshots. This value is comparable to RawValueSize.
	SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
	// Size of the top-level index if kTwoLevelIndexSearch is used.
	TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
	// User collected properties.
	UserProperties map[string]string
	// If filtering is enabled, was the filter created on the whole key.
	WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"`

	// Loaded set indicating which fields have been loaded from disk. Indexed by
	// the field's byte offset within the struct
	// (reflect.StructField.Offset). Only set if the properties have been loaded
	// from a file. Only exported for testing purposes.
	Loaded map[uintptr]struct{}
}

Properties holds the sstable property values. The properties are automatically populated during sstable creation and load from the properties meta block when an sstable is opened.

func (*Properties) NumPointDeletions ¶

func (p *Properties) NumPointDeletions() uint64

NumPointDeletions returns the number of point deletions in this table.

func (*Properties) NumRangeKeys ¶

func (p *Properties) NumRangeKeys() uint64

NumRangeKeys returns a count of the number of range keys in this table.

func (*Properties) String ¶

func (p *Properties) String() string

type ReadableFile ¶

type ReadableFile interface {
	io.ReaderAt
	io.Closer
	Stat() (os.FileInfo, error)
}

ReadableFile describes the smallest subset of vfs.File that is required for reading SSTs.

type Reader ¶

type Reader struct {
	Compare   Compare
	FormatKey base.FormatKey
	Split     Split

	// Keep types that are not multiples of 8 bytes at the end and with
	// decreasing size.
	Properties Properties
	// contains filtered or unexported fields
}

Reader is a table reader.

func NewMemReader ¶

func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error)

NewMemReader opens a reader over the SST stored in the passed []byte.

func NewReader ¶

func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error)

NewReader returns a new table reader for the file. Closing the reader will close the file.

func (*Reader) Close ¶

func (r *Reader) Close() error

Close implements DB.Close, as documented in the pebble package.

func (*Reader) CommonProperties ¶ added in v1.1.0

func (r *Reader) CommonProperties() *CommonProperties

CommonProperties implemented the CommonReader interface.

func (*Reader) EstimateDiskUsage ¶

func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error)

EstimateDiskUsage returns the total size of data blocks overlapping the range `[start, end]`. Even if a data block partially overlaps, or we cannot determine overlap due to abbreviated index keys, the full data block size is included in the estimation.

This function does not account for any metablock space usage. Assumes there is at least partial overlap, i.e., `[start, end]` falls neither completely before nor completely after the file's range.

Only blocks containing point keys are considered. Range deletion and range key blocks are not considered.

TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of data blocks overlapped and add that same fraction of the metadata blocks to the estimate.

func (*Reader) Layout ¶

func (r *Reader) Layout() (*Layout, error)

Layout returns the layout (block organization) for an sstable.

func (*Reader) NewCompactionIter ¶

func (r *Reader) NewCompactionIter(
	bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool,
) (Iterator, error)

NewCompactionIter returns an iterator similar to NewIter but it also increments the number of bytes iterated. If an error occurs, NewCompactionIter cleans up after itself and returns a nil iterator.

func (*Reader) NewIter ¶

func (r *Reader) NewIter(lower, upper []byte) (Iterator, error)

NewIter returns an iterator for the contents of the table. If an error occurs, NewIter cleans up after itself and returns a nil iterator. NewIter must only be used when the Reader is guaranteed to outlive any LazyValues returned from the iter.

func (*Reader) NewIterWithBlockPropertyFilters ¶

func (r *Reader) NewIterWithBlockPropertyFilters(
	lower, upper []byte,
	filterer *BlockPropertiesFilterer,
	useFilterBlock bool,
	stats *base.InternalIteratorStats,
	rp ReaderProvider,
) (Iterator, error)

NewIterWithBlockPropertyFilters returns an iterator for the contents of the table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after itself and returns a nil iterator.

func (*Reader) NewIterWithBlockPropertyFiltersAndContextEtc ¶ added in v1.1.0

func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
	ctx context.Context,
	lower, upper []byte,
	filterer *BlockPropertiesFilterer,
	hideObsoletePoints, useFilterBlock bool,
	stats *base.InternalIteratorStats,
	rp ReaderProvider,
) (Iterator, error)

NewIterWithBlockPropertyFiltersAndContextEtc is similar to NewIterWithBlockPropertyFilters and additionally accepts a context for tracing.

If hideObsoletePoints, the callee assumes that filterer already includes obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by first calling TryAddBlockPropertyFilterForHideObsoletePoints.

func (*Reader) NewRawRangeDelIter ¶

func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error)

NewRawRangeDelIter returns an internal iterator for the contents of the range-del block for the table. Returns nil if the table does not contain any range deletions.

TODO(sumeer): plumb context.Context since this path is relevant in the user-facing iterator. Add WithContext methods since the existing ones are public.

func (*Reader) NewRawRangeKeyIter ¶

func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error)

NewRawRangeKeyIter returns an internal iterator for the contents of the range-key block for the table. Returns nil if the table does not contain any range keys.

TODO(sumeer): plumb context.Context since this path is relevant in the user-facing iterator. Add WithContext methods since the existing ones are public.

func (*Reader) TableFormat ¶

func (r *Reader) TableFormat() (TableFormat, error)

TableFormat returns the format version for the table.

func (*Reader) TryAddBlockPropertyFilterForHideObsoletePoints ¶ added in v1.1.0

func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(
	snapshotForHideObsoletePoints uint64,
	fileLargestSeqNum uint64,
	pointKeyFilters []BlockPropertyFilter,
) (hideObsoletePoints bool, filters []BlockPropertyFilter)

TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the value of hideObsoletePoints and potentially add a block property filter.

func (*Reader) ValidateBlockChecksums ¶

func (r *Reader) ValidateBlockChecksums() error

ValidateBlockChecksums validates the checksums for each block in the SSTable.

type ReaderOption ¶

type ReaderOption interface {
	// contains filtered or unexported methods
}

ReaderOption provide an interface to do work on Reader while it is being opened.

type ReaderOptions ¶

type ReaderOptions struct {
	// Cache is used to cache uncompressed blocks from sstables.
	//
	// The default cache size is a zero-size cache.
	Cache *cache.Cache

	// LoadBlockSema, if set, is used to limit the number of blocks that can be
	// loaded (i.e. read from the filesystem) in parallel. Each load acquires one
	// unit from the semaphore for the duration of the read.
	LoadBlockSema *fifo.Semaphore

	// User properties specified in this map will not be added to sst.Properties.UserProperties.
	DeniedUserProperties map[string]struct{}

	// Comparer defines a total ordering over the space of []byte keys: a 'less
	// than' relationship. The same comparison algorithm must be used for reads
	// and writes over the lifetime of the DB.
	//
	// The default value uses the same ordering as bytes.Compare.
	Comparer *Comparer

	// Merge defines the Merge function in use for this keyspace.
	Merge base.Merge

	// Filters is a map from filter policy name to filter policy. It is used for
	// debugging tools which may be used on multiple databases configured with
	// different filter policies. It is not necessary to populate this filters
	// map during normal usage of a DB.
	Filters map[string]FilterPolicy

	// Merger defines the associative merge operation to use for merging values
	// written with {Batch,DB}.Merge. The MergerName is checked for consistency
	// with the value stored in the sstable when it was written.
	MergerName string

	// Logger is an optional logger and tracer.
	LoggerAndTracer base.LoggerAndTracer
}

ReaderOptions holds the parameters needed for reading an sstable.

type ReaderProvider ¶

type ReaderProvider interface {
	GetReader() (r *Reader, err error)
	Close()
}

ReaderProvider supports the implementation of blockProviderWhenClosed. GetReader and Close can be called multiple times in pairs.

type SeekGEFlags ¶

type SeekGEFlags = base.SeekGEFlags

SeekGEFlags exports base.SeekGEFlags.

type SeekLTFlags ¶

type SeekLTFlags = base.SeekLTFlags

SeekLTFlags exports base.SeekLTFlags.

type Separator ¶

type Separator = base.Separator

Separator exports the base.Separator type.

type Split ¶

type Split = base.Split

Split exports the base.Split type.

type Successor ¶

type Successor = base.Successor

Successor exports the base.Successor type.

type SuffixReplaceableBlockCollector ¶

type SuffixReplaceableBlockCollector interface {
	// UpdateKeySuffixes is called when a block is updated to change the suffix of
	// all keys in the block, and is passed the old value for that prop, if any,
	// for that block as well as the old and new suffix.
	UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
}

SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector interface that allows a block property collector to indicate that it supports being *updated* during suffix replacement, i.e. when an existing SST in which all keys have the same key suffix is updated to have a new suffix.

A collector which supports being updated in such cases must be able to derive its updated value from its old value and the change being made to the suffix, without needing to be passed each updated K/V.

For example, a collector that only inspects values would can simply copy its previously computed property as-is, since key-suffix replacement does not change values, while a collector that depends only on key suffixes, like one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just set its new bounds from the new suffix, as it is common to all keys, without needing to recompute it from every key.

An implementation of DataBlockIntervalCollector can also implement this interface, in which case the BlockPropertyCollector returned by passing it to NewBlockIntervalCollector will also implement this interface automatically.

type SuffixReplaceableTableCollector ¶

type SuffixReplaceableTableCollector interface {
	// UpdateKeySuffixes is called when a table is updated to change the suffix of
	// all keys in the table, and is passed the old value for that prop, if any,
	// for that table as well as the old and new suffix.
	UpdateKeySuffixes(oldProps map[string]string, oldSuffix, newSuffix []byte) error
}

SuffixReplaceableTableCollector is an extension to the TablePropertyCollector interface that allows a table property collector to indicate that it supports being *updated* during suffix replacement, i.e. when an existing SST in which all keys have the same key suffix is updated to have a new suffix.

A collector which supports being updated in such cases must be able to derive its updated value from its old value and the change being made to the suffix, without needing to be passed each updated K/V.

For example, a collector that only inspects values can simply copy its previously computed property as-is, since key-suffix replacement does not change values, while a collector that depends only on key suffixes, like one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just set its new bounds from the new suffix, as it is common to all keys, without needing to recompute it from every key.

type TableFormat ¶

type TableFormat uint32

TableFormat specifies the format version for sstables. The legacy LevelDB format is format version 1.

const (
	TableFormatUnspecified TableFormat = iota
	TableFormatLevelDB
	TableFormatRocksDBv2
	TableFormatPebblev1 // Block properties.
	TableFormatPebblev2 // Range keys.
	TableFormatPebblev3 // Value blocks.
	TableFormatPebblev4 // DELSIZED tombstones.
	NumTableFormats

	TableFormatMax = NumTableFormats - 1
)

The available table formats, representing the tuple (magic number, version number). Note that these values are not (and should not) be serialized to disk. The ordering should follow the order the versions were introduced to Pebble (i.e. the history is linear).

func ParseTableFormat ¶

func ParseTableFormat(magic []byte, version uint32) (TableFormat, error)

ParseTableFormat parses the given magic bytes and version into its corresponding internal TableFormat.

func (TableFormat) AsTuple ¶

func (f TableFormat) AsTuple() (string, uint32)

AsTuple returns the TableFormat's (Magic String, Version) tuple.

func (TableFormat) String ¶

func (f TableFormat) String() string

String returns the TableFormat (Magic String,Version) tuple.

type TablePropertyCollector ¶

type TablePropertyCollector interface {
	// Add is called with each new entry added to the sstable. While the sstable
	// is itself sorted by key, do not assume that the entries are added in any
	// order. In particular, the ordering of point entries and range tombstones
	// is unspecified.
	Add(key InternalKey, value []byte) error

	// Finish is called when all entries have been added to the sstable. The
	// collected properties (if any) should be added to the specified map. Note
	// that in case of an error during sstable construction, Finish may not be
	// called.
	Finish(userProps map[string]string) error

	// The name of the property collector.
	Name() string
}

TablePropertyCollector provides a hook for collecting user-defined properties based on the keys and values stored in an sstable. A new TablePropertyCollector is created for an sstable when the sstable is being written.

type TestKeysMaskingFilter ¶

type TestKeysMaskingFilter struct {
	*BlockIntervalFilter
}

TestKeysMaskingFilter implements BlockPropertyFilterMask and may be used to mask point keys with the testkeys-style suffixes (eg, @4) that are masked by range keys with testkeys-style suffixes.

func NewTestKeysMaskingFilter ¶

func NewTestKeysMaskingFilter() TestKeysMaskingFilter

NewTestKeysMaskingFilter constructs a TestKeysMaskingFilter that implements pebble.BlockPropertyFilterMask for efficient range-key masking using the testkeys block property filter. The masking filter wraps a block interval filter, and modifies the configured interval when Pebble requests it.

func (TestKeysMaskingFilter) Intersects ¶

func (f TestKeysMaskingFilter) Intersects(prop []byte) (bool, error)

Intersects implements the BlockPropertyFilter interface.

func (TestKeysMaskingFilter) SetSuffix ¶

func (f TestKeysMaskingFilter) SetSuffix(suffix []byte) error

SetSuffix implements pebble.BlockPropertyFilterMask.

type TrivialReaderProvider ¶

type TrivialReaderProvider struct {
	*Reader
}

TrivialReaderProvider implements ReaderProvider for a Reader that will outlive the top-level iterator in the iterator tree.

func (TrivialReaderProvider) Close ¶

func (trp TrivialReaderProvider) Close()

Close implements ReaderProvider.

func (TrivialReaderProvider) GetReader ¶

func (trp TrivialReaderProvider) GetReader() (*Reader, error)

GetReader implements ReaderProvider.

type UserKeyPrefixBound ¶

type UserKeyPrefixBound struct {
	// Lower is a lower bound user key prefix.
	Lower []byte
	// Upper is an upper bound user key prefix.
	Upper []byte
}

UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes. If both are nil, there is no bound specified. Else, Compare(Lower,Upper) must be < 0.

func (*UserKeyPrefixBound) IsEmpty ¶

func (ukb *UserKeyPrefixBound) IsEmpty() bool

IsEmpty returns true iff the bound is empty.

type VirtualReader ¶ added in v1.1.0

type VirtualReader struct {
	Properties CommonProperties
	// contains filtered or unexported fields
}

VirtualReader wraps Reader. Its purpose is to restrict functionality of the Reader which should be inaccessible to virtual sstables, and enforce bounds invariants associated with virtual sstables. All reads on virtual sstables should go through a VirtualReader.

INVARIANT: Any iterators created through a virtual reader will guarantee that they don't expose keys outside the virtual sstable bounds.

func MakeVirtualReader ¶ added in v1.1.0

func MakeVirtualReader(
	reader *Reader, meta manifest.VirtualFileMeta, isForeign bool,
) VirtualReader

MakeVirtualReader is used to contruct a reader which can read from virtual sstables.

func (*VirtualReader) CommonProperties ¶ added in v1.1.0

func (v *VirtualReader) CommonProperties() *CommonProperties

CommonProperties implements the CommonReader interface.

func (*VirtualReader) EstimateDiskUsage ¶ added in v1.1.0

func (v *VirtualReader) EstimateDiskUsage(start, end []byte) (uint64, error)

EstimateDiskUsage just calls VirtualReader.reader.EstimateDiskUsage after enforcing the virtual sstable bounds.

func (*VirtualReader) NewCompactionIter ¶ added in v1.1.0

func (v *VirtualReader) NewCompactionIter(
	bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool,
) (Iterator, error)

NewCompactionIter is the compaction iterator function for virtual readers.

func (*VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc ¶ added in v1.1.0

func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc(
	ctx context.Context,
	lower, upper []byte,
	filterer *BlockPropertiesFilterer,
	hideObsoletePoints, useFilterBlock bool,
	stats *base.InternalIteratorStats,
	rp ReaderProvider,
) (Iterator, error)

NewIterWithBlockPropertyFiltersAndContextEtc wraps Reader.NewIterWithBlockPropertyFiltersAndContext. We assume that the passed in [lower, upper) bounds will have at least some overlap with the virtual sstable bounds. No overlap is not currently supported in the iterator.

func (*VirtualReader) NewRawRangeDelIter ¶ added in v1.1.0

func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error)

NewRawRangeDelIter wraps Reader.NewRawRangeDelIter.

func (*VirtualReader) NewRawRangeKeyIter ¶ added in v1.1.0

func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error)

NewRawRangeKeyIter wraps Reader.NewRawRangeKeyIter.

func (*VirtualReader) ValidateBlockChecksumsOnBacking ¶ added in v1.1.0

func (v *VirtualReader) ValidateBlockChecksumsOnBacking() error

ValidateBlockChecksumsOnBacking will call ValidateBlockChecksumsOnBacking on the underlying reader. Note that block checksum validation is NOT restricted to virtual sstable bounds.

type Writer ¶

type Writer struct {
	// contains filtered or unexported fields
}

Writer is a table writer.

func NewWriter ¶

func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer

NewWriter returns a new table writer for the file. Closing the writer will close the file.

func (*Writer) Add ¶

func (w *Writer) Add(key InternalKey, value []byte) error

Add adds a key/value pair to the table being written. For a given Writer, the keys passed to Add must be in increasing order. The exception to this rule is range deletion tombstones. Range deletion tombstones need to be added ordered by their start key, but they can be added out of order from point entries. Additionally, range deletion tombstones must be fragmented (i.e. by keyspan.Fragmenter).

func (*Writer) AddRangeKey ¶

func (w *Writer) AddRangeKey(key InternalKey, value []byte) error

AddRangeKey adds a range key set, unset, or delete key/value pair to the table being written.

Range keys must be supplied in strictly ascending order of start key (i.e. user key ascending, sequence number descending, and key type descending). Ranges added must also be supplied in fragmented span order - i.e. other than spans that are perfectly aligned (same start and end keys), spans may not overlap. Range keys may be added out of order relative to point keys and range deletions.

func (*Writer) AddWithForceObsolete ¶ added in v1.1.0

func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error

AddWithForceObsolete must be used when writing a strict-obsolete sstable.

forceObsolete indicates whether the caller has determined that this key is obsolete even though it may be the latest point key for this userkey. This should be set to true for keys obsoleted by RANGEDELs, and is required for strict-obsolete sstables.

Note that there are two properties, S1 and S2 (see comment in format.go) that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the responsibility of the caller. S1 is solely the responsibility of the callee.

func (*Writer) Close ¶

func (w *Writer) Close() (err error)

Close finishes writing the table and closes the underlying file that the table was written to.

func (*Writer) Delete ¶

func (w *Writer) Delete(key []byte) error

Delete deletes the value for the given key. The sequence number is set to 0. Intended for use to externally construct an sstable before ingestion into a DB.

TODO(peter): untested

func (*Writer) DeleteRange ¶

func (w *Writer) DeleteRange(start, end []byte) error

DeleteRange deletes all of the keys (and values) in the range [start,end) (inclusive on start, exclusive on end). The sequence number is set to 0. Intended for use to externally construct an sstable before ingestion into a DB.

TODO(peter): untested

func (*Writer) EstimatedSize ¶

func (w *Writer) EstimatedSize() uint64

EstimatedSize returns the estimated size of the sstable being written if a call to Finish() was made without adding additional keys.

func (*Writer) Merge ¶

func (w *Writer) Merge(key, value []byte) error

Merge adds an action to the DB that merges the value at key with the new value. The details of the merge are dependent upon the configured merge operator. The sequence number is set to 0. Intended for use to externally construct an sstable before ingestion into a DB.

TODO(peter): untested

func (*Writer) Metadata ¶

func (w *Writer) Metadata() (*WriterMetadata, error)

Metadata returns the metadata for the finished sstable. Only valid to call after the sstable has been finished.

func (*Writer) RangeKeyDelete ¶

func (w *Writer) RangeKeyDelete(start, end []byte) error

RangeKeyDelete deletes a range between start (inclusive) and end (exclusive).

Keys must be added to the table in increasing order of start key. Spans are not required to be fragmented.

func (*Writer) RangeKeySet ¶

func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error

RangeKeySet sets a range between start (inclusive) and end (exclusive) with the given suffix to the given value. The resulting range key is given the sequence number zero, with the expectation that the resulting sstable will be ingested.

Keys must be added to the table in increasing order of start key. Spans are not required to be fragmented. The same suffix may not be set or unset twice over the same keyspan, because it would result in inconsistent state. Both the Set and Unset would share the zero sequence number, and a key cannot be both simultaneously set and unset.

func (*Writer) RangeKeyUnset ¶

func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error

RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive) with the given suffix. The resulting range key is given the sequence number zero, with the expectation that the resulting sstable will be ingested.

Keys must be added to the table in increasing order of start key. Spans are not required to be fragmented. The same suffix may not be set or unset twice over the same keyspan, because it would result in inconsistent state. Both the Set and Unset would share the zero sequence number, and a key cannot be both simultaneously set and unset.

func (*Writer) Set ¶

func (w *Writer) Set(key, value []byte) error

Set sets the value for the given key. The sequence number is set to 0. Intended for use to externally construct an sstable before ingestion into a DB. For a given Writer, the keys passed to Set must be in strictly increasing order.

TODO(peter): untested

func (*Writer) Write ¶

func (w *Writer) Write(blockWithTrailer []byte) (n int, err error)

Write implements io.Writer. This is analogous to writeCompressedBlock for blocks that already incorporate the trailer, and don't need the callee to return a BlockHandle.

type WriterMetadata ¶

type WriterMetadata struct {
	Size          uint64
	SmallestPoint InternalKey
	// LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed
	// before Writer.Close is called, because they may only be set on
	// Writer.Close.
	LargestPoint     InternalKey
	SmallestRangeDel InternalKey
	LargestRangeDel  InternalKey
	SmallestRangeKey InternalKey
	LargestRangeKey  InternalKey
	HasPointKeys     bool
	HasRangeDelKeys  bool
	HasRangeKeys     bool
	SmallestSeqNum   uint64
	LargestSeqNum    uint64
	Properties       Properties
}

WriterMetadata holds info about a finished sstable.

func RewriteKeySuffixes ¶

func RewriteKeySuffixes(
	sst []byte,
	rOpts ReaderOptions,
	out objstorage.Writable,
	o WriterOptions,
	from, to []byte,
	concurrency int,
) (*WriterMetadata, error)

RewriteKeySuffixes is deprecated.

TODO(sumeer): remove after switching CockroachDB to RewriteKeySuffixesAndReturnFormat.

func RewriteKeySuffixesViaWriter ¶

func RewriteKeySuffixesViaWriter(
	r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte,
) (*WriterMetadata, error)

RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a single loop over the Reader that writes each key to the Writer with the new suffix. The is significantly slower than the parallelized rewriter, and does more work to rederive filters, props, etc.

Any obsolete bits that key-value pairs may be annotated with are ignored and lost during the rewrite. Some of the obsolete bits may be recreated -- specifically when there are multiple keys with the same user key. Additionally, the output sstable has the pebble.obsolete.is_strict property set to false. See the longer comment at RewriteKeySuffixesAndReturnFormat.

func (*WriterMetadata) SetLargestPointKey ¶

func (m *WriterMetadata) SetLargestPointKey(k InternalKey)

SetLargestPointKey sets the largest point key to the given key. NB: this method set the "absolute" largest point key. Any existing key is overridden.

func (*WriterMetadata) SetLargestRangeDelKey ¶

func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey)

SetLargestRangeDelKey sets the largest rangedel key to the given key. NB: this method set the "absolute" largest rangedel key. Any existing key is overridden.

func (*WriterMetadata) SetLargestRangeKey ¶

func (m *WriterMetadata) SetLargestRangeKey(k InternalKey)

SetLargestRangeKey sets the largest range key to the given key. NB: this method set the "absolute" largest range key. Any existing key is overridden.

func (*WriterMetadata) SetSmallestPointKey ¶

func (m *WriterMetadata) SetSmallestPointKey(k InternalKey)

SetSmallestPointKey sets the smallest point key to the given key. NB: this method set the "absolute" smallest point key. Any existing key is overridden.

func (*WriterMetadata) SetSmallestRangeDelKey ¶

func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey)

SetSmallestRangeDelKey sets the smallest rangedel key to the given key. NB: this method set the "absolute" smallest rangedel key. Any existing key is overridden.

func (*WriterMetadata) SetSmallestRangeKey ¶

func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey)

SetSmallestRangeKey sets the smallest range key to the given key. NB: this method set the "absolute" smallest range key. Any existing key is overridden.

type WriterOption ¶

type WriterOption interface {
	// contains filtered or unexported methods
}

WriterOption provide an interface to do work on Writer while it is being opened.

type WriterOptions ¶

type WriterOptions struct {
	// BlockRestartInterval is the number of keys between restart points
	// for delta encoding of keys.
	//
	// The default value is 16.
	BlockRestartInterval int

	// BlockSize is the target uncompressed size in bytes of each table block.
	//
	// The default value is 4096.
	BlockSize int

	// BlockSizeThreshold finishes a block if the block size is larger than the
	// specified percentage of the target block size and adding the next entry
	// would cause the block to be larger than the target block size.
	//
	// The default value is 90
	BlockSizeThreshold int

	// Cache is used to cache uncompressed blocks from sstables.
	//
	// The default is a nil cache.
	Cache *cache.Cache

	// Comparer defines a total ordering over the space of []byte keys: a 'less
	// than' relationship. The same comparison algorithm must be used for reads
	// and writes over the lifetime of the DB.
	//
	// The default value uses the same ordering as bytes.Compare.
	Comparer *Comparer

	// Compression defines the per-block compression to use.
	//
	// The default value (DefaultCompression) uses snappy compression.
	Compression Compression

	// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
	// reduce disk reads for Get calls.
	//
	// One such implementation is bloom.FilterPolicy(10) from the pebble/bloom
	// package.
	//
	// The default value means to use no filter.
	FilterPolicy FilterPolicy

	// FilterType defines whether an existing filter policy is applied at a
	// block-level or table-level. Block-level filters use less memory to create,
	// but are slower to access as a check for the key in the index must first be
	// performed to locate the filter block. A table-level filter will require
	// memory proportional to the number of keys in an sstable to create, but
	// avoids the index lookup when determining if a key is present. Table-level
	// filters should be preferred except under constrained memory situations.
	FilterType FilterType

	// IndexBlockSize is the target uncompressed size in bytes of each index
	// block. When the index block size is larger than this target, two-level
	// indexes are automatically enabled. Setting this option to a large value
	// (such as math.MaxInt32) disables the automatic creation of two-level
	// indexes.
	//
	// The default value is the value of BlockSize.
	IndexBlockSize int

	// Merger defines the associative merge operation to use for merging values
	// written with {Batch,DB}.Merge. The MergerName is checked for consistency
	// with the value stored in the sstable when it was written.
	MergerName string

	// TableFormat specifies the format version for writing sstables. The default
	// is TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use
	// TableFormatLevelDB to create LevelDB compatible sstable which can be used
	// by a wider range of tools and libraries.
	TableFormat TableFormat

	// IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment
	// in format.go. Must be false if format < TableFormatPebblev4.
	//
	// TODO(bilal): set this when writing shared ssts.
	IsStrictObsolete bool

	// WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is
	// used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the
	// youngest for a userkey.
	WritingToLowestLevel bool

	// TablePropertyCollectors is a list of TablePropertyCollector creation
	// functions. A new TablePropertyCollector is created for each sstable built
	// and lives for the lifetime of the table.
	TablePropertyCollectors []func() TablePropertyCollector

	// BlockPropertyCollectors is a list of BlockPropertyCollector creation
	// functions. A new BlockPropertyCollector is created for each sstable
	// built and lives for the lifetime of writing that table.
	BlockPropertyCollectors []func() BlockPropertyCollector

	// Checksum specifies which checksum to use.
	Checksum ChecksumType

	// Parallelism is used to indicate that the sstable Writer is allowed to
	// compress data blocks and write datablocks to disk in parallel with the
	// Writer client goroutine.
	Parallelism bool

	// ShortAttributeExtractor mirrors
	// Options.Experimental.ShortAttributeExtractor.
	ShortAttributeExtractor base.ShortAttributeExtractor

	// RequiredInPlaceValueBound mirrors
	// Options.Experimental.RequiredInPlaceValueBound.
	RequiredInPlaceValueBound UserKeyPrefixBound
}

WriterOptions holds the parameters used to control building an sstable.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL