Documentation ¶
Overview ¶
Package estore provides an ordered key/value store.
Example ¶
package main import ( "crypto/rand" "fmt" "log" "github.com/edgelesssys/estore" "github.com/edgelesssys/estore/vfs" ) func main() { encryptionKey := make([]byte, 16) _, err := rand.Read(encryptionKey) if err != nil { log.Fatal(err) } db, err := estore.Open("", &estore.Options{EncryptionKey: encryptionKey, FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } key := []byte("hello") if err := db.Set(key, []byte("world"), estore.Sync); err != nil { log.Fatal(err) } value, closer, err := db.Get(key) if err != nil { log.Fatal(err) } fmt.Printf("%s %s\n", key, value) if err := closer.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello world
Index ¶
- Constants
- Variables
- func CanDeterministicallySingleDelete(it *Iterator) (bool, error)
- func DebugCheckLevels(db *DB) error
- func GetVersion(dir string, fs vfs.FS) (string, error)
- func IsCorruptionError(err error) bool
- func NewCache(size int64) *cache.Cache
- func TableCacheSize(maxOpenFiles int) int
- type AbbreviatedKey
- type ArchiveCleaner
- type AttributeAndLen
- type BackingType
- type Batch
- func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error
- func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error
- func (b *Batch) Close() error
- func (b *Batch) Commit(o *WriteOptions) error
- func (b *Batch) CommitStats() BatchCommitStats
- func (b *Batch) Count() uint32
- func (b *Batch) Delete(key []byte, _ *WriteOptions) error
- func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp
- func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error
- func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp
- func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error
- func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp
- func (b *Batch) Empty() bool
- func (b *Batch) Get(key []byte) ([]byte, io.Closer, error)
- func (b *Batch) Indexed() bool
- func (b *Batch) Len() int
- func (b *Batch) LogData(data []byte, _ *WriteOptions) error
- func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error
- func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp
- func (b *Batch) NewIter(o *IterOptions) (*Iterator, error)
- func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator
- func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error
- func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp
- func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error
- func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error
- func (b *Batch) Reader() BatchReader
- func (b *Batch) Repr() []byte
- func (b *Batch) Reset()
- func (b *Batch) SeqNum() uint64
- func (b *Batch) Set(key, value []byte, _ *WriteOptions) error
- func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp
- func (b *Batch) SetRepr(data []byte) error
- func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error
- func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp
- func (b *Batch) SyncWait() error
- type BatchCommitStats
- type BatchReader
- type BlockPropertyCollector
- type BlockPropertyFilter
- type BlockPropertyFilterMask
- type CPUWorkHandle
- type CPUWorkPermissionGranter
- type Cache
- type CacheMetrics
- type CheckLevelsStats
- type CheckpointOption
- type CheckpointSpan
- type Cleaner
- type CloneOptions
- type CompactionInfo
- type Compare
- type Comparer
- type Compression
- type DB
- func (d *DB) Apply(batch *Batch, opts *WriteOptions) error
- func (d *DB) ApplyNoSyncWait(batch *Batch, opts *WriteOptions) error
- func (d *DB) AsyncFlush() (<-chan struct{}, error)
- func (d *DB) CheckLevels(stats *CheckLevelsStats) error
- func (d *DB) Checkpoint(destDir string, opts ...CheckpointOption) (ckErr error)
- func (d *DB) Close() error
- func (d *DB) Compact(start, end []byte, parallelize bool) error
- func (d *DB) Delete(key []byte, opts *WriteOptions) error
- func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error
- func (d *DB) DeleteSized(key []byte, valueSize uint32, opts *WriteOptions) error
- func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error
- func (d *DB) EstimateDiskUsage(start, end []byte) (uint64, error)
- func (d *DB) EstimateDiskUsageByBackingType(start, end []byte) (totalSize, remoteSize, externalSize uint64, _ error)
- func (d *DB) Flush() error
- func (d *DB) FormatMajorVersion() FormatMajorVersion
- func (d *DB) Get(key []byte) ([]byte, io.Closer, error)
- func (d *DB) Ingest(paths []string) error
- func (d *DB) IngestAndExcise(paths []string, shared []SharedSSTMeta, exciseSpan KeyRange) (IngestOperationStats, error)
- func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error)
- func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error)
- func (d *DB) LogData(data []byte, opts *WriteOptions) error
- func (d *DB) Merge(key, value []byte, opts *WriteOptions) error
- func (d *DB) Metrics() *Metrics
- func (d *DB) NewBatch() *Batch
- func (d *DB) NewBatchWithSize(size int) *Batch
- func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot
- func (d *DB) NewIndexedBatch() *Batch
- func (d *DB) NewIndexedBatchWithSize(size int) *Batch
- func (d *DB) NewIter(o *IterOptions) (*Iterator, error)
- func (d *DB) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error)
- func (d *DB) NewSnapshot() *Snapshot
- func (d *DB) NewTransaction(writable bool) *Transaction
- func (d *DB) ObjProvider() objstorage.Provider
- func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error
- func (d *DB) RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error
- func (d *DB) RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error
- func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error
- func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error)
- func (d *DB) ScanInternal(ctx context.Context, lower, upper []byte, ...) error
- func (d *DB) ScanStatistics(ctx context.Context, lower, upper []byte, opts ScanStatisticsOptions) (LSMKeyStatistics, error)
- func (d *DB) Set(key, value []byte, opts *WriteOptions) error
- func (d *DB) SetCreatorID(creatorID uint64) error
- func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error
- func (d *DB) TestOnlyWaitForCleaning()
- type DBDesc
- type DeferredBatchOp
- type DeletableValueMerger
- type DeleteCleaner
- type DiskSlowInfo
- type DownloadSpan
- type Equal
- type EventListener
- type EventuallyFileOnlySnapshot
- func (es *EventuallyFileOnlySnapshot) Close() error
- func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error)
- func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error)
- func (es *EventuallyFileOnlySnapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error)
- func (es *EventuallyFileOnlySnapshot) ScanInternal(ctx context.Context, lower, upper []byte, ...) error
- func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(ctx context.Context, dur time.Duration) error
- type ExternalFile
- type ExternalIterForwardOnly
- type ExternalIterOption
- type FileNum
- type FilterMetrics
- type FilterPolicy
- type FilterType
- type FilterWriter
- type FlushInfo
- type FormatMajorVersion
- type IngestOperationStats
- type InternalIteratorStats
- type InternalKey
- type InternalKeyKind
- type IterKeyType
- type IterOptions
- type IterValidityState
- type Iterator
- func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error)
- func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error)
- func (i *Iterator) Close() error
- func (i *Iterator) Error() error
- func (i *Iterator) First() bool
- func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool)
- func (i *Iterator) Key() []byte
- func (i *Iterator) Last() bool
- func (i *Iterator) LazyValue() LazyValue
- func (i *Iterator) Metrics() IteratorMetrics
- func (i *Iterator) Next() bool
- func (i *Iterator) NextPrefix() bool
- func (i *Iterator) NextWithLimit(limit []byte) IterValidityState
- func (i *Iterator) Prev() bool
- func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState
- func (i *Iterator) RangeBounds() (start, end []byte)
- func (i *Iterator) RangeKeyChanged() bool
- func (i *Iterator) RangeKeys() []RangeKeyData
- func (i *Iterator) ResetStats()
- func (i *Iterator) SeekGE(key []byte) bool
- func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState
- func (i *Iterator) SeekLT(key []byte) bool
- func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState
- func (i *Iterator) SeekPrefixGE(key []byte) bool
- func (i *Iterator) SetBounds(lower, upper []byte)
- func (i *Iterator) SetOptions(o *IterOptions)
- func (i *Iterator) Stats() IteratorStats
- func (i *Iterator) Valid() bool
- func (i *Iterator) Value() []byte
- func (i *Iterator) ValueAndErr() ([]byte, error)
- type IteratorLevel
- type IteratorLevelKind
- type IteratorMetrics
- type IteratorStats
- type IteratorStatsKind
- type KeyRange
- func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool
- func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool
- func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool
- func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool
- func (k *KeyRange) Valid() bool
- type KeyStatistics
- type LSMKeyStatistics
- type LazyFetcher
- type LazyValue
- type LevelInfo
- type LevelMetrics
- type LevelOptions
- type Lock
- type Logger
- type LoggerAndTracer
- type ManifestCreateInfo
- type ManifestDeleteInfo
- type Merge
- type Merger
- type Metrics
- func (m *Metrics) DiskSpaceUsage() uint64
- func (m *Metrics) NumVirtual() uint64
- func (m *Metrics) ReadAmp() int
- func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune)
- func (m *Metrics) String() string
- func (m *Metrics) StringForTests() string
- func (m *Metrics) Total() LevelMetrics
- func (m *Metrics) VirtualSize() uint64
- type MultiLevelHeuristic
- type NoMultiLevel
- type Options
- func (o *Options) AddEventListener(l EventListener)
- func (o *Options) Check(s string) error
- func (o *Options) Clone() *Options
- func (o *Options) EnsureDefaults() *Options
- func (o *Options) Level(level int) LevelOptions
- func (o *Options) MakeReaderOptions() sstable.ReaderOptions
- func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions
- func (o *Options) Parse(s string, hooks *ParseHooks) error
- func (o *Options) String() string
- func (o *Options) Validate() error
- func (o *Options) WithFSDefaults() *Options
- type ParseHooks
- type RangeKeyData
- type RangeKeyIteratorStats
- type RangeKeyMasking
- type ReadaheadConfig
- type Reader
- type SSTableInfo
- type SSTablesOption
- type ScanStatisticsOptions
- type SecondaryCacheMetrics
- type Separator
- type SharedSSTMeta
- type ShortAttribute
- type ShortAttributeExtractor
- type Snapshot
- func (s *Snapshot) Close() error
- func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error)
- func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error)
- func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error)
- func (s *Snapshot) ScanInternal(ctx context.Context, lower, upper []byte, ...) error
- type Split
- type Successor
- type TableCache
- type TableCreateInfo
- type TableDeleteInfo
- type TableInfo
- type TableIngestInfo
- type TablePropertyCollector
- type TableStatsInfo
- type TableValidatedInfo
- type ThroughputMetric
- type Transaction
- type UserKeyPrefixBound
- type ValueMerger
- type WALCreateInfo
- type WALDeleteInfo
- type WriteAmpHeuristic
- type WriteOptions
- type WriteStallBeginInfo
- type Writer
Examples ¶
Constants ¶
const ( InternalKeyKindDelete = base.InternalKeyKindDelete InternalKeyKindSet = base.InternalKeyKindSet InternalKeyKindMerge = base.InternalKeyKindMerge InternalKeyKindLogData = base.InternalKeyKindLogData InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete InternalKeyKindMax = base.InternalKeyKindMax InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete InternalKeyKindRangeKeySet = base.InternalKeyKindRangeKeySet InternalKeyKindRangeKeyUnset = base.InternalKeyKindRangeKeyUnset InternalKeyKindRangeKeyDelete = base.InternalKeyKindRangeKeyDelete InternalKeyKindIngestSST = base.InternalKeyKindIngestSST InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized InternalKeyKindInvalid = base.InternalKeyKindInvalid InternalKeySeqNumBatch = base.InternalKeySeqNumBatch InternalKeySeqNumMax = base.InternalKeySeqNumMax InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel )
These constants are part of the file format, and should not be changed.
const ( DefaultCompression = sstable.DefaultCompression NoCompression = sstable.NoCompression SnappyCompression = sstable.SnappyCompression )
Exported Compression constants.
const (
TableFilter = base.TableFilter
)
Exported TableFilter constants.
Variables ¶
var ( // ErrNotFound is returned when a get operation does not find the requested // key. ErrNotFound = base.ErrNotFound // ErrClosed is panicked when an operation is performed on a closed snapshot or // DB. Use errors.Is(err, ErrClosed) to check for this error. ErrClosed = errors.New("pebble: closed") // ErrReadOnly is returned when a write operation is performed on a read-only // database. ErrReadOnly = errors.New("pebble: read-only") )
var ( // FsyncLatencyBuckets are prometheus histogram buckets suitable for a histogram // that records latencies for fsyncs. FsyncLatencyBuckets = append( prometheus.LinearBuckets(0.0, float64(time.Microsecond*100), 50), prometheus.ExponentialBucketsRange(float64(time.Millisecond*5), float64(10*time.Second), 50)..., ) // SecondaryCacheIOBuckets exported to enable exporting from package pebble to // enable exporting metrics with below buckets in CRDB. SecondaryCacheIOBuckets = sharedcache.IOBuckets // SecondaryCacheChannelWriteBuckets exported to enable exporting from package // pebble to enable exporting metrics with below buckets in CRDB. SecondaryCacheChannelWriteBuckets = sharedcache.ChannelWriteBuckets )
var DefaultComparer = base.DefaultComparer
DefaultComparer exports the base.DefaultComparer variable.
var DefaultLogger = base.DefaultLogger
DefaultLogger logs to the Go stdlib logs.
var DefaultMerger = base.DefaultMerger
DefaultMerger exports the base.DefaultMerger variable.
var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize)))
ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction")
ErrCancelledCompaction is returned if a compaction is cancelled by a concurrent excise or ingest-split operation.
var ErrCorruption = base.ErrCorruption
ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST, sstable) isn't in the expected format.
var ErrDBAlreadyExists = errors.New("pebble: database already exists")
ErrDBAlreadyExists is generated when ErrorIfExists is set and the database already exists.
Note that errors can be wrapped with more details; use errors.Is().
var ErrDBDoesNotExist = errors.New("pebble: database does not exist")
ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database does not exist.
Note that errors can be wrapped with more details; use errors.Is().
var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine")
ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database already exists and is not pristine.
Note that errors can be wrapped with more details; use errors.Is().
var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch"))
ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called with a shared file visitor function, and a file in a shareable level (i.e. level >= sharedLevelsStart) was found to not be in shared storage according to objstorage.Provider, or not shareable for another reason such as for containing keys newer than the snapshot sequence number.
var ErrNotIndexed = errors.New("pebble: batch not indexed")
ErrNotIndexed means that a read operation on a batch failed because the batch is not indexed and thus doesn't support reads.
var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot")
ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets applied before the transition of that EFOS to a file-only snapshot.
var NoSync = &WriteOptions{Sync: false}
NoSync specifies the default write options for writes which do not synchronize to disk.
var Sync = &WriteOptions{Sync: true}
Sync specifies the default write options for writes which synchronize to disk.
Functions ¶
func CanDeterministicallySingleDelete ¶
CanDeterministicallySingleDelete takes a valid iterator and examines internal state to determine if a SingleDelete deleting Iterator.Key() would deterministically delete the key. CanDeterministicallySingleDelete requires the iterator to be oriented in the forward direction (eg, the last positioning operation must've been a First, a Seek[Prefix]GE, or a Next[Prefix][WithLimit]).
This function does not change the external position of the iterator, and all positioning methods should behave the same as if it was never called. This function will only return a meaningful result the first time it's invoked at an iterator position. This function invalidates the iterator Value's memory, and the caller must not rely on the memory safety of the previous Iterator position.
If CanDeterministicallySingleDelete returns true AND the key at the iterator position is not modified between the creation of the Iterator and the commit of a batch containing a SingleDelete over the key, then the caller can be assured that SingleDelete is equivalent to Delete on the local engine, but it may not be true on another engine that received the same writes and with logically equivalent state since this engine may have collapsed multiple SETs into one.
func DebugCheckLevels ¶
DebugCheckLevels calls CheckLevels on the provided database. It may be set in the DebugCheck field of Options to check level invariants whenever a new version is installed.
func GetVersion ¶
GetVersion returns the engine version string from the latest options file present in dir. Used to check what Pebble or RocksDB version was last used to write to the database stored in this directory. An empty string is returned if no valid OPTIONS file with a version key was found.
func IsCorruptionError ¶
IsCorruptionError returns true if the given error indicates database corruption.
func NewCache ¶
NewCache creates a new cache of the specified size. Memory for the cache is allocated on demand, not during initialization. The cache is created with a reference count of 1. Each DB it is associated with adds a reference, so the creator of the cache should usually release their reference after the DB is created.
c := pebble.NewCache(...) defer c.Unref() d, err := pebble.Open(pebble.Options{Cache: c})
func TableCacheSize ¶
TableCacheSize can be used to determine the table cache size for a single db, given the maximum open files which can be used by a table cache which is only used by a single db.
Types ¶
type AbbreviatedKey ¶
type AbbreviatedKey = base.AbbreviatedKey
AbbreviatedKey exports the base.AbbreviatedKey type.
type ArchiveCleaner ¶
type ArchiveCleaner = base.ArchiveCleaner
ArchiveCleaner exports the base.ArchiveCleaner type.
type AttributeAndLen ¶
type AttributeAndLen = base.AttributeAndLen
AttributeAndLen exports the base.AttributeAndLen type.
type BackingType ¶
type BackingType int
BackingType denotes the type of storage backing a given sstable.
const ( // BackingTypeLocal denotes an sstable stored on local disk according to the // objprovider. This file is completely owned by us. BackingTypeLocal BackingType = iota // by this Pebble instance and possibly shared by other Pebble instances. // These types of files have lifecycle managed by Pebble. BackingTypeShared // created by a Pebble instance other than this one. These types of files have // lifecycle managed by Pebble. BackingTypeSharedForeign // BackingTypeExternal denotes an sstable stored on external storage, // not owned by any Pebble instance and with no refcounting/cleanup methods // or lifecycle management. An example of an external file is a file restored // from a backup. BackingTypeExternal )
type Batch ¶
type Batch struct {
// contains filtered or unexported fields
}
A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets, RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch implements the Reader interface, but only an indexed batch supports reading (without error) via Get or NewIter. A non-indexed batch will return ErrNotIndexed when read from. A batch is not safe for concurrent use, and consumers should use a batch per goroutine or provide their own synchronization.
Indexing ¶
Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch allows iteration via an Iterator (see Batch.NewIter). The iterator provides a merged view of the operations in the batch and the underlying database. This is implemented by treating the batch as an additional layer in the LSM where every entry in the batch is considered newer than any entry in the underlying database (batch entries have the InternalKeySeqNumBatch bit set). By treating the batch as an additional layer in the LSM, iteration supports all batch operations (i.e. Set, Merge, Delete, DeleteRange, RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
The same key can be operated on multiple times in a batch, though only the latest operation will be visible. For example, Put("a", "b"), Delete("a") will cause the key "a" to not be visible in the batch. Put("a", "b"), Put("a", "c") will cause a read of "a" to return the value "c".
The batch index is implemented via an skiplist (internal/batchskl). While the skiplist implementation is very fast, inserting into an indexed batch is significantly slower than inserting into a non-indexed batch. Only use an indexed batch if you require reading from it.
Atomic commit ¶
The operations in a batch are persisted by calling Batch.Commit which is equivalent to calling DB.Apply(batch). A batch is committed atomically by writing the internal batch representation to the WAL, adding all of the batch operations to the memtable associated with the WAL, and then incrementing the visible sequence number so that subsequent reads can see the effects of the batch operations. If WriteOptions.Sync is true, a call to Batch.Commit will guarantee that the batch is persisted to disk before returning. See commitPipeline for more on the implementation details.
Large batches ¶
The size of a batch is limited only by available memory (be aware that indexed batches require considerably additional memory for the skiplist structure). A given WAL file has a single memtable associated with it (this restriction could be removed, but doing so is onerous and complex). And a memtable has a fixed size due to the underlying fixed size arena. Note that this differs from RocksDB where a memtable can grow arbitrarily large using a list of arena chunks. In RocksDB this is accomplished by storing pointers in the arena memory, but that isn't possible in Go.
During Batch.Commit, a batch which is larger than a threshold (> MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue of memtables. A flushableBatch forces WAL to be rotated, but that happens anyways when the memtable becomes full so this does not cause significant WAL churn. Because the flushableBatch is readable as another layer in the LSM, Batch.Commit returns as soon as the flushableBatch has been added to the queue of memtables.
Internally, a flushableBatch provides Iterator support by sorting the batch contents (the batch is sorted once, when it is added to the memtable queue). Sorting the batch contents and insertion of the contents into a memtable have the same big-O time, but the constant factor dominates here. Sorting is significantly faster and uses significantly less memory.
Internal representation ¶
The internal batch representation is a contiguous byte buffer with a fixed 12-byte header, followed by a series of records.
+-------------+------------+--- ... ---+ | SeqNum (8B) | Count (4B) | Entries | +-------------+------------+--- ... ---+
Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed strings (varstring):
+-----------+-----------------+-------------------+ | Kind (1B) | Key (varstring) | Value (varstring) | +-----------+-----------------+-------------------+
A varstring is a varint32 followed by N bytes of data. The Kind tags are exactly those specified by InternalKeyKind. The following table shows the format for records of each kind:
InternalKeyKindDelete varstring InternalKeyKindLogData varstring InternalKeyKindIngestSST varstring InternalKeyKindSet varstring varstring InternalKeyKindMerge varstring varstring InternalKeyKindRangeDelete varstring varstring InternalKeyKindRangeKeySet varstring varstring InternalKeyKindRangeKeyUnset varstring varstring InternalKeyKindRangeKeyDelete varstring varstring
The intuitive understanding here are that the arguments to Delete, Set, Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The RangeKeySet and RangeKeyUnset operations are slightly more complicated, encoding their end key, suffix and value [in the case of RangeKeySet] within the Value varstring. For more information on the value encoding for RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
The internal batch representation is the on disk format for a batch in the WAL, and thus stable. New record kinds may be added, but the existing ones will not be modified.
func (*Batch) AddInternalKey ¶
func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error
AddInternalKey allows the caller to add an internal key of point key kinds to a batch. Passing in an internal key of kind RangeKey* or RangeDelete will result in a panic. Note that the seqnum in the internal key is effectively ignored, even though the Kind is preserved. This is because the batch format does not allow for a per-key seqnum to be specified, only a batch-wide one.
Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not supported with this method as they require specialized logic.
func (*Batch) Apply ¶
func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error
Apply the operations contained in the batch to the receiver batch.
It is safe to modify the contents of the arguments after Apply returns.
func (*Batch) Commit ¶
func (b *Batch) Commit(o *WriteOptions) error
Commit applies the batch to its parent writer.
func (*Batch) CommitStats ¶
func (b *Batch) CommitStats() BatchCommitStats
CommitStats returns stats related to committing the batch. Should be called after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be called after Batch.SyncWait.
func (*Batch) Count ¶
Count returns the count of memtable-modifying operations in this batch. All operations with the except of LogData increment this count. For IngestSSTs, count is only used to indicate the number of SSTs ingested in the record, the batch isn't applied to the memtable.
func (*Batch) Delete ¶
func (b *Batch) Delete(key []byte, _ *WriteOptions) error
Delete adds an action to the batch that deletes the entry for key.
It is safe to modify the contents of the arguments after Delete returns.
func (*Batch) DeleteDeferred ¶
func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp
DeleteDeferred is similar to Delete in that it adds a delete operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) DeleteRange ¶
func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error
DeleteRange deletes all of the point keys (and values) in the range [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT delete overlapping range keys (eg, keys set via RangeKeySet).
It is safe to modify the contents of the arguments after DeleteRange returns.
func (*Batch) DeleteRangeDeferred ¶
func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp
DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range operation to the batch, except it only takes in key lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object. Note that DeferredBatchOp.Key should be populated with the start key, and DeferredBatchOp.Value should be populated with the end key.
func (*Batch) DeleteSized ¶
func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error
DeleteSized behaves identically to Delete, but takes an additional argument indicating the size of the value being deleted. DeleteSized should be preferred when the caller has the expectation that there exists a single internal KV pair for the key (eg, the key has not been overwritten recently), and the caller knows the size of its value.
DeleteSized will record the value size within the tombstone and use it to inform compaction-picking heuristics which strive to reduce space amplification in the LSM. This "calling your shot" mechanic allows the storage engine to more accurately estimate and reduce space amplification.
It is safe to modify the contents of the arguments after DeleteSized returns.
func (*Batch) DeleteSizedDeferred ¶
func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp
DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete operation to the batch, except it only takes in key length instead of a complete key slice, letting the caller encode into the DeferredBatchOp.Key slice and then call Finish() on the returned object.
func (*Batch) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the Batch does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*Batch) Indexed ¶
Indexed returns true if the batch is indexed (i.e. supports read operations).
func (*Batch) LogData ¶
func (b *Batch) LogData(data []byte, _ *WriteOptions) error
LogData adds the specified to the batch. The data will be written to the WAL, but not added to memtables or sstables. Log data is never indexed, which makes it useful for testing WAL performance.
It is safe to modify the contents of the argument after LogData returns.
func (*Batch) Merge ¶
func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error
Merge adds an action to the batch that merges the value at key with the new value. The details of the merge are dependent upon the configured merge operator.
It is safe to modify the contents of the arguments after Merge returns.
func (*Batch) MergeDeferred ¶
func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp
MergeDeferred is similar to Merge in that it adds a merge operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) NewIter ¶
func (b *Batch) NewIter(o *IterOptions) (*Iterator, error)
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
The returned Iterator observes all of the Batch's existing mutations, but no later mutations. Its view can be refreshed via RefreshBatchSnapshot or SetOptions().
func (*Batch) NewIterWithContext ¶
func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator
NewIterWithContext is like NewIter, and additionally accepts a context for tracing.
func (*Batch) RangeKeyDelete ¶
func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error
RangeKeyDelete deletes all of the range keys in the range [start,end) (inclusive on start, exclusive on end). It does not delete point keys (for that use DeleteRange). RangeKeyDelete removes all range keys within the bounds, including those with or without suffixes.
It is safe to modify the contents of the arguments after RangeKeyDelete returns.
func (*Batch) RangeKeyDeleteDeferred ¶
func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp
RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an operation to delete range keys to the batch, except it only takes in key lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object. Note that DeferredBatchOp.Key should be populated with the start key, and DeferredBatchOp.Value should be populated with the end key.
func (*Batch) RangeKeySet ¶
func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error
RangeKeySet sets a range key mapping the key range [start, end) at the MVCC timestamp suffix to value. The suffix is optional. If any portion of the key range [start, end) is already set by a range key with the same suffix value, RangeKeySet overrides it.
It is safe to modify the contents of the arguments after RangeKeySet returns.
func (*Batch) RangeKeyUnset ¶
func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error
RangeKeyUnset removes a range key mapping the key range [start, end) at the MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed range key. RangeKeyUnset only removes portions of range keys that fall within the [start, end) key span, and only range keys with suffixes that exactly match the unset suffix.
It is safe to modify the contents of the arguments after RangeKeyUnset returns.
func (*Batch) Reader ¶
func (b *Batch) Reader() BatchReader
Reader returns a BatchReader for the current batch contents. If the batch is mutated, the new entries will not be visible to the reader.
func (*Batch) Repr ¶
Repr returns the underlying batch representation. It is not safe to modify the contents. Reset() will not change the contents of the returned value, though any other mutation operation may do so.
func (*Batch) Reset ¶
func (b *Batch) Reset()
Reset resets the batch for reuse. The underlying byte slice (that is returned by Repr()) may not be modified. It is only necessary to call this method if a batch is explicitly being reused. Close automatically takes are of releasing resources when appropriate for batches that are internally being reused.
func (*Batch) SeqNum ¶
SeqNum returns the batch sequence number which is applied to the first record in the batch. The sequence number is incremented for each subsequent record. It returns zero if the batch is empty.
func (*Batch) Set ¶
func (b *Batch) Set(key, value []byte, _ *WriteOptions) error
Set adds an action to the batch that sets the key to map to the value.
It is safe to modify the contents of the arguments after Set returns.
func (*Batch) SetDeferred ¶
func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp
SetDeferred is similar to Set in that it adds a set operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) SetRepr ¶
SetRepr sets the underlying batch representation. The batch takes ownership of the supplied slice. It is not safe to modify it afterwards until the Batch is no longer in use.
func (*Batch) SingleDelete ¶
func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error
SingleDelete adds an action to the batch that single deletes the entry for key. See Writer.SingleDelete for more details on the semantics of SingleDelete.
It is safe to modify the contents of the arguments after SingleDelete returns.
func (*Batch) SingleDeleteDeferred ¶
func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp
SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
type BatchCommitStats ¶
type BatchCommitStats struct { // TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or // Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap // between calling ApplyNoSyncWait and calling SyncWait, that gap could // include some duration in which real work was being done for the commit // and will not be included here. This missing time is considered acceptable // since the goal of these stats is to understand user-facing latency. // // TotalDuration includes time spent in various queues both inside Pebble // and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait // etc.). For some of these queues (which we consider important) the wait // times are included below -- these expose low-level implementation detail // and are meant for expert diagnosis and subject to change. There may be // unaccounted time after subtracting those values from TotalDuration. TotalDuration time.Duration // SemaphoreWaitDuration is the wait time for semaphores in // commitPipeline.Commit. SemaphoreWaitDuration time.Duration // WALQueueWaitDuration is the wait time for allocating memory blocks in the // LogWriter (due to the LogWriter not writing fast enough). At the moment // this is duration is always zero because a single WAL will allow // allocating memory blocks up to the entire memtable size. In the future, // we may pipeline WALs and bound the WAL queued blocks separately, so this // field is preserved for that possibility. WALQueueWaitDuration time.Duration // MemTableWriteStallDuration is the wait caused by a write stall due to too // many memtables (due to not flushing fast enough). MemTableWriteStallDuration time.Duration // L0ReadAmpWriteStallDuration is the wait caused by a write stall due to // high read amplification in L0 (due to not compacting fast enough out of // L0). L0ReadAmpWriteStallDuration time.Duration // WALRotationDuration is the wait time for WAL rotation, which includes // syncing and closing the old WAL and creating (or reusing) a new one. WALRotationDuration time.Duration // CommitWaitDuration is the wait for publishing the seqnum plus the // duration for the WAL sync (if requested). The former should be tiny and // one can assume that this is all due to the WAL sync. CommitWaitDuration time.Duration }
BatchCommitStats exposes stats related to committing a batch.
NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow batch commits. The caller can use these stats to do their own tracing as needed.
type BatchReader ¶
type BatchReader []byte
BatchReader iterates over the entries contained in a batch.
func ReadBatch ¶
func ReadBatch(repr []byte) (r BatchReader, count uint32)
ReadBatch constructs a BatchReader from a batch representation. The header is not validated. ReadBatch returns a new batch reader and the count of entries contained within the batch.
func (*BatchReader) Next ¶
func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error)
Next returns the next entry in this batch, if there is one. If the reader has reached the end of the batch, Next returns ok=false and a nil error. If the batch is corrupt and the next entry is illegible, Next returns ok=false and a non-nil error.
type BlockPropertyCollector ¶
type BlockPropertyCollector = sstable.BlockPropertyCollector
BlockPropertyCollector exports the sstable.BlockPropertyCollector type.
type BlockPropertyFilter ¶
type BlockPropertyFilter = base.BlockPropertyFilter
BlockPropertyFilter exports the sstable.BlockPropertyFilter type.
type BlockPropertyFilterMask ¶
type BlockPropertyFilterMask interface { BlockPropertyFilter // SetSuffix configures the mask with the suffix of a range key. The filter // should return false from Intersects whenever it's provided with a // property encoding a block's minimum suffix that's greater (according to // Compare) than the provided suffix. SetSuffix(suffix []byte) error }
BlockPropertyFilterMask extends the BlockPropertyFilter interface for use with range-key masking. Unlike an ordinary block property filter, a BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble invokes its SetSuffix method.
When a Pebble iterator steps into a range key's bounds and the range key has a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts as a mask. The masking range key hides all point keys that fall within the range key's bounds and have suffixes > the range key's suffix. Without a filter mask configured, Pebble performs this hiding by stepping through point keys and comparing suffixes. If large numbers of point keys are masked, this requires Pebble to load, iterate through and discard a large number of sstable blocks containing masked point keys.
If a block-property collector and a filter mask are configured, Pebble may skip loading some point-key blocks altogether. If a block's keys are known to all fall within the bounds of the masking range key and the block was annotated by a block-property collector with the maximal suffix, Pebble can ask the filter mask to compare the property to the current masking range key's suffix. If the mask reports no intersection, the block may be skipped.
If unsuffixed and suffixed keys are written to the database, care must be taken to avoid unintentionally masking un-suffixed keys located in the same block as suffixed keys. One solution is to interpret unsuffixed keys as containing the maximal suffix value, ensuring that blocks containing unsuffixed keys are always loaded.
type CPUWorkHandle ¶
type CPUWorkHandle interface { // Permitted indicates whether Pebble can use additional CPU resources. Permitted() bool }
CPUWorkHandle represents a handle used by the CPUWorkPermissionGranter API.
type CPUWorkPermissionGranter ¶
type CPUWorkPermissionGranter interface { // GetPermission returns a handle regardless of whether permission is granted // or not. In the latter case, the handle is only useful for recording // the CPU time actually spent on this calling goroutine. GetPermission(time.Duration) CPUWorkHandle // CPUWorkDone must be called regardless of whether CPUWorkHandle.Permitted // returns true or false. CPUWorkDone(CPUWorkHandle) }
CPUWorkPermissionGranter is used to request permission to opportunistically use additional CPUs to speed up internal background work.
type CacheMetrics ¶
CacheMetrics holds metrics for the block and table cache.
type CheckLevelsStats ¶
CheckLevelsStats provides basic stats on points and tombstones encountered.
type CheckpointOption ¶
type CheckpointOption func(*checkpointOptions)
CheckpointOption set optional parameters used by `DB.Checkpoint`.
func WithFlushedWAL ¶
func WithFlushedWAL() CheckpointOption
WithFlushedWAL enables flushing and syncing the WAL prior to constructing a checkpoint. This guarantees that any writes committed before calling DB.Checkpoint will be part of that checkpoint.
Note that this setting can only be useful in cases when some writes are performed with Sync = false. Otherwise, the guarantee will already be met.
Passing this option is functionally equivalent to calling DB.LogData(nil, Sync) right before DB.Checkpoint.
func WithRestrictToSpans ¶
func WithRestrictToSpans(spans []CheckpointSpan) CheckpointOption
WithRestrictToSpans specifies spans of interest for the checkpoint. Any SSTs that don't overlap with any of these spans are excluded from the checkpoint.
Note that the checkpoint can still surface keys outside of these spans (from the WAL and from SSTs that partially overlap with these spans). Moreover, these surface keys aren't necessarily "valid" in that they could have been modified but the SST containing the modification is excluded.
type CheckpointSpan ¶
CheckpointSpan is a key range [Start, End) (inclusive on Start, exclusive on End) of interest for a checkpoint.
type CloneOptions ¶
type CloneOptions struct { // IterOptions, if non-nil, define the iterator options to configure a // cloned iterator. If nil, the clone adopts the same IterOptions as the // iterator being cloned. IterOptions *IterOptions // RefreshBatchView may be set to true when cloning an Iterator over an // indexed batch. When false, the clone adopts the same (possibly stale) // view of the indexed batch as the cloned Iterator. When true, the clone is // constructed with a refreshed view of the batch, observing all of the // batch's mutations at the time of the Clone. If the cloned iterator was // not constructed to read over an indexed batch, RefreshVatchView has no // effect. RefreshBatchView bool }
CloneOptions configures an iterator constructed through Iterator.Clone.
type CompactionInfo ¶
type CompactionInfo struct { // JobID is the ID of the compaction job. JobID int // Reason is the reason for the compaction. Reason string // Input contains the input tables for the compaction organized by level. Input []LevelInfo // Output contains the output tables generated by the compaction. The output // tables are empty for the compaction begin event. Output LevelInfo // Duration is the time spent compacting, including reading and writing // sstables. Duration time.Duration // TotalDuration is the total wall-time duration of the compaction, // including applying the compaction to the database. TotalDuration is // always ≥ Duration. TotalDuration time.Duration Done bool Err error SingleLevelOverlappingRatio float64 MultiLevelOverlappingRatio float64 // Annotations specifies additional info to appear in a compaction's event log line Annotations compactionAnnotations }
CompactionInfo contains the info for a compaction event.
func (CompactionInfo) SafeFormat ¶
func (i CompactionInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (CompactionInfo) String ¶
func (i CompactionInfo) String() string
type Compression ¶
type Compression = sstable.Compression
Compression exports the base.Compression type.
type DB ¶
type DB struct {
// contains filtered or unexported fields
}
DB provides a concurrent, persistent ordered key/value store.
A DB's basic operations (Get, Set, Delete) should be self-explanatory. Get and Delete will return ErrNotFound if the requested key is not in the store. Callers are free to ignore this error.
A DB also allows for iterating over the key/value pairs in key order. If d is a DB, the code below prints all key/value pairs whose keys are 'greater than or equal to' k:
iter := d.NewIter(readOptions) for iter.SeekGE(k); iter.Valid(); iter.Next() { fmt.Printf("key=%q value=%q\n", iter.Key(), iter.Value()) } return iter.Close()
The Options struct holds the optional parameters for the DB, including a Comparer to define a 'less than' relationship over keys. It is always valid to pass a nil *Options, which means to use the default parameter values. Any zero field of a non-nil *Options also means to use the default value for that parameter. Thus, the code below uses a custom Comparer, but the default values for every other parameter:
db := pebble.Open(&Options{ Comparer: myComparer, })
func (*DB) Apply ¶
func (d *DB) Apply(batch *Batch, opts *WriteOptions) error
Apply the operations contained in the batch to the DB. If the batch is large the contents of the batch may be retained by the database. If that occurs the batch contents will be cleared preventing the caller from attempting to reuse them.
It is safe to modify the contents of the arguments after Apply returns.
func (*DB) ApplyNoSyncWait ¶
func (d *DB) ApplyNoSyncWait(batch *Batch, opts *WriteOptions) error
ApplyNoSyncWait must only be used when opts.Sync is true and the caller does not want to wait for the WAL fsync to happen. The method will return once the mutation is applied to the memtable and is visible (note that a mutation is visible before the WAL sync even in the wait case, so we have not weakened the durability semantics). The caller must call Batch.SyncWait to wait for the WAL fsync. The caller must not Close the batch without first calling Batch.SyncWait.
RECOMMENDATION: Prefer using Apply unless you really understand why you need ApplyNoSyncWait. EXPERIMENTAL: API/feature subject to change. Do not yet use outside CockroachDB.
func (*DB) AsyncFlush ¶
AsyncFlush asynchronously flushes the memtable to stable storage.
If no error is returned, the caller can receive from the returned channel in order to wait for the flush to complete.
func (*DB) CheckLevels ¶
func (d *DB) CheckLevels(stats *CheckLevelsStats) error
CheckLevels checks:
- Every entry in the DB is consistent with the level invariant. See the comment at the top of the file.
- Point keys in sstables are ordered.
- Range delete tombstones in sstables are ordered and fragmented.
- Successful processing of all MERGE records.
func (*DB) Checkpoint ¶
func (d *DB) Checkpoint( destDir string, opts ...CheckpointOption, ) ( ckErr error, )
Checkpoint constructs a snapshot of the DB instance in the specified directory. The WAL, MANIFEST, OPTIONS, and sstables will be copied into the snapshot. Hard links will be used when possible. Beware of the significant space overhead for a checkpoint if hard links are disabled. Also beware that even if hard links are used, the space overhead for the checkpoint will increase over time as the DB performs compactions.
func (*DB) Close ¶
Close closes the DB.
It is not safe to close a DB until all outstanding iterators are closed or to call Close concurrently with any other DB method. It is not valid to call any of a DB's methods after the DB has been closed.
func (*DB) Delete ¶
func (d *DB) Delete(key []byte, opts *WriteOptions) error
Delete deletes the value for the given key. Deletes are blind all will succeed even if the given key does not exist.
It is safe to modify the contents of the arguments after Delete returns.
func (*DB) DeleteRange ¶
func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error
DeleteRange deletes all of the keys (and values) in the range [start,end) (inclusive on start, exclusive on end).
It is safe to modify the contents of the arguments after DeleteRange returns.
func (*DB) DeleteSized ¶
func (d *DB) DeleteSized(key []byte, valueSize uint32, opts *WriteOptions) error
DeleteSized behaves identically to Delete, but takes an additional argument indicating the size of the value being deleted. DeleteSized should be preferred when the caller has the expectation that there exists a single internal KV pair for the key (eg, the key has not been overwritten recently), and the caller knows the size of its value.
DeleteSized will record the value size within the tombstone and use it to inform compaction-picking heuristics which strive to reduce space amplification in the LSM. This "calling your shot" mechanic allows the storage engine to more accurately estimate and reduce space amplification.
It is safe to modify the contents of the arguments after DeleteSized returns.
func (*DB) Download ¶
func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error
Download ensures that the LSM does not use any external sstables for the given key ranges. It does so by performing appropriate compactions so that all external data becomes available locally.
Note that calling this method does not imply that all other compactions stop; it simply informs Pebble of a list of spans for which external data should be downloaded with high priority.
The method returns once no external sstasbles overlap the given spans, the context is canceled, or an error is hit.
TODO(radu): consider passing a priority/impact knob to express how important the download is (versus live traffic performance, LSM health).
func (*DB) EstimateDiskUsage ¶
EstimateDiskUsage returns the estimated filesystem space used in bytes for storing the range `[start, end]`. The estimation is computed as follows:
- For sstables fully contained in the range the whole file size is included.
- For sstables partially contained in the range the overlapping data block sizes are included. Even if a data block partially overlaps, or we cannot determine overlap due to abbreviated index keys, the full data block size is included in the estimation. Note that unlike fully contained sstables, none of the meta-block space is counted for partially overlapped files.
- For virtual sstables, we use the overlap between start, end and the virtual sstable bounds to determine disk usage.
- There may also exist WAL entries for unflushed keys in this range. This estimation currently excludes space used for the range in the WAL.
func (*DB) EstimateDiskUsageByBackingType ¶
func (d *DB) EstimateDiskUsageByBackingType( start, end []byte, ) (totalSize, remoteSize, externalSize uint64, _ error)
EstimateDiskUsageByBackingType is like EstimateDiskUsage but additionally returns the subsets of that size in remote ane external files.
func (*DB) FormatMajorVersion ¶
func (d *DB) FormatMajorVersion() FormatMajorVersion
FormatMajorVersion returns the database's active format major version. The format major version may be higher than the one provided in Options when the database was opened if the existing database was written with a higher format version.
func (*DB) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the DB does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*DB) Ingest ¶
Ingest ingests a set of sstables into the DB. Ingestion of the files is atomic and semantically equivalent to creating a single batch containing all of the mutations in the sstables. Ingestion may require the memtable to be flushed. The ingested sstable files are moved into the DB and must reside on the same filesystem as the DB. Sstables can be created for ingestion using sstable.Writer. On success, Ingest removes the input paths.
Two types of sstables are accepted for ingestion(s): one is sstables present in the instance's vfs.FS and can be referenced locally. The other is sstables present in remote.Storage, referred to as shared or foreign sstables. These shared sstables can be linked through objstorageprovider.Provider, and do not need to already be present on the local vfs.FS. Foreign sstables must all fit in an excise span, and are destined for a level specified in SharedSSTMeta.
All sstables *must* be Sync()'d by the caller after all bytes are written and before its file handle is closed; failure to do so could violate durability or lead to corrupted on-disk state. This method cannot, in a platform-and-FS-agnostic way, ensure that all sstables in the input are properly synced to disk. Opening new file handles and Sync()-ing them does not always guarantee durability; see the discussion here on that: https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
Ingestion loads each sstable into the lowest level of the LSM which it doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, ingestion forces the memtable to flush, and then waits for the flush to occur. In some cases, such as with no foreign sstables and no excise span, ingestion that gets blocked on a memtable can join the flushable queue and finish even before the memtable has been flushed.
The steps for ingestion are:
- Allocate file numbers for every sstable being ingested.
- Load the metadata for all sstables being ingested.
- Sort the sstables by smallest key, verifying non overlap (for local sstables).
- Hard link (or copy) the local sstables into the DB directory.
- Allocate a sequence number to use for all of the entries in the local sstables. This is the step where overlap with memtables is determined. If there is overlap, we remember the most recent memtable that overlaps.
- Update the sequence number in the ingested local sstables. (Remote sstables get fixed sequence numbers that were determined at load time.)
- Wait for the most recent memtable that overlaps to flush (if any).
- Add the ingested sstables to the version (DB.ingestApply). 8.1. If an excise span was specified, figure out what sstables in the current version overlap with the excise span, and create new virtual sstables out of those sstables that exclude the excised span (DB.excise).
- Publish the ingestion sequence number.
Note that if the mutable memtable overlaps with ingestion, a flush of the memtable is forced equivalent to DB.Flush. Additionally, subsequent mutations that get sequence numbers larger than the ingestion sequence number get queued up behind the ingestion waiting for it to complete. This can produce a noticeable hiccup in performance. See https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix this hiccup.
func (*DB) IngestAndExcise ¶
func (d *DB) IngestAndExcise( paths []string, shared []SharedSSTMeta, exciseSpan KeyRange, ) (IngestOperationStats, error)
IngestAndExcise does the same as IngestWithStats, and additionally accepts a list of shared files to ingest that can be read from a remote.Storage through a Provider. All the shared files must live within exciseSpan, and any existing keys in exciseSpan are deleted by turning existing sstables into virtual sstables (if not virtual already) and shrinking their spans to exclude exciseSpan. See the comment at Ingest for a more complete picture of the ingestion process.
Panics if this DB instance was not instantiated with a remote.Storage and shared sstables are present.
func (*DB) IngestExternalFiles ¶
func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error)
IngestExternalFiles does the same as IngestWithStats, and additionally accepts external files (with locator info that can be resolved using d.opts.SharedStorage). These files must also be non-overlapping with each other, and must be resolvable through d.objProvider.
func (*DB) IngestWithStats ¶
func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error)
IngestWithStats does the same as Ingest, and additionally returns IngestOperationStats.
func (*DB) LogData ¶
func (d *DB) LogData(data []byte, opts *WriteOptions) error
LogData adds the specified to the batch. The data will be written to the WAL, but not added to memtables or sstables. Log data is never indexed, which makes it useful for testing WAL performance.
It is safe to modify the contents of the argument after LogData returns.
func (*DB) Merge ¶
func (d *DB) Merge(key, value []byte, opts *WriteOptions) error
Merge adds an action to the DB that merges the value at key with the new value. The details of the merge are dependent upon the configured merge operator.
It is safe to modify the contents of the arguments after Merge returns.
func (*DB) NewBatch ¶
NewBatch returns a new empty write-only batch. Any reads on the batch will return an error. If the batch is committed it will be applied to the DB.
func (*DB) NewBatchWithSize ¶
NewBatchWithSize is mostly identical to NewBatch, but it will allocate the the specified memory space for the internal slice in advance.
func (*DB) NewEventuallyFileOnlySnapshot ¶
func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot
NewEventuallyFileOnlySnapshot returns a point-in-time view of the current DB state, similar to NewSnapshot, but with consistency constrained to the provided set of key ranges. See the comment at EventuallyFileOnlySnapshot for its semantics.
func (*DB) NewIndexedBatch ¶
NewIndexedBatch returns a new empty read-write batch. Any reads on the batch will read from both the batch and the DB. If the batch is committed it will be applied to the DB. An indexed batch is slower that a non-indexed batch for insert operations. If you do not need to perform reads on the batch, use NewBatch instead.
func (*DB) NewIndexedBatchWithSize ¶
NewIndexedBatchWithSize is mostly identical to NewIndexedBatch, but it will allocate the the specified memory space for the internal slice in advance.
func (*DB) NewIter ¶
func (d *DB) NewIter(o *IterOptions) (*Iterator, error)
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last. The iterator provides a point-in-time view of the current DB state. This view is maintained by preventing file deletions and preventing memtables referenced by the iterator from being deleted. Using an iterator to maintain a long-lived point-in-time view of the DB state can lead to an apparent memory and disk usage leak. Use snapshots (see NewSnapshot) for point-in-time snapshots which avoids these problems.
func (*DB) NewIterWithContext ¶
NewIterWithContext is like NewIter, and additionally accepts a context for tracing.
func (*DB) NewSnapshot ¶
NewSnapshot returns a point-in-time view of the current DB state. Iterators created with this handle will all observe a stable snapshot of the current DB state. The caller must call Snapshot.Close() when the snapshot is no longer needed. Snapshots are not persisted across DB restarts (close -> open). Unlike the implicit snapshot maintained by an iterator, a snapshot will not prevent memtables from being released or sstables from being deleted. Instead, a snapshot prevents deletion of sequence numbers referenced by the snapshot.
func (*DB) NewTransaction ¶
func (d *DB) NewTransaction(writable bool) *Transaction
NewTransaction starts a new transaction.
Read transactions can be run concurrently, but only one write transaction can be run at a time. If additional write transactions are started, the calls to this function will block until the current write transaction is closed.
func (*DB) ObjProvider ¶
func (d *DB) ObjProvider() objstorage.Provider
ObjProvider returns the objstorage.Provider for this database. Meant to be used for internal purposes only.
func (*DB) RangeKeyDelete ¶
func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error
RangeKeyDelete deletes all of the range keys in the range [start,end) (inclusive on start, exclusive on end). It does not delete point keys (for that use DeleteRange). RangeKeyDelete removes all range keys within the bounds, including those with or without suffixes.
It is safe to modify the contents of the arguments after RangeKeyDelete returns.
func (*DB) RangeKeySet ¶
func (d *DB) RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error
RangeKeySet sets a range key mapping the key range [start, end) at the MVCC timestamp suffix to value. The suffix is optional. If any portion of the key range [start, end) is already set by a range key with the same suffix value, RangeKeySet overrides it.
It is safe to modify the contents of the arguments after RangeKeySet returns.
func (*DB) RangeKeyUnset ¶
func (d *DB) RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error
RangeKeyUnset removes a range key mapping the key range [start, end) at the MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed range key. RangeKeyUnset only removes portions of range keys that fall within the [start, end) key span, and only range keys with suffixes that exactly match the unset suffix.
It is safe to modify the contents of the arguments after RangeKeyUnset returns.
func (*DB) RatchetFormatMajorVersion ¶
func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error
RatchetFormatMajorVersion ratchets the opened database's format major version to the provided version. It errors if the provided format major version is below the database's current version. Once a database's format major version is upgraded, previous Pebble versions that do not know of the format version will be unable to open the database.
func (*DB) SSTables ¶
func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error)
SSTables retrieves the current sstables. The returned slice is indexed by level and each level is indexed by the position of the sstable within the level. Note that this information may be out of date due to concurrent flushes and compactions.
func (*DB) ScanInternal ¶
func (d *DB) ScanInternal( ctx context.Context, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, visitRangeDel func(start, end []byte, seqNum uint64) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, ) error
ScanInternal scans all internal keys within the specified bounds, truncating any rangedels and rangekeys to those bounds if they span past them. For use when an external user needs to be aware of all internal keys that make up a key range.
Keys deleted by range deletions must not be returned or exposed by this method, while the range deletion deleting that key must be exposed using visitRangeDel. Keys that would be masked by range key masking (if an appropriate prefix were set) should be exposed, alongside the range key that would have masked it. This method also collapses all point keys into one InternalKey; so only one internal key at most per user key is returned to visitPointKey.
If visitSharedFile is not nil, ScanInternal iterates in skip-shared iteration mode. In this iteration mode, sstables in levels L5 and L6 are skipped, and their metadatas truncated to [lower, upper) and passed into visitSharedFile. ErrInvalidSkipSharedIteration is returned if visitSharedFile is not nil and an sstable in L5 or L6 is found that is not in shared storage according to provider.IsShared, or an sstable in those levels contains a newer key than the snapshot sequence number (only applicable for snapshot.ScanInternal). Examples of when this could happen could be if Pebble started writing sstables before a creator ID was set (as creator IDs are necessary to enable shared storage) resulting in some lower level SSTs being on non-shared storage. Skip-shared iteration is invalid in those cases.
func (*DB) ScanStatistics ¶
func (d *DB) ScanStatistics( ctx context.Context, lower, upper []byte, opts ScanStatisticsOptions, ) (LSMKeyStatistics, error)
ScanStatistics returns the count of different key kinds within the lsm for a key span [lower, upper) as well as the number of snapshot keys.
func (*DB) Set ¶
func (d *DB) Set(key, value []byte, opts *WriteOptions) error
Set sets the value for the given key. It overwrites any previous value for that key; a DB is not a multi-map.
It is safe to modify the contents of the arguments after Set returns.
func (*DB) SetCreatorID ¶
SetCreatorID sets the CreatorID which is needed in order to use shared objects. Remote object usage is disabled until this method is called the first time. Once set, the Creator ID is persisted and cannot change.
Does nothing if SharedStorage was not set in the options when the DB was opened or if the DB is in read-only mode.
func (*DB) SingleDelete ¶
func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error
SingleDelete adds an action to the batch that single deletes the entry for key. See Writer.SingleDelete for more details on the semantics of SingleDelete.
It is safe to modify the contents of the arguments after SingleDelete returns.
func (*DB) TestOnlyWaitForCleaning ¶
func (d *DB) TestOnlyWaitForCleaning()
TestOnlyWaitForCleaning MUST only be used in tests.
type DBDesc ¶
type DBDesc struct { // Exists is true if an existing database was found. Exists bool // FormatMajorVersion indicates the database's current format // version. FormatMajorVersion FormatMajorVersion // ManifestFilename is the filename of the current active manifest, // if the database exists. ManifestFilename string }
DBDesc briefly describes high-level state about a database.
type DeferredBatchOp ¶
type DeferredBatchOp struct {
// Key and Value point to parts of the binary batch representation where
// keys and values should be encoded/copied into. len(Key) and len(Value)
// bytes must be copied into these slices respectively before calling
// Finish(). Changing where these slices point to is not allowed.
Key, Value []byte
// contains filtered or unexported fields
}
DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is being inserted into the batch. Indexing is not performed on the specified key until Finish is called, hence the name deferred. This struct lets the caller copy or encode keys/values directly into the batch representation instead of copying into an intermediary buffer then having pebble.Batch copy off of it.
func (DeferredBatchOp) Finish ¶
func (d DeferredBatchOp) Finish() error
Finish completes the addition of this batch operation, and adds it to the index if necessary. Must be called once (and exactly once) keys/values have been filled into Key and Value. Not calling Finish or not copying/encoding keys will result in an incomplete index, and calling Finish twice may result in a panic.
type DeletableValueMerger ¶
type DeletableValueMerger = base.DeletableValueMerger
DeletableValueMerger exports the base.DeletableValueMerger type.
type DeleteCleaner ¶
type DeleteCleaner = base.DeleteCleaner
DeleteCleaner exports the base.DeleteCleaner type.
type DiskSlowInfo ¶
type DiskSlowInfo = vfs.DiskSlowInfo
DiskSlowInfo contains the info for a disk slowness event when writing to a file.
type DownloadSpan ¶
DownloadSpan is a key range passed to the Download method.
type EventListener ¶
type EventListener struct { // BackgroundError is invoked whenever an error occurs during a background // operation such as flush or compaction. BackgroundError func(error) // CompactionBegin is invoked after the inputs to a compaction have been // determined, but before the compaction has produced any output. CompactionBegin func(CompactionInfo) // CompactionEnd is invoked after a compaction has completed and the result // has been installed. CompactionEnd func(CompactionInfo) // DiskSlow is invoked after a disk write operation on a file created with a // disk health checking vfs.FS (see vfs.DefaultWithDiskHealthChecks) is // observed to exceed the specified disk slowness threshold duration. DiskSlow // is called on a goroutine that is monitoring slowness/stuckness. The callee // MUST return without doing any IO, or blocking on anything (like a mutex) // that is waiting on IO. This is imperative in order to reliably monitor for // slowness, since if this goroutine gets stuck, the monitoring will stop // working. DiskSlow func(DiskSlowInfo) // FlushBegin is invoked after the inputs to a flush have been determined, // but before the flush has produced any output. FlushBegin func(FlushInfo) // FlushEnd is invoked after a flush has complated and the result has been // installed. FlushEnd func(FlushInfo) // FormatUpgrade is invoked after the database's FormatMajorVersion // is upgraded. FormatUpgrade func(FormatMajorVersion) // ManifestCreated is invoked after a manifest has been created. ManifestCreated func(ManifestCreateInfo) // ManifestDeleted is invoked after a manifest has been deleted. ManifestDeleted func(ManifestDeleteInfo) // TableCreated is invoked when a table has been created. TableCreated func(TableCreateInfo) // TableDeleted is invoked after a table has been deleted. TableDeleted func(TableDeleteInfo) // TableIngested is invoked after an externally created table has been // ingested via a call to DB.Ingest(). TableIngested func(TableIngestInfo) // TableStatsLoaded is invoked at most once, when the table stats // collector has loaded statistics for all tables that existed at Open. TableStatsLoaded func(TableStatsInfo) // TableValidated is invoked after validation runs on an sstable. TableValidated func(TableValidatedInfo) // WALCreated is invoked after a WAL has been created. WALCreated func(WALCreateInfo) // WALDeleted is invoked after a WAL has been deleted. WALDeleted func(WALDeleteInfo) // WriteStallBegin is invoked when writes are intentionally delayed. WriteStallBegin func(WriteStallBeginInfo) // WriteStallEnd is invoked when delayed writes are released. WriteStallEnd func() }
EventListener contains a set of functions that will be invoked when various significant DB events occur. Note that the functions should not run for an excessive amount of time as they are invoked synchronously by the DB and may block continued DB work. For a similar reason it is advisable to not perform any synchronous calls back into the DB.
func MakeLoggingEventListener ¶
func MakeLoggingEventListener(logger Logger) EventListener
MakeLoggingEventListener creates an EventListener that logs all events to the specified logger.
func TeeEventListener ¶
func TeeEventListener(a, b EventListener) EventListener
TeeEventListener wraps two EventListeners, forwarding all events to both.
func (*EventListener) EnsureDefaults ¶
func (l *EventListener) EnsureDefaults(logger Logger)
EnsureDefaults ensures that background error events are logged to the specified logger if a handler for those events hasn't been otherwise specified. Ensure all handlers are non-nil so that we don't have to check for nil-ness before invoking.
type EventuallyFileOnlySnapshot ¶
type EventuallyFileOnlySnapshot struct {
// contains filtered or unexported fields
}
EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot induces less write amplification than Snapshot, at the cost of increased space amplification. While a Snapshot may increase write amplification across all flushes and compactions for the duration of its lifetime, an EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if memtables at the time of EFOS instantiation contained keys that the EFOS is interested in (i.e. its protectedRanges). In that case, the EFOS prevents elision of keys visible to it, similar to a Snapshot, until those memtables are flushed, and once that happens, the "EventuallyFileOnlySnapshot" transitions to a file-only snapshot state in which it pins zombies sstables like an open Iterator would, without pinning any memtables. Callers that can tolerate the increased space amplification of pinning zombie sstables until the snapshot is closed may prefer EventuallyFileOnlySnapshots for their reduced write amplification. Callers that desire the benefits of the file-only state that requires no pinning of memtables should call `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns ErrSnapshotExcised) before relying on the EFOS to keep producing iterators with zero write-amp and zero pinning of memtables in memory.
EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in subtle ways. No new iterators can be created once EventuallyFileOnlySnapshot.excised is set to true.
func (*EventuallyFileOnlySnapshot) Close ¶
func (es *EventuallyFileOnlySnapshot) Close() error
Close closes the file-only snapshot and releases all referenced resources. Not idempotent.
func (*EventuallyFileOnlySnapshot) NewIter ¶
func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error)
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last.
func (*EventuallyFileOnlySnapshot) NewIterWithContext ¶
func (es *EventuallyFileOnlySnapshot) NewIterWithContext( ctx context.Context, o *IterOptions, ) (*Iterator, error)
NewIterWithContext is like NewIter, and additionally accepts a context for tracing.
func (*EventuallyFileOnlySnapshot) ScanInternal ¶
func (es *EventuallyFileOnlySnapshot) ScanInternal( ctx context.Context, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, visitRangeDel func(start, end []byte, seqNum uint64) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, ) error
ScanInternal scans all internal keys within the specified bounds, truncating any rangedels and rangekeys to those bounds. For use when an external user needs to be aware of all internal keys that make up a key range.
See comment on db.ScanInternal for the behaviour that can be expected of point keys deleted by range dels and keys masked by range keys.
func (*EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot ¶
func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot( ctx context.Context, dur time.Duration, ) error
WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot has been converted into a file-only snapshot (i.e. all memtables containing keys < seqNum are flushed). A duration can be passed in, and if nonzero, a delayed flush will be scheduled at that duration if necessary.
Idempotent; can be called multiple times with no side effects.
type ExternalFile ¶
type ExternalFile struct { // Locator is the shared.Locator that can be used with objProvider to // resolve a reference to this external sstable. Locator remote.Locator // ObjName is the unique name of this sstable on Locator. ObjName string // Size of the referenced proportion of the virtualized sstable. An estimate // is acceptable in lieu of the backing file size. Size uint64 // SmallestUserKey and LargestUserKey are the [smallest,largest) user key // bounds of the sstable. Both these bounds are loose i.e. it's possible for // the sstable to not span the entirety of this range. However, multiple // ExternalFiles in one ingestion must all have non-overlapping // [smallest, largest) spans. Note that this Largest bound is exclusive. SmallestUserKey, LargestUserKey []byte // HasPointKey and HasRangeKey denote whether this file contains point keys // or range keys. If both structs are false, an error is returned during // ingestion. HasPointKey, HasRangeKey bool }
ExternalFile are external sstables that can be referenced through objprovider and ingested as remote files that will not be refcounted or cleaned up. For use with online restore. Note that the underlying sstable could contain keys outside the [Smallest,Largest) bounds; however Pebble is expected to only read the keys within those bounds.
type ExternalIterForwardOnly ¶
type ExternalIterForwardOnly struct{}
ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator will only be used for forward positioning operations (First, SeekGE, Next). This could enable optimizations that take advantage of this invariant. Behaviour when a reverse positioning operation is done on an iterator opened with this option is unpredictable, though in most cases it should.
type ExternalIterOption ¶
type ExternalIterOption interface {
// contains filtered or unexported methods
}
ExternalIterOption provide an interface to specify open-time options to NewExternalIter.
func ExternalIterReaderOptions ¶
func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption
ExternalIterReaderOptions returns an ExternalIterOption that specifies sstable.ReaderOptions to be applied on sstable readers in NewExternalIter.
type FilterMetrics ¶
type FilterMetrics = sstable.FilterMetrics
FilterMetrics holds metrics for the filter policy
type FilterPolicy ¶
type FilterPolicy = base.FilterPolicy
FilterPolicy exports the base.FilterPolicy type.
type FilterWriter ¶
type FilterWriter = base.FilterWriter
FilterWriter exports the base.FilterWriter type.
type FlushInfo ¶
type FlushInfo struct { // JobID is the ID of the flush job. JobID int // Reason is the reason for the flush. Reason string // Input contains the count of input memtables that were flushed. Input int // InputBytes contains the total in-memory size of the memtable(s) that were // flushed. This size includes skiplist indexing data structures. InputBytes uint64 // Output contains the ouptut table generated by the flush. The output info // is empty for the flush begin event. Output []TableInfo // Duration is the time spent flushing. This duration includes writing and // syncing all of the flushed keys to sstables. Duration time.Duration // TotalDuration is the total wall-time duration of the flush, including // applying the flush to the database. TotalDuration is always ≥ Duration. TotalDuration time.Duration // Ingest is set to true if the flush is handling tables that were added to // the flushable queue via an ingestion operation. Ingest bool // IngestLevels are the output levels for each ingested table in the flush. // This field is only populated when Ingest is true. IngestLevels []int Done bool Err error }
FlushInfo contains the info for a flush event.
func (FlushInfo) SafeFormat ¶
func (i FlushInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
type FormatMajorVersion ¶
type FormatMajorVersion uint64
FormatMajorVersion is a constant controlling the format of persisted data. Backwards incompatible changes to durable formats are gated behind new format major versions.
At any point, a database's format major version may be bumped. However, once a database's format major version is increased, previous versions of Pebble will refuse to open the database.
The zero value format is the FormatDefault constant. The exact FormatVersion that the default corresponds to may change with time.
const ( // FormatDefault leaves the format version unspecified. The // FormatDefault constant may be ratcheted upwards over time. FormatDefault FormatMajorVersion = iota // FormatMostCompatible maintains the most backwards compatibility, // maintaining bi-directional compatibility with RocksDB 6.2.1 in // the particular configuration described in the Pebble README. FormatMostCompatible // FormatVersioned is a new format major version that replaces the // old `CURRENT` file with a new 'marker' file scheme. Previous // Pebble versions will be unable to open the database unless // they're aware of format versions. FormatVersioned // FormatSetWithDelete is a format major version that introduces a new key // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be // unable to open this database. FormatSetWithDelete // FormatBlockPropertyCollector is a format major version that introduces // BlockPropertyCollectors. FormatBlockPropertyCollector // FormatSplitUserKeysMarked is a format major version that guarantees that // all files that share user keys with neighbors are marked for compaction // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block // (without holding mutexes) until the scan of the LSM is complete and the // manifest has been rotated. FormatSplitUserKeysMarked // FormatSplitUserKeysMarkedCompacted is a format major version that // guarantees that all files explicitly marked for compaction in the manifest // have been compacted. Combined with the FormatSplitUserKeysMarked format // major version, this version guarantees that there are no user keys split // across multiple files within a level L1+. Ratcheting to this format version // will block (without holding mutexes) until all necessary compactions for // files marked for compaction are complete. FormatSplitUserKeysMarkedCompacted // FormatRangeKeys is a format major version that introduces range keys. FormatRangeKeys // FormatMinTableFormatPebblev1 is a format major version that guarantees that // tables created by or ingested into the DB at or above this format major // version will have a table format version of at least Pebblev1 (Block // Properties). FormatMinTableFormatPebblev1 // FormatPrePebblev1Marked is a format major version that guarantees that all // sstables with a table format version pre-Pebblev1 (i.e. those that are // guaranteed to not contain block properties) are marked for compaction in // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without // holding mutexes) until the scan of the LSM is complete and the manifest has // been rotated. FormatPrePebblev1Marked // FormatSSTableValueBlocks is a format major version that adds support for // storing values in value blocks in the sstable. Value block support is not // necessarily enabled when writing sstables, when running with this format // major version. // // WARNING: In development, so no production code should upgrade to this // format, since a DB with this format major version will not actually // interoperate correctly with another DB with the same format major // version. This format major version is introduced so that tests can start // being executed up to this version. Note that these tests succeed despite // the incomplete support since they do not enable value blocks and use // TableFormatPebblev2. FormatSSTableValueBlocks // FormatFlushableIngest is a format major version that enables lazy // addition of ingested sstables into the LSM structure. When an ingest // overlaps with a memtable, a record of the ingest is written to the WAL // without waiting for a flush. Subsequent reads treat the ingested files as // a level above the overlapping memtable. Once the memtable is flushed, the // ingested files are moved into the lowest possible levels. // // This feature is behind a format major version because it required // breaking changes to the WAL format. FormatFlushableIngest // FormatPrePebblev1MarkedCompacted is a format major version that guarantees // that all sstables explicitly marked for compaction in the manifest (see // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format // version will block (without holding mutexes) until all necessary // compactions for files marked for compaction are complete. FormatPrePebblev1MarkedCompacted // FormatDeleteSizedAndObsolete is a format major version that adds support // for deletion tombstones that encode the size of the value they're // expected to delete. This format major version is required before the // associated key kind may be committed through batch applications or // ingests. It also adds support for keys that are marked obsolete (see // sstable/format.go for details). FormatDeleteSizedAndObsolete // FormatVirtualSSTables is a format major version that adds support for // virtual sstables that can reference a sub-range of keys in an underlying // physical sstable. This information is persisted through new, // backward-incompatible fields in the Manifest, and therefore requires // a format major version. FormatVirtualSSTables // FormatNewest always contains the most recent format major version. FormatNewest FormatMajorVersion = internalFormatNewest )
func (FormatMajorVersion) MaxTableFormat ¶
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat
MaxTableFormat returns the maximum sstable.TableFormat that can be used at this FormatMajorVersion.
func (FormatMajorVersion) MinTableFormat ¶
func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat
MinTableFormat returns the minimum sstable.TableFormat that can be used at this FormatMajorVersion.
func (FormatMajorVersion) SafeValue ¶
func (v FormatMajorVersion) SafeValue()
SafeValue implements redact.SafeValue.
func (FormatMajorVersion) String ¶
func (v FormatMajorVersion) String() string
String implements fmt.Stringer.
type IngestOperationStats ¶
type IngestOperationStats struct { // Bytes is the total bytes in the ingested sstables. Bytes uint64 // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested // into L0. This value is approximate when flushable ingests are active and // an ingest overlaps an entry in the flushable queue. Currently, this // approximation is very rough, only including tables that overlapped the // memtable. This estimate may be improved with #2112. ApproxIngestedIntoL0Bytes uint64 // MemtableOverlappingFiles is the count of ingested sstables // that overlapped keys in the memtables. MemtableOverlappingFiles int }
IngestOperationStats provides some information about where in the LSM the bytes were ingested.
type InternalIteratorStats ¶
type InternalIteratorStats = base.InternalIteratorStats
InternalIteratorStats contains miscellaneous stats produced by internal iterators.
type InternalKey ¶
type InternalKey = base.InternalKey
InternalKey exports the base.InternalKey type.
type InternalKeyKind ¶
type InternalKeyKind = base.InternalKeyKind
InternalKeyKind exports the base.InternalKeyKind type.
type IterKeyType ¶
type IterKeyType int8
IterKeyType configures which types of keys an iterator should surface.
const ( // IterKeyTypePointsOnly configures an iterator to iterate over point keys // only. IterKeyTypePointsOnly IterKeyType = iota // IterKeyTypeRangesOnly configures an iterator to iterate over range keys // only. IterKeyTypeRangesOnly // IterKeyTypePointsAndRanges configures an iterator iterate over both point // keys and range keys simultaneously. IterKeyTypePointsAndRanges )
type IterOptions ¶
type IterOptions struct { // LowerBound specifies the smallest key (inclusive) that the iterator will // return during iteration. If the iterator is seeked or iterated past this // boundary the iterator will return Valid()==false. Setting LowerBound // effectively truncates the key space visible to the iterator. LowerBound []byte // UpperBound specifies the largest key (exclusive) that the iterator will // return during iteration. If the iterator is seeked or iterated past this // boundary the iterator will return Valid()==false. Setting UpperBound // effectively truncates the key space visible to the iterator. UpperBound []byte // TableFilter can be used to filter the tables that are scanned during // iteration based on the user properties. Return true to scan the table and // false to skip scanning. This function must be thread-safe since the same // function can be used by multiple iterators, if the iterator is cloned. TableFilter func(userProps map[string]string) bool // SkipPoint may be used to skip over point keys that don't match an // arbitrary predicate during iteration. If set, the Iterator invokes // SkipPoint for keys encountered. If SkipPoint returns true, the iterator // will skip the key without yielding it to the iterator operation in // progress. // // SkipPoint must be a pure function and always return the same result when // provided the same arguments. The iterator may call SkipPoint multiple // times for the same user key. SkipPoint func(userKey []byte) bool // PointKeyFilters can be used to avoid scanning tables and blocks in tables // when iterating over point keys. This slice represents an intersection // across all filters, i.e., all filters must indicate that the block is // relevant. // // Performance note: When len(PointKeyFilters) > 0, the caller should ensure // that cap(PointKeyFilters) is at least len(PointKeyFilters)+1. This helps // avoid allocations in Pebble internal code that mutates the slice. PointKeyFilters []BlockPropertyFilter // RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables // when iterating over range keys. The same requirements that apply to // PointKeyFilters apply here too. RangeKeyFilters []BlockPropertyFilter // KeyTypes configures which types of keys to iterate over: point keys, // range keys, or both. KeyTypes IterKeyType // RangeKeyMasking can be used to enable automatic masking of point keys by // range keys. Range key masking is only supported during combined range key // and point key iteration mode (IterKeyTypePointsAndRanges). RangeKeyMasking RangeKeyMasking // OnlyReadGuaranteedDurable is an advanced option that is only supported by // the Reader implemented by DB. When set to true, only the guaranteed to be // durable state is visible in the iterator. // - This definition is made under the assumption that the FS implementation // is providing a durability guarantee when data is synced. // - The visible state represents a consistent point in the history of the // DB. // - The implementation is free to choose a conservative definition of what // is guaranteed durable. For simplicity, the current implementation // ignores memtables. A more sophisticated implementation could track the // highest seqnum that is synced to the WAL and published and use that as // the visible seqnum for an iterator. Note that the latter approach is // not strictly better than the former since we can have DBs that are (a) // synced more rarely than memtable flushes, (b) have no WAL. (a) is // likely to be true in a future CockroachDB context where the DB // containing the state machine may be rarely synced. // NB: this current implementation relies on the fact that memtables are // flushed in seqnum order, and any ingested sstables that happen to have a // lower seqnum than a non-flushed memtable don't have any overlapping keys. // This is the fundamental level invariant used in other code too, like when // merging iterators. // // Semantically, using this option provides the caller a "snapshot" as of // the time the most recent memtable was flushed. An alternate interface // would be to add a NewSnapshot variant. Creating a snapshot is heavier // weight than creating an iterator, so we have opted to support this // iterator option. OnlyReadGuaranteedDurable bool // UseL6Filters allows the caller to opt into reading filter blocks for L6 // sstables. Helpful if a lot of SeekPrefixGEs are expected in quick // succession, that are also likely to not yield a single key. Filter blocks in // L6 can be relatively large, often larger than data blocks, so the benefit of // loading them in the cache is minimized if the probability of the key // existing is not low or if we just expect a one-time Seek (where loading the // data block directly is better). UseL6Filters bool // contains filtered or unexported fields }
IterOptions hold the optional per-query parameters for NewIter.
Like Options, a nil *IterOptions is valid and means to use the default values.
func (*IterOptions) GetLowerBound ¶
func (o *IterOptions) GetLowerBound() []byte
GetLowerBound returns the LowerBound or nil if the receiver is nil.
func (*IterOptions) GetUpperBound ¶
func (o *IterOptions) GetUpperBound() []byte
GetUpperBound returns the UpperBound or nil if the receiver is nil.
func (*IterOptions) SpanIterOptions ¶
func (o *IterOptions) SpanIterOptions() keyspan.SpanIterOptions
SpanIterOptions creates a SpanIterOptions from this IterOptions.
type IterValidityState ¶
type IterValidityState int8
IterValidityState captures the state of the Iterator.
const ( // IterExhausted represents an Iterator that is exhausted. IterExhausted IterValidityState = iota // IterValid represents an Iterator that is valid. IterValid // IterAtLimit represents an Iterator that has a non-exhausted // internalIterator, but has reached a limit without any key for the // caller. IterAtLimit )
type Iterator ¶
type Iterator struct {
// contains filtered or unexported fields
}
Iterator iterates over a DB's key/value pairs in key order.
An iterator must be closed after use, but it is not necessary to read an iterator until exhaustion.
An iterator is not goroutine-safe, but it is safe to use multiple iterators concurrently, with each in a dedicated goroutine.
It is also safe to use an iterator concurrently with modifying its underlying DB, if that DB permits modification. However, the resultant key/value pairs are not guaranteed to be a consistent snapshot of that DB at a particular point in time.
If an iterator encounters an error during any operation, it is stored by the Iterator and surfaced through the Error method. All absolute positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any accumulated error before positioning. All relative positioning methods (eg, Next, Prev) return without advancing if the iterator has an accumulated error.
Example ¶
package main import ( "fmt" "log" "github.com/edgelesssys/estore" "github.com/edgelesssys/estore/vfs" ) func main() { db, err := estore.Open("", &estore.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, estore.Sync); err != nil { log.Fatal(err) } } iter, _ := db.NewIter(nil) for iter.First(); iter.Valid(); iter.Next() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world world
Example (PrefixIteration) ¶
package main import ( "fmt" "log" "github.com/edgelesssys/estore" "github.com/edgelesssys/estore/vfs" ) func main() { db, err := estore.Open("", &estore.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keyUpperBound := func(b []byte) []byte { end := make([]byte, len(b)) copy(end, b) for i := len(end) - 1; i >= 0; i-- { end[i] = end[i] + 1 if end[i] != 0 { return end[:i+1] } } return nil // no upper-bound } prefixIterOptions := func(prefix []byte) *estore.IterOptions { return &estore.IterOptions{ LowerBound: prefix, UpperBound: keyUpperBound(prefix), } } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, estore.Sync); err != nil { log.Fatal(err) } } iter, _ := db.NewIter(prefixIterOptions([]byte("hello"))) for iter.First(); iter.Valid(); iter.Next() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world
func NewExternalIter ¶
func NewExternalIter( o *Options, iterOpts *IterOptions, files [][]sstable.ReadableFile, extraOpts ...ExternalIterOption, ) (it *Iterator, err error)
NewExternalIter takes an input 2d array of sstable files which may overlap across subarrays but not within a subarray (at least as far as points are concerned; range keys are allowed to overlap arbitrarily even within a subarray), and returns an Iterator over the merged contents of the sstables. Input sstables may contain point keys, range keys, range deletions, etc. The input files slice must be sorted in reverse chronological ordering. A key in a file at a lower index subarray will shadow a key with an identical user key contained within a file at a higher index subarray. Each subarray must be sorted in internal key order, where lower index files contain keys that sort left of files with higher indexes.
Input sstables must only contain keys with the zero sequence number.
Iterators constructed through NewExternalIter do not support all iterator options, including block-property and table filters. NewExternalIter errors if an incompatible option is set.
func NewExternalIterWithContext ¶
func NewExternalIterWithContext( ctx context.Context, o *Options, iterOpts *IterOptions, files [][]sstable.ReadableFile, extraOpts ...ExternalIterOption, ) (it *Iterator, err error)
NewExternalIterWithContext is like NewExternalIter, and additionally accepts a context for tracing.
func (*Iterator) Clone ¶
func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error)
Clone creates a new Iterator over the same underlying data, i.e., over the same {batch, memtables, sstables}). The resulting iterator is not positioned. It starts with the same IterOptions, unless opts.IterOptions is set.
When called on an Iterator over an indexed batch, the clone's visibility of the indexed batch is determined by CloneOptions.RefreshBatchView. If false, the clone inherits the iterator's current (possibly stale) view of the batch, and callers may call SetOptions to subsequently refresh the clone's view to include all batch mutations. If true, the clone is constructed with a complete view of the indexed batch's mutations at the time of the Clone.
Callers can use Clone if they need multiple iterators that need to see exactly the same underlying state of the DB. This should not be used to extend the lifetime of the data backing the original Iterator since that will cause an increase in memory and disk usage (use NewSnapshot for that purpose).
func (*Iterator) CloneWithContext ¶
CloneWithContext is like Clone, and additionally accepts a context for tracing.
func (*Iterator) Close ¶
Close closes the iterator and returns any accumulated error. Exhausting all the key/value pairs in a table is not considered to be an error. It is not valid to call any method, including Close, after the iterator has been closed.
func (*Iterator) First ¶
First moves the iterator the the first key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) HasPointAndRange ¶
HasPointAndRange indicates whether there exists a point key, a range key or both at the current iterator position.
func (*Iterator) Key ¶
Key returns the key of the current key/value pair, or nil if done. The caller should not modify the contents of the returned slice, and its contents may change on the next call to Next.
If positioned at an iterator position that only holds a range key, Key() always returns the start bound of the range key. Otherwise, it returns the point key's key.
func (*Iterator) Last ¶
Last moves the iterator the the last key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) LazyValue ¶
LazyValue returns the LazyValue. Only for advanced use cases. REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
func (*Iterator) Metrics ¶
func (i *Iterator) Metrics() IteratorMetrics
Metrics returns per-iterator metrics.
func (*Iterator) Next ¶
Next moves the iterator to the next key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) NextPrefix ¶
NextPrefix moves the iterator to the next key/value pair with a key containing a different prefix than the current key. Prefixes are determined by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration mode.
It is not permitted to invoke NextPrefix while at a IterAtLimit position. When called in this condition, NextPrefix has non-deterministic behavior.
It is not permitted to invoke NextPrefix when the Iterator has an upper-bound that is a versioned MVCC key (see the comment for Comparer.Split). It returns an error in this case.
func (*Iterator) NextWithLimit ¶
func (i *Iterator) NextWithLimit(limit []byte) IterValidityState
NextWithLimit moves the iterator to the next key/value pair.
If limit is provided, it serves as a best-effort exclusive limit. If the next key is greater than or equal to limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, NextWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, NextWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) Prev ¶
Prev moves the iterator to the previous key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) PrevWithLimit ¶
func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState
PrevWithLimit moves the iterator to the previous key/value pair.
If limit is provided, it serves as a best-effort inclusive limit. If the previous key is less than limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, PrevWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) RangeBounds ¶
RangeBounds returns the start (inclusive) and end (exclusive) bounds of the range key covering the current iterator position. RangeBounds returns nil bounds if there is no range key covering the current iterator position, or the iterator is not configured to surface range keys.
If valid, the returned start bound is less than or equal to Key() and the returned end bound is greater than Key().
func (*Iterator) RangeKeyChanged ¶
RangeKeyChanged indicates whether the most recent iterator positioning operation resulted in the iterator stepping into or out of a new range key. If true, previously returned range key bounds and data has been invalidated. If false, previously obtained range key bounds, suffix and value slices are still valid and may continue to be read.
Invalid iterator positions are considered to not hold range keys, meaning that if an iterator steps from an IterExhausted or IterAtLimit position onto a position with a range key, RangeKeyChanged will yield true.
func (*Iterator) RangeKeys ¶
func (i *Iterator) RangeKeys() []RangeKeyData
RangeKeys returns the range key values and their suffixes covering the current iterator position. The range bounds may be retrieved separately through Iterator.RangeBounds().
func (*Iterator) SeekGE ¶
SeekGE moves the iterator to the first key/value pair whose key is greater than or equal to the given key. Returns true if the iterator is pointing at a valid entry and false otherwise.
Example ¶
package main import ( "fmt" "log" "github.com/edgelesssys/estore" "github.com/edgelesssys/estore/vfs" ) func main() { db, err := estore.Open("", &estore.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, estore.Sync); err != nil { log.Fatal(err) } } iter, _ := db.NewIter(nil) if iter.SeekGE([]byte("a")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if iter.SeekGE([]byte("hello w")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if iter.SeekGE([]byte("w")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world world
func (*Iterator) SeekGEWithLimit ¶
func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState
SeekGEWithLimit moves the iterator to the first key/value pair whose key is greater than or equal to the given key.
If limit is provided, it serves as a best-effort exclusive limit. If the first key greater than or equal to the given search key is also greater than or equal to limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, SeekGEWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, SeekGEWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace [key, limit).
func (*Iterator) SeekLT ¶
SeekLT moves the iterator to the last key/value pair whose key is less than the given key. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) SeekLTWithLimit ¶
func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState
SeekLTWithLimit moves the iterator to the last key/value pair whose key is less than the given key.
If limit is provided, it serves as a best-effort inclusive limit. If the last key less than the given search key is also less than limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, SeekLTWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) SeekPrefixGE ¶
SeekPrefixGE moves the iterator to the first key/value pair whose key is greater than or equal to the given key and which has the same "prefix" as the given key. The prefix for a key is determined by the user-defined Comparer.Split function. The iterator will not observe keys not matching the "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix iteration mode. The iterator remains in prefix iteration until a subsequent call to another absolute positioning method (SeekGE, SeekLT, First, Last). Reverse iteration (Prev) is not supported when an iterator is in prefix iteration mode. Returns true if the iterator is pointing at a valid entry and false otherwise.
The semantics of SeekPrefixGE are slightly unusual and designed for iteration to be able to take advantage of bloom filters that have been created on the "prefix". If you're not using bloom filters, there is no reason to use SeekPrefixGE.
An example Split function may separate a timestamp suffix from the prefix of the key.
Split(<key>@<timestamp>) -> <key>
Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the usual definition, those prefixes differ by the definition of the Split function. To see how this works, consider the following set of calls on this data set:
SeekPrefixGE("a@0") -> "a@1" Next() -> "a@2" Next() -> EOF
If you're just looking to iterate over keys with a shared prefix, as defined by the configured comparer, set iterator bounds instead:
iter := db.NewIter(&pebble.IterOptions{ LowerBound: []byte("prefix"), UpperBound: []byte("prefiy"), }) for iter.First(); iter.Valid(); iter.Next() { // Only keys beginning with "prefix" will be visited. }
See ExampleIterator_SeekPrefixGE for a working example.
When iterating with range keys enabled, all range keys encountered are truncated to the seek key's prefix's bounds. The truncation of the upper bound requires that the database's Comparer is configured with a ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
func (*Iterator) SetBounds ¶
SetBounds sets the lower and upper bounds for the iterator. Once SetBounds returns, the caller is free to mutate the provided slices.
The iterator will always be invalidated and must be repositioned with a call to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
func (*Iterator) SetOptions ¶
func (i *Iterator) SetOptions(o *IterOptions)
SetOptions sets new iterator options for the iterator. Note that the lower and upper bounds applied here will supersede any bounds set by previous calls to SetBounds.
Note that the slices provided in this SetOptions must not be changed by the caller until the iterator is closed, or a subsequent SetBounds or SetOptions has returned. This is because comparisons between the existing and new bounds are sometimes used to optimize seeking. See the extended commentary on SetBounds.
If the iterator was created over an indexed mutable batch, the iterator's view of the mutable batch is refreshed.
The iterator will always be invalidated and must be repositioned with a call to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
If only lower and upper bounds need to be modified, prefer SetBounds.
func (*Iterator) Valid ¶
Valid returns true if the iterator is positioned at a valid key/value pair and false otherwise.
func (*Iterator) Value ¶
Value returns the value of the current key/value pair, or nil if done. The caller should not modify the contents of the returned slice, and its contents may change on the next call to Next.
Only valid if HasPointAndRange() returns true for hasPoint. Deprecated: use ValueAndErr instead.
func (*Iterator) ValueAndErr ¶
ValueAndErr returns the value, and any error encountered in extracting the value. REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
The caller should not modify the contents of the returned slice, and its contents may change on the next call to Next.
type IteratorLevel ¶
type IteratorLevel struct { Kind IteratorLevelKind // FlushableIndex indicates the position within the flushable queue of this level. // Only valid if kind == IteratorLevelFlushable. FlushableIndex int // The level within the LSM. Only valid if Kind == IteratorLevelLSM. Level int // Sublevel is only valid if Kind == IteratorLevelLSM and Level == 0. Sublevel int }
IteratorLevel is used with scanInternalIterator to surface additional iterator-specific info where possible. Note: this is struct is only provided for point keys.
type IteratorLevelKind ¶
type IteratorLevelKind int8
IteratorLevelKind is used to denote whether the current ScanInternal iterator is unknown, belongs to a flushable, or belongs to an LSM level type.
const ( // IteratorLevelUnknown indicates an unknown LSM level. IteratorLevelUnknown IteratorLevelKind = iota // IteratorLevelLSM indicates an LSM level. IteratorLevelLSM // IteratorLevelFlushable indicates a flushable (i.e. memtable). IteratorLevelFlushable )
type IteratorMetrics ¶
type IteratorMetrics struct { // The read amplification experienced by this iterator. This is the sum of // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read // amplification generally results in slower reads, though allowing higher // read amplification can also result in faster writes. ReadAmp int }
IteratorMetrics holds per-iterator metrics. These do not change over the lifetime of the iterator.
type IteratorStats ¶
type IteratorStats struct { // ForwardSeekCount includes SeekGE, SeekPrefixGE, First. ForwardSeekCount [NumStatsKind]int // ReverseSeek includes SeekLT, Last. ReverseSeekCount [NumStatsKind]int // ForwardStepCount includes Next. ForwardStepCount [NumStatsKind]int // ReverseStepCount includes Prev. ReverseStepCount [NumStatsKind]int InternalStats InternalIteratorStats RangeKeyStats RangeKeyIteratorStats }
IteratorStats contains iteration stats.
func (*IteratorStats) Merge ¶
func (stats *IteratorStats) Merge(o IteratorStats)
Merge adds all of the argument's statistics to the receiver. It may be used to accumulate stats across multiple iterators.
func (*IteratorStats) SafeFormat ¶
func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune)
SafeFormat implements the redact.SafeFormatter interface.
func (*IteratorStats) String ¶
func (stats *IteratorStats) String() string
type IteratorStatsKind ¶
type IteratorStatsKind int8
IteratorStatsKind describes the two kind of iterator stats.
const ( // InterfaceCall represents calls to Iterator. InterfaceCall IteratorStatsKind = iota // InternalIterCall represents calls by Iterator to its internalIterator. InternalIterCall // NumStatsKind is the number of kinds, and is used for array sizing. NumStatsKind )
type KeyRange ¶
type KeyRange struct {
Start, End []byte
}
KeyRange encodes a key range in user key space. A KeyRange's Start is inclusive while its End is exclusive.
func (*KeyRange) Contains ¶
func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool
Contains returns whether the specified key exists in the KeyRange.
func (*KeyRange) Overlaps ¶
Overlaps checks if the specified file has an overlap with the KeyRange. Note that we aren't checking for full containment of m within k, rather just that there's some intersection between m and k's bounds.
func (*KeyRange) OverlapsInternalKeyRange ¶
func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool
OverlapsInternalKeyRange checks if the specified internal key range has an overlap with the KeyRange. Note that we aren't checking for full containment of smallest-largest within k, rather just that there's some intersection between the two ranges.
func (*KeyRange) OverlapsKeyRange ¶
OverlapsKeyRange checks if this span overlaps with the provided KeyRange. Note that we aren't checking for full containment of either span in the other, just that there's a key x that is in both key ranges.
type KeyStatistics ¶
type KeyStatistics struct { // SnapshotPinnedKeys represents obsolete keys that cannot be elided during // a compaction, because they are required by an open snapshot. SnapshotPinnedKeys int // SnapshotPinnedKeysBytes is the total number of bytes of all snapshot // pinned keys. SnapshotPinnedKeysBytes uint64 // KindsCount is the count for each kind of key. It includes point keys, // range deletes and range keys. KindsCount [InternalKeyKindMax + 1]int // LatestKindsCount is the count for each kind of key when it is the latest // kind for a user key. It is only populated for point keys. LatestKindsCount [InternalKeyKindMax + 1]int }
KeyStatistics keeps track of the number of keys that have been pinned by a snapshot as well as counts of the different key kinds in the lsm.
One way of using the accumulated stats, when we only have sets and dels, and say the counts are represented as del_count, set_count, del_latest_count, set_latest_count, snapshot_pinned_count.
del_latest_count + set_latest_count is the set of unique user keys (unique).
set_latest_count is the set of live unique user keys (live_unique).
Garbage is del_count + set_count - live_unique.
If everything were in the LSM, del_count+set_count-snapshot_pinned_count would also be the set of unique user keys (note that snapshot_pinned_count is counting something different -- see comment below). But snapshot_pinned_count only counts keys in the LSM so the excess here must be keys in memtables.
type LSMKeyStatistics ¶
type LSMKeyStatistics struct { Accumulated KeyStatistics // Levels contains statistics only for point keys. Range deletions and range keys will // appear in Accumulated but not Levels. Levels [numLevels]KeyStatistics // BytesRead represents the logical, pre-compression size of keys and values read BytesRead uint64 }
LSMKeyStatistics is used by DB.ScanStatistics.
type LazyFetcher ¶
type LazyFetcher = base.LazyFetcher
LazyFetcher exports the base.LazyFetcher type. This export is needed since LazyValue.Clone requires a pointer to a LazyFetcher struct to avoid allocations. No code outside Pebble needs to peer into a LazyFetcher.
type LevelInfo ¶
LevelInfo contains info pertaining to a particular level.
func (LevelInfo) SafeFormat ¶
func (i LevelInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
type LevelMetrics ¶
type LevelMetrics struct { // The number of sublevels within the level. The sublevel count corresponds // to the read amplification for the level. An empty level will have a // sublevel count of 0, implying no read amplification. Only L0 will have // a sublevel count other than 0 or 1. Sublevels int32 // The total number of files in the level. NumFiles int64 // The total number of virtual sstables in the level. NumVirtualFiles uint64 // The total size in bytes of the files in the level. Size int64 // The total size of the virtual sstables in the level. VirtualSize uint64 // The level's compaction score. This is the compensatedScoreRatio in the // candidateLevelInfo. Score float64 // The number of incoming bytes from other levels read during // compactions. This excludes bytes moved and bytes ingested. For L0 this is // the bytes written to the WAL. BytesIn uint64 // The number of bytes ingested. The sibling metric for tables is // TablesIngested. BytesIngested uint64 // The number of bytes moved into the level by a "move" compaction. The // sibling metric for tables is TablesMoved. BytesMoved uint64 // The number of bytes read for compactions at the level. This includes bytes // read from other levels (BytesIn), as well as bytes read for the level. BytesRead uint64 // The number of bytes written during compactions. The sibling // metric for tables is TablesCompacted. This metric may be summed // with BytesFlushed to compute the total bytes written for the level. BytesCompacted uint64 // The number of bytes written during flushes. The sibling // metrics for tables is TablesFlushed. This metric is always // zero for all levels other than L0. BytesFlushed uint64 // The number of sstables compacted to this level. TablesCompacted uint64 // The number of sstables flushed to this level. TablesFlushed uint64 // The number of sstables ingested into the level. TablesIngested uint64 // The number of sstables moved to this level by a "move" compaction. TablesMoved uint64 MultiLevel struct { // BytesInTop are the total bytes in a multilevel compaction coming from the top level. BytesInTop uint64 // BytesIn, exclusively for multiLevel compactions. BytesIn uint64 // BytesRead, exclusively for multilevel compactions. BytesRead uint64 } // Additional contains misc additional metrics that are not always printed. Additional struct { // The sum of Properties.ValueBlocksSize for all the sstables in this // level. Printed by LevelMetrics.format iff there is at least one level // with a non-zero value. ValueBlocksSize uint64 // Cumulative metrics about bytes written to data blocks and value blocks, // via compactions (except move compactions) or flushes. Not printed by // LevelMetrics.format, but are available to sophisticated clients. BytesWrittenDataBlocks uint64 BytesWrittenValueBlocks uint64 } }
LevelMetrics holds per-level metrics such as the number of files and total size of the files, and compaction related metrics.
func (*LevelMetrics) Add ¶
func (m *LevelMetrics) Add(u *LevelMetrics)
Add updates the counter metrics for the level.
func (*LevelMetrics) WriteAmp ¶
func (m *LevelMetrics) WriteAmp() float64
WriteAmp computes the write amplification for compactions at this level. Computed as (BytesFlushed + BytesCompacted) / BytesIn.
type LevelOptions ¶
type LevelOptions struct { // BlockRestartInterval is the number of keys between restart points // for delta encoding of keys. // // The default value is 16. BlockRestartInterval int // BlockSize is the target uncompressed size in bytes of each table block. // // The default value is 4096. BlockSize int // BlockSizeThreshold finishes a block if the block size is larger than the // specified percentage of the target block size and adding the next entry // would cause the block to be larger than the target block size. // // The default value is 90 BlockSizeThreshold int // Compression defines the per-block compression to use. // // The default value (DefaultCompression) uses snappy compression. Compression Compression // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can // reduce disk reads for Get calls. // // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom // package. // // The default value means to use no filter. FilterPolicy FilterPolicy // FilterType defines whether an existing filter policy is applied at a // block-level or table-level. Block-level filters use less memory to create, // but are slower to access as a check for the key in the index must first be // performed to locate the filter block. A table-level filter will require // memory proportional to the number of keys in an sstable to create, but // avoids the index lookup when determining if a key is present. Table-level // filters should be preferred except under constrained memory situations. FilterType FilterType // IndexBlockSize is the target uncompressed size in bytes of each index // block. When the index block size is larger than this target, two-level // indexes are automatically enabled. Setting this option to a large value // (such as math.MaxInt32) disables the automatic creation of two-level // indexes. // // The default value is the value of BlockSize. IndexBlockSize int // The target file size for the level. TargetFileSize int64 }
LevelOptions holds the optional per-level parameters.
func (*LevelOptions) EnsureDefaults ¶
func (o *LevelOptions) EnsureDefaults() *LevelOptions
EnsureDefaults ensures that the default values for all of the options have been initialized. It is valid to call EnsureDefaults on a nil receiver. A non-nil result will always be returned.
type Lock ¶
type Lock struct {
// contains filtered or unexported fields
}
Lock represents a file lock on a directory. It may be passed to Open through Options.Lock to elide lock aquisition during Open.
func LockDirectory ¶
LockDirectory acquires the database directory lock in the named directory, preventing another process from opening the database. LockDirectory returns a handle to the held lock that may be passed to Open through Options.Lock to subsequently open the database, skipping lock acquistion during Open.
LockDirectory may be used to expand the critical section protected by the database lock to include setup before the call to Open.
type LoggerAndTracer ¶
type LoggerAndTracer = base.LoggerAndTracer
LoggerAndTracer defines an interface for logging and tracing.
type ManifestCreateInfo ¶
type ManifestCreateInfo struct { // JobID is the ID of the job the caused the manifest to be created. JobID int Path string // The file number of the new Manifest. FileNum FileNum Err error }
ManifestCreateInfo contains info about a manifest creation event.
func (ManifestCreateInfo) SafeFormat ¶
func (i ManifestCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (ManifestCreateInfo) String ¶
func (i ManifestCreateInfo) String() string
type ManifestDeleteInfo ¶
type ManifestDeleteInfo struct { // JobID is the ID of the job the caused the Manifest to be deleted. JobID int Path string FileNum FileNum Err error }
ManifestDeleteInfo contains the info for a Manifest deletion event.
func (ManifestDeleteInfo) SafeFormat ¶
func (i ManifestDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (ManifestDeleteInfo) String ¶
func (i ManifestDeleteInfo) String() string
type Metrics ¶
type Metrics struct { BlockCache CacheMetrics Compact struct { // The total number of compactions, and per-compaction type counts. Count int64 DefaultCount int64 DeleteOnlyCount int64 ElisionOnlyCount int64 MoveCount int64 ReadCount int64 RewriteCount int64 MultiLevelCount int64 CounterLevelCount int64 // An estimate of the number of bytes that need to be compacted for the LSM // to reach a stable state. EstimatedDebt uint64 // Number of bytes present in sstables being written by in-progress // compactions. This value will be zero if there are no in-progress // compactions. InProgressBytes int64 // Number of compactions that are in-progress. NumInProgress int64 // MarkedFiles is a count of files that are marked for // compaction. Such files are compacted in a rewrite compaction // when no other compactions are picked. MarkedFiles int // Duration records the cumulative duration of all compactions since the // database was opened. Duration time.Duration } Ingest struct { // The total number of ingestions Count uint64 } Flush struct { // The total number of flushes. Count int64 WriteThroughput ThroughputMetric // Number of flushes that are in-progress. In the current implementation // this will always be zero or one. NumInProgress int64 // AsIngestCount is a monotonically increasing counter of flush operations // handling ingested tables. AsIngestCount uint64 // AsIngestCount is a monotonically increasing counter of tables ingested as // flushables. AsIngestTableCount uint64 // AsIngestBytes is a monotonically increasing counter of the bytes flushed // for flushables that originated as ingestion operations. AsIngestBytes uint64 } Filter FilterMetrics Levels [numLevels]LevelMetrics MemTable struct { // The number of bytes allocated by memtables and large (flushable) // batches. Size uint64 // The count of memtables. Count int64 // The number of bytes present in zombie memtables which are no longer // referenced by the current DB state. An unbounded number of memtables // may be zombie if they're still in use by an iterator. One additional // memtable may be zombie if it's no longer in use and waiting to be // recycled. ZombieSize uint64 // The count of zombie memtables. ZombieCount int64 } Keys struct { // The approximate count of internal range key set keys in the database. RangeKeySetsCount uint64 // The approximate count of internal tombstones (DEL, SINGLEDEL and // RANGEDEL key kinds) within the database. TombstoneCount uint64 // A cumulative total number of missized DELSIZED keys encountered by // compactions since the database was opened. MissizedTombstonesCount uint64 } Snapshots struct { // The number of currently open snapshots. Count int // The sequence number of the earliest, currently open snapshot. EarliestSeqNum uint64 // A running tally of keys written to sstables during flushes or // compactions that would've been elided if it weren't for open // snapshots. PinnedKeys uint64 // A running cumulative sum of the size of keys and values written to // sstables during flushes or compactions that would've been elided if // it weren't for open snapshots. PinnedSize uint64 } Table struct { // The number of bytes present in obsolete tables which are no longer // referenced by the current DB state or any open iterators. ObsoleteSize uint64 // The count of obsolete tables. ObsoleteCount int64 // The number of bytes present in zombie tables which are no longer // referenced by the current DB state but are still in use by an iterator. ZombieSize uint64 // The count of zombie tables. ZombieCount int64 // The count of the backing sstables. BackingTableCount uint64 // The sum of the sizes of the all of the backing sstables. BackingTableSize uint64 } TableCache CacheMetrics // Count of the number of open sstable iterators. TableIters int64 // Uptime is the total time since this DB was opened. Uptime time.Duration WAL struct { // Number of live WAL files. Files int64 // Number of obsolete WAL files. ObsoleteFiles int64 // Physical size of the obsolete WAL files. ObsoletePhysicalSize uint64 // Size of the live data in the WAL files. Note that with WAL file // recycling this is less than the actual on-disk size of the WAL files. Size uint64 // Physical size of the WAL files on-disk. With WAL file recycling, // this is greater than the live data in WAL files. PhysicalSize uint64 // Number of logical bytes written to the WAL. BytesIn uint64 // Number of bytes written to the WAL. BytesWritten uint64 } LogWriter struct { FsyncLatency prometheus.Histogram record.LogWriterMetrics } SecondaryCacheMetrics SecondaryCacheMetrics // contains filtered or unexported fields }
Metrics holds metrics for various subsystems of the DB such as the Cache, Compactions, WAL, and per-Level metrics.
TODO(peter): The testing of these metrics is relatively weak. There should be testing that performs various operations on a DB and verifies that the metrics reflect those operations.
func (*Metrics) DiskSpaceUsage ¶
DiskSpaceUsage returns the total disk space used by the database in bytes, including live and obsolete files.
func (*Metrics) NumVirtual ¶
NumVirtual is the number of virtual sstables in the latest version summed over every level in the lsm.
func (*Metrics) ReadAmp ¶
ReadAmp returns the current read amplification of the database. It's computed as the number of sublevels in L0 + the number of non-empty levels below L0.
func (*Metrics) SafeFormat ¶
func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (*Metrics) String ¶
String pretty-prints the metrics as below:
| | | | ingested | moved | written | | amp level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- 0 | 101 102B 0B 0 | 103.0 | 104B | 112 104B | 113 106B | 221 217B | 107B | 1 2.1 1 | 201 202B 0B 0 | 203.0 | 204B | 212 204B | 213 206B | 421 417B | 207B | 2 2.0 2 | 301 302B 0B 0 | 303.0 | 304B | 312 304B | 313 306B | 621 617B | 307B | 3 2.0 3 | 401 402B 0B 0 | 403.0 | 404B | 412 404B | 413 406B | 821 817B | 407B | 4 2.0 4 | 501 502B 0B 0 | 503.0 | 504B | 512 504B | 513 506B | 1.0K 1017B | 507B | 5 2.0 5 | 601 602B 0B 0 | 603.0 | 604B | 612 604B | 613 606B | 1.2K 1.2KB | 607B | 6 2.0 6 | 701 702B 0B 0 | - | 704B | 712 704B | 713 706B | 1.4K 1.4KB | 707B | 7 2.0 total | 2.8K 2.7KB 0B 0 | - | 2.8KB | 2.9K 2.8KB | 2.9K 2.8KB | 5.7K 8.4KB | 2.8KB | 28 3.0 ------------------------------------------------------------------------------------------------------------------- WAL: 22 files (24B) in: 25B written: 26B (4% overhead) Flushes: 8 Compactions: 5 estimated debt: 6B in progress: 2 (7B) default: 27 delete: 28 elision: 29 move: 30 read: 31 rewrite: 32 multi-level: 33 MemTables: 12 (11B) zombie: 14 (13B) Zombie tables: 16 (15B) Backing tables: 0 (0B) Block cache: 2 entries (1B) hit rate: 42.9% Table cache: 18 entries (17B) hit rate: 48.7% Secondary cache: 40 entries (40B) hit rate: 49.9% Snapshots: 4 earliest seq num: 1024 Table iters: 21 Filter utility: 47.4% Ingestions: 27 as flushable: 36 (34B in 35 tables)
func (*Metrics) StringForTests ¶
StringForTests is identical to m.String() on 64-bit platforms. It is used to provide a platform-independent result for tests.
func (*Metrics) Total ¶
func (m *Metrics) Total() LevelMetrics
Total returns the sum of the per-level metrics and WAL metrics.
func (*Metrics) VirtualSize ¶
VirtualSize is the sum of the sizes of the virtual sstables in the latest version. BackingTableSize - VirtualSize gives an estimate for the space amplification caused by not compacting virtual sstables.
type MultiLevelHeuristic ¶
type MultiLevelHeuristic interface {
// contains filtered or unexported methods
}
MultiLevelHeuristic evaluates whether to add files from the next level into the compaction.
type NoMultiLevel ¶
type NoMultiLevel struct{}
NoMultiLevel will never add an additional level to the compaction.
type Options ¶
type Options struct { // EncryptionKey is the master key for encryption at rest. Must be 16, 24, or 32 bytes. EncryptionKey []byte // Sync sstables periodically in order to smooth out writes to disk. This // option does not provide any persistency guarantee, but is used to avoid // latency spikes if the OS automatically decides to write out a large chunk // of dirty filesystem buffers. This option only controls SSTable syncs; WAL // syncs are controlled by WALBytesPerSync. // // The default value is 512KB. BytesPerSync int // Cache is used to cache uncompressed blocks from sstables. // // The default cache size is 8 MB. Cache *cache.Cache // Cleaner cleans obsolete files. // // The default cleaner uses the DeleteCleaner. Cleaner Cleaner // Local contains option that pertain to files stored on the local filesystem. Local struct { // ReadaheadConfigFn is a function used to retrieve the current readahead // mode. This function is consulted when a table enters the table cache. ReadaheadConfigFn func() ReadaheadConfig } // Comparer defines a total ordering over the space of []byte keys: a 'less // than' relationship. The same comparison algorithm must be used for reads // and writes over the lifetime of the DB. // // The default value uses the same ordering as bytes.Compare. Comparer *Comparer // DebugCheck is invoked, if non-nil, whenever a new version is being // installed. Typically, this is set to pebble.DebugCheckLevels in tests // or tools only, to check invariants over all the data in the database. DebugCheck func(*DB) error // Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits // crash recovery, but can improve performance if crash recovery is not // needed (e.g. when only temporary state is being stored in the database). // // TODO(peter): untested DisableWAL bool // ErrorIfExists causes an error on Open if the database already exists. // The error can be checked with errors.Is(err, ErrDBAlreadyExists). // // The default value is false. ErrorIfExists bool // ErrorIfNotExists causes an error on Open if the database does not already // exist. The error can be checked with errors.Is(err, ErrDBDoesNotExist). // // The default value is false which will cause a database to be created if it // does not already exist. ErrorIfNotExists bool // ErrorIfNotPristine causes an error on Open if the database already exists // and any operations have been performed on the database. The error can be // checked with errors.Is(err, ErrDBNotPristine). // // Note that a database that contained keys that were all subsequently deleted // may or may not trigger the error. Currently, we check if there are any live // SSTs or log records to replay. ErrorIfNotPristine bool // EventListener provides hooks to listening to significant DB events such as // flushes, compactions, and table deletion. EventListener *EventListener // Experimental contains experimental options which are off by default. // These options are temporary and will eventually either be deleted, moved // out of the experimental group, or made the non-adjustable default. These // options may change at any time, so do not rely on them. Experimental struct { // The threshold of L0 read-amplification at which compaction concurrency // is enabled (if CompactionDebtConcurrency was not already exceeded). // Every multiple of this value enables another concurrent // compaction up to MaxConcurrentCompactions. L0CompactionConcurrency int // CompactionDebtConcurrency controls the threshold of compaction debt // at which additional compaction concurrency slots are added. For every // multiple of this value in compaction debt bytes, an additional // concurrent compaction is added. This works "on top" of // L0CompactionConcurrency, so the higher of the count of compaction // concurrency slots as determined by the two options is chosen. CompactionDebtConcurrency uint64 // IngestSplit, if it returns true, allows for ingest-time splitting of // existing sstables into two virtual sstables to allow ingestion sstables to // slot into a lower level than they otherwise would have. IngestSplit func() bool // ReadCompactionRate controls the frequency of read triggered // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: // // AllowedSeeks = FileSize / ReadCompactionRate // // From LevelDB: // “` // We arrange to automatically compact this file after // a certain number of seeks. Let's assume: // (1) One seek costs 10ms // (2) Writing or reading 1MB costs 10ms (100MB/s) // (3) A compaction of 1MB does 25MB of IO: // 1MB read from this level // 10-12MB read from next level (boundaries may be misaligned) // 10-12MB written to next level // This implies that 25 seeks cost the same as the compaction // of 1MB of data. I.e., one seek costs approximately the // same as the compaction of 40KB of data. We are a little // conservative and allow approximately one seek for every 16KB // of data before triggering a compaction. // “` ReadCompactionRate int64 // ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in // iterator.maybeSampleRead() to control the frequency of read sampling // to trigger a read triggered compaction. A value of -1 prevents sampling // and disables read triggered compactions. The default is 1 << 4. which // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). ReadSamplingMultiplier int64 // TableCacheShards is the number of shards per table cache. // Reducing the value can reduce the number of idle goroutines per DB // instance which can be useful in scenarios with a lot of DB instances // and a large number of CPUs, but doing so can lead to higher contention // in the table cache and reduced performance. // // The default value is the number of logical CPUs, which can be // limited by runtime.GOMAXPROCS. TableCacheShards int // KeyValidationFunc is a function to validate a user key in an SSTable. // // Currently, this function is used to validate the smallest and largest // keys in an SSTable undergoing compaction. In this case, returning an // error from the validation function will result in a panic at runtime, // given that there is rarely any way of recovering from malformed keys // present in compacted files. By default, validation is not performed. // // Additional use-cases may be added in the future. // // NOTE: callers should take care to not mutate the key being validated. KeyValidationFunc func(userKey []byte) error // ValidateOnIngest schedules validation of sstables after they have // been ingested. // // By default, this value is false. ValidateOnIngest bool // LevelMultiplier configures the size multiplier used to determine the // desired size of each level of the LSM. Defaults to 10. LevelMultiplier int // MultiLevelCompactionHeuristic determines whether to add an additional // level to a conventional two level compaction. If nil, a multilevel // compaction will never get triggered. MultiLevelCompactionHeuristic MultiLevelHeuristic // MaxWriterConcurrency is used to indicate the maximum number of // compression workers the compression queue is allowed to use. If // MaxWriterConcurrency > 0, then the Writer will use parallelism, to // compress and write blocks to disk. Otherwise, the writer will // compress and write blocks to disk synchronously. MaxWriterConcurrency int // ForceWriterParallelism is used to force parallelism in the sstable // Writer for the metamorphic tests. Even with the MaxWriterConcurrency // option set, we only enable parallelism in the sstable Writer if there // is enough CPU available, and this option bypasses that. ForceWriterParallelism bool // CPUWorkPermissionGranter should be set if Pebble should be given the // ability to optionally schedule additional CPU. See the documentation // for CPUWorkPermissionGranter for more details. CPUWorkPermissionGranter CPUWorkPermissionGranter // EnableValueBlocks is used to decide whether to enable writing // TableFormatPebblev3 sstables. This setting is only respected by a // specific subset of format major versions: FormatSSTableValueBlocks, // FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower // format major versions, value blocks are never enabled. In higher // format major versions, value blocks are always enabled. EnableValueBlocks func() bool // ShortAttributeExtractor is used iff EnableValueBlocks() returns true // (else ignored). If non-nil, a ShortAttribute can be extracted from the // value and stored with the key, when the value is stored elsewhere. ShortAttributeExtractor ShortAttributeExtractor // RequiredInPlaceValueBound specifies an optional span of user key // prefixes that are not-MVCC, but have a suffix. For these the values // must be stored with the key, since the concept of "older versions" is // not defined. It is also useful for statically known exclusions to value // separation. In CockroachDB, this will be used for the lock table key // space that has non-empty suffixes, but those locks don't represent // actual MVCC versions (the suffix ordering is arbitrary). We will also // need to add support for dynamically configured exclusions (we want the // default to be to allow Pebble to decide whether to separate the value // or not, hence this is structured as exclusions), for example, for users // of CockroachDB to dynamically exclude certain tables. // // Any change in exclusion behavior takes effect only on future written // sstables, and does not start rewriting existing sstables. // // Even ignoring changes in this setting, exclusions are interpreted as a // guidance by Pebble, and not necessarily honored. Specifically, user // keys with multiple Pebble-versions *may* have the older versions stored // in value blocks. RequiredInPlaceValueBound UserKeyPrefixBound // DisableIngestAsFlushable disables lazy ingestion of sstables through // a WAL write and memtable rotation. Only effectual if the the format // major version is at least `FormatFlushableIngest`. DisableIngestAsFlushable func() bool // RemoteStorage enables use of remote storage (e.g. S3) for storing // sstables. Setting this option enables use of CreateOnShared option and // allows ingestion of external files. RemoteStorage remote.StorageFactory // If CreateOnShared is non-zero, new sstables are created on remote storage // (using CreateOnSharedLocator and with the appropriate // CreateOnSharedStrategy). These sstables can be shared between different // Pebble instances; the lifecycle of such objects is managed by the // remote.Storage constructed by options.RemoteStorage. // // Can only be used when RemoteStorage is set (and recognizes // CreateOnSharedLocator). CreateOnShared remote.CreateOnSharedStrategy CreateOnSharedLocator remote.Locator // CacheSizeBytesBytes is the size of the on-disk block cache for objects // on shared storage in bytes. If it is 0, no cache is used. SecondaryCacheSizeBytes int64 // IneffectualPointDeleteCallback is called in compactions/flushes if any // single delete is being elided without deleting a point set/merge. IneffectualSingleDeleteCallback func(userKey []byte) // SingleDeleteInvariantViolationCallback is called in compactions/flushes if any // single delete has consumed a Set/Merge, and there is another immediately older // Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under // which SingleDelete can be used correctly. // // Consider the sequence SingleDelete#3, Set#2, Set#1. There are three // ways some of these keys can first meet in a compaction. // // - All 3 keys in the same compaction: this callback will detect the // violation. // // - SingleDelete#3, Set#2 meet in a compaction first: Both keys will // disappear. The violation will not be detected, and the DB will have // Set#1 which is likely incorrect (from the user's perspective). // // - Set#2, Set#1 meet in a compaction first: The output will be Set#2, // which will later be consumed by SingleDelete#3. The violation will // not be detected and the DB will be correct. SingleDeleteInvariantViolationCallback func(userKey []byte) } // Filters is a map from filter policy name to filter policy. It is used for // debugging tools which may be used on multiple databases configured with // different filter policies. It is not necessary to populate this filters // map during normal usage of a DB. Filters map[string]FilterPolicy // FlushDelayDeleteRange configures how long the database should wait before // forcing a flush of a memtable that contains a range deletion. Disk space // cannot be reclaimed until the range deletion is flushed. No automatic // flush occurs if zero. FlushDelayDeleteRange time.Duration // FlushDelayRangeKey configures how long the database should wait before // forcing a flush of a memtable that contains a range key. Range keys in // the memtable prevent lazy combined iteration, so it's desirable to flush // range keys promptly. No automatic flush occurs if zero. FlushDelayRangeKey time.Duration // FlushSplitBytes denotes the target number of bytes per sublevel in // each flush split interval (i.e. range between two flush split keys) // in L0 sstables. When set to zero, only a single sstable is generated // by each flush. When set to a non-zero value, flushes are split at // points to meet L0's TargetFileSize, any grandparent-related overlap // options, and at boundary keys of L0 flush split intervals (which are // targeted to contain around FlushSplitBytes bytes in each sublevel // between pairs of boundary keys). Splitting sstables during flush // allows increased compaction flexibility and concurrency when those // tables are compacted to lower levels. FlushSplitBytes int64 // FormatMajorVersion sets the format of on-disk files. It is // recommended to set the format major version to an explicit // version, as the default may change over time. // // At Open if the existing database is formatted using a later // format major version that is known to this version of Pebble, // Pebble will continue to use the later format major version. If // the existing database's version is unknown, the caller may use // FormatMostCompatible and will be able to open the database // regardless of its actual version. // // If the existing database is formatted using a format major // version earlier than the one specified, Open will automatically // ratchet the database to the specified format major version. FormatMajorVersion FormatMajorVersion // FS provides the interface for persistent file storage. // // The default value uses the underlying operating system's file system. FS vfs.FS // Lock, if set, must be a database lock acquired through LockDirectory for // the same directory passed to Open. If provided, Open will skip locking // the directory. Closing the database will not release the lock, and it's // the responsibility of the caller to release the lock after closing the // database. // // Open will enforce that the Lock passed locks the same directory passed to // Open. Concurrent calls to Open using the same Lock are detected and // prohibited. Lock *Lock // The count of L0 files necessary to trigger an L0 compaction. L0CompactionFileThreshold int // The amount of L0 read-amplification necessary to trigger an L0 compaction. L0CompactionThreshold int // Hard limit on L0 read-amplification, computed as the number of L0 // sublevels. Writes are stopped when this threshold is reached. L0StopWritesThreshold int // The maximum number of bytes for LBase. The base level is the level which // L0 is compacted into. The base level is determined dynamically based on // the existing data in the LSM. The maximum number of bytes for other levels // is computed dynamically based on the base level's maximum size. When the // maximum number of bytes for a level is exceeded, compaction is requested. LBaseMaxBytes int64 // Per-level options. Options for at least one level must be specified. The // options for the last level are used for all subsequent levels. Levels []LevelOptions // Logger used to write log messages. // // The default logger uses the Go standard library log package. Logger Logger // LoggerAndTracer is used for writing log messages and traces. LoggerAndTracer LoggerAndTracer // MaxManifestFileSize is the maximum size the MANIFEST file is allowed to // become. When the MANIFEST exceeds this size it is rolled over and a new // MANIFEST is created. MaxManifestFileSize int64 // MaxOpenFiles is a soft limit on the number of open files that can be // used by the DB. // // The default value is 1000. MaxOpenFiles int // The size of a MemTable in steady state. The actual MemTable size starts at // min(256KB, MemTableSize) and doubles for each subsequent MemTable up to // MemTableSize. This reduces the memory pressure caused by MemTables for // short lived (test) DB instances. Note that more than one MemTable can be // in existence since flushing a MemTable involves creating a new one and // writing the contents of the old one in the // background. MemTableStopWritesThreshold places a hard limit on the size of // the queued MemTables. // // The default value is 4MB. MemTableSize uint64 // Hard limit on the number of queued of MemTables. Writes are stopped when // the sum of the queued memtable sizes exceeds: // MemTableStopWritesThreshold * MemTableSize. // // This value should be at least 2 or writes will stop whenever a MemTable is // being flushed. // // The default value is 2. MemTableStopWritesThreshold int // Merger defines the associative merge operation to use for merging values // written with {Batch,DB}.Merge. // // The default merger concatenates values. Merger *Merger // MaxConcurrentCompactions specifies the maximum number of concurrent // compactions. The default is 1. Concurrent compactions are performed // - when L0 read-amplification passes the L0CompactionConcurrency threshold // - for automatic background compactions // - when a manual compaction for a level is split and parallelized // MaxConcurrentCompactions must be greater than 0. MaxConcurrentCompactions func() int // DisableAutomaticCompactions dictates whether automatic compactions are // scheduled or not. The default is false (enabled). This option is only used // externally when running a manual compaction, and internally for tests. DisableAutomaticCompactions bool // NoSyncOnClose decides whether the Pebble instance will enforce a // close-time synchronization (e.g., fdatasync() or sync_file_range()) // on files it writes to. Setting this to true removes the guarantee for a // sync on close. Some implementations can still issue a non-blocking sync. NoSyncOnClose bool // NumPrevManifest is the number of non-current or older manifests which // we want to keep around for debugging purposes. By default, we're going // to keep one older manifest. NumPrevManifest int // ReadOnly indicates that the DB should be opened in read-only mode. Writes // to the DB will return an error, background compactions are disabled, and // the flush that normally occurs after replaying the WAL at startup is // disabled. ReadOnly bool // TableCache is an initialized TableCache which should be set as an // option if the DB needs to be initialized with a pre-existing table cache. // If TableCache is nil, then a table cache which is unique to the DB instance // is created. TableCache can be shared between db instances by setting it here. // The TableCache set here must use the same underlying cache as Options.Cache // and pebble will panic otherwise. TableCache *TableCache // TablePropertyCollectors is a list of TablePropertyCollector creation // functions. A new TablePropertyCollector is created for each sstable built // and lives for the lifetime of the table. TablePropertyCollectors []func() TablePropertyCollector // BlockPropertyCollectors is a list of BlockPropertyCollector creation // functions. A new BlockPropertyCollector is created for each sstable // built and lives for the lifetime of writing that table. BlockPropertyCollectors []func() BlockPropertyCollector // WALBytesPerSync sets the number of bytes to write to a WAL before calling // Sync on it in the background. Just like with BytesPerSync above, this // helps smooth out disk write latencies, and avoids cases where the OS // writes a lot of buffered data to disk at once. However, this is less // necessary with WALs, as many write operations already pass in // Sync = true. // // The default value is 0, i.e. no background syncing. This matches the // default behaviour in RocksDB. WALBytesPerSync int // WALDir specifies the directory to store write-ahead logs (WALs) in. If // empty (the default), WALs will be stored in the same directory as sstables // (i.e. the directory passed to pebble.Open). WALDir string // WALMinSyncInterval is the minimum duration between syncs of the WAL. If // WAL syncs are requested faster than this interval, they will be // artificially delayed. Introducing a small artificial delay (500us) between // WAL syncs can allow more operations to arrive and reduce IO operations // while having a minimal impact on throughput. This option is supplied as a // closure in order to allow the value to be changed dynamically. The default // value is 0. // // TODO(peter): rather than a closure, should there be another mechanism for // changing options dynamically? WALMinSyncInterval func() time.Duration // TargetByteDeletionRate is the rate (in bytes per second) at which sstable file // deletions are limited to (under normal circumstances). // // Deletion pacing is used to slow down deletions when compactions finish up // or readers close and newly-obsolete files need cleaning up. Deleting lots // of files at once can cause disk latency to go up on some SSDs, which this // functionality guards against. // // This value is only a best-effort target; the effective rate can be // higher if deletions are falling behind or disk space is running low. // // Setting this to 0 disables deletion pacing, which is also the default. TargetByteDeletionRate int // contains filtered or unexported fields }
Options holds the optional parameters for configuring pebble. These options apply to the DB at large; per-query options are defined by the IterOptions and WriteOptions types.
func (*Options) AddEventListener ¶
func (o *Options) AddEventListener(l EventListener)
AddEventListener adds the provided event listener to the Options, in addition to any existing event listener.
func (*Options) Check ¶
Check verifies the options are compatible with the previous options serialized by Options.String(). For example, the Comparer and Merger must be the same, or data will not be able to be properly read from the DB.
func (*Options) EnsureDefaults ¶
EnsureDefaults ensures that the default values for all options are set if a valid value was not already specified. Returns the new options.
func (*Options) Level ¶
func (o *Options) Level(level int) LevelOptions
Level returns the LevelOptions for the specified level.
func (*Options) MakeReaderOptions ¶
func (o *Options) MakeReaderOptions() sstable.ReaderOptions
MakeReaderOptions constructs sstable.ReaderOptions from the corresponding options in the receiver.
func (*Options) MakeWriterOptions ¶
func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions
MakeWriterOptions constructs sstable.WriterOptions for the specified level from the corresponding options in the receiver.
func (*Options) Parse ¶
func (o *Options) Parse(s string, hooks *ParseHooks) error
Parse parses the options from the specified string. Note that certain options cannot be parsed into populated fields. For example, comparer and merger.
func (*Options) Validate ¶
Validate verifies that the options are mutually consistent. For example, L0StopWritesThreshold must be >= L0CompactionThreshold, otherwise a write stall would persist indefinitely.
func (*Options) WithFSDefaults ¶
WithFSDefaults configures the Options to wrap the configured filesystem with the default virtual file system middleware, like disk-health checking.
type ParseHooks ¶
type ParseHooks struct { NewCache func(size int64) *Cache NewCleaner func(name string) (Cleaner, error) NewComparer func(name string) (*Comparer, error) NewFilterPolicy func(name string) (FilterPolicy, error) NewMerger func(name string) (*Merger, error) SkipUnknown func(name, value string) bool }
ParseHooks contains callbacks to create options fields which can have user-defined implementations.
type RangeKeyData ¶
RangeKeyData describes a range key's data, set through RangeKeySet. The key boundaries of the range key is provided by Iterator.RangeBounds.
type RangeKeyIteratorStats ¶
type RangeKeyIteratorStats struct { // Count records the number of range keys encountered during // iteration. Range keys may be counted multiple times if the iterator // leaves a range key's bounds and then returns. Count int // ContainedPoints records the number of point keys encountered within the // bounds of a range key. Note that this includes point keys with suffixes // that sort both above and below the covering range key's suffix. ContainedPoints int // SkippedPoints records the count of the subset of ContainedPoints point // keys that were skipped during iteration due to range-key masking. It does // not include point keys that were never loaded because a // RangeKeyMasking.Filter excluded the entire containing block. SkippedPoints int }
RangeKeyIteratorStats contains miscellaneous stats about range keys encountered by the iterator.
func (*RangeKeyIteratorStats) Merge ¶
func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats)
Merge adds all of the argument's statistics to the receiver. It may be used to accumulate stats across multiple iterators.
func (*RangeKeyIteratorStats) SafeFormat ¶
func (s *RangeKeyIteratorStats) SafeFormat(p redact.SafePrinter, verb rune)
SafeFormat implements the redact.SafeFormatter interface.
func (*RangeKeyIteratorStats) String ¶
func (s *RangeKeyIteratorStats) String() string
type RangeKeyMasking ¶
type RangeKeyMasking struct { // Suffix configures which range keys may mask point keys. Only range keys // that are defined at suffixes greater than or equal to Suffix will mask // point keys. Suffix []byte // Filter is an optional field that may be used to improve performance of // range-key masking through a block-property filter defined over key // suffixes. If non-nil, Filter is called by Pebble to construct a // block-property filter mask at iterator creation. The filter is used to // skip whole point-key blocks containing point keys with suffixes greater // than a covering range-key's suffix. // // To use this functionality, the caller must create and configure (through // Options.BlockPropertyCollectors) a block-property collector that records // the maxmimum suffix contained within a block. The caller then must write // and provide a BlockPropertyFilterMask implementation on that same // property. See the BlockPropertyFilterMask type for more information. Filter func() BlockPropertyFilterMask }
RangeKeyMasking configures automatic hiding of point keys by range keys. A non-nil Suffix enables range-key masking. When enabled, range keys with suffixes ≥ Suffix behave as masks. All point keys that are contained within a masking range key's bounds and have suffixes greater than the range key's suffix are automatically skipped.
Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there exists a range key with suffix _r_ covering a point key with suffix _p_, and
_s_ ≤ _r_ < _p_
then the point key is elided.
Range-key masking may only be used when iterating over both point keys and range keys with IterKeyTypePointsAndRanges.
type ReadaheadConfig ¶
type ReadaheadConfig = objstorageprovider.ReadaheadConfig
ReadaheadConfig controls the use of read-ahead.
type Reader ¶
type Reader interface { // Get gets the value for the given key. It returns ErrNotFound if the DB // does not contain the key. // // The caller should not modify the contents of the returned slice, but it is // safe to modify the contents of the argument after Get returns. The // returned slice will remain valid until the returned Closer is closed. On // success, the caller MUST call closer.Close() or a memory leak will occur. Get(key []byte) (value []byte, closer io.Closer, err error) // NewIter returns an iterator that is unpositioned (Iterator.Valid() will // return false). The iterator can be positioned via a call to SeekGE, // SeekLT, First or Last. NewIter(o *IterOptions) (*Iterator, error) // Close closes the Reader. It may or may not close any underlying io.Reader // or io.Writer, depending on how the DB was created. // // It is not safe to close a DB until all outstanding iterators are closed. // It is valid to call Close multiple times. Other methods should not be // called after the DB has been closed. Close() error }
Reader is a readable key/value store.
It is safe to call Get and NewIter from concurrent goroutines.
type SSTableInfo ¶
type SSTableInfo struct { manifest.TableInfo // Virtual indicates whether the sstable is virtual. Virtual bool // BackingSSTNum is the file number associated with backing sstable which // backs the sstable associated with this SSTableInfo. If Virtual is false, // then BackingSSTNum == FileNum. BackingSSTNum base.FileNum // BackingType is the type of storage backing this sstable. BackingType BackingType // Locator is the remote.Locator backing this sstable, if the backing type is // not BackingTypeLocal. Locator remote.Locator // Properties is the sstable properties of this table. If Virtual is true, // then the Properties are associated with the backing sst. Properties *sstable.Properties }
SSTableInfo export manifest.TableInfo with sstable.Properties alongside other file backing info.
type SSTablesOption ¶
type SSTablesOption func(*sstablesOptions)
SSTablesOption set optional parameter used by `DB.SSTables`.
func WithApproximateSpanBytes ¶
func WithApproximateSpanBytes() SSTablesOption
WithApproximateSpanBytes enables capturing the approximate number of bytes that overlap the provided key span for each sstable. NOTE: this option can only be used with WithKeyRangeFilter and WithProperties provided.
func WithKeyRangeFilter ¶
func WithKeyRangeFilter(start, end []byte) SSTablesOption
WithKeyRangeFilter ensures returned sstables overlap start and end (end-exclusive) if start and end are both nil these properties have no effect.
func WithProperties ¶
func WithProperties() SSTablesOption
WithProperties enable return sstable properties in each TableInfo.
NOTE: if most of the sstable properties need to be read from disk, this options may make method `SSTables` quite slow.
type ScanStatisticsOptions ¶
type ScanStatisticsOptions struct { // LimitBytesPerSecond indicates the number of bytes that are able to be read // per second using ScanInternal. // A value of 0 indicates that there is no limit set. LimitBytesPerSecond int64 }
ScanStatisticsOptions is used by DB.ScanStatistics.
type SecondaryCacheMetrics ¶
type SecondaryCacheMetrics = sharedcache.Metrics
SecondaryCacheMetrics holds metrics for the persistent secondary cache that caches commonly accessed blocks from blob storage on a local file system.
type SharedSSTMeta ¶
type SharedSSTMeta struct { // objstorage.Provider. Backing objstorage.RemoteObjectBackingHandle
SharedSSTMeta represents an sstable on shared storage that can be ingested by another pebble instance. This struct must contain all fields that are required for a Pebble instance to ingest a foreign sstable on shared storage, including constructing any relevant objstorage.Provider / remoteobjcat.Catalog data structures, as well as creating virtual FileMetadatas.
Note that the Pebble instance creating and returning a SharedSSTMeta might not be the one that created the underlying sstable on shared storage to begin with; it's possible for a Pebble instance to reshare an sstable that was shared to it.
type ShortAttribute ¶
type ShortAttribute = base.ShortAttribute
ShortAttribute exports the base.ShortAttribute type.
type ShortAttributeExtractor ¶
type ShortAttributeExtractor = base.ShortAttributeExtractor
ShortAttributeExtractor exports the base.ShortAttributeExtractor type.
type Snapshot ¶
type Snapshot struct {
// contains filtered or unexported fields
}
Snapshot provides a read-only point-in-time view of the DB state.
func (*Snapshot) Close ¶
Close closes the snapshot, releasing its resources. Close must be called. Failure to do so will result in a tiny memory leak and a large leak of resources on disk due to the entries the snapshot is preventing from being deleted.
d.mu must NOT be held by the caller.
func (*Snapshot) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the Snapshot does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*Snapshot) NewIter ¶
func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error)
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last.
func (*Snapshot) NewIterWithContext ¶
NewIterWithContext is like NewIter, and additionally accepts a context for tracing.
func (*Snapshot) ScanInternal ¶
func (s *Snapshot) ScanInternal( ctx context.Context, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, visitRangeDel func(start, end []byte, seqNum uint64) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, ) error
ScanInternal scans all internal keys within the specified bounds, truncating any rangedels and rangekeys to those bounds. For use when an external user needs to be aware of all internal keys that make up a key range.
See comment on db.ScanInternal for the behaviour that can be expected of point keys deleted by range dels and keys masked by range keys.
type TableCache ¶
type TableCache struct {
// contains filtered or unexported fields
}
TableCache is a shareable cache for open sstables.
func NewTableCache ¶
func NewTableCache(cache *Cache, numShards int, size int) *TableCache
NewTableCache will create a reference to the table cache. It is the callers responsibility to call tableCache.Unref if they will no longer hold a reference to the table cache.
func (*TableCache) Ref ¶
func (c *TableCache) Ref()
Ref adds a reference to the table cache. Once tableCache.init returns, the table cache only remains valid if there is at least one reference to it.
func (*TableCache) Unref ¶
func (c *TableCache) Unref() error
Unref removes a reference to the table cache.
type TableCreateInfo ¶
type TableCreateInfo struct { JobID int // Reason is the reason for the table creation: "compacting", "flushing", or // "ingesting". Reason string Path string FileNum FileNum }
TableCreateInfo contains the info for a table creation event.
func (TableCreateInfo) SafeFormat ¶
func (i TableCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableCreateInfo) String ¶
func (i TableCreateInfo) String() string
type TableDeleteInfo ¶
TableDeleteInfo contains the info for a table deletion event.
func (TableDeleteInfo) SafeFormat ¶
func (i TableDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableDeleteInfo) String ¶
func (i TableDeleteInfo) String() string
type TableIngestInfo ¶
type TableIngestInfo struct { // JobID is the ID of the job the caused the table to be ingested. JobID int Tables []struct { TableInfo Level int } // GlobalSeqNum is the sequence number that was assigned to all entries in // the ingested table. GlobalSeqNum uint64 Err error // contains filtered or unexported fields }
TableIngestInfo contains the info for a table ingestion event.
func (TableIngestInfo) SafeFormat ¶
func (i TableIngestInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableIngestInfo) String ¶
func (i TableIngestInfo) String() string
type TablePropertyCollector ¶
type TablePropertyCollector = sstable.TablePropertyCollector
TablePropertyCollector exports the sstable.TablePropertyCollector type.
type TableStatsInfo ¶
type TableStatsInfo struct { // JobID is the ID of the job that finished loading the initial tables' // stats. JobID int }
TableStatsInfo contains the info for a table stats loaded event.
func (TableStatsInfo) SafeFormat ¶
func (i TableStatsInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableStatsInfo) String ¶
func (i TableStatsInfo) String() string
type TableValidatedInfo ¶
type TableValidatedInfo struct { JobID int Meta *fileMetadata }
TableValidatedInfo contains information on the result of a validation run on an sstable.
func (TableValidatedInfo) SafeFormat ¶
func (i TableValidatedInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableValidatedInfo) String ¶
func (i TableValidatedInfo) String() string
type ThroughputMetric ¶
type ThroughputMetric = base.ThroughputMetric
ThroughputMetric is a cumulative throughput metric. See the detailed comment in base.
type Transaction ¶
type Transaction struct { *Batch // contains filtered or unexported fields }
Transaction is a database transaction.
Transactions must be closed by calling Close or Commit when they are no longer needed. You must not perform non-transctional write operations if a write transaction is active.
Example ¶
package main import ( "crypto/rand" "fmt" "log" "github.com/edgelesssys/estore" "github.com/edgelesssys/estore/vfs" ) func main() { encryptionKey := make([]byte, 16) _, err := rand.Read(encryptionKey) if err != nil { log.Fatal(err) } db, err := estore.Open("", &estore.Options{EncryptionKey: encryptionKey, FS: vfs.NewMem()}) if err != nil { panic(err) } // Write key-value pairs in a write transaction. tx := db.NewTransaction(true) defer tx.Close() if err := tx.Set([]byte("key1"), []byte("value1"), nil); err != nil { panic(err) } if err := tx.Set([]byte("key2"), []byte("value2"), nil); err != nil { panic(err) } if err := tx.Commit(); err != nil { panic(err) } // Read the values back. tx = db.NewTransaction(false) defer tx.Close() val, closer, err := tx.Get([]byte("key1")) if err != nil { panic(err) } defer closer.Close() fmt.Println(string(val)) val, closer, err = tx.Get([]byte("key2")) if err != nil { panic(err) } defer closer.Close() fmt.Println(string(val)) }
Output: value1 value2
func (*Transaction) Close ¶
func (t *Transaction) Close()
Close closes the transaction without committing it.
It is valid but not required to call Close after Commit.
func (*Transaction) Commit ¶
func (t *Transaction) Commit() error
Commit commits and closes the transaction.
func (*Transaction) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the key is not found.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*Transaction) NewIter ¶
func (t *Transaction) NewIter(o *IterOptions) *Iterator
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last.
func (*Transaction) NewIterWithContext ¶
func (t *Transaction) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator
NewIterWithContext is like NewIter, and additionally accepts a context for tracing.
type UserKeyPrefixBound ¶
type UserKeyPrefixBound = sstable.UserKeyPrefixBound
UserKeyPrefixBound exports the sstable.UserKeyPrefixBound type.
type ValueMerger ¶
type ValueMerger = base.ValueMerger
ValueMerger exports the base.ValueMerger type.
type WALCreateInfo ¶
type WALCreateInfo struct { // JobID is the ID of the job the caused the WAL to be created. JobID int Path string // The file number of the new WAL. FileNum FileNum // The file number of a previous WAL which was recycled to create this // one. Zero if recycling did not take place. RecycledFileNum FileNum Err error }
WALCreateInfo contains info about a WAL creation event.
func (WALCreateInfo) SafeFormat ¶
func (i WALCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WALCreateInfo) String ¶
func (i WALCreateInfo) String() string
type WALDeleteInfo ¶
type WALDeleteInfo struct { // JobID is the ID of the job the caused the WAL to be deleted. JobID int Path string FileNum FileNum Err error }
WALDeleteInfo contains the info for a WAL deletion event.
func (WALDeleteInfo) SafeFormat ¶
func (i WALDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WALDeleteInfo) String ¶
func (i WALDeleteInfo) String() string
type WriteAmpHeuristic ¶
type WriteAmpHeuristic struct { // addPropensity is a constant that affects the propensity to conduct multilevel // compactions. If positive, a multilevel compaction may get picked even if // the single level compaction has lower write amp, and vice versa. AddPropensity float64 // AllowL0 if true, allow l0 to be involved in a ML compaction. AllowL0 bool }
WriteAmpHeuristic defines a multi level compaction heuristic which will add an additional level to the picked compaction if it reduces predicted write amp of the compaction + the addPropensity constant.
type WriteOptions ¶
type WriteOptions struct { // Sync is whether to sync writes through the OS buffer cache and down onto // the actual disk, if applicable. Setting Sync is required for durability of // individual write operations but can result in slower writes. // // If false, and the process or machine crashes, then a recent write may be // lost. This is due to the recently written data being buffered inside the // process running Pebble. This differs from the semantics of a write system // call in which the data is buffered in the OS buffer cache and would thus // survive a process crash. // // The default value is true. Sync bool }
WriteOptions hold the optional per-query parameters for Set and Delete operations.
Like Options, a nil *WriteOptions is valid and means to use the default values.
func (*WriteOptions) GetSync ¶
func (o *WriteOptions) GetSync() bool
GetSync returns the Sync value or true if the receiver is nil.
type WriteStallBeginInfo ¶
type WriteStallBeginInfo struct {
Reason string
}
WriteStallBeginInfo contains the info for a write stall begin event.
func (WriteStallBeginInfo) SafeFormat ¶
func (i WriteStallBeginInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WriteStallBeginInfo) String ¶
func (i WriteStallBeginInfo) String() string
type Writer ¶
type Writer interface { // Apply the operations contained in the batch to the DB. // // It is safe to modify the contents of the arguments after Apply returns. Apply(batch *Batch, o *WriteOptions) error // Delete deletes the value for the given key. Deletes are blind all will // succeed even if the given key does not exist. // // It is safe to modify the contents of the arguments after Delete returns. Delete(key []byte, o *WriteOptions) error // DeleteSized behaves identically to Delete, but takes an additional // argument indicating the size of the value being deleted. DeleteSized // should be preferred when the caller has the expectation that there exists // a single internal KV pair for the key (eg, the key has not been // overwritten recently), and the caller knows the size of its value. // // DeleteSized will record the value size within the tombstone and use it to // inform compaction-picking heuristics which strive to reduce space // amplification in the LSM. This "calling your shot" mechanic allows the // storage engine to more accurately estimate and reduce space // amplification. // // It is safe to modify the contents of the arguments after DeleteSized // returns. DeleteSized(key []byte, valueSize uint32, _ *WriteOptions) error // SingleDelete is similar to Delete in that it deletes the value for the given key. Like Delete, // it is a blind operation that will succeed even if the given key does not exist. // // WARNING: Undefined (non-deterministic) behavior will result if a key is overwritten and // then deleted using SingleDelete. The record may appear deleted immediately, but be // resurrected at a later time after compactions have been performed. Or the record may // be deleted permanently. A Delete operation lays down a "tombstone" which shadows all // previous versions of a key. The SingleDelete operation is akin to "anti-matter" and will // only delete the most recently written version for a key. These different semantics allow // the DB to avoid propagating a SingleDelete operation during a compaction as soon as the // corresponding Set operation is encountered. These semantics require extreme care to handle // properly. Only use if you have a workload where the performance gain is critical and you // can guarantee that a record is written once and then deleted once. // // SingleDelete is internally transformed into a Delete if the most recent record for a key is either // a Merge or Delete record. // // It is safe to modify the contents of the arguments after SingleDelete returns. SingleDelete(key []byte, o *WriteOptions) error // DeleteRange deletes all of the point keys (and values) in the range // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT // delete overlapping range keys (eg, keys set via RangeKeySet). // // It is safe to modify the contents of the arguments after DeleteRange // returns. DeleteRange(start, end []byte, o *WriteOptions) error // LogData adds the specified to the batch. The data will be written to the // WAL, but not added to memtables or sstables. Log data is never indexed, // which makes it useful for testing WAL performance. // // It is safe to modify the contents of the argument after LogData returns. LogData(data []byte, opts *WriteOptions) error // Merge merges the value for the given key. The details of the merge are // dependent upon the configured merge operation. // // It is safe to modify the contents of the arguments after Merge returns. Merge(key, value []byte, o *WriteOptions) error // Set sets the value for the given key. It overwrites any previous value // for that key; a DB is not a multi-map. // // It is safe to modify the contents of the arguments after Set returns. Set(key, value []byte, o *WriteOptions) error // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC // timestamp suffix to value. The suffix is optional. If any portion of the key // range [start, end) is already set by a range key with the same suffix value, // RangeKeySet overrides it. // // It is safe to modify the contents of the arguments after RangeKeySet returns. RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error // RangeKeyUnset removes a range key mapping the key range [start, end) at the // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed // range key. RangeKeyUnset only removes portions of range keys that fall within // the [start, end) key span, and only range keys with suffixes that exactly // match the unset suffix. // // It is safe to modify the contents of the arguments after RangeKeyUnset // returns. RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error // RangeKeyDelete deletes all of the range keys in the range [start,end) // (inclusive on start, exclusive on end). It does not delete point keys (for // that use DeleteRange). RangeKeyDelete removes all range keys within the // bounds, including those with or without suffixes. // // It is safe to modify the contents of the arguments after RangeKeyDelete // returns. RangeKeyDelete(start, end []byte, opts *WriteOptions) error }
Writer is a writable key/value store.
Goroutine safety is dependent on the specific implementation.
Source Files ¶
- batch.go
- cache.go
- checkpoint.go
- cleaner.go
- commit.go
- compaction.go
- compaction_iter.go
- compaction_picker.go
- comparer.go
- db.go
- error_iter.go
- event.go
- external_iterator.go
- filenames.go
- flushable.go
- format_major_version.go
- get_iter.go
- ingest.go
- internal.go
- iterator.go
- level_checker.go
- level_iter.go
- log_recycler.go
- logger.go
- mem_table.go
- merger.go
- merging_iter.go
- merging_iter_heap.go
- metrics.go
- open.go
- options.go
- pacer.go
- range_keys.go
- read_compaction_queue.go
- read_state.go
- scan_internal.go
- snapshot.go
- table_cache.go
- table_stats.go
- transaction.go
- version_set.go
Directories ¶
Path | Synopsis |
---|---|
Package bloom implements Bloom filters.
|
Package bloom implements Bloom filters. |
cmd
|
|
internal
|
|
cache
Package cache implements the CLOCK-Pro caching algorithm.
|
Package cache implements the CLOCK-Pro caching algorithm. |
crc
Package crc implements the checksum algorithm used throughout pebble.
|
Package crc implements the checksum algorithm used throughout pebble. |
datatest
Package datatest provides common datadriven test commands for use outside of the root Pebble package.
|
Package datatest provides common datadriven test commands for use outside of the root Pebble package. |
dsl
Package dsl provides facilities for parsing lisp-like domain-specific languages (DSL).
|
Package dsl provides facilities for parsing lisp-like domain-specific languages (DSL). |
keyspan
Package keyspan provides facilities for sorting, fragmenting and iterating over spans of user keys.
|
Package keyspan provides facilities for sorting, fragmenting and iterating over spans of user keys. |
metamorphic
Package metamorphic holds the entrypoint for Pebble's internal metamorphic tests.
|
Package metamorphic holds the entrypoint for Pebble's internal metamorphic tests. |
metamorphic/metaflags
Package metaflags defines command-line flags for the metamorphic tests and provides functionality to construct the respective metamorphic.RunOptions/RunOnceOptions.
|
Package metaflags defines command-line flags for the metamorphic tests and provides functionality to construct the respective metamorphic.RunOptions/RunOnceOptions. |
metamorphic/metarunner
metarunner is a utility which runs metamorphic.RunOnce or Compare.
|
metarunner is a utility which runs metamorphic.RunOnce or Compare. |
mkbench
mkbench is a utility for processing the raw nightly benchmark data in JSON data that can be visualized by docs/js/app.js.
|
mkbench is a utility for processing the raw nightly benchmark data in JSON data that can be visualized by docs/js/app.js. |
rangekey
Package rangekey provides facilities for encoding, decoding and merging range keys.
|
Package rangekey provides facilities for encoding, decoding and merging range keys. |
rate
Package rate provides a rate limiter.
|
Package rate provides a rate limiter. |
testkeys
Package testkeys provides facilities for generating and comparing human-readable test keys for use in tests and benchmarks.
|
Package testkeys provides facilities for generating and comparing human-readable test keys for use in tests and benchmarks. |
Package metamorphic provides a testing framework for running randomized tests over multiple Pebble databases with varying configurations.
|
Package metamorphic provides a testing framework for running randomized tests over multiple Pebble databases with varying configurations. |
Package rangekey provides functionality for working with range keys.
|
Package rangekey provides functionality for working with range keys. |
Package record reads and writes sequences of records.
|
Package record reads and writes sequences of records. |
Package replay implements collection and replaying of compaction benchmarking workloads.
|
Package replay implements collection and replaying of compaction benchmarking workloads. |
Package sstable implements readers and writers of pebble tables.
|
Package sstable implements readers and writers of pebble tables. |
vfstest
Package vfstest provides facilities for interacting with or faking filesystems during tests and benchmarks.
|
Package vfstest provides facilities for interacting with or faking filesystems during tests and benchmarks. |