Documentation ¶
Overview ¶
Package pebble provides an ordered key/value store.
Example ¶
package main import ( "fmt" "log" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) func main() { db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } key := []byte("hello") if err := db.Set(key, []byte("world"), pebble.Sync); err != nil { log.Fatal(err) } value, closer, err := db.Get(key) if err != nil { log.Fatal(err) } fmt.Printf("%s %s\n", key, value) if err := closer.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello world
Index ¶
- Constants
- Variables
- func DebugCheckLevels(db *DB) error
- func GetVersion(dir string, fs vfs.FS) (string, error)
- func NewCache(size int64) *cache.Cache
- func TableCacheSize(maxOpenFiles int) int
- type AbbreviatedKey
- type ArchiveCleaner
- type Batch
- func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error
- func (b *Batch) Close() error
- func (b *Batch) Commit(o *WriteOptions) error
- func (b *Batch) Count() uint32
- func (b *Batch) Delete(key []byte, _ *WriteOptions) error
- func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp
- func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error
- func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp
- func (b *Batch) Empty() bool
- func (b *Batch) Experimental() ExperimentalWriter
- func (b *Batch) Get(key []byte) ([]byte, io.Closer, error)
- func (b *Batch) Indexed() bool
- func (b *Batch) LogData(data []byte, _ *WriteOptions) error
- func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error
- func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp
- func (b *Batch) NewIter(o *IterOptions) *Iterator
- func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp
- func (b *Batch) Reader() BatchReader
- func (b *Batch) Repr() []byte
- func (b *Batch) Reset()
- func (b *Batch) SeqNum() uint64
- func (b *Batch) Set(key, value []byte, _ *WriteOptions) error
- func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp
- func (b *Batch) SetRepr(data []byte) error
- func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error
- func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp
- type BatchReader
- type BlockPropertyCollector
- type BlockPropertyFilter
- type CPUWorkPermissionGranter
- type Cache
- type CacheMetrics
- type CheckLevelsStats
- type CheckpointOption
- type Cleaner
- type CompactionInfo
- type Compare
- type Comparer
- type Compression
- type DB
- func (d *DB) Apply(batch *Batch, opts *WriteOptions) error
- func (d *DB) AsyncFlush() (<-chan struct{}, error)
- func (d *DB) CheckLevels(stats *CheckLevelsStats) error
- func (d *DB) Checkpoint(destDir string, opts ...CheckpointOption) (ckErr error)
- func (d *DB) Close() error
- func (d *DB) Compact(start, end []byte, parallelize bool) error
- func (d *DB) Delete(key []byte, opts *WriteOptions) error
- func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error
- func (d *DB) EstimateDiskUsage(start, end []byte) (uint64, error)
- func (d *DB) Experimental() ExperimentalWriter
- func (d *DB) Flush() error
- func (d *DB) FormatMajorVersion() FormatMajorVersion
- func (d *DB) Get(key []byte) ([]byte, io.Closer, error)
- func (d *DB) Ingest(paths []string) error
- func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error)
- func (d *DB) InternalIntervalMetrics() *InternalIntervalMetrics
- func (d *DB) LogData(data []byte, opts *WriteOptions) error
- func (d *DB) Merge(key, value []byte, opts *WriteOptions) error
- func (d *DB) Metrics() *Metrics
- func (d *DB) NewBatch() *Batch
- func (d *DB) NewIndexedBatch() *Batch
- func (d *DB) NewIter(o *IterOptions) *Iterator
- func (d *DB) NewSnapshot() *Snapshot
- func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error
- func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error)
- func (d *DB) Set(key, value []byte, opts *WriteOptions) error
- func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error
- type DBDesc
- type DeferredBatchOp
- type DeletableValueMerger
- type DeleteCleaner
- type DiskSlowInfo
- type Equal
- type EventListener
- type ExperimentalWriter
- type FileNum
- type FilterMetrics
- type FilterPolicy
- type FilterType
- type FilterWriter
- type FlushInfo
- type FormatMajorVersion
- type IngestOperationStats
- type InternalIntervalMetrics
- type InternalIteratorStats
- type InternalKey
- type InternalKeyKind
- type IterKeyType
- type IterOptions
- type IterValidityState
- type Iterator
- func (i *Iterator) Clone() (*Iterator, error)
- func (i *Iterator) Close() error
- func (i *Iterator) Error() error
- func (i *Iterator) First() bool
- func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool)
- func (i *Iterator) Key() []byte
- func (i *Iterator) Last() bool
- func (i *Iterator) Metrics() IteratorMetrics
- func (i *Iterator) Next() bool
- func (i *Iterator) NextWithLimit(limit []byte) IterValidityState
- func (i *Iterator) Prev() bool
- func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState
- func (i *Iterator) RangeBounds() (start, end []byte)
- func (i *Iterator) RangeKeys() []RangeKeyData
- func (i *Iterator) ResetStats()
- func (i *Iterator) SeekGE(key []byte) bool
- func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState
- func (i *Iterator) SeekLT(key []byte) bool
- func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState
- func (i *Iterator) SeekPrefixGE(key []byte) bool
- func (i *Iterator) SetBounds(lower, upper []byte)
- func (i *Iterator) SetOptions(o *IterOptions)
- func (i *Iterator) Stats() IteratorStats
- func (i *Iterator) Valid() bool
- func (i *Iterator) Value() []byte
- type IteratorMetrics
- type IteratorStats
- type IteratorStatsKind
- type LevelInfo
- type LevelMetrics
- type LevelOptions
- type Logger
- type ManifestCreateInfo
- type ManifestDeleteInfo
- type Merge
- type Merger
- type Metrics
- type Options
- func (o *Options) Check(s string) error
- func (o *Options) Clone() *Options
- func (o *Options) EnsureDefaults() *Options
- func (o *Options) Level(level int) LevelOptions
- func (o *Options) MakeReaderOptions() sstable.ReaderOptions
- func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions
- func (o *Options) Parse(s string, hooks *ParseHooks) error
- func (o *Options) String() string
- func (o *Options) Validate() error
- type ParseHooks
- type RangeKeyData
- type RangeKeyMasking
- type Reader
- type SSTableInfo
- type SSTablesOption
- type Separator
- type Snapshot
- type Split
- type Successor
- type TableCache
- type TableCreateInfo
- type TableDeleteInfo
- type TableInfo
- type TableIngestInfo
- type TablePropertyCollector
- type TableStatsInfo
- type TableValidatedInfo
- type ThroughputMetric
- type ValueMerger
- type WALCreateInfo
- type WALDeleteInfo
- type WriteOptions
- type WriteStallBeginInfo
- type Writer
Examples ¶
Constants ¶
const ( InternalKeyKindDelete = base.InternalKeyKindDelete InternalKeyKindSet = base.InternalKeyKindSet InternalKeyKindMerge = base.InternalKeyKindMerge InternalKeyKindLogData = base.InternalKeyKindLogData InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete InternalKeyKindMax = base.InternalKeyKindMax InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete InternalKeyKindRangeKeySet = base.InternalKeyKindRangeKeySet InternalKeyKindRangeKeyUnset = base.InternalKeyKindRangeKeyUnset InternalKeyKindRangeKeyDelete = base.InternalKeyKindRangeKeyDelete InternalKeyKindInvalid = base.InternalKeyKindInvalid InternalKeySeqNumBatch = base.InternalKeySeqNumBatch InternalKeySeqNumMax = base.InternalKeySeqNumMax InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel )
These constants are part of the file format, and should not be changed.
const ( DefaultCompression = sstable.DefaultCompression NoCompression = sstable.NoCompression SnappyCompression = sstable.SnappyCompression ZstdCompression = sstable.ZstdCompression )
Exported Compression constants.
const (
TableFilter = base.TableFilter
)
Exported TableFilter constants.
Variables ¶
var ( // ErrNotFound is returned when a get operation does not find the requested // key. ErrNotFound = base.ErrNotFound // ErrClosed is panicked when an operation is performed on a closed snapshot or // DB. Use errors.Is(err, ErrClosed) to check for this error. ErrClosed = errors.New("pebble: closed") // ErrReadOnly is returned when a write operation is performed on a read-only // database. ErrReadOnly = errors.New("pebble: read-only") )
var DefaultComparer = base.DefaultComparer
DefaultComparer exports the base.DefaultComparer variable.
var DefaultLogger defaultLogger
DefaultLogger logs to the Go stdlib logs.
var DefaultMerger = base.DefaultMerger
DefaultMerger exports the base.DefaultMerger variable.
var ErrBatchTooLarge = errors.Newf("pebble: batch too large: >= %s", humanize.Uint64(maxBatchSize))
ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
var ErrInvalidBatch = errors.New("pebble: invalid batch")
ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
var ErrNotIndexed = errors.New("pebble: batch not indexed")
ErrNotIndexed means that a read operation on a batch failed because the batch is not indexed and thus doesn't support reads.
var NoSync = &WriteOptions{Sync: false}
NoSync specifies the default write options for writes which do not synchronize to disk.
var Sync = &WriteOptions{Sync: true}
Sync specifies the default write options for writes which synchronize to disk.
Functions ¶
func DebugCheckLevels ¶
DebugCheckLevels calls CheckLevels on the provided database. It may be set in the DebugCheck field of Options to check level invariants whenever a new version is installed.
func GetVersion ¶
GetVersion returns the engine version string from the latest options file present in dir. Used to check what Pebble or RocksDB version was last used to write to the database stored in this directory. An empty string is returned if no valid OPTIONS file with a version key was found.
func NewCache ¶
NewCache creates a new cache of the specified size. Memory for the cache is allocated on demand, not during initialization. The cache is created with a reference count of 1. Each DB it is associated with adds a reference, so the creator of the cache should usually release their reference after the DB is created.
c := pebble.NewCache(...) defer c.Unref() d, err := pebble.Open(pebble.Options{Cache: c})
func TableCacheSize ¶
TableCacheSize can be used to determine the table cache size for a single db, given the maximum open files which can be used by a table cache which is only used by a single db.
Types ¶
type AbbreviatedKey ¶
type AbbreviatedKey = base.AbbreviatedKey
AbbreviatedKey exports the base.AbbreviatedKey type.
type ArchiveCleaner ¶
type ArchiveCleaner = base.ArchiveCleaner
ArchiveCleaner exports the base.ArchiveCleaner type.
type Batch ¶
type Batch struct {
// contains filtered or unexported fields
}
A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets, RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch implements the Reader interface, but only an indexed batch supports reading (without error) via Get or NewIter. A non-indexed batch will return ErrNotIndexed when read from. A batch is not safe for concurrent use, and consumers should use a batch per goroutine or provide their own synchronization.
Indexing ¶
Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch allows iteration via an Iterator (see Batch.NewIter). The iterator provides a merged view of the operations in the batch and the underlying database. This is implemented by treating the batch as an additional layer in the LSM where every entry in the batch is considered newer than any entry in the underlying database (batch entries have the InternalKeySeqNumBatch bit set). By treating the batch as an additional layer in the LSM, iteration supports all batch operations (i.e. Set, Merge, Delete, DeleteRange, RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
The same key can be operated on multiple times in a batch, though only the latest operation will be visible. For example, Put("a", "b"), Delete("a") will cause the key "a" to not be visible in the batch. Put("a", "b"), Put("a", "c") will cause a read of "a" to return the value "c".
The batch index is implemented via an skiplist (internal/batchskl). While the skiplist implementation is very fast, inserting into an indexed batch is significantly slower than inserting into a non-indexed batch. Only use an indexed batch if you require reading from it.
Atomic commit ¶
The operations in a batch are persisted by calling Batch.Commit which is equivalent to calling DB.Apply(batch). A batch is committed atomically by writing the internal batch representation to the WAL, adding all of the batch operations to the memtable associated with the WAL, and then incrementing the visible sequence number so that subsequent reads can see the effects of the batch operations. If WriteOptions.Sync is true, a call to Batch.Commit will guarantee that the batch is persisted to disk before returning. See commitPipeline for more on the implementation details.
Large batches ¶
The size of a batch is limited only by available memory (be aware that indexed batches require considerably additional memory for the skiplist structure). A given WAL file has a single memtable associated with it (this restriction could be removed, but doing so is onerous and complex). And a memtable has a fixed size due to the underlying fixed size arena. Note that this differs from RocksDB where a memtable can grow arbitrarily large using a list of arena chunks. In RocksDB this is accomplished by storing pointers in the arena memory, but that isn't possible in Go.
During Batch.Commit, a batch which is larger than a threshold (> MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue of memtables. A flushableBatch forces WAL to be rotated, but that happens anyways when the memtable becomes full so this does not cause significant WAL churn. Because the flushableBatch is readable as another layer in the LSM, Batch.Commit returns as soon as the flushableBatch has been added to the queue of memtables.
Internally, a flushableBatch provides Iterator support by sorting the batch contents (the batch is sorted once, when it is added to the memtable queue). Sorting the batch contents and insertion of the contents into a memtable have the same big-O time, but the constant factor dominates here. Sorting is significantly faster and uses significantly less memory.
Internal representation ¶
The internal batch representation is a contiguous byte buffer with a fixed 12-byte header, followed by a series of records.
+-------------+------------+--- ... ---+ | SeqNum (8B) | Count (4B) | Entries | +-------------+------------+--- ... ---+
Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed strings (varstring):
+-----------+-----------------+-------------------+ | Kind (1B) | Key (varstring) | Value (varstring) | +-----------+-----------------+-------------------+
A varstring is a varint32 followed by N bytes of data. The Kind tags are exactly those specified by InternalKeyKind. The following table shows the format for records of each kind:
InternalKeyKindDelete varstring InternalKeyKindLogData varstring InternalKeyKindSet varstring varstring InternalKeyKindMerge varstring varstring InternalKeyKindRangeDelete varstring varstring InternalKeyKindRangeKeySet varstring varstring InternalKeyKindRangeKeyUnset varstring varstring InternalKeyKindRangeKeyDelete varstring varstring
The intuitive understanding here are that the arguments to Delete, Set, Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The RangeKeySet and RangeKeyUnset operations are slightly more complicated, encoding their end key, suffix and value [in the case of RangeKeySet] within the Value varstring. For more information on the value encoding for RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
The internal batch representation is the on disk format for a batch in the WAL, and thus stable. New record kinds may be added, but the existing ones will not be modified.
func (*Batch) Apply ¶
func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error
Apply the operations contained in the batch to the receiver batch.
It is safe to modify the contents of the arguments after Apply returns.
func (*Batch) Commit ¶
func (b *Batch) Commit(o *WriteOptions) error
Commit applies the batch to its parent writer.
func (*Batch) Count ¶
Count returns the count of memtable-modifying operations in this batch. All operations with the except of LogData increment this count.
func (*Batch) Delete ¶
func (b *Batch) Delete(key []byte, _ *WriteOptions) error
Delete adds an action to the batch that deletes the entry for key.
It is safe to modify the contents of the arguments after Delete returns.
func (*Batch) DeleteDeferred ¶
func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp
DeleteDeferred is similar to Delete in that it adds a delete operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) DeleteRange ¶
func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error
DeleteRange deletes all of the point keys (and values) in the range [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT delete overlapping range keys (eg, keys set via RangeKeySet).
It is safe to modify the contents of the arguments after DeleteRange returns.
func (*Batch) DeleteRangeDeferred ¶
func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp
DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range operation to the batch, except it only takes in key lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object. Note that DeferredBatchOp.Key should be populated with the start key, and DeferredBatchOp.Value should be populated with the end key.
func (*Batch) Experimental ¶
func (b *Batch) Experimental() ExperimentalWriter
Experimental returns the experimental write API, backed by the same batch.
func (*Batch) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the Batch does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*Batch) Indexed ¶
Indexed returns true if the batch is indexed (i.e. supports read operations).
func (*Batch) LogData ¶
func (b *Batch) LogData(data []byte, _ *WriteOptions) error
LogData adds the specified to the batch. The data will be written to the WAL, but not added to memtables or sstables. Log data is never indexed, which makes it useful for testing WAL performance.
It is safe to modify the contents of the argument after LogData returns.
func (*Batch) Merge ¶
func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error
Merge adds an action to the batch that merges the value at key with the new value. The details of the merge are dependent upon the configured merge operator.
It is safe to modify the contents of the arguments after Merge returns.
func (*Batch) MergeDeferred ¶
func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp
MergeDeferred is similar to Merge in that it adds a merge operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) NewIter ¶
func (b *Batch) NewIter(o *IterOptions) *Iterator
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
The returned Iterator observes all of the Batch's existing mutations, but no later mutations. Its view can be refreshed via RefreshBatchSnapshot or SetOptions().
func (*Batch) RangeKeyDeleteDeferred ¶
func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp
RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an operation to delete range keys to the batch, except it only takes in key lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object. Note that DeferredBatchOp.Key should be populated with the start key, and DeferredBatchOp.Value should be populated with the end key.
func (*Batch) Reader ¶
func (b *Batch) Reader() BatchReader
Reader returns a BatchReader for the current batch contents. If the batch is mutated, the new entries will not be visible to the reader.
func (*Batch) Repr ¶
Repr returns the underlying batch representation. It is not safe to modify the contents. Reset() will not change the contents of the returned value, though any other mutation operation may do so.
func (*Batch) Reset ¶
func (b *Batch) Reset()
Reset resets the batch for reuse. The underlying byte slice (that is returned by Repr()) is not modified. It is only necessary to call this method if a batch is explicitly being reused. Close automatically takes are of releasing resources when appropriate for batches that are internally being reused.
func (*Batch) SeqNum ¶
SeqNum returns the batch sequence number which is applied to the first record in the batch. The sequence number is incremented for each subsequent record. It returns zero if the batch is empty.
func (*Batch) Set ¶
func (b *Batch) Set(key, value []byte, _ *WriteOptions) error
Set adds an action to the batch that sets the key to map to the value.
It is safe to modify the contents of the arguments after Set returns.
func (*Batch) SetDeferred ¶
func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp
SetDeferred is similar to Set in that it adds a set operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
func (*Batch) SetRepr ¶
SetRepr sets the underlying batch representation. The batch takes ownership of the supplied slice. It is not safe to modify it afterwards until the Batch is no longer in use.
func (*Batch) SingleDelete ¶
func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error
SingleDelete adds an action to the batch that single deletes the entry for key. See Writer.SingleDelete for more details on the semantics of SingleDelete.
It is safe to modify the contents of the arguments after SingleDelete returns.
func (*Batch) SingleDeleteDeferred ¶
func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp
SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete operation to the batch, except it only takes in key/value lengths instead of complete slices, letting the caller encode into those objects and then call Finish() on the returned object.
type BatchReader ¶
type BatchReader []byte
BatchReader iterates over the entries contained in a batch.
func ReadBatch ¶
func ReadBatch(repr []byte) (r BatchReader, count uint32)
ReadBatch constructs a BatchReader from a batch representation. The header is not validated. ReadBatch returns a new batch reader and the count of entries contained within the batch.
func (*BatchReader) Next ¶
func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool)
Next returns the next entry in this batch. The final return value is false if the batch is corrupt. The end of batch is reached when len(r)==0.
type BlockPropertyCollector ¶
type BlockPropertyCollector = sstable.BlockPropertyCollector
BlockPropertyCollector exports the sstable.BlockPropertyCollector type.
type BlockPropertyFilter ¶
type BlockPropertyFilter = base.BlockPropertyFilter
BlockPropertyFilter exports the sstable.BlockPropertyFilter type.
type CPUWorkPermissionGranter ¶
CPUWorkPermissionGranter is used to request permission to opportunistically use additional CPUs to speed up internal background work. Each granted "proc" can be used to spin up a CPU bound goroutine, i.e, if scheduled each such goroutine can consume one P in the goroutine scheduler. The calls to ReturnProcs can be a bit delayed, since Pebble interacts with this interface in a coarse manner. So one should assume that the total number of granted procs is a non tight upper bound on the CPU that will get consumed.
type CacheMetrics ¶
CacheMetrics holds metrics for the block and table cache.
type CheckLevelsStats ¶
CheckLevelsStats provides basic stats on points and tombstones encountered.
type CheckpointOption ¶
type CheckpointOption func(*checkpointOptions)
CheckpointOption set optional parameters used by `DB.Checkpoint`.
func WithFlushedWAL ¶
func WithFlushedWAL() CheckpointOption
WithFlushedWAL enables flushing and syncing the WAL prior to constructing a checkpoint. This guarantees that any writes committed before calling DB.Checkpoint will be part of that checkpoint.
Note that this setting can only be useful in cases when some writes are performed with Sync = false. Otherwise, the guarantee will already be met.
Passing this option is functionally equivalent to calling DB.LogData(nil, Sync) right before DB.Checkpoint.
type CompactionInfo ¶
type CompactionInfo struct { // JobID is the ID of the compaction job. JobID int // Reason is the reason for the compaction. Reason string // Input contains the input tables for the compaction organized by level. Input []LevelInfo // Output contains the output tables generated by the compaction. The output // tables are empty for the compaction begin event. Output LevelInfo // Duration is the time spent compacting, including reading and writing // sstables. Duration time.Duration // TotalDuration is the total wall-time duration of the compaction, // including applying the compaction to the database. TotalDuration is // always ≥ Duration. TotalDuration time.Duration Done bool Err error }
CompactionInfo contains the info for a compaction event.
func (CompactionInfo) SafeFormat ¶
func (i CompactionInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (CompactionInfo) String ¶
func (i CompactionInfo) String() string
type Compression ¶
type Compression = sstable.Compression
Compression exports the base.Compression type.
type DB ¶
type DB struct {
// contains filtered or unexported fields
}
DB provides a concurrent, persistent ordered key/value store.
A DB's basic operations (Get, Set, Delete) should be self-explanatory. Get and Delete will return ErrNotFound if the requested key is not in the store. Callers are free to ignore this error.
A DB also allows for iterating over the key/value pairs in key order. If d is a DB, the code below prints all key/value pairs whose keys are 'greater than or equal to' k:
iter := d.NewIter(readOptions) for iter.SeekGE(k); iter.Valid(); iter.Next() { fmt.Printf("key=%q value=%q\n", iter.Key(), iter.Value()) } return iter.Close()
The Options struct holds the optional parameters for the DB, including a Comparer to define a 'less than' relationship over keys. It is always valid to pass a nil *Options, which means to use the default parameter values. Any zero field of a non-nil *Options also means to use the default value for that parameter. Thus, the code below uses a custom Comparer, but the default values for every other parameter:
db := pebble.Open(&Options{ Comparer: myComparer, })
func (*DB) Apply ¶
func (d *DB) Apply(batch *Batch, opts *WriteOptions) error
Apply the operations contained in the batch to the DB. If the batch is large the contents of the batch may be retained by the database. If that occurs the batch contents will be cleared preventing the caller from attempting to reuse them.
It is safe to modify the contents of the arguments after Apply returns.
func (*DB) AsyncFlush ¶
AsyncFlush asynchronously flushes the memtable to stable storage.
If no error is returned, the caller can receive from the returned channel in order to wait for the flush to complete.
func (*DB) CheckLevels ¶
func (d *DB) CheckLevels(stats *CheckLevelsStats) error
CheckLevels checks:
- Every entry in the DB is consistent with the level invariant. See the comment at the top of the file.
- Point keys in sstables are ordered.
- Range delete tombstones in sstables are ordered and fragmented.
- Successful processing of all MERGE records.
func (*DB) Checkpoint ¶
func (d *DB) Checkpoint( destDir string, opts ...CheckpointOption, ) ( ckErr error, )
Checkpoint constructs a snapshot of the DB instance in the specified directory. The WAL, MANIFEST, OPTIONS, and sstables will be copied into the snapshot. Hard links will be used when possible. Beware of the significant space overhead for a checkpoint if hard links are disabled. Also beware that even if hard links are used, the space overhead for the checkpoint will increase over time as the DB performs compactions.
func (*DB) Close ¶
Close closes the DB.
It is not safe to close a DB until all outstanding iterators are closed or to call Close concurrently with any other DB method. It is not valid to call any of a DB's methods after the DB has been closed.
func (*DB) Delete ¶
func (d *DB) Delete(key []byte, opts *WriteOptions) error
Delete deletes the value for the given key. Deletes are blind all will succeed even if the given key does not exist.
It is safe to modify the contents of the arguments after Delete returns.
func (*DB) DeleteRange ¶
func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error
DeleteRange deletes all of the keys (and values) in the range [start,end) (inclusive on start, exclusive on end).
It is safe to modify the contents of the arguments after DeleteRange returns.
func (*DB) EstimateDiskUsage ¶
EstimateDiskUsage returns the estimated filesystem space used in bytes for storing the range `[start, end]`. The estimation is computed as follows:
- For sstables fully contained in the range the whole file size is included.
- For sstables partially contained in the range the overlapping data block sizes are included. Even if a data block partially overlaps, or we cannot determine overlap due to abbreviated index keys, the full data block size is included in the estimation. Note that unlike fully contained sstables, none of the meta-block space is counted for partially overlapped files.
- There may also exist WAL entries for unflushed keys in this range. This estimation currently excludes space used for the range in the WAL.
func (*DB) Experimental ¶
func (d *DB) Experimental() ExperimentalWriter
Experimental returns the experimental write API.
func (*DB) FormatMajorVersion ¶
func (d *DB) FormatMajorVersion() FormatMajorVersion
FormatMajorVersion returns the database's active format major version. The format major version may be higher than the one provided in Options when the database was opened if the existing database was written with a higher format version.
func (*DB) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the DB does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*DB) Ingest ¶
Ingest ingests a set of sstables into the DB. Ingestion of the files is atomic and semantically equivalent to creating a single batch containing all of the mutations in the sstables. Ingestion may require the memtable to be flushed. The ingested sstable files are moved into the DB and must reside on the same filesystem as the DB. Sstables can be created for ingestion using sstable.Writer. On success, Ingest removes the input paths.
All sstables *must* be Sync()'d by the caller after all bytes are written and before its file handle is closed; failure to do so could violate durability or lead to corrupted on-disk state. This method cannot, in a platform-and-FS-agnostic way, ensure that all sstables in the input are properly synced to disk. Opening new file handles and Sync()-ing them does not always guarantee durability; see the discussion here on that: https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379
Ingestion loads each sstable into the lowest level of the LSM which it doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, ingestion forces the memtable to flush, and then waits for the flush to occur.
The steps for ingestion are:
- Allocate file numbers for every sstable being ingested.
- Load the metadata for all sstables being ingest.
- Sort the sstables by smallest key, verifying non overlap.
- Hard link (or copy) the sstables into the DB directory.
- Allocate a sequence number to use for all of the entries in the sstables. This is the step where overlap with memtables is determined. If there is overlap, we remember the most recent memtable that overlaps.
- Update the sequence number in the ingested sstables.
- Wait for the most recent memtable that overlaps to flush (if any).
- Add the ingested sstables to the version (DB.ingestApply).
- Publish the ingestion sequence number.
Note that if the mutable memtable overlaps with ingestion, a flush of the memtable is forced equivalent to DB.Flush. Additionally, subsequent mutations that get sequence numbers larger than the ingestion sequence number get queued up behind the ingestion waiting for it to complete. This can produce a noticeable hiccup in performance. See https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix this hiccup.
func (*DB) IngestWithStats ¶
func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error)
IngestWithStats does the same as Ingest, and additionally returns IngestOperationStats.
func (*DB) InternalIntervalMetrics ¶
func (d *DB) InternalIntervalMetrics() *InternalIntervalMetrics
InternalIntervalMetrics returns the InternalIntervalMetrics and resets for the next interval (which is until the next call to this method).
func (*DB) LogData ¶
func (d *DB) LogData(data []byte, opts *WriteOptions) error
LogData adds the specified to the batch. The data will be written to the WAL, but not added to memtables or sstables. Log data is never indexed, which makes it useful for testing WAL performance.
It is safe to modify the contents of the argument after LogData returns.
func (*DB) Merge ¶
func (d *DB) Merge(key, value []byte, opts *WriteOptions) error
Merge adds an action to the DB that merges the value at key with the new value. The details of the merge are dependent upon the configured merge operator.
It is safe to modify the contents of the arguments after Merge returns.
func (*DB) NewBatch ¶
NewBatch returns a new empty write-only batch. Any reads on the batch will return an error. If the batch is committed it will be applied to the DB.
func (*DB) NewIndexedBatch ¶
NewIndexedBatch returns a new empty read-write batch. Any reads on the batch will read from both the batch and the DB. If the batch is committed it will be applied to the DB. An indexed batch is slower that a non-indexed batch for insert operations. If you do not need to perform reads on the batch, use NewBatch instead.
func (*DB) NewIter ¶
func (d *DB) NewIter(o *IterOptions) *Iterator
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last. The iterator provides a point-in-time view of the current DB state. This view is maintained by preventing file deletions and preventing memtables referenced by the iterator from being deleted. Using an iterator to maintain a long-lived point-in-time view of the DB state can lead to an apparent memory and disk usage leak. Use snapshots (see NewSnapshot) for point-in-time snapshots which avoids these problems.
func (*DB) NewSnapshot ¶
NewSnapshot returns a point-in-time view of the current DB state. Iterators created with this handle will all observe a stable snapshot of the current DB state. The caller must call Snapshot.Close() when the snapshot is no longer needed. Snapshots are not persisted across DB restarts (close -> open). Unlike the implicit snapshot maintained by an iterator, a snapshot will not prevent memtables from being released or sstables from being deleted. Instead, a snapshot prevents deletion of sequence numbers referenced by the snapshot.
func (*DB) RatchetFormatMajorVersion ¶
func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error
RatchetFormatMajorVersion ratchets the opened database's format major version to the provided version. It errors if the provided format major version is below the database's current version. Once a database's format major version is upgraded, previous Pebble versions that do not know of the format version will be unable to open the database.
func (*DB) SSTables ¶
func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error)
SSTables retrieves the current sstables. The returned slice is indexed by level and each level is indexed by the position of the sstable within the level. Note that this information may be out of date due to concurrent flushes and compactions.
func (*DB) Set ¶
func (d *DB) Set(key, value []byte, opts *WriteOptions) error
Set sets the value for the given key. It overwrites any previous value for that key; a DB is not a multi-map.
It is safe to modify the contents of the arguments after Set returns.
func (*DB) SingleDelete ¶
func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error
SingleDelete adds an action to the batch that single deletes the entry for key. See Writer.SingleDelete for more details on the semantics of SingleDelete.
It is safe to modify the contents of the arguments after SingleDelete returns.
type DBDesc ¶
type DBDesc struct { // Exists is true if an existing database was found. Exists bool // FormatMajorVersion indicates the database's current format // version. FormatMajorVersion FormatMajorVersion // ManifestFilename is the filename of the current active manifest, // if the database exists. ManifestFilename string }
DBDesc briefly describes high-level state about a database.
type DeferredBatchOp ¶
type DeferredBatchOp struct {
// Key and Value point to parts of the binary batch representation where
// keys and values should be encoded/copied into. len(Key) and len(Value)
// bytes must be copied into these slices respectively before calling
// Finish(). Changing where these slices point to is not allowed.
Key, Value []byte
// contains filtered or unexported fields
}
DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is being inserted into the batch. Indexing is not performed on the specified key until Finish is called, hence the name deferred. This struct lets the caller copy or encode keys/values directly into the batch representation instead of copying into an intermediary buffer then having pebble.Batch copy off of it.
func (DeferredBatchOp) Finish ¶
func (d DeferredBatchOp) Finish() error
Finish completes the addition of this batch operation, and adds it to the index if necessary. Must be called once (and exactly once) keys/values have been filled into Key and Value. Not calling Finish or not copying/encoding keys will result in an incomplete index, and calling Finish twice may result in a panic.
type DeletableValueMerger ¶
type DeletableValueMerger = base.DeletableValueMerger
DeletableValueMerger exports the base.DeletableValueMerger type.
type DeleteCleaner ¶
type DeleteCleaner = base.DeleteCleaner
DeleteCleaner exports the base.DeleteCleaner type.
type DiskSlowInfo ¶
type DiskSlowInfo struct { // Path of file being written to. Path string // Duration that has elapsed since this disk operation started. Duration time.Duration }
DiskSlowInfo contains the info for a disk slowness event when writing to a file.
func (DiskSlowInfo) SafeFormat ¶
func (i DiskSlowInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (DiskSlowInfo) String ¶
func (i DiskSlowInfo) String() string
type EventListener ¶
type EventListener struct { // BackgroundError is invoked whenever an error occurs during a background // operation such as flush or compaction. BackgroundError func(error) // CompactionBegin is invoked after the inputs to a compaction have been // determined, but before the compaction has produced any output. CompactionBegin func(CompactionInfo) // CompactionEnd is invoked after a compaction has completed and the result // has been installed. CompactionEnd func(CompactionInfo) // DiskSlow is invoked after a disk write operation on a file created // with a disk health checking vfs.FS (see vfs.DefaultWithDiskHealthChecks) // is observed to exceed the specified disk slowness threshold duration. DiskSlow func(DiskSlowInfo) // FlushBegin is invoked after the inputs to a flush have been determined, // but before the flush has produced any output. FlushBegin func(FlushInfo) // FlushEnd is invoked after a flush has complated and the result has been // installed. FlushEnd func(FlushInfo) // FormatUpgrade is invoked after the database's FormatMajorVersion // is upgraded. FormatUpgrade func(FormatMajorVersion) // ManifestCreated is invoked after a manifest has been created. ManifestCreated func(ManifestCreateInfo) // ManifestDeleted is invoked after a manifest has been deleted. ManifestDeleted func(ManifestDeleteInfo) // TableCreated is invoked when a table has been created. TableCreated func(TableCreateInfo) // TableDeleted is invoked after a table has been deleted. TableDeleted func(TableDeleteInfo) // TableIngested is invoked after an externally created table has been // ingested via a call to DB.Ingest(). TableIngested func(TableIngestInfo) // TableStatsLoaded is invoked at most once, when the table stats // collector has loaded statistics for all tables that existed at Open. TableStatsLoaded func(TableStatsInfo) // TableValidated is invoked after validation runs on an sstable. TableValidated func(TableValidatedInfo) // WALCreated is invoked after a WAL has been created. WALCreated func(WALCreateInfo) // WALDeleted is invoked after a WAL has been deleted. WALDeleted func(WALDeleteInfo) // WriteStallBegin is invoked when writes are intentionally delayed. WriteStallBegin func(WriteStallBeginInfo) // WriteStallEnd is invoked when delayed writes are released. WriteStallEnd func() }
EventListener contains a set of functions that will be invoked when various significant DB events occur. Note that the functions should not run for an excessive amount of time as they are invoked synchronously by the DB and may block continued DB work. For a similar reason it is advisable to not perform any synchronous calls back into the DB.
func MakeLoggingEventListener ¶
func MakeLoggingEventListener(logger Logger) EventListener
MakeLoggingEventListener creates an EventListener that logs all events to the specified logger.
func TeeEventListener ¶
func TeeEventListener(a, b EventListener) EventListener
TeeEventListener wraps two EventListeners, forwarding all events to both.
func (*EventListener) EnsureDefaults ¶
func (l *EventListener) EnsureDefaults(logger Logger)
EnsureDefaults ensures that background error events are logged to the specified logger if a handler for those events hasn't been otherwise specified. Ensure all handlers are non-nil so that we don't have to check for nil-ness before invoking.
type ExperimentalWriter ¶
type ExperimentalWriter interface { Writer // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC // timestamp suffix to value. The suffix is optional. If any portion of the key // range [start, end) is already set by a range key with the same suffix value, // RangeKeySet overrides it. // // It is safe to modify the contents of the arguments after RangeKeySet returns. // // WARNING: This is an experimental feature with limited functionality. RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error // RangeKeyUnset removes a range key mapping the key range [start, end) at the // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed // range key. RangeKeyUnset only removes portions of range keys that fall within // the [start, end) key span, and only range keys with suffixes that exactly // match the unset suffix. // // It is safe to modify the contents of the arguments after RangeKeyUnset // returns. // // WARNING: This is an experimental feature with limited functionality. RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error // RangeKeyDelete deletes all of the range keys in the range [start,end) // (inclusive on start, exclusive on end). It does not delete point keys (for // that use DeleteRange). RangeKeyDelete removes all range keys within the // bounds, including those with or without suffixes. // // It is safe to modify the contents of the arguments after RangeKeyDelete // returns. // // WARNING: This is an experimental feature with limited functionality. RangeKeyDelete(start, end []byte, opts *WriteOptions) error }
ExperimentalWriter provides access to experimental features of a Batch.
type FilterMetrics ¶
type FilterMetrics = sstable.FilterMetrics
FilterMetrics holds metrics for the filter policy
type FilterPolicy ¶
type FilterPolicy = base.FilterPolicy
FilterPolicy exports the base.FilterPolicy type.
type FilterWriter ¶
type FilterWriter = base.FilterWriter
FilterWriter exports the base.FilterWriter type.
type FlushInfo ¶
type FlushInfo struct { // JobID is the ID of the flush job. JobID int // Reason is the reason for the flush. Reason string // Input contains the count of input memtables that were flushed. Input int // Output contains the ouptut table generated by the flush. The output info // is empty for the flush begin event. Output []TableInfo // Duration is the time spent flushing. This duration includes writing and // syncing all of the flushed keys to sstables. Duration time.Duration // TotalDuration is the total wall-time duration of the flush, including // applying the flush to the database. TotalDuration is always ≥ Duration. TotalDuration time.Duration Done bool Err error }
FlushInfo contains the info for a flush event.
func (FlushInfo) SafeFormat ¶
func (i FlushInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
type FormatMajorVersion ¶
type FormatMajorVersion uint64
FormatMajorVersion is a constant controlling the format of persisted data. Backwards incompatible changes to durable formats are gated behind new format major versions.
At any point, a database's format major version may be bumped. However, once a database's format major version is increased, previous versions of Pebble will refuse to open the database.
The zero value format is the FormatDefault constant. The exact FormatVersion that the default corresponds to may change with time.
const ( // FormatDefault leaves the format version unspecified. The // FormatDefault constant may be ratcheted upwards over time. FormatDefault FormatMajorVersion = iota // FormatMostCompatible maintains the most backwards compatibility, // maintaining bi-directional compatibility with RocksDB 6.2.1 in // the particular configuration described in the Pebble README. FormatMostCompatible // FormatVersioned is a new format major version that replaces the // old `CURRENT` file with a new 'marker' file scheme. Previous // Pebble versions will be unable to open the database unless // they're aware of format versions. FormatVersioned // FormatSetWithDelete is a format major version that introduces a new key // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be // unable to open this database. FormatSetWithDelete // FormatBlockPropertyCollector is a format major version that introduces // BlockPropertyCollectors. FormatBlockPropertyCollector // FormatSplitUserKeysMarked is a format major version that guarantees that // all files that share user keys with neighbors are marked for compaction // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block // (without holding mutexes) until the scan of the LSM is complete and the // manifest has been rotated. FormatSplitUserKeysMarked // FormatMarkedCompacted is a format major version that guarantees that all // files explicitly marked for compaction in the manifest have been // compacted. Combined with the FormatSplitUserKeysMarked format major // version, this version guarantees that there are no user keys split across // multiple files within a level L1+. Ratcheting to this format version will // block (without holding mutexes) until all necessary compactions for files // marked for compaction are complete. FormatMarkedCompacted // FormatRangeKeys is a format major version that introduces range keys. FormatRangeKeys // FormatNewest always contains the most recent format major version. // NB: When adding new versions, the MaxTableFormat method should also be // updated to return the maximum allowable version for the new // FormatMajorVersion. FormatNewest FormatMajorVersion = FormatRangeKeys )
func (FormatMajorVersion) MaxTableFormat ¶
func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat
MaxTableFormat returns the maximum sstable.TableFormat that can be used at this FormatMajorVersion.
func (FormatMajorVersion) String ¶
func (v FormatMajorVersion) String() string
String implements fmt.Stringer.
type IngestOperationStats ¶
type IngestOperationStats struct { // Bytes is the total bytes in the ingested sstables. Bytes uint64 // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested // into L0. // Currently, this value is completely accurate, but we are allowing this to // be approximate once https://github.com/cockroachdb/pebble/issues/25 is // implemented. ApproxIngestedIntoL0Bytes uint64 }
IngestOperationStats provides some information about where in the LSM the bytes were ingested.
type InternalIntervalMetrics ¶
type InternalIntervalMetrics struct { // LogWriter metrics. LogWriter struct { // WriteThroughput is the WAL throughput. WriteThroughput ThroughputMetric // PendingBufferUtilization is the utilization of the WAL writer's // finite-sized pending blocks buffer. It provides an additional signal // regarding how close to "full" the WAL writer is. The value is in the // interval [0,1]. PendingBufferUtilization float64 // SyncQueueUtilization is the utilization of the WAL writer's // finite-sized queue of work that is waiting to sync. The value is in the // interval [0,1]. SyncQueueUtilization float64 // SyncLatencyMicros is a distribution of the fsync latency observed by // the WAL writer. It can be nil if there were no fsyncs. SyncLatencyMicros *hdrhistogram.Histogram } // Flush loop metrics. Flush struct { // WriteThroughput is the flushing throughput. WriteThroughput ThroughputMetric } }
InternalIntervalMetrics exposes metrics about internal subsystems, that can be useful for deep observability purposes, and for higher-level admission control systems that are trying to estimate the capacity of the DB. These are experimental and subject to change, since they expose internal implementation details, so do not rely on these without discussion with the Pebble team. These represent the metrics over the interval of time from the last call to retrieve these metrics. These are not cumulative, unlike Metrics. The main challenge in making these cumulative is the hdrhistogram.Histogram, which does not have the ability to subtract a histogram from a preceding metric retrieval.
type InternalIteratorStats ¶
type InternalIteratorStats = base.InternalIteratorStats
InternalIteratorStats contains miscellaneous stats produced by internal iterators.
type InternalKey ¶
type InternalKey = base.InternalKey
InternalKey exports the base.InternalKey type.
type InternalKeyKind ¶
type InternalKeyKind = base.InternalKeyKind
InternalKeyKind exports the base.InternalKeyKind type.
type IterKeyType ¶
type IterKeyType int8
IterKeyType configures which types of keys an iterator should surface.
const ( // IterKeyTypePointsOnly configures an iterator to iterate over point keys // only. IterKeyTypePointsOnly IterKeyType = iota // IterKeyTypeRangesOnly configures an iterator to iterate over range keys // only. IterKeyTypeRangesOnly // IterKeyTypePointsAndRanges configures an iterator iterate over both point // keys and range keys simultaneously. IterKeyTypePointsAndRanges )
type IterOptions ¶
type IterOptions struct { // LowerBound specifies the smallest key (inclusive) that the iterator will // return during iteration. If the iterator is seeked or iterated past this // boundary the iterator will return Valid()==false. Setting LowerBound // effectively truncates the key space visible to the iterator. LowerBound []byte // UpperBound specifies the largest key (exclusive) that the iterator will // return during iteration. If the iterator is seeked or iterated past this // boundary the iterator will return Valid()==false. Setting UpperBound // effectively truncates the key space visible to the iterator. UpperBound []byte // TableFilter can be used to filter the tables that are scanned during // iteration based on the user properties. Return true to scan the table and // false to skip scanning. This function must be thread-safe since the same // function can be used by multiple iterators, if the iterator is cloned. TableFilter func(userProps map[string]string) bool // PointKeyFilters can be used to avoid scanning tables and blocks in tables // when iterating over point keys. It is requires that this slice is sorted in // increasing order of the BlockPropertyFilter.ShortID. This slice represents // an intersection across all filters, i.e., all filters must indicate that the // block is relevant. PointKeyFilters []BlockPropertyFilter // RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables // when iterating over range keys. The same requirements that apply to // PointKeyFilters apply here too. RangeKeyFilters []BlockPropertyFilter // KeyTypes configures which types of keys to iterate over: point keys, // range keys, or both. KeyTypes IterKeyType // RangeKeyMasking can be used to enable automatic masking of point keys by // range keys. Range key masking is only supported during combined range key // and point key iteration mode (IterKeyTypePointsAndRanges). RangeKeyMasking RangeKeyMasking // OnlyReadGuaranteedDurable is an advanced option that is only supported by // the Reader implemented by DB. When set to true, only the guaranteed to be // durable state is visible in the iterator. // - This definition is made under the assumption that the FS implementation // is providing a durability guarantee when data is synced. // - The visible state represents a consistent point in the history of the // DB. // - The implementation is free to choose a conservative definition of what // is guaranteed durable. For simplicity, the current implementation // ignores memtables. A more sophisticated implementation could track the // highest seqnum that is synced to the WAL and published and use that as // the visible seqnum for an iterator. Note that the latter approach is // not strictly better than the former since we can have DBs that are (a) // synced more rarely than memtable flushes, (b) have no WAL. (a) is // likely to be true in a future CockroachDB context where the DB // containing the state machine may be rarely synced. // NB: this current implementation relies on the fact that memtables are // flushed in seqnum order, and any ingested sstables that happen to have a // lower seqnum than a non-flushed memtable don't have any overlapping keys. // This is the fundamental level invariant used in other code too, like when // merging iterators. // // Semantically, using this option provides the caller a "snapshot" as of // the time the most recent memtable was flushed. An alternate interface // would be to add a NewSnapshot variant. Creating a snapshot is heavier // weight than creating an iterator, so we have opted to support this // iterator option. OnlyReadGuaranteedDurable bool // UseL6Filters allows the caller to opt into reading filter blocks for L6 // sstables. Helpful if a lot of SeekPrefixGEs are expected in quick // succession, that are also likely to not yield a single key. Filter blocks in // L6 can be relatively large, often larger than data blocks, so the benefit of // loading them in the cache is minimized if the probability of the key // existing is not low or if we just expect a one-time Seek (where loading the // data block directly is better). UseL6Filters bool // contains filtered or unexported fields }
IterOptions hold the optional per-query parameters for NewIter.
Like Options, a nil *IterOptions is valid and means to use the default values.
func (*IterOptions) GetLowerBound ¶
func (o *IterOptions) GetLowerBound() []byte
GetLowerBound returns the LowerBound or nil if the receiver is nil.
func (*IterOptions) GetUpperBound ¶
func (o *IterOptions) GetUpperBound() []byte
GetUpperBound returns the UpperBound or nil if the receiver is nil.
type IterValidityState ¶
type IterValidityState int8
IterValidityState captures the state of the Iterator.
const ( // IterExhausted represents an Iterator that is exhausted. IterExhausted IterValidityState = iota // IterValid represents an Iterator that is valid. IterValid // IterAtLimit represents an Iterator that has a non-exhausted // internalIterator, but has reached a limit without any key for the // caller. IterAtLimit )
type Iterator ¶
type Iterator struct {
// contains filtered or unexported fields
}
Iterator iterates over a DB's key/value pairs in key order.
An iterator must be closed after use, but it is not necessary to read an iterator until exhaustion.
An iterator is not goroutine-safe, but it is safe to use multiple iterators concurrently, with each in a dedicated goroutine.
It is also safe to use an iterator concurrently with modifying its underlying DB, if that DB permits modification. However, the resultant key/value pairs are not guaranteed to be a consistent snapshot of that DB at a particular point in time.
If an iterator encounters an error during any operation, it is stored by the Iterator and surfaced through the Error method. All absolute positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any accumulated error before positioning. All relative positioning methods (eg, Next, Prev) return without advancing if the iterator has an accumulated error.
Example ¶
package main import ( "fmt" "log" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) func main() { db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, pebble.Sync); err != nil { log.Fatal(err) } } iter := db.NewIter(nil) for iter.First(); iter.Valid(); iter.Next() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world world
Example (PrefixIteration) ¶
package main import ( "fmt" "log" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) func main() { db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keyUpperBound := func(b []byte) []byte { end := make([]byte, len(b)) copy(end, b) for i := len(end) - 1; i >= 0; i-- { end[i] = end[i] + 1 if end[i] != 0 { return end[:i+1] } } return nil // no upper-bound } prefixIterOptions := func(prefix []byte) *pebble.IterOptions { return &pebble.IterOptions{ LowerBound: prefix, UpperBound: keyUpperBound(prefix), } } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, pebble.Sync); err != nil { log.Fatal(err) } } iter := db.NewIter(prefixIterOptions([]byte("hello"))) for iter.First(); iter.Valid(); iter.Next() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world
func NewExternalIter ¶
func NewExternalIter( o *Options, iterOpts *IterOptions, files []sstable.ReadableFile, extraReaderOpts ...sstable.ReaderOption, ) (it *Iterator, err error)
NewExternalIter takes an input set of sstable files which may overlap arbitrarily and returns an Iterator over the merged contents of the sstables. Input sstables may contain point keys, range keys, range deletions, etc. The input files slice must be sorted in reverse chronological ordering. A key in a file at a lower index will shadow a key with an identical user key contained within a file at a higher index.
Input sstables must only contain keys with the zero sequence number.
Iterators constructed through NewExternalIter do not support all iterator options, including block-property and table filters. NewExternalIter errors if an incompatible option is set.
func (*Iterator) Clone ¶
Clone creates a new Iterator over the same underlying data, i.e., over the same {batch, memtables, sstables}). It starts with the same IterOptions but is not positioned.
When called on an Iterator over an indexed batch, the clone inherits the iterator's current (possibly stale) view of the batch. Callers may call SetOptions to refresh the clone's view to include all batch mutations.
Callers can use Clone if they need multiple iterators that need to see exactly the same underlying state of the DB. This should not be used to extend the lifetime of the data backing the original Iterator since that will cause an increase in memory and disk usage (use NewSnapshot for that purpose).
func (*Iterator) Close ¶
Close closes the iterator and returns any accumulated error. Exhausting all the key/value pairs in a table is not considered to be an error. It is not valid to call any method, including Close, after the iterator has been closed.
func (*Iterator) First ¶
First moves the iterator the the first key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) HasPointAndRange ¶
HasPointAndRange indicates whether there exists a point key, a range key or both at the current iterator position.
func (*Iterator) Key ¶
Key returns the key of the current key/value pair, or nil if done. The caller should not modify the contents of the returned slice, and its contents may change on the next call to Next.
func (*Iterator) Last ¶
Last moves the iterator the the last key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) Metrics ¶
func (i *Iterator) Metrics() IteratorMetrics
Metrics returns per-iterator metrics.
func (*Iterator) Next ¶
Next moves the iterator to the next key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) NextWithLimit ¶
func (i *Iterator) NextWithLimit(limit []byte) IterValidityState
NextWithLimit moves the iterator to the next key/value pair.
If limit is provided, it serves as a best-effort exclusive limit. If the next key is greater than or equal to limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, NextWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, NextWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) Prev ¶
Prev moves the iterator to the previous key/value pair. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) PrevWithLimit ¶
func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState
PrevWithLimit moves the iterator to the previous key/value pair.
If limit is provided, it serves as a best-effort inclusive limit. If the previous key is less than limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, PrevWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) RangeBounds ¶
RangeBounds returns the start (inclusive) and end (exclusive) bounds of the range key covering the current iterator position. RangeBounds returns nil bounds if there is no range key covering the current iterator position, or the iterator is not configured to surface range keys.
func (*Iterator) RangeKeys ¶
func (i *Iterator) RangeKeys() []RangeKeyData
RangeKeys returns the range key values and their suffixes covering the current iterator position. The range bounds may be retrieved separately through Iterator.RangeBounds().
func (*Iterator) SeekGE ¶
SeekGE moves the iterator to the first key/value pair whose key is greater than or equal to the given key. Returns true if the iterator is pointing at a valid entry and false otherwise.
Example ¶
package main import ( "fmt" "log" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) func main() { db, err := pebble.Open("", &pebble.Options{FS: vfs.NewMem()}) if err != nil { log.Fatal(err) } keys := []string{"hello", "world", "hello world"} for _, key := range keys { if err := db.Set([]byte(key), nil, pebble.Sync); err != nil { log.Fatal(err) } } iter := db.NewIter(nil) if iter.SeekGE([]byte("a")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if iter.SeekGE([]byte("hello w")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if iter.SeekGE([]byte("w")); iter.Valid() { fmt.Printf("%s\n", iter.Key()) } if err := iter.Close(); err != nil { log.Fatal(err) } if err := db.Close(); err != nil { log.Fatal(err) } }
Output: hello hello world world
func (*Iterator) SeekGEWithLimit ¶
func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState
SeekGEWithLimit moves the iterator to the first key/value pair whose key is greater than or equal to the given key.
If limit is provided, it serves as a best-effort exclusive limit. If the first key greater than or equal to the given search key is also greater than or equal to limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, SeekGEWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, SeekGEWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace [key, limit).
func (*Iterator) SeekLT ¶
SeekLT moves the iterator to the last key/value pair whose key is less than the given key. Returns true if the iterator is pointing at a valid entry and false otherwise.
func (*Iterator) SeekLTWithLimit ¶
func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState
SeekLTWithLimit moves the iterator to the last key/value pair whose key is less than the given key.
If limit is provided, it serves as a best-effort inclusive limit. If the last key less than the given search key is also less than limit, the Iterator may pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit may return a key beyond limit.
If the Iterator is configured to iterate over range keys, SeekLTWithLimit guarantees it will surface any range keys with bounds overlapping the keyspace up to limit.
func (*Iterator) SeekPrefixGE ¶
SeekPrefixGE moves the iterator to the first key/value pair whose key is greater than or equal to the given key and which has the same "prefix" as the given key. The prefix for a key is determined by the user-defined Comparer.Split function. The iterator will not observe keys not matching the "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix iteration mode. The iterator remains in prefix iteration until a subsequent call to another absolute positioning method (SeekGE, SeekLT, First, Last). Reverse iteration (Prev) is not supported when an iterator is in prefix iteration mode. Returns true if the iterator is pointing at a valid entry and false otherwise.
The semantics of SeekPrefixGE are slightly unusual and designed for iteration to be able to take advantage of bloom filters that have been created on the "prefix". If you're not using bloom filters, there is no reason to use SeekPrefixGE.
An example Split function may separate a timestamp suffix from the prefix of the key.
Split(<key>@<timestamp>) -> <key>
Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the usual definition, those prefixes differ by the definition of the Split function. To see how this works, consider the following set of calls on this data set:
SeekPrefixGE("a@0") -> "a@1" Next() -> "a@2" Next() -> EOF
If you're just looking to iterate over keys with a shared prefix, as defined by the configured comparer, set iterator bounds instead:
iter := db.NewIter(&pebble.IterOptions{ LowerBound: []byte("prefix"), UpperBound: []byte("prefiy"), }) for iter.First(); iter.Valid(); iter.Next() { // Only keys beginning with "prefix" will be visited. }
See ExampleIterator_SeekPrefixGE for a working example.
func (*Iterator) SetBounds ¶
SetBounds sets the lower and upper bounds for the iterator. Once SetBounds returns, the caller is free to mutate the provided slices.
The iterator will always be invalidated and must be repositioned with a call to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
func (*Iterator) SetOptions ¶
func (i *Iterator) SetOptions(o *IterOptions)
SetOptions sets new iterator options for the iterator. Note that the lower and upper bounds applied here will supersede any bounds set by previous calls to SetBounds.
Note that the slices provided in this SetOptions must not be changed by the caller until the iterator is closed, or a subsequent SetBounds or SetOptions has returned. This is because comparisons between the existing and new bounds are sometimes used to optimize seeking. See the extended commentary on SetBounds.
If the iterator was created over an indexed mutable batch, the iterator's view of the mutable batch is refreshed.
The iterator will always be invalidated and must be repositioned with a call to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
If only lower and upper bounds need to be modified, prefer SetBounds.
func (*Iterator) Valid ¶
Valid returns true if the iterator is positioned at a valid key/value pair and false otherwise.
type IteratorMetrics ¶
type IteratorMetrics struct { // The read amplification experienced by this iterator. This is the sum of // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read // amplification generally results in slower reads, though allowing higher // read amplification can also result in faster writes. ReadAmp int }
IteratorMetrics holds per-iterator metrics. These do not change over the lifetime of the iterator.
type IteratorStats ¶
type IteratorStats struct { // ForwardSeekCount includes SeekGE, SeekPrefixGE, First. ForwardSeekCount [NumStatsKind]int // ReverseSeek includes SeekLT, Last. ReverseSeekCount [NumStatsKind]int // ForwardStepCount includes Next. ForwardStepCount [NumStatsKind]int // ReverseStepCount includes Prev. ReverseStepCount [NumStatsKind]int InternalStats InternalIteratorStats }
IteratorStats contains iteration stats.
func (*IteratorStats) SafeFormat ¶
func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune)
SafeFormat implements the redact.SafeFormatter interface.
func (*IteratorStats) String ¶
func (stats *IteratorStats) String() string
type IteratorStatsKind ¶
type IteratorStatsKind int8
IteratorStatsKind describes the two kind of iterator stats.
const ( // InterfaceCall represents calls to Iterator. InterfaceCall IteratorStatsKind = iota // InternalIterCall represents calls by Iterator to its internalIterator. InternalIterCall // NumStatsKind is the number of kinds, and is used for array sizing. NumStatsKind )
type LevelInfo ¶
LevelInfo contains info pertaining to a partificular level.
func (LevelInfo) SafeFormat ¶
func (i LevelInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
type LevelMetrics ¶
type LevelMetrics struct { // The number of sublevels within the level. The sublevel count corresponds // to the read amplification for the level. An empty level will have a // sublevel count of 0, implying no read amplification. Only L0 will have // a sublevel count other than 0 or 1. Sublevels int32 // The total number of files in the level. NumFiles int64 // The total size in bytes of the files in the level. Size int64 // The level's compaction score. Score float64 // The number of incoming bytes from other levels read during // compactions. This excludes bytes moved and bytes ingested. For L0 this is // the bytes written to the WAL. BytesIn uint64 // The number of bytes ingested. The sibling metric for tables is // TablesIngested. BytesIngested uint64 // The number of bytes moved into the level by a "move" compaction. The // sibling metric for tables is TablesMoved. BytesMoved uint64 // The number of bytes read for compactions at the level. This includes bytes // read from other levels (BytesIn), as well as bytes read for the level. BytesRead uint64 // The number of bytes written during compactions. The sibling // metric for tables is TablesCompacted. This metric may be summed // with BytesFlushed to compute the total bytes written for the level. BytesCompacted uint64 // The number of bytes written during flushes. The sibling // metrics for tables is TablesFlushed. This metric is always // zero for all levels other than L0. BytesFlushed uint64 // The number of sstables compacted to this level. TablesCompacted uint64 // The number of sstables flushed to this level. TablesFlushed uint64 // The number of sstables ingested into the level. TablesIngested uint64 // The number of sstables moved to this level by a "move" compaction. TablesMoved uint64 }
LevelMetrics holds per-level metrics such as the number of files and total size of the files, and compaction related metrics.
func (*LevelMetrics) Add ¶
func (m *LevelMetrics) Add(u *LevelMetrics)
Add updates the counter metrics for the level.
func (*LevelMetrics) WriteAmp ¶
func (m *LevelMetrics) WriteAmp() float64
WriteAmp computes the write amplification for compactions at this level. Computed as (BytesFlushed + BytesCompacted) / BytesIn.
type LevelOptions ¶
type LevelOptions struct { // BlockRestartInterval is the number of keys between restart points // for delta encoding of keys. // // The default value is 16. BlockRestartInterval int // BlockSize is the target uncompressed size in bytes of each table block. // // The default value is 4096. BlockSize int // BlockSizeThreshold finishes a block if the block size is larger than the // specified percentage of the target block size and adding the next entry // would cause the block to be larger than the target block size. // // The default value is 90 BlockSizeThreshold int // Compression defines the per-block compression to use. // // The default value (DefaultCompression) uses snappy compression. Compression Compression // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can // reduce disk reads for Get calls. // // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom // package. // // The default value means to use no filter. FilterPolicy FilterPolicy // FilterType defines whether an existing filter policy is applied at a // block-level or table-level. Block-level filters use less memory to create, // but are slower to access as a check for the key in the index must first be // performed to locate the filter block. A table-level filter will require // memory proportional to the number of keys in an sstable to create, but // avoids the index lookup when determining if a key is present. Table-level // filters should be preferred except under constrained memory situations. FilterType FilterType // IndexBlockSize is the target uncompressed size in bytes of each index // block. When the index block size is larger than this target, two-level // indexes are automatically enabled. Setting this option to a large value // (such as math.MaxInt32) disables the automatic creation of two-level // indexes. // // The default value is the value of BlockSize. IndexBlockSize int // The target file size for the level. TargetFileSize int64 }
LevelOptions holds the optional per-level parameters.
func (*LevelOptions) EnsureDefaults ¶
func (o *LevelOptions) EnsureDefaults() *LevelOptions
EnsureDefaults ensures that the default values for all of the options have been initialized. It is valid to call EnsureDefaults on a nil receiver. A non-nil result will always be returned.
type Logger ¶
type Logger interface { Infof(format string, args ...interface{}) Fatalf(format string, args ...interface{}) }
Logger defines an interface for writing log messages.
type ManifestCreateInfo ¶
type ManifestCreateInfo struct { // JobID is the ID of the job the caused the manifest to be created. JobID int Path string // The file number of the new Manifest. FileNum FileNum Err error }
ManifestCreateInfo contains info about a manifest creation event.
func (ManifestCreateInfo) SafeFormat ¶
func (i ManifestCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (ManifestCreateInfo) String ¶
func (i ManifestCreateInfo) String() string
type ManifestDeleteInfo ¶
type ManifestDeleteInfo struct { // JobID is the ID of the job the caused the Manifest to be deleted. JobID int Path string FileNum FileNum Err error }
ManifestDeleteInfo contains the info for a Manifest deletion event.
func (ManifestDeleteInfo) SafeFormat ¶
func (i ManifestDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (ManifestDeleteInfo) String ¶
func (i ManifestDeleteInfo) String() string
type Metrics ¶
type Metrics struct { BlockCache CacheMetrics Compact struct { // The total number of compactions, and per-compaction type counts. Count int64 DefaultCount int64 DeleteOnlyCount int64 ElisionOnlyCount int64 MoveCount int64 ReadCount int64 RewriteCount int64 MultiLevelCount int64 // An estimate of the number of bytes that need to be compacted for the LSM // to reach a stable state. EstimatedDebt uint64 // Number of bytes present in sstables being written by in-progress // compactions. This value will be zero if there are no in-progress // compactions. InProgressBytes int64 // Number of compactions that are in-progress. NumInProgress int64 // MarkedFiles is a count of files that are marked for // compaction. Such files are compacted in a rewrite compaction // when no other compactions are picked. MarkedFiles int } Flush struct { // The total number of flushes. Count int64 } Filter FilterMetrics Levels [numLevels]LevelMetrics MemTable struct { // The number of bytes allocated by memtables and large (flushable) // batches. Size uint64 // The count of memtables. Count int64 // The number of bytes present in zombie memtables which are no longer // referenced by the current DB state but are still in use by an iterator. ZombieSize uint64 // The count of zombie memtables. ZombieCount int64 } Snapshots struct { // The number of currently open snapshots. Count int // The sequence number of the earliest, currently open snapshot. EarliestSeqNum uint64 } Table struct { // The number of bytes present in obsolete tables which are no longer // referenced by the current DB state or any open iterators. ObsoleteSize uint64 // The count of obsolete tables. ObsoleteCount int64 // The number of bytes present in zombie tables which are no longer // referenced by the current DB state but are still in use by an iterator. ZombieSize uint64 // The count of zombie tables. ZombieCount int64 } TableCache CacheMetrics // Count of the number of open sstable iterators. TableIters int64 WAL struct { // Number of live WAL files. Files int64 // Number of obsolete WAL files. ObsoleteFiles int64 // Physical size of the obsolete WAL files. ObsoletePhysicalSize uint64 // Size of the live data in the WAL files. Note that with WAL file // recycling this is less than the actual on-disk size of the WAL files. Size uint64 // Physical size of the WAL files on-disk. With WAL file recycling, // this is greater than the live data in WAL files. PhysicalSize uint64 // Number of logical bytes written to the WAL. BytesIn uint64 // Number of bytes written to the WAL. BytesWritten uint64 } // contains filtered or unexported fields }
Metrics holds metrics for various subsystems of the DB such as the Cache, Compactions, WAL, and per-Level metrics.
TODO(peter): The testing of these metrics is relatively weak. There should be testing that performs various operations on a DB and verifies that the metrics reflect those operations.
func (*Metrics) DiskSpaceUsage ¶
DiskSpaceUsage returns the total disk space used by the database in bytes, including live and obsolete files.
func (*Metrics) ReadAmp ¶
ReadAmp returns the current read amplification of the database. It's computed as the number of sublevels in L0 + the number of non-empty levels below L0.
func (*Metrics) SafeFormat ¶
func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (*Metrics) String ¶
String pretty-prints the metrics, showing a line for the WAL, a line per-level, and a total:
__level_____count____size___score______in__ingest(sz_cnt)____move(sz_cnt)___write(sz_cnt)____read___w-amp WAL 1 27 B - 48 B - - - - 108 B - - 2.2 0 2 1.6 K 0.50 81 B 825 B 1 0 B 0 2.4 K 3 0 B 30.6 1 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0.0 2 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0.0 3 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0.0 4 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0.0 5 0 0 B 0.00 0 B 0 B 0 0 B 0 0 B 0 0 B 0.0 6 1 825 B 0.00 1.6 K 0 B 0 0 B 0 825 B 1 1.6 K 0.5 total 3 2.4 K - 933 B 825 B 1 0 B 0 4.1 K 4 1.6 K 4.5 flush 3 compact 1 1.6 K 0 B 1 (size == estimated-debt, score = in-progress-bytes, in = num-in-progress) ctype 0 0 0 0 0 (default, delete, elision, move, read) memtbl 1 4.0 M zmemtbl 0 0 B ztbl 0 0 B bcache 4 752 B 7.7% (score == hit-rate) tcache 0 0 B 0.0% (score == hit-rate)
snapshots 0 0 (score == earliest seq num)
titers 0 filter - - 0.0% (score == utility)
The WAL "in" metric is the size of the batches written to the WAL. The WAL "write" metric is the size of the physical data written to the WAL which includes record fragment overhead. Write amplification is computed as bytes-written / bytes-in, except for the total row where bytes-in is replaced with WAL-bytes-written + bytes-ingested.
func (*Metrics) Total ¶
func (m *Metrics) Total() LevelMetrics
Total returns the sum of the per-level metrics and WAL metrics.
type Options ¶
type Options struct { // Sync sstables periodically in order to smooth out writes to disk. This // option does not provide any persistency guarantee, but is used to avoid // latency spikes if the OS automatically decides to write out a large chunk // of dirty filesystem buffers. This option only controls SSTable syncs; WAL // syncs are controlled by WALBytesPerSync. // // The default value is 512KB. BytesPerSync int // Cache is used to cache uncompressed blocks from sstables. // // The default cache size is 8 MB. Cache *cache.Cache // Cleaner cleans obsolete files. // // The default cleaner uses the DeleteCleaner. Cleaner Cleaner // Comparer defines a total ordering over the space of []byte keys: a 'less // than' relationship. The same comparison algorithm must be used for reads // and writes over the lifetime of the DB. // // The default value uses the same ordering as bytes.Compare. Comparer *Comparer // DebugCheck is invoked, if non-nil, whenever a new version is being // installed. Typically, this is set to pebble.DebugCheckLevels in tests // or tools only, to check invariants over all the data in the database. DebugCheck func(*DB) error // Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits // crash recovery, but can improve performance if crash recovery is not // needed (e.g. when only temporary state is being stored in the database). // // TODO(peter): untested DisableWAL bool // ErrorIfExists is whether it is an error if the database already exists. // // The default value is false. ErrorIfExists bool // ErrorIfNotExists is whether it is an error if the database does not // already exist. // // The default value is false which will cause a database to be created if it // does not already exist. ErrorIfNotExists bool // EventListener provides hooks to listening to significant DB events such as // flushes, compactions, and table deletion. EventListener EventListener // Experimental contains experimental options which are off by default. // These options are temporary and will eventually either be deleted, moved // out of the experimental group, or made the non-adjustable default. These // options may change at any time, so do not rely on them. Experimental struct { // The threshold of L0 read-amplification at which compaction concurrency // is enabled (if CompactionDebtConcurrency was not already exceeded). // Every multiple of this value enables another concurrent // compaction up to MaxConcurrentCompactions. L0CompactionConcurrency int // CompactionDebtConcurrency controls the threshold of compaction debt // at which additional compaction concurrency slots are added. For every // multiple of this value in compaction debt bytes, an additional // concurrent compaction is added. This works "on top" of // L0CompactionConcurrency, so the higher of the count of compaction // concurrency slots as determined by the two options is chosen. CompactionDebtConcurrency int // DeleteRangeFlushDelay configures how long the database should wait // before forcing a flush of a memtable that contains a range // deletion. Disk space cannot be reclaimed until the range deletion // is flushed. No automatic flush occurs if zero. DeleteRangeFlushDelay time.Duration // MinDeletionRate is the minimum number of bytes per second that would // be deleted. Deletion pacing is used to slow down deletions when // compactions finish up or readers close, and newly-obsolete files need // cleaning up. Deleting lots of files at once can cause disk latency to // go up on some SSDs, which this functionality guards against. This is a // minimum as the maximum is theoretically unlimited; pacing is disabled // when there are too many obsolete files relative to live bytes, or // there isn't enough disk space available. Setting this to 0 disables // deletion pacing, which is also the default. MinDeletionRate int // ReadCompactionRate controls the frequency of read triggered // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: // // AllowedSeeks = FileSize / ReadCompactionRate // // From LevelDB: // “` // We arrange to automatically compact this file after // a certain number of seeks. Let's assume: // (1) One seek costs 10ms // (2) Writing or reading 1MB costs 10ms (100MB/s) // (3) A compaction of 1MB does 25MB of IO: // 1MB read from this level // 10-12MB read from next level (boundaries may be misaligned) // 10-12MB written to next level // This implies that 25 seeks cost the same as the compaction // of 1MB of data. I.e., one seek costs approximately the // same as the compaction of 40KB of data. We are a little // conservative and allow approximately one seek for every 16KB // of data before triggering a compaction. // “` ReadCompactionRate int64 // ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in // iterator.maybeSampleRead() to control the frequency of read sampling // to trigger a read triggered compaction. A value of -1 prevents sampling // and disables read triggered compactions. The default is 1 << 4. which // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). ReadSamplingMultiplier int64 // TableCacheShards is the number of shards per table cache. // Reducing the value can reduce the number of idle goroutines per DB // instance which can be useful in scenarios with a lot of DB instances // and a large number of CPUs, but doing so can lead to higher contention // in the table cache and reduced performance. // // The default value is the number of logical CPUs, which can be // limited by runtime.GOMAXPROCS. TableCacheShards int // KeyValidationFunc is a function to validate a user key in an SSTable. // // Currently, this function is used to validate the smallest and largest // keys in an SSTable undergoing compaction. In this case, returning an // error from the validation function will result in a panic at runtime, // given that there is rarely any way of recovering from malformed keys // present in compacted files. By default, validation is not performed. // // Additional use-cases may be added in the future. // // NOTE: callers should take care to not mutate the key being validated. KeyValidationFunc func(userKey []byte) error // ValidateOnIngest schedules validation of sstables after they have // been ingested. // // By default, this value is false. ValidateOnIngest bool // MultiLevelCompaction allows the compaction of SSTs from more than two // levels iff a conventional two level compaction will quickly trigger a // compaction in the output level. MultiLevelCompaction bool // MaxWriterConcurrency is used to indicate the maximum number of // compression workers the compression queue is allowed to use. If // MaxWriterConcurrency > 0, then the Writer will use parallelism, to // compress and write blocks to disk. Otherwise, the writer will // compress and write blocks to disk synchronously. MaxWriterConcurrency int // ForceWriterParallelism is used to force parallelism in the sstable // Writer for the metamorphic tests. Even with the MaxWriterConcurrency // option set, we only enable parallelism in the sstable Writer if there // is enough CPU available, and this option bypasses that. ForceWriterParallelism bool // CPUWorkPermissionGranter should be set if Pebble should be given the // ability to optionally schedule additional CPU. See the documentation // for CPUWorkPermissionGranter for more details. CPUWorkPermissionGranter CPUWorkPermissionGranter } // Filters is a map from filter policy name to filter policy. It is used for // debugging tools which may be used on multiple databases configured with // different filter policies. It is not necessary to populate this filters // map during normal usage of a DB. Filters map[string]FilterPolicy // FlushSplitBytes denotes the target number of bytes per sublevel in // each flush split interval (i.e. range between two flush split keys) // in L0 sstables. When set to zero, only a single sstable is generated // by each flush. When set to a non-zero value, flushes are split at // points to meet L0's TargetFileSize, any grandparent-related overlap // options, and at boundary keys of L0 flush split intervals (which are // targeted to contain around FlushSplitBytes bytes in each sublevel // between pairs of boundary keys). Splitting sstables during flush // allows increased compaction flexibility and concurrency when those // tables are compacted to lower levels. FlushSplitBytes int64 // FormatMajorVersion sets the format of on-disk files. It is // recommended to set the format major version to an explicit // version, as the default may change over time. // // At Open if the existing database is formatted using a later // format major version that is known to this version of Pebble, // Pebble will continue to use the later format major version. If // the existing database's version is unknown, the caller may use // FormatMostCompatible and will be able to open the database // regardless of its actual version. // // If the existing database is formatted using a format major // version earlier than the one specified, Open will automatically // ratchet the database to the specified format major version. FormatMajorVersion FormatMajorVersion // FS provides the interface for persistent file storage. // // The default value uses the underlying operating system's file system. FS vfs.FS // The count of L0 files necessary to trigger an L0 compaction. L0CompactionFileThreshold int // The amount of L0 read-amplification necessary to trigger an L0 compaction. L0CompactionThreshold int // Hard limit on L0 read-amplification, computed as the number of L0 // sublevels. Writes are stopped when this threshold is reached. L0StopWritesThreshold int // The maximum number of bytes for LBase. The base level is the level which // L0 is compacted into. The base level is determined dynamically based on // the existing data in the LSM. The maximum number of bytes for other levels // is computed dynamically based on the base level's maximum size. When the // maximum number of bytes for a level is exceeded, compaction is requested. LBaseMaxBytes int64 // Per-level options. Options for at least one level must be specified. The // options for the last level are used for all subsequent levels. Levels []LevelOptions // Logger used to write log messages. // // The default logger uses the Go standard library log package. Logger Logger // MaxManifestFileSize is the maximum size the MANIFEST file is allowed to // become. When the MANIFEST exceeds this size it is rolled over and a new // MANIFEST is created. MaxManifestFileSize int64 // MaxOpenFiles is a soft limit on the number of open files that can be // used by the DB. // // The default value is 1000. MaxOpenFiles int // The size of a MemTable in steady state. The actual MemTable size starts at // min(256KB, MemTableSize) and doubles for each subsequent MemTable up to // MemTableSize. This reduces the memory pressure caused by MemTables for // short lived (test) DB instances. Note that more than one MemTable can be // in existence since flushing a MemTable involves creating a new one and // writing the contents of the old one in the // background. MemTableStopWritesThreshold places a hard limit on the size of // the queued MemTables. MemTableSize int // Hard limit on the size of queued of MemTables. Writes are stopped when the // sum of the queued memtable sizes exceeds // MemTableStopWritesThreshold*MemTableSize. This value should be at least 2 // or writes will stop whenever a MemTable is being flushed. MemTableStopWritesThreshold int // Merger defines the associative merge operation to use for merging values // written with {Batch,DB}.Merge. // // The default merger concatenates values. Merger *Merger // MaxConcurrentCompactions specifies the maximum number of concurrent // compactions. The default is 1. Concurrent compactions are performed // - when L0 read-amplification passes the L0CompactionConcurrency threshold // - for automatic background compactions // - when a manual compaction for a level is split and parallelized MaxConcurrentCompactions int // DisableAutomaticCompactions dictates whether automatic compactions are // scheduled or not. The default is false (enabled). This option is only used // externally when running a manual compaction, and internally for tests. DisableAutomaticCompactions bool // NoSyncOnClose decides whether the Pebble instance will enforce a // close-time synchronization (e.g., fdatasync() or sync_file_range()) // on files it writes to. Setting this to true removes the guarantee for a // sync on close. Some implementations can still issue a non-blocking sync. NoSyncOnClose bool // NumPrevManifest is the number of non-current or older manifests which // we want to keep around for debugging purposes. By default, we're going // to keep one older manifest. NumPrevManifest int // ReadOnly indicates that the DB should be opened in read-only mode. Writes // to the DB will return an error, background compactions are disabled, and // the flush that normally occurs after replaying the WAL at startup is // disabled. ReadOnly bool // TableCache is an initialized TableCache which should be set as an // option if the DB needs to be initialized with a pre-existing table cache. // If TableCache is nil, then a table cache which is unique to the DB instance // is created. TableCache can be shared between db instances by setting it here. // The TableCache set here must use the same underlying cache as Options.Cache // and pebble will panic otherwise. TableCache *TableCache // TablePropertyCollectors is a list of TablePropertyCollector creation // functions. A new TablePropertyCollector is created for each sstable built // and lives for the lifetime of the table. TablePropertyCollectors []func() TablePropertyCollector // BlockPropertyCollectors is a list of BlockPropertyCollector creation // functions. A new BlockPropertyCollector is created for each sstable // built and lives for the lifetime of writing that table. BlockPropertyCollectors []func() BlockPropertyCollector // WALBytesPerSync sets the number of bytes to write to a WAL before calling // Sync on it in the background. Just like with BytesPerSync above, this // helps smooth out disk write latencies, and avoids cases where the OS // writes a lot of buffered data to disk at once. However, this is less // necessary with WALs, as many write operations already pass in // Sync = true. // // The default value is 0, i.e. no background syncing. This matches the // default behaviour in RocksDB. WALBytesPerSync int // WALDir specifies the directory to store write-ahead logs (WALs) in. If // empty (the default), WALs will be stored in the same directory as sstables // (i.e. the directory passed to pebble.Open). WALDir string // WALMinSyncInterval is the minimum duration between syncs of the WAL. If // WAL syncs are requested faster than this interval, they will be // artificially delayed. Introducing a small artificial delay (500us) between // WAL syncs can allow more operations to arrive and reduce IO operations // while having a minimal impact on throughput. This option is supplied as a // closure in order to allow the value to be changed dynamically. The default // value is 0. // // TODO(peter): rather than a closure, should there be another mechanism for // changing options dynamically? WALMinSyncInterval func() time.Duration // contains filtered or unexported fields }
Options holds the optional parameters for configuring pebble. These options apply to the DB at large; per-query options are defined by the IterOptions and WriteOptions types.
func (*Options) Check ¶
Check verifies the options are compatible with the previous options serialized by Options.String(). For example, the Comparer and Merger must be the same, or data will not be able to be properly read from the DB.
func (*Options) EnsureDefaults ¶
EnsureDefaults ensures that the default values for all options are set if a valid value was not already specified. Returns the new options.
func (*Options) Level ¶
func (o *Options) Level(level int) LevelOptions
Level returns the LevelOptions for the specified level.
func (*Options) MakeReaderOptions ¶
func (o *Options) MakeReaderOptions() sstable.ReaderOptions
MakeReaderOptions constructs sstable.ReaderOptions from the corresponding options in the receiver.
func (*Options) MakeWriterOptions ¶
func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstable.WriterOptions
MakeWriterOptions constructs sstable.WriterOptions for the specified level from the corresponding options in the receiver.
type ParseHooks ¶
type ParseHooks struct { NewCache func(size int64) *Cache NewCleaner func(name string) (Cleaner, error) NewComparer func(name string) (*Comparer, error) NewFilterPolicy func(name string) (FilterPolicy, error) NewMerger func(name string) (*Merger, error) SkipUnknown func(name, value string) bool }
ParseHooks contains callbacks to create options fields which can have user-defined implementations.
type RangeKeyData ¶
RangeKeyData describes a range key's data, set through RangeKeySet. The key boundaries of the range key is provided by Iterator.RangeBounds.
type RangeKeyMasking ¶
type RangeKeyMasking struct { // Suffix configures which range keys may mask point keys. Only range keys // that are defined at suffixes less than or equal to Suffix will mask point // keys. Suffix []byte }
RangeKeyMasking configures automatic hiding of point keys by range keys. A non-nil Suffix enables range-key masking. When enabled, range keys with suffixes ≤ Suffix behave as masks. All point keys that are contained within a masking range key's bounds and have suffixes less than the range key's suffix are automatically skipped.
Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there exists a range key with suffix _r_ covering a point key with suffix _p_, and
_s_ ≤ _r_ < _p_
then the point key is elided.
Range-key masking may only be used when iterating over both point keys and range keys with IterKeyTypePointsAndRanges.
type Reader ¶
type Reader interface { // Get gets the value for the given key. It returns ErrNotFound if the DB // does not contain the key. // // The caller should not modify the contents of the returned slice, but it is // safe to modify the contents of the argument after Get returns. The // returned slice will remain valid until the returned Closer is closed. On // success, the caller MUST call closer.Close() or a memory leak will occur. Get(key []byte) (value []byte, closer io.Closer, err error) // NewIter returns an iterator that is unpositioned (Iterator.Valid() will // return false). The iterator can be positioned via a call to SeekGE, // SeekLT, First or Last. NewIter(o *IterOptions) *Iterator // Close closes the Reader. It may or may not close any underlying io.Reader // or io.Writer, depending on how the DB was created. // // It is not safe to close a DB until all outstanding iterators are closed. // It is valid to call Close multiple times. Other methods should not be // called after the DB has been closed. Close() error }
Reader is a readable key/value store.
It is safe to call Get and NewIter from concurrent goroutines.
type SSTableInfo ¶
type SSTableInfo struct { manifest.TableInfo // Properties is the sstable properties of this table. Properties *sstable.Properties }
SSTableInfo export manifest.TableInfo with sstable.Properties
type SSTablesOption ¶
type SSTablesOption func(*sstablesOptions)
SSTablesOption set optional parameter used by `DB.SSTables`.
func WithProperties ¶
func WithProperties() SSTablesOption
WithProperties enable return sstable properties in each TableInfo.
NOTE: if most of the sstable properties need to be read from disk, this options may make method `SSTables` quite slow.
type Snapshot ¶
type Snapshot struct {
// contains filtered or unexported fields
}
Snapshot provides a read-only point-in-time view of the DB state.
func (*Snapshot) Close ¶
Close closes the snapshot, releasing its resources. Close must be called. Failure to do so will result in a tiny memory leak and a large leak of resources on disk due to the entries the snapshot is preventing from being deleted.
func (*Snapshot) Get ¶
Get gets the value for the given key. It returns ErrNotFound if the Snapshot does not contain the key.
The caller should not modify the contents of the returned slice, but it is safe to modify the contents of the argument after Get returns. The returned slice will remain valid until the returned Closer is closed. On success, the caller MUST call closer.Close() or a memory leak will occur.
func (*Snapshot) NewIter ¶
func (s *Snapshot) NewIter(o *IterOptions) *Iterator
NewIter returns an iterator that is unpositioned (Iterator.Valid() will return false). The iterator can be positioned via a call to SeekGE, SeekLT, First or Last.
type TableCache ¶
type TableCache struct {
// contains filtered or unexported fields
}
TableCache is a shareable cache for open sstables.
func NewTableCache ¶
func NewTableCache(cache *Cache, numShards int, size int) *TableCache
NewTableCache will create a reference to the table cache. It is the callers responsibility to call tableCache.Unref if they will no longer hold a reference to the table cache.
func (*TableCache) Ref ¶
func (c *TableCache) Ref()
Ref adds a reference to the table cache. Once tableCache.init returns, the table cache only remains valid if there is at least one reference to it.
func (*TableCache) Unref ¶
func (c *TableCache) Unref() error
Unref removes a reference to the table cache.
type TableCreateInfo ¶
type TableCreateInfo struct { JobID int // Reason is the reason for the table creation: "compacting", "flushing", or // "ingesting". Reason string Path string FileNum FileNum }
TableCreateInfo contains the info for a table creation event.
func (TableCreateInfo) SafeFormat ¶
func (i TableCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableCreateInfo) String ¶
func (i TableCreateInfo) String() string
type TableDeleteInfo ¶
TableDeleteInfo contains the info for a table deletion event.
func (TableDeleteInfo) SafeFormat ¶
func (i TableDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableDeleteInfo) String ¶
func (i TableDeleteInfo) String() string
type TableIngestInfo ¶
type TableIngestInfo struct { // JobID is the ID of the job the caused the table to be ingested. JobID int Tables []struct { TableInfo Level int } // GlobalSeqNum is the sequence number that was assigned to all entries in // the ingested table. GlobalSeqNum uint64 Err error }
TableIngestInfo contains the info for a table ingestion event.
func (TableIngestInfo) SafeFormat ¶
func (i TableIngestInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableIngestInfo) String ¶
func (i TableIngestInfo) String() string
type TablePropertyCollector ¶
type TablePropertyCollector = sstable.TablePropertyCollector
TablePropertyCollector exports the sstable.TablePropertyCollector type.
type TableStatsInfo ¶
type TableStatsInfo struct { // JobID is the ID of the job that finished loading the initial tables' // stats. JobID int }
TableStatsInfo contains the info for a table stats loaded event.
func (TableStatsInfo) SafeFormat ¶
func (i TableStatsInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableStatsInfo) String ¶
func (i TableStatsInfo) String() string
type TableValidatedInfo ¶
type TableValidatedInfo struct { JobID int Meta *fileMetadata }
TableValidatedInfo contains information on the result of a validation run on an sstable.
func (TableValidatedInfo) SafeFormat ¶
func (i TableValidatedInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (TableValidatedInfo) String ¶
func (i TableValidatedInfo) String() string
type ThroughputMetric ¶
type ThroughputMetric = base.ThroughputMetric
ThroughputMetric is a cumulative throughput metric. See the detailed comment in base.
type ValueMerger ¶
type ValueMerger = base.ValueMerger
ValueMerger exports the base.ValueMerger type.
type WALCreateInfo ¶
type WALCreateInfo struct { // JobID is the ID of the job the caused the WAL to be created. JobID int Path string // The file number of the new WAL. FileNum FileNum // The file number of a previous WAL which was recycled to create this // one. Zero if recycling did not take place. RecycledFileNum FileNum Err error }
WALCreateInfo contains info about a WAL creation event.
func (WALCreateInfo) SafeFormat ¶
func (i WALCreateInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WALCreateInfo) String ¶
func (i WALCreateInfo) String() string
type WALDeleteInfo ¶
type WALDeleteInfo struct { // JobID is the ID of the job the caused the WAL to be deleted. JobID int Path string FileNum FileNum Err error }
WALDeleteInfo contains the info for a WAL deletion event.
func (WALDeleteInfo) SafeFormat ¶
func (i WALDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WALDeleteInfo) String ¶
func (i WALDeleteInfo) String() string
type WriteOptions ¶
type WriteOptions struct { // Sync is whether to sync writes through the OS buffer cache and down onto // the actual disk, if applicable. Setting Sync is required for durability of // individual write operations but can result in slower writes. // // If false, and the process or machine crashes, then a recent write may be // lost. This is due to the recently written data being buffered inside the // process running Pebble. This differs from the semantics of a write system // call in which the data is buffered in the OS buffer cache and would thus // survive a process crash. // // The default value is true. Sync bool }
WriteOptions hold the optional per-query parameters for Set and Delete operations.
Like Options, a nil *WriteOptions is valid and means to use the default values.
func (*WriteOptions) GetSync ¶
func (o *WriteOptions) GetSync() bool
GetSync returns the Sync value or true if the receiver is nil.
type WriteStallBeginInfo ¶
type WriteStallBeginInfo struct {
Reason string
}
WriteStallBeginInfo contains the info for a write stall begin event.
func (WriteStallBeginInfo) SafeFormat ¶
func (i WriteStallBeginInfo) SafeFormat(w redact.SafePrinter, _ rune)
SafeFormat implements redact.SafeFormatter.
func (WriteStallBeginInfo) String ¶
func (i WriteStallBeginInfo) String() string
type Writer ¶
type Writer interface { // Apply the operations contained in the batch to the DB. // // It is safe to modify the contents of the arguments after Apply returns. Apply(batch *Batch, o *WriteOptions) error // Delete deletes the value for the given key. Deletes are blind all will // succeed even if the given key does not exist. // // It is safe to modify the contents of the arguments after Delete returns. Delete(key []byte, o *WriteOptions) error // SingleDelete is similar to Delete in that it deletes the value for the given key. Like Delete, // it is a blind operation that will succeed even if the given key does not exist. // // WARNING: Undefined (non-deterministic) behavior will result if a key is overwritten and // then deleted using SingleDelete. The record may appear deleted immediately, but be // resurrected at a later time after compactions have been performed. Or the record may // be deleted permanently. A Delete operation lays down a "tombstone" which shadows all // previous versions of a key. The SingleDelete operation is akin to "anti-matter" and will // only delete the most recently written version for a key. These different semantics allow // the DB to avoid propagating a SingleDelete operation during a compaction as soon as the // corresponding Set operation is encountered. These semantics require extreme care to handle // properly. Only use if you have a workload where the performance gain is critical and you // can guarantee that a record is written once and then deleted once. // // SingleDelete is internally transformed into a Delete if the most recent record for a key is either // a Merge or Delete record. // // It is safe to modify the contents of the arguments after SingleDelete returns. SingleDelete(key []byte, o *WriteOptions) error // DeleteRange deletes all of the point keys (and values) in the range // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT // delete overlapping range keys (eg, keys set via RangeKeySet). // // It is safe to modify the contents of the arguments after DeleteRange // returns. DeleteRange(start, end []byte, o *WriteOptions) error // LogData adds the specified to the batch. The data will be written to the // WAL, but not added to memtables or sstables. Log data is never indexed, // which makes it useful for testing WAL performance. // // It is safe to modify the contents of the argument after LogData returns. LogData(data []byte, opts *WriteOptions) error // Merge merges the value for the given key. The details of the merge are // dependent upon the configured merge operation. // // It is safe to modify the contents of the arguments after Merge returns. Merge(key, value []byte, o *WriteOptions) error // Set sets the value for the given key. It overwrites any previous value // for that key; a DB is not a multi-map. // // It is safe to modify the contents of the arguments after Set returns. Set(key, value []byte, o *WriteOptions) error // Experimental returns the experimental write API. Experimental() ExperimentalWriter }
Writer is a writable key/value store.
Goroutine safety is dependent on the specific implementation.
Source Files ¶
- batch.go
- cache.go
- checkpoint.go
- cleaner.go
- commit.go
- compaction.go
- compaction_iter.go
- compaction_picker.go
- comparer.go
- db.go
- error_iter.go
- event.go
- external_iterator.go
- filenames.go
- flush_external.go
- flushable.go
- format_major_version.go
- get_iter.go
- ingest.go
- internal.go
- iterator.go
- level_checker.go
- level_iter.go
- log_recycler.go
- logger.go
- mem_table.go
- merger.go
- merging_iter.go
- merging_iter_heap.go
- metrics.go
- open.go
- options.go
- pacer.go
- range_keys.go
- read_compaction_queue.go
- read_state.go
- snapshot.go
- syncing_fs.go
- table_cache.go
- table_stats.go
- version_set.go
Directories ¶
Path | Synopsis |
---|---|
Package bloom implements Bloom filters.
|
Package bloom implements Bloom filters. |
cmd
|
|
internal
|
|
cache
Package cache implements the CLOCK-Pro caching algorithm.
|
Package cache implements the CLOCK-Pro caching algorithm. |
crc
Package crc implements the checksum algorithm used throughout pebble.
|
Package crc implements the checksum algorithm used throughout pebble. |
keyspan
Package keyspan provides facilities for sorting, fragmenting and iterating over spans of user keys.
|
Package keyspan provides facilities for sorting, fragmenting and iterating over spans of user keys. |
mkbench
mkbench is a utility for processing the raw nightly benchmark data in JSON data that can be visualized by docs/js/app.js.
|
mkbench is a utility for processing the raw nightly benchmark data in JSON data that can be visualized by docs/js/app.js. |
rangekey
Package rangekey provides facilities for encoding, decoding and merging range keys.
|
Package rangekey provides facilities for encoding, decoding and merging range keys. |
rate
Package rate provides a rate limiter.
|
Package rate provides a rate limiter. |
replay
Package replay implements facilities for replaying writes to a database.
|
Package replay implements facilities for replaying writes to a database. |
testkeys
Package testkeys provides facilities for generating and comparing human-readable test keys for use in tests and benchmarks.
|
Package testkeys provides facilities for generating and comparing human-readable test keys for use in tests and benchmarks. |
Package record reads and writes sequences of records.
|
Package record reads and writes sequences of records. |
Package sstable implements readers and writers of pebble tables.
|
Package sstable implements readers and writers of pebble tables. |