hll

package
v0.0.0-...-3e17171 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 22, 2024 License: Apache-2.0 Imports: 7 Imported by: 0

Documentation

Overview

Package hll is dedicated to streaming algorithms that enable estimation of the cardinality of a stream of items.

HllSketch and Union are the public facing classes of this high performance implementation of Phillipe Flajolet's HyperLogLog algorithm but with significantly improved error behavior and important features that can be essential for large production systems that must handle massive data.

Example
// Creating a first HLL sketch
sketch, _ := NewHllSketch(10, TgtHllTypeHll4)

// Add 100 distinct values
for i := 0; i < 100; i++ {
	sketch.UpdateInt64(int64(i))
}
est, _ := sketch.GetEstimate()
fmt.Printf("Cardinality estimation of first sketch(0-100): %d\n", int64(est))

// Add another 100000 distinct values (repeating the 100 previously added)
for i := 0; i < 100000; i++ {
	sketch.UpdateInt64(int64(i))
}
est, _ = sketch.GetEstimate()
fmt.Printf("Cardinality estimation of first sketch(0-100000): %d\n", int64(est))

// Get the upper bound (2nd std deviation) of the estimate
ub, _ := sketch.GetUpperBound(2)
fmt.Printf("Upper bound (2nd std deviation) of first sketch(0-100000): %d\n", int64(ub))

// Get the lower bound (2nd std deviation) of the estimate
lb, _ := sketch.GetLowerBound(2)
fmt.Printf("Lower bound (2nd std deviation) of first sketch(0-100000): %d\n", int64(lb))
fmt.Printf("\n")

// Creating a second HLL sketch
anotherSketch, _ := NewHllSketch(10, TgtHllTypeHll4)
// Add another 100000 distinct values (starting at 50000
for i := 50000; i < 150000; i++ {
	anotherSketch.UpdateInt64(int64(i))
}
est, _ = anotherSketch.GetEstimate()
fmt.Printf("Cardinality estimation of second sketch(50000-150000): %d\n", int64(est))
fmt.Printf("\n")

// Creating a union sketch and merge the two sketches
unionsketchBldr, _ := NewUnion(10)
unionsketchBldr.UpdateSketch(sketch)
unionsketchBldr.UpdateSketch(anotherSketch)
unionSketch, _ := unionsketchBldr.GetResult(TgtHllTypeHll4)

unionEst, _ := unionSketch.GetEstimate()
fmt.Printf("Cardinality estimation of first and second union: %d\n", int64(unionEst))

// Get the upper bound (2nd std deviation) of the estimate
ub, _ = unionSketch.GetUpperBound(2)
fmt.Printf("Upper bound (2nd std deviation) of first and second union: %d\n", int64(ub))

// Get the lower bound (2nd std deviation) of the estimate
lb, _ = unionSketch.GetLowerBound(2)
fmt.Printf("Lower bound (2nd std deviation) of first and second union: %d\n", int64(lb))
fmt.Printf("\n")

// Serialize and deserialize the union sketch
serializedSketch, _ := unionSketch.ToUpdatableSlice()
reloadedSketch, _ := NewHllSketchFromSlice(serializedSketch, true)
reloadedEst, _ := reloadedSketch.GetEstimate()
fmt.Printf("Cardinality estimation of reloaded unioned sketch: %d\n", int64(reloadedEst))
Output:

Cardinality estimation of first sketch(0-100): 100
Cardinality estimation of first sketch(0-100000): 104403
Upper bound (2nd std deviation) of first sketch(0-100000): 109997
Lower bound (2nd std deviation) of first sketch(0-100000): 99134

Cardinality estimation of second sketch(50000-150000): 96390

Cardinality estimation of first and second union: 151359
Upper bound (2nd std deviation) of first and second union: 161518
Lower bound (2nd std deviation) of first and second union: 141853

Cardinality estimation of reloaded unioned sketch: 151359

Index

Examples

Constants

View Source
const (
	TgtHllTypeHll4    = TgtHllType(0)
	TgtHllTypeHll6    = TgtHllType(1)
	TgtHllTypeHll8    = TgtHllType(2)
	TgtHllTypeDefault = TgtHllTypeHll4
)

Specifies the target type of HLL sketch to be created. It is a target in that the actual allocation of the HLL array is deferred until sufficient number of items have been received by the warm-up phases.

These three target types are isomorphic representations of the same underlying HLL algorithm. Thus, given the same value of <i>lgConfigK</i> and the same input, all three HLL target types will produce identical estimates and have identical error distributions.

The memory (and also the serialization) of the sketch during this early warmup phase starts out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required until the full HLL array is allocated. This transition point occurs at about 10% of K for sketches where lgConfigK is > 8.

  • Hll 8 This uses an 8-bit byte per HLL bucket. It is generally the fastest in terms of update time, but has the largest storage footprint of about K bytes.

  • Hll 6 This uses a 6-bit field per HLL bucket. It is the generally the next fastest in terms of update time with a storage footprint of about 3/4 * K bytes.

  • Hll 4 This uses a 4-bit field per HLL bucket and for large counts may require the use of a small internal auxiliary array for storing statistical exceptions, which are rare. For the values of lgConfigK > 13 (K = 8192), this additional array adds about 3% to the overall storage. It is generally the slowest in terms of update time, but has the smallest storage footprint of about K/2 * 1.03 bytes.

Variables

This section is empty.

Functions

This section is empty.

Types

type HllSketch

type HllSketch interface {
	// Copy returns a clone of this sketch.
	Copy() (HllSketch, error)

	// CopyAs returns a clone of this sketch with the specified TgtHllType.
	//
	//   - tgtHllType, the TgtHllType enum
	CopyAs(tgtHllType TgtHllType) (HllSketch, error)

	// GetCompositeEstimate is less accurate than GetEstimate method and is automatically used
	// when the sketch has gone through union operations where the more accurate HIP estimator
	// cannot be used
	// This is made public only for error characterization  software that exists in separate package and is not
	// intended for normal use.
	GetCompositeEstimate() (float64, error)

	// GetEstimate returns the cardinality estimate
	GetEstimate() (float64, error)

	// UpdateUInt64 present the given unsigned 64-bit integer as a potential unique item.
	UpdateUInt64(datum uint64) error

	// UpdateInt64 present the given signed 64-bit integer as a potential unique item.
	UpdateInt64(datum int64) error

	// UpdateSlice present the given byte slice as a potential unique item.
	UpdateSlice(datum []byte) error

	// UpdateString present the given string as a potential unique item.
	UpdateString(datum string) error

	// Reset resets the sketch to empty, but does not change the configured values of lgConfigK and tgtHllType.
	Reset() error

	// GetLowerBound gets the approximate lower error bound given the specifified numbers of standard deviations.
	//
	//   - numStdDev, this must be an integer between 1 and 3, inclusive.
	GetLowerBound(numStdDev int) (float64, error)

	// GetUpperBound gets the approximate upper error bound given the specified number of standard deviations.
	//
	//   - numStdDev, this must be an integer between 1 and 3, inclusive.
	GetUpperBound(numStdDev int) (float64, error)

	// IsEmpty returns true if the sketch is empty.
	IsEmpty() bool

	// GetLgConfigK returns the lgConfigK of the sketch.
	GetLgConfigK() int

	// GetTgtHllType returns the TgtHllType of the sketch.
	GetTgtHllType() TgtHllType

	// GetCurMode returns the current mode of the sketch: LIST, SET, HLL.
	GetCurMode() curMode

	// GetUpdatableSerializationBytes gets the size in bytes of the current sketch when serialized using
	// ToUpdatableSlice.
	GetUpdatableSerializationBytes() int

	// ToCompactSlice serializes the sketch to a slice, compacting data structures
	// where feasible to eliminate unused storage in the serialized image.
	ToCompactSlice() ([]byte, error)

	// ToUpdatableSlice serializes the sketch as a byte slice in an updatable form.
	// The updatable form is larger than the compact form.
	ToUpdatableSlice() ([]byte, error)

	GetSerializationVersion() int
	// contains filtered or unexported methods
}

func NewHllSketch

func NewHllSketch(lgConfigK int, tgtHllType TgtHllType) (HllSketch, error)

NewHllSketch constructs a new sketch with the type of HLL sketch to configure

  • lgConfigK, the Log2 of K for the target HLL sketch. This value must be

between 4 and 21 inclusively.

  • tgtHllType. the desired HLL type.

func NewHllSketchFromSlice

func NewHllSketchFromSlice(bytes []byte, checkRebuild bool) (HllSketch, error)

NewHllSketchFromSlice deserialize a given byte slice, which must be a valid HllSketch image and may have data.

  • bytes, the given byte slice, this slice is not modified and is not retained by the sketch

func NewHllSketchWithDefault

func NewHllSketchWithDefault() (HllSketch, error)

NewHllSketchWithDefault constructs a new on-heap sketch with the default lgK and tgtHllType.

type TgtHllType

type TgtHllType int

type Union

type Union interface {
	// Updatable
	UpdateUInt64(datum uint64) error
	UpdateInt64(datum int64) error
	UpdateSlice(datum []byte) error
	UpdateString(datum string) error
	Reset() error

	// Estimable
	GetCompositeEstimate() (float64, error)
	GetEstimate() (float64, error)
	//GetHipEstimate() (float64, error)
	GetLowerBound(numStdDev int) (float64, error)
	GetUpperBound(numStdDev int) (float64, error)
	IsEmpty() bool

	GetLgConfigK() int
	GetTgtHllType() TgtHllType
	GetCurMode() curMode

	GetUpdatableSerializationBytes() int
	ToCompactSlice() ([]byte, error)
	ToUpdatableSlice() ([]byte, error)

	UpdateSketch(sketch HllSketch) error
	GetResult(tgtHllType TgtHllType) (HllSketch, error)
	// contains filtered or unexported methods
}

func NewUnion

func NewUnion(lgMaxK int) (Union, error)

func NewUnionFromSlice

func NewUnionFromSlice(byteArray []byte) (Union, error)

func NewUnionWithDefault

func NewUnionWithDefault() (Union, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL