Documentation ¶
Index ¶
- Variables
- func GetStatValue(typ parquet.Type, val []byte) interface{}
- type AppVersion
- type BooleanStatistics
- func (s *BooleanStatistics) Descr() *schema.Column
- func (s *BooleanStatistics) DistinctCount() int64
- func (s *BooleanStatistics) Encode() (enc EncodedStatistics, err error)
- func (s *BooleanStatistics) EncodeMax() []byte
- func (s *BooleanStatistics) EncodeMin() []byte
- func (s *BooleanStatistics) Equals(other TypedStatistics) bool
- func (s *BooleanStatistics) HasDistinctCount() bool
- func (s *BooleanStatistics) HasMinMax() bool
- func (s *BooleanStatistics) HasNullCount() bool
- func (s *BooleanStatistics) Max() bool
- func (s *BooleanStatistics) Merge(other TypedStatistics)
- func (s *BooleanStatistics) Min() bool
- func (s *BooleanStatistics) MinMaxEqual(rhs *BooleanStatistics) bool
- func (s *BooleanStatistics) NullCount() int64
- func (s *BooleanStatistics) NumValues() int64
- func (s *BooleanStatistics) Reset()
- func (s *BooleanStatistics) SetMinMax(argMin, argMax bool)
- func (s *BooleanStatistics) Type() parquet.Type
- func (s *BooleanStatistics) Update(values []bool, numNull int64)
- func (s *BooleanStatistics) UpdateSpaced(values []bool, validBits []byte, validBitsOffset, numNull int64)
- type ByteArrayStatistics
- func (s *ByteArrayStatistics) Descr() *schema.Column
- func (s *ByteArrayStatistics) DistinctCount() int64
- func (s *ByteArrayStatistics) Encode() (enc EncodedStatistics, err error)
- func (s *ByteArrayStatistics) EncodeMax() []byte
- func (s *ByteArrayStatistics) EncodeMin() []byte
- func (s *ByteArrayStatistics) Equals(other TypedStatistics) bool
- func (s *ByteArrayStatistics) HasDistinctCount() bool
- func (s *ByteArrayStatistics) HasMinMax() bool
- func (s *ByteArrayStatistics) HasNullCount() bool
- func (s *ByteArrayStatistics) Max() parquet.ByteArray
- func (s *ByteArrayStatistics) Merge(other TypedStatistics)
- func (s *ByteArrayStatistics) Min() parquet.ByteArray
- func (s *ByteArrayStatistics) MinMaxEqual(rhs *ByteArrayStatistics) bool
- func (s *ByteArrayStatistics) NullCount() int64
- func (s *ByteArrayStatistics) NumValues() int64
- func (s *ByteArrayStatistics) Reset()
- func (s *ByteArrayStatistics) SetMinMax(argMin, argMax parquet.ByteArray)
- func (s *ByteArrayStatistics) Type() parquet.Type
- func (s *ByteArrayStatistics) Update(values []parquet.ByteArray, numNull int64)
- func (s *ByteArrayStatistics) UpdateSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset, numNull int64)
- type ChunkMetaInfo
- type ColumnChunkMetaData
- func (c *ColumnChunkMetaData) BloomFilterOffset() int64
- func (c *ColumnChunkMetaData) Compression() compress.Compression
- func (c *ColumnChunkMetaData) CryptoMetadata() *format.ColumnCryptoMetaData
- func (c *ColumnChunkMetaData) DataPageOffset() int64
- func (c *ColumnChunkMetaData) DictionaryPageOffset() int64
- func (c *ColumnChunkMetaData) EncodingStats() []PageEncodingStats
- func (c *ColumnChunkMetaData) Encodings() []parquet.Encoding
- func (c *ColumnChunkMetaData) Equals(other *ColumnChunkMetaData) bool
- func (c *ColumnChunkMetaData) FileOffset() int64
- func (c *ColumnChunkMetaData) FilePath() string
- func (c *ColumnChunkMetaData) HasDictionaryPage() bool
- func (c *ColumnChunkMetaData) HasIndexPage() bool
- func (c *ColumnChunkMetaData) IndexPageOffset() int64
- func (c *ColumnChunkMetaData) NumValues() int64
- func (c *ColumnChunkMetaData) PathInSchema() parquet.ColumnPath
- func (c *ColumnChunkMetaData) Statistics() (TypedStatistics, error)
- func (c *ColumnChunkMetaData) StatsSet() (bool, error)
- func (c *ColumnChunkMetaData) TotalCompressedSize() int64
- func (c *ColumnChunkMetaData) TotalUncompressedSize() int64
- func (c *ColumnChunkMetaData) Type() parquet.Type
- type ColumnChunkMetaDataBuilder
- func (c *ColumnChunkMetaDataBuilder) Contents() *format.ColumnChunk
- func (c *ColumnChunkMetaDataBuilder) Descr() *schema.Column
- func (c *ColumnChunkMetaDataBuilder) Finish(info ChunkMetaInfo, hasDict, dictFallback bool, encStats EncodingStats, ...) error
- func (c *ColumnChunkMetaDataBuilder) SetFilePath(val string)
- func (c *ColumnChunkMetaDataBuilder) SetStats(val EncodedStatistics)
- func (c *ColumnChunkMetaDataBuilder) TotalCompressedSize() int64
- func (c *ColumnChunkMetaDataBuilder) WriteTo(w io.Writer) (int64, error)
- type EncodedStatistics
- func (e *EncodedStatistics) ApplyStatSizeLimits(length int)
- func (e *EncodedStatistics) IsSet() bool
- func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics
- func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics
- func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics
- func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics
- func (e *EncodedStatistics) ToThrift() (stats *format.Statistics)
- type EncodingStats
- type FileCryptoMetadata
- type FileMetaData
- func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error
- func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm
- func (f *FileMetaData) Equals(other *FileMetaData) bool
- func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata
- func (f *FileMetaData) NumSchemaElements() int
- func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData
- func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error)
- func (f *FileMetaData) SerializeString(ctx context.Context) (string, error)
- func (f *FileMetaData) SetFilePath(path string)
- func (f *FileMetaData) Size() int
- func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error)
- func (f *FileMetaData) VerifySignature(signature []byte) bool
- func (f *FileMetaData) Version() parquet.Version
- func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error)
- func (f *FileMetaData) WriterVersion() *AppVersion
- type FileMetaDataBuilder
- type FixedLenByteArrayStatistics
- func (s *FixedLenByteArrayStatistics) Descr() *schema.Column
- func (s *FixedLenByteArrayStatistics) DistinctCount() int64
- func (s *FixedLenByteArrayStatistics) Encode() (enc EncodedStatistics, err error)
- func (s *FixedLenByteArrayStatistics) EncodeMax() []byte
- func (s *FixedLenByteArrayStatistics) EncodeMin() []byte
- func (s *FixedLenByteArrayStatistics) Equals(other TypedStatistics) bool
- func (s *FixedLenByteArrayStatistics) HasDistinctCount() bool
- func (s *FixedLenByteArrayStatistics) HasMinMax() bool
- func (s *FixedLenByteArrayStatistics) HasNullCount() bool
- func (s *FixedLenByteArrayStatistics) Max() parquet.FixedLenByteArray
- func (s *FixedLenByteArrayStatistics) Merge(other TypedStatistics)
- func (s *FixedLenByteArrayStatistics) Min() parquet.FixedLenByteArray
- func (s *FixedLenByteArrayStatistics) MinMaxEqual(rhs *FixedLenByteArrayStatistics) bool
- func (s *FixedLenByteArrayStatistics) NullCount() int64
- func (s *FixedLenByteArrayStatistics) NumValues() int64
- func (s *FixedLenByteArrayStatistics) Reset()
- func (s *FixedLenByteArrayStatistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray)
- func (s *FixedLenByteArrayStatistics) Type() parquet.Type
- func (s *FixedLenByteArrayStatistics) Update(values []parquet.FixedLenByteArray, numNull int64)
- func (s *FixedLenByteArrayStatistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, ...)
- type Float32Statistics
- func (s *Float32Statistics) Descr() *schema.Column
- func (s *Float32Statistics) DistinctCount() int64
- func (s *Float32Statistics) Encode() (enc EncodedStatistics, err error)
- func (s *Float32Statistics) EncodeMax() []byte
- func (s *Float32Statistics) EncodeMin() []byte
- func (s *Float32Statistics) Equals(other TypedStatistics) bool
- func (s *Float32Statistics) HasDistinctCount() bool
- func (s *Float32Statistics) HasMinMax() bool
- func (s *Float32Statistics) HasNullCount() bool
- func (s *Float32Statistics) Max() float32
- func (s *Float32Statistics) Merge(other TypedStatistics)
- func (s *Float32Statistics) Min() float32
- func (s *Float32Statistics) MinMaxEqual(rhs *Float32Statistics) bool
- func (s *Float32Statistics) NullCount() int64
- func (s *Float32Statistics) NumValues() int64
- func (s *Float32Statistics) Reset()
- func (s *Float32Statistics) SetMinMax(argMin, argMax float32)
- func (s *Float32Statistics) Type() parquet.Type
- func (s *Float32Statistics) Update(values []float32, numNull int64)
- func (s *Float32Statistics) UpdateSpaced(values []float32, validBits []byte, validBitsOffset, numNull int64)
- type Float64Statistics
- func (s *Float64Statistics) Descr() *schema.Column
- func (s *Float64Statistics) DistinctCount() int64
- func (s *Float64Statistics) Encode() (enc EncodedStatistics, err error)
- func (s *Float64Statistics) EncodeMax() []byte
- func (s *Float64Statistics) EncodeMin() []byte
- func (s *Float64Statistics) Equals(other TypedStatistics) bool
- func (s *Float64Statistics) HasDistinctCount() bool
- func (s *Float64Statistics) HasMinMax() bool
- func (s *Float64Statistics) HasNullCount() bool
- func (s *Float64Statistics) Max() float64
- func (s *Float64Statistics) Merge(other TypedStatistics)
- func (s *Float64Statistics) Min() float64
- func (s *Float64Statistics) MinMaxEqual(rhs *Float64Statistics) bool
- func (s *Float64Statistics) NullCount() int64
- func (s *Float64Statistics) NumValues() int64
- func (s *Float64Statistics) Reset()
- func (s *Float64Statistics) SetMinMax(argMin, argMax float64)
- func (s *Float64Statistics) Type() parquet.Type
- func (s *Float64Statistics) Update(values []float64, numNull int64)
- func (s *Float64Statistics) UpdateSpaced(values []float64, validBits []byte, validBitsOffset, numNull int64)
- type Int32Statistics
- func (s *Int32Statistics) Descr() *schema.Column
- func (s *Int32Statistics) DistinctCount() int64
- func (s *Int32Statistics) Encode() (enc EncodedStatistics, err error)
- func (s *Int32Statistics) EncodeMax() []byte
- func (s *Int32Statistics) EncodeMin() []byte
- func (s *Int32Statistics) Equals(other TypedStatistics) bool
- func (s *Int32Statistics) HasDistinctCount() bool
- func (s *Int32Statistics) HasMinMax() bool
- func (s *Int32Statistics) HasNullCount() bool
- func (s *Int32Statistics) Max() int32
- func (s *Int32Statistics) Merge(other TypedStatistics)
- func (s *Int32Statistics) Min() int32
- func (s *Int32Statistics) MinMaxEqual(rhs *Int32Statistics) bool
- func (s *Int32Statistics) NullCount() int64
- func (s *Int32Statistics) NumValues() int64
- func (s *Int32Statistics) Reset()
- func (s *Int32Statistics) SetMinMax(argMin, argMax int32)
- func (s *Int32Statistics) Type() parquet.Type
- func (s *Int32Statistics) Update(values []int32, numNull int64)
- func (s *Int32Statistics) UpdateSpaced(values []int32, validBits []byte, validBitsOffset, numNull int64)
- type Int64Statistics
- func (s *Int64Statistics) Descr() *schema.Column
- func (s *Int64Statistics) DistinctCount() int64
- func (s *Int64Statistics) Encode() (enc EncodedStatistics, err error)
- func (s *Int64Statistics) EncodeMax() []byte
- func (s *Int64Statistics) EncodeMin() []byte
- func (s *Int64Statistics) Equals(other TypedStatistics) bool
- func (s *Int64Statistics) HasDistinctCount() bool
- func (s *Int64Statistics) HasMinMax() bool
- func (s *Int64Statistics) HasNullCount() bool
- func (s *Int64Statistics) Max() int64
- func (s *Int64Statistics) Merge(other TypedStatistics)
- func (s *Int64Statistics) Min() int64
- func (s *Int64Statistics) MinMaxEqual(rhs *Int64Statistics) bool
- func (s *Int64Statistics) NullCount() int64
- func (s *Int64Statistics) NumValues() int64
- func (s *Int64Statistics) Reset()
- func (s *Int64Statistics) SetMinMax(argMin, argMax int64)
- func (s *Int64Statistics) Type() parquet.Type
- func (s *Int64Statistics) Update(values []int64, numNull int64)
- func (s *Int64Statistics) UpdateSpaced(values []int64, validBits []byte, validBitsOffset, numNull int64)
- type Int96Statistics
- func (s *Int96Statistics) Descr() *schema.Column
- func (s *Int96Statistics) DistinctCount() int64
- func (s *Int96Statistics) Encode() (enc EncodedStatistics, err error)
- func (s *Int96Statistics) EncodeMax() []byte
- func (s *Int96Statistics) EncodeMin() []byte
- func (s *Int96Statistics) Equals(other TypedStatistics) bool
- func (s *Int96Statistics) HasDistinctCount() bool
- func (s *Int96Statistics) HasMinMax() bool
- func (s *Int96Statistics) HasNullCount() bool
- func (s *Int96Statistics) Max() parquet.Int96
- func (s *Int96Statistics) Merge(other TypedStatistics)
- func (s *Int96Statistics) Min() parquet.Int96
- func (s *Int96Statistics) MinMaxEqual(rhs *Int96Statistics) bool
- func (s *Int96Statistics) NullCount() int64
- func (s *Int96Statistics) NumValues() int64
- func (s *Int96Statistics) Reset()
- func (s *Int96Statistics) SetMinMax(argMin, argMax parquet.Int96)
- func (s *Int96Statistics) Type() parquet.Type
- func (s *Int96Statistics) Update(values []parquet.Int96, numNull int64)
- func (s *Int96Statistics) UpdateSpaced(values []parquet.Int96, validBits []byte, validBitsOffset, numNull int64)
- type KeyValueMetadata
- func (k *KeyValueMetadata) Append(key, value string) error
- func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool
- func (k KeyValueMetadata) FindValue(key string) *string
- func (k KeyValueMetadata) Keys() (ret []string)
- func (k KeyValueMetadata) Len() int
- func (k KeyValueMetadata) Values() (ret []string)
- type PageEncodingStats
- type RowGroupMetaData
- func (r *RowGroupMetaData) ColumnChunk(i int) (*ColumnChunkMetaData, error)
- func (r *RowGroupMetaData) Equals(other *RowGroupMetaData) bool
- func (r *RowGroupMetaData) FileOffset() int64
- func (r *RowGroupMetaData) NumColumns() int
- func (r *RowGroupMetaData) NumRows() int64
- func (r *RowGroupMetaData) Ordinal() int16
- func (r *RowGroupMetaData) TotalByteSize() int64
- func (r *RowGroupMetaData) TotalCompressedSize() int64
- type RowGroupMetaDataBuilder
- func (r *RowGroupMetaDataBuilder) CurrentColumn() int
- func (r *RowGroupMetaDataBuilder) Finish(totalBytesWritten int64, ordinal int16) error
- func (r *RowGroupMetaDataBuilder) NextColumnChunk() *ColumnChunkMetaDataBuilder
- func (r *RowGroupMetaDataBuilder) NumColumns() int
- func (r *RowGroupMetaDataBuilder) NumRows() int64
- func (r *RowGroupMetaDataBuilder) SetNumRows(nrows int)
- type StatProvider
- type TypedStatistics
Constants ¶
This section is empty.
Variables ¶
var DefaultCompressionType = compress.Codecs.Uncompressed
DefaultCompressionType is used unless a different compression is specified in the properties
var ( // Parquet816FixedVersion is the version used for fixing PARQUET-816 // that changed the padding calculations for dictionary headers on row groups. Parquet816FixedVersion = NewAppVersionExplicit("parquet-mr", 1, 2, 9) )
Functions ¶
func GetStatValue ¶
Types ¶
type AppVersion ¶
type AppVersion struct { App string Build string Version struct { Major int Minor int Patch int Unknown string PreRelease string BuildInfo string } }
AppVersion represents a specific application version either read from or written to a parquet file.
func NewAppVersion ¶
func NewAppVersion(createdby string) *AppVersion
NewAppVersion parses a "created by" string such as "parquet-go 1.0.0".
It also supports handling pre-releases and build info such as
parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
func NewAppVersionExplicit ¶
func NewAppVersionExplicit(app string, major, minor, patch int) *AppVersion
NewAppVersionExplicit is a convenience function to construct a specific application version from the given app string and version
func (AppVersion) Equal ¶
func (v AppVersion) Equal(other *AppVersion) bool
Equal only compares the Application and major/minor/patch versions.
Pre-release and build info are not considered.
func (AppVersion) HasCorrectStatistics ¶
func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schema.LogicalType, stats EncodedStatistics, sort schema.SortOrder) bool
HasCorrectStatistics checks whether or not the statistics are valid to be used based on the primitive type and the version since previous versions had issues with properly computing stats.
Reference: parquet-cpp/src/parquet/metadata.cc
PARQUET-686 has more discussion on statistics
func (AppVersion) LessThan ¶
func (v AppVersion) LessThan(other *AppVersion) bool
LessThan compares the app versions and returns true if this version is "less than" the passed version.
If the apps don't match, this always returns false. Otherwise it compares the major versions first, then the minor versions, and finally the patch versions.
Pre-release and build info are not considered.
type BooleanStatistics ¶
type BooleanStatistics struct {
// contains filtered or unexported fields
}
BooleanStatistics is the typed interface for managing stats for a column of Boolean type.
func NewBooleanStatistics ¶
func NewBooleanStatistics(descr *schema.Column, mem memory.Allocator) *BooleanStatistics
NewBooleanStatistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Boolean
func NewBooleanStatisticsFromEncoded ¶
func NewBooleanStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *BooleanStatistics
NewBooleanStatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*BooleanStatistics) DistinctCount ¶
func (s *BooleanStatistics) DistinctCount() int64
func (*BooleanStatistics) Encode ¶
func (s *BooleanStatistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*BooleanStatistics) EncodeMax ¶
func (s *BooleanStatistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*BooleanStatistics) EncodeMin ¶
func (s *BooleanStatistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*BooleanStatistics) Equals ¶
func (s *BooleanStatistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*BooleanStatistics) HasDistinctCount ¶
func (s *BooleanStatistics) HasDistinctCount() bool
func (*BooleanStatistics) HasNullCount ¶
func (s *BooleanStatistics) HasNullCount() bool
func (*BooleanStatistics) Max ¶
func (s *BooleanStatistics) Max() bool
func (*BooleanStatistics) Merge ¶
func (s *BooleanStatistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*BooleanStatistics) Min ¶
func (s *BooleanStatistics) Min() bool
func (*BooleanStatistics) MinMaxEqual ¶
func (s *BooleanStatistics) MinMaxEqual(rhs *BooleanStatistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*BooleanStatistics) SetMinMax ¶
func (s *BooleanStatistics) SetMinMax(argMin, argMax bool)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*BooleanStatistics) Update ¶
func (s *BooleanStatistics) Update(values []bool, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*BooleanStatistics) UpdateSpaced ¶
func (s *BooleanStatistics) UpdateSpaced(values []bool, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type ByteArrayStatistics ¶
type ByteArrayStatistics struct {
// contains filtered or unexported fields
}
ByteArrayStatistics is the typed interface for managing stats for a column of ByteArray type.
func NewByteArrayStatistics ¶
func NewByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *ByteArrayStatistics
NewByteArrayStatistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.ByteArray
func NewByteArrayStatisticsFromEncoded ¶
func NewByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *ByteArrayStatistics
NewByteArrayStatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*ByteArrayStatistics) DistinctCount ¶
func (s *ByteArrayStatistics) DistinctCount() int64
func (*ByteArrayStatistics) Encode ¶
func (s *ByteArrayStatistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*ByteArrayStatistics) EncodeMax ¶
func (s *ByteArrayStatistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*ByteArrayStatistics) EncodeMin ¶
func (s *ByteArrayStatistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*ByteArrayStatistics) Equals ¶
func (s *ByteArrayStatistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*ByteArrayStatistics) HasDistinctCount ¶
func (s *ByteArrayStatistics) HasDistinctCount() bool
func (*ByteArrayStatistics) HasNullCount ¶
func (s *ByteArrayStatistics) HasNullCount() bool
func (*ByteArrayStatistics) Max ¶
func (s *ByteArrayStatistics) Max() parquet.ByteArray
func (*ByteArrayStatistics) Merge ¶
func (s *ByteArrayStatistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*ByteArrayStatistics) Min ¶
func (s *ByteArrayStatistics) Min() parquet.ByteArray
func (*ByteArrayStatistics) MinMaxEqual ¶
func (s *ByteArrayStatistics) MinMaxEqual(rhs *ByteArrayStatistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*ByteArrayStatistics) SetMinMax ¶
func (s *ByteArrayStatistics) SetMinMax(argMin, argMax parquet.ByteArray)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*ByteArrayStatistics) Update ¶
func (s *ByteArrayStatistics) Update(values []parquet.ByteArray, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*ByteArrayStatistics) UpdateSpaced ¶
func (s *ByteArrayStatistics) UpdateSpaced(values []parquet.ByteArray, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type ChunkMetaInfo ¶
type ChunkMetaInfo struct { NumValues int64 DictPageOffset int64 IndexPageOffset int64 DataPageOffset int64 CompressedSize int64 UncompressedSize int64 }
ChunkMetaInfo is a helper struct for passing the offset and size information for finishing the building of column chunk metadata
type ColumnChunkMetaData ¶
type ColumnChunkMetaData struct {
// contains filtered or unexported fields
}
ColumnChunkMetaData is a proxy around format.ColumnChunkMetaData containing all of the information and metadata for a given column chunk and it's associated Column
func NewColumnChunkMetaData ¶
func NewColumnChunkMetaData(column *format.ColumnChunk, descr *schema.Column, writerVersion *AppVersion, rowGroupOrdinal, columnOrdinal int16, fileDecryptor encryption.FileDecryptor) (*ColumnChunkMetaData, error)
NewColumnChunkMetaData creates an instance of the metadata from a column chunk and descriptor
this is primarily used internally or between the subpackages. ColumnChunkMetaDataBuilder should be used by consumers instead of using this directly.
func (*ColumnChunkMetaData) BloomFilterOffset ¶
func (c *ColumnChunkMetaData) BloomFilterOffset() int64
BloomFilterOffset is the byte offset from the beginning of the file to the bloom filter data.
func (*ColumnChunkMetaData) Compression ¶
func (c *ColumnChunkMetaData) Compression() compress.Compression
Compression provides the type of compression used for this particular chunk.
func (*ColumnChunkMetaData) CryptoMetadata ¶
func (c *ColumnChunkMetaData) CryptoMetadata() *format.ColumnCryptoMetaData
CryptoMetadata returns the cryptographic metadata for how this column was encrypted and how to decrypt it.
func (*ColumnChunkMetaData) DataPageOffset ¶
func (c *ColumnChunkMetaData) DataPageOffset() int64
DataPageOffset returns the location in the file where the data pages begin for this column
func (*ColumnChunkMetaData) DictionaryPageOffset ¶
func (c *ColumnChunkMetaData) DictionaryPageOffset() int64
DictionaryPageOffset returns the location in the file where the dictionary page starts
func (*ColumnChunkMetaData) EncodingStats ¶
func (c *ColumnChunkMetaData) EncodingStats() []PageEncodingStats
EncodingStats connects the order of encodings based on the list of pages and types
func (*ColumnChunkMetaData) Encodings ¶
func (c *ColumnChunkMetaData) Encodings() []parquet.Encoding
Encodings returns the list of different encodings used in this chunk
func (*ColumnChunkMetaData) Equals ¶
func (c *ColumnChunkMetaData) Equals(other *ColumnChunkMetaData) bool
func (*ColumnChunkMetaData) FileOffset ¶
func (c *ColumnChunkMetaData) FileOffset() int64
FileOffset is the location in the file where the column data begins
func (*ColumnChunkMetaData) FilePath ¶
func (c *ColumnChunkMetaData) FilePath() string
FilePath gives the name of the parquet file if provided in the metadata
func (*ColumnChunkMetaData) HasDictionaryPage ¶
func (c *ColumnChunkMetaData) HasDictionaryPage() bool
HasDictionaryPage returns true if there is a dictionary page offset set in this metadata.
func (*ColumnChunkMetaData) HasIndexPage ¶
func (c *ColumnChunkMetaData) HasIndexPage() bool
HasIndexPage returns true if the offset for the index page is set in the metadata
func (*ColumnChunkMetaData) IndexPageOffset ¶
func (c *ColumnChunkMetaData) IndexPageOffset() int64
IndexPageOffset is the location in the file where the index page starts.
func (*ColumnChunkMetaData) NumValues ¶
func (c *ColumnChunkMetaData) NumValues() int64
NumValues is the number of values stored in just this chunk including nulls.
func (*ColumnChunkMetaData) PathInSchema ¶
func (c *ColumnChunkMetaData) PathInSchema() parquet.ColumnPath
PathInSchema is the full path to this column from the root of the schema including any nested columns
func (*ColumnChunkMetaData) Statistics ¶
func (c *ColumnChunkMetaData) Statistics() (TypedStatistics, error)
Statistics can return nil if there are no stats in this metadata
func (*ColumnChunkMetaData) StatsSet ¶
func (c *ColumnChunkMetaData) StatsSet() (bool, error)
StatsSet returns true only if there are statistics set in the metadata and the column descriptor has a sort order that is not SortUnknown
It also checks the writer version to ensure that it was not written by a version of parquet which is known to have incorrect stat computations.
func (*ColumnChunkMetaData) TotalCompressedSize ¶
func (c *ColumnChunkMetaData) TotalCompressedSize() int64
TotalCompressedSize will be equal to TotalUncompressedSize if the data is not compressed. Otherwise this will be the size of the actual data in the file.
func (*ColumnChunkMetaData) TotalUncompressedSize ¶
func (c *ColumnChunkMetaData) TotalUncompressedSize() int64
TotalUncompressedSize is the total size of the raw data after uncompressing the chunk
func (*ColumnChunkMetaData) Type ¶
func (c *ColumnChunkMetaData) Type() parquet.Type
Type is the physical storage type used in the parquet file for this column chunk.
type ColumnChunkMetaDataBuilder ¶
type ColumnChunkMetaDataBuilder struct {
// contains filtered or unexported fields
}
ColumnChunkMetaDataBuilder is used during writing to construct metadata for a given column chunk while writing, providing a proxy around constructing the actual thrift object.
func NewColumnChunkMetaDataBuilder ¶
func NewColumnChunkMetaDataBuilder(props *parquet.WriterProperties, column *schema.Column) *ColumnChunkMetaDataBuilder
func NewColumnChunkMetaDataBuilderWithContents ¶
func NewColumnChunkMetaDataBuilderWithContents(props *parquet.WriterProperties, column *schema.Column, chunk *format.ColumnChunk) *ColumnChunkMetaDataBuilder
NewColumnChunkMetaDataBuilderWithContents will construct a builder and start it with the provided column chunk information rather than with an empty column chunk.
func (*ColumnChunkMetaDataBuilder) Contents ¶
func (c *ColumnChunkMetaDataBuilder) Contents() *format.ColumnChunk
Contents returns the underlying thrift ColumnChunk object so that it can be used for constructing or duplicating column metadata
func (*ColumnChunkMetaDataBuilder) Descr ¶
func (c *ColumnChunkMetaDataBuilder) Descr() *schema.Column
Descr returns the associated column descriptor for this column chunk
func (*ColumnChunkMetaDataBuilder) Finish ¶
func (c *ColumnChunkMetaDataBuilder) Finish(info ChunkMetaInfo, hasDict, dictFallback bool, encStats EncodingStats, metaEncryptor encryption.Encryptor) error
Finish finalizes the metadata with the given offsets, flushes any compression that needs to be done, and performs any encryption if an encryptor is provided.
func (*ColumnChunkMetaDataBuilder) SetFilePath ¶
func (c *ColumnChunkMetaDataBuilder) SetFilePath(val string)
func (*ColumnChunkMetaDataBuilder) SetStats ¶
func (c *ColumnChunkMetaDataBuilder) SetStats(val EncodedStatistics)
func (*ColumnChunkMetaDataBuilder) TotalCompressedSize ¶
func (c *ColumnChunkMetaDataBuilder) TotalCompressedSize() int64
func (*ColumnChunkMetaDataBuilder) WriteTo ¶
func (c *ColumnChunkMetaDataBuilder) WriteTo(w io.Writer) (int64, error)
WriteTo will always return 0 as the int64 since the thrift writer library does not return the number of bytes written, we only use the signature of (int64, error) in order to match the standard WriteTo interfaces.
type EncodedStatistics ¶
type EncodedStatistics struct { HasMax bool Max []byte HasMin bool Min []byte Signed bool HasNullCount bool NullCount int64 HasDistinctCount bool DistinctCount int64 }
EncodedStatistics are raw statistics with encoded values that will be written to the parquet file, or was read from the parquet file.
func (*EncodedStatistics) ApplyStatSizeLimits ¶
func (e *EncodedStatistics) ApplyStatSizeLimits(length int)
ApplyStatSizeLimits sets the maximum size of the min/max values.
from parquet-mr we don't write stats larger than the max size rather than truncating. the rationale is that some engines may use the minimum value in the page as the true minimum for aggregations and there is no way to mark that a value has been truncated and is a lower bound and not in the page
func (*EncodedStatistics) IsSet ¶
func (e *EncodedStatistics) IsSet() bool
IsSet returns true iff one of the Has* values is true.
func (*EncodedStatistics) SetDistinctCount ¶
func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics
SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true
func (*EncodedStatistics) SetMax ¶
func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics
SetMax sets the encoded Max value to val and sets HasMax to true
func (*EncodedStatistics) SetMin ¶
func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics
SetMin sets the encoded Min value to val, and sets HasMin to true
func (*EncodedStatistics) SetNullCount ¶
func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics
SetNullCount sets the NullCount to val and sets HasNullCount to true
func (*EncodedStatistics) ToThrift ¶
func (e *EncodedStatistics) ToThrift() (stats *format.Statistics)
type EncodingStats ¶
type EncodingStats struct { DictEncodingStats map[parquet.Encoding]int32 DataEncodingStats map[parquet.Encoding]int32 }
EncodingStats is a helper struct for passing the encoding stat information for finishing up metadata for a column chunk.
type FileCryptoMetadata ¶
type FileCryptoMetadata struct {
// contains filtered or unexported fields
}
FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object
func NewFileCryptoMetaData ¶
func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error)
NewFileCryptoMetaData takes in the raw serialized bytes to deserialize storing the number of bytes that were actually deserialized.
func (FileCryptoMetadata) EncryptionAlgorithm ¶
func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm
EncryptionAlgorithm constructs the object from the thrift instance of the encryption algorithm
func (FileCryptoMetadata) KeyMetadata ¶
func (fc FileCryptoMetadata) KeyMetadata() []byte
func (FileCryptoMetadata) Len ¶
func (fc FileCryptoMetadata) Len() int
Len is the number of bytes that were deserialized to create this object
type FileMetaData ¶
type FileMetaData struct { *format.FileMetaData Schema *schema.Schema FileDecryptor encryption.FileDecryptor // contains filtered or unexported fields }
FileMetaData is a proxy around the underlying thrift FileMetaData object to make it easier to use and interact with.
func NewFileMetaData ¶
func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error)
NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize and will attempt to decrypt the footer if a decryptor is provided.
func (*FileMetaData) AppendRowGroups ¶
func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error
AppendRowGroups will add all of the rowgroup metadata from other to the current file metadata
func (*FileMetaData) EncryptionAlgorithm ¶
func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm
EncryptionAlgorithm constructs the algorithm object from the thrift information or returns an empty instance if it was not set.
func (*FileMetaData) Equals ¶
func (f *FileMetaData) Equals(other *FileMetaData) bool
func (*FileMetaData) KeyValueMetadata ¶
func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata
func (*FileMetaData) NumSchemaElements ¶
func (f *FileMetaData) NumSchemaElements() int
NumSchemaElements is the length of the flattened schema list in the thrift
func (*FileMetaData) RowGroup ¶
func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData
RowGroup provides the metadata for the (0-based) index of the row group
func (*FileMetaData) Serialize ¶
func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error)
func (*FileMetaData) SerializeString ¶
func (f *FileMetaData) SerializeString(ctx context.Context) (string, error)
func (*FileMetaData) SetFilePath ¶
func (f *FileMetaData) SetFilePath(path string)
SetFilePath will set the file path into all of the columns in each row group.
func (*FileMetaData) Size ¶
func (f *FileMetaData) Size() int
Size is the length of the raw serialized metadata bytes in the footer
func (*FileMetaData) Subset ¶
func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error)
Subset will construct a new FileMetaData object containing only the requested row groups by index
func (*FileMetaData) VerifySignature ¶
func (f *FileMetaData) VerifySignature(signature []byte) bool
VerifySignature constructs a cryptographic signature using the FileDecryptor of the footer and then verifies it's integrity.
Panics if f.FileDecryptor is nil
func (*FileMetaData) Version ¶
func (f *FileMetaData) Version() parquet.Version
Version returns the "version" of the file
WARNING: The value returned by this method is unreliable as 1) the parquet file metadata stores the version as a single integer and 2) some producers are known to always write a hardcoded value. Therefore you cannot use this value to know which features are used in the file.
func (*FileMetaData) WriteTo ¶
func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error)
WriteTo will serialize and write out this file metadata, encrypting it if appropriate.
If it is an encrypted file with a plaintext footer, then we will write the signature with the unencrypted footer.
func (*FileMetaData) WriterVersion ¶
func (f *FileMetaData) WriterVersion() *AppVersion
WriterVersion returns the constructed application version from the created by string
type FileMetaDataBuilder ¶
type FileMetaDataBuilder struct {
// contains filtered or unexported fields
}
FileMetaDataBuilder is a proxy for more easily constructing file metadata particularly used when writing a file out.
func NewFileMetadataBuilder ¶
func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder
NewFileMetadataBuilder will use the default writer properties if nil is passed for the writer properties and nil is allowable for the key value metadata.
func (*FileMetaDataBuilder) AppendRowGroup ¶
func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder
AppendRowGroup adds a rowgroup to the list and returns a builder for that row group
func (*FileMetaDataBuilder) Finish ¶
func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error)
Finish will finalize the metadata of the number of rows, row groups, version etc. This will clear out this filemetadatabuilder so it can be re-used
func (*FileMetaDataBuilder) GetFileCryptoMetaData ¶
func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata
GetFileCryptoMetaData returns the cryptographic information for encrypting/ decrypting the file.
type FixedLenByteArrayStatistics ¶
type FixedLenByteArrayStatistics struct {
// contains filtered or unexported fields
}
FixedLenByteArrayStatistics is the typed interface for managing stats for a column of FixedLenByteArray type.
func NewFixedLenByteArrayStatistics ¶
func NewFixedLenByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *FixedLenByteArrayStatistics
NewFixedLenByteArrayStatistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.FixedLenByteArray
func NewFixedLenByteArrayStatisticsFromEncoded ¶
func NewFixedLenByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *FixedLenByteArrayStatistics
NewFixedLenByteArrayStatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*FixedLenByteArrayStatistics) DistinctCount ¶
func (s *FixedLenByteArrayStatistics) DistinctCount() int64
func (*FixedLenByteArrayStatistics) Encode ¶
func (s *FixedLenByteArrayStatistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*FixedLenByteArrayStatistics) EncodeMax ¶
func (s *FixedLenByteArrayStatistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*FixedLenByteArrayStatistics) EncodeMin ¶
func (s *FixedLenByteArrayStatistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*FixedLenByteArrayStatistics) Equals ¶
func (s *FixedLenByteArrayStatistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*FixedLenByteArrayStatistics) HasDistinctCount ¶
func (s *FixedLenByteArrayStatistics) HasDistinctCount() bool
func (*FixedLenByteArrayStatistics) HasMinMax ¶
func (s *FixedLenByteArrayStatistics) HasMinMax() bool
func (*FixedLenByteArrayStatistics) HasNullCount ¶
func (s *FixedLenByteArrayStatistics) HasNullCount() bool
func (*FixedLenByteArrayStatistics) Max ¶
func (s *FixedLenByteArrayStatistics) Max() parquet.FixedLenByteArray
func (*FixedLenByteArrayStatistics) Merge ¶
func (s *FixedLenByteArrayStatistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*FixedLenByteArrayStatistics) Min ¶
func (s *FixedLenByteArrayStatistics) Min() parquet.FixedLenByteArray
func (*FixedLenByteArrayStatistics) MinMaxEqual ¶
func (s *FixedLenByteArrayStatistics) MinMaxEqual(rhs *FixedLenByteArrayStatistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*FixedLenByteArrayStatistics) NullCount ¶
func (s *FixedLenByteArrayStatistics) NullCount() int64
func (*FixedLenByteArrayStatistics) NumValues ¶
func (s *FixedLenByteArrayStatistics) NumValues() int64
func (*FixedLenByteArrayStatistics) SetMinMax ¶
func (s *FixedLenByteArrayStatistics) SetMinMax(argMin, argMax parquet.FixedLenByteArray)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*FixedLenByteArrayStatistics) Update ¶
func (s *FixedLenByteArrayStatistics) Update(values []parquet.FixedLenByteArray, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*FixedLenByteArrayStatistics) UpdateSpaced ¶
func (s *FixedLenByteArrayStatistics) UpdateSpaced(values []parquet.FixedLenByteArray, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type Float32Statistics ¶
type Float32Statistics struct {
// contains filtered or unexported fields
}
Float32Statistics is the typed interface for managing stats for a column of Float32 type.
func NewFloat32Statistics ¶
func NewFloat32Statistics(descr *schema.Column, mem memory.Allocator) *Float32Statistics
NewFloat32Statistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Float
func NewFloat32StatisticsFromEncoded ¶
func NewFloat32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float32Statistics
NewFloat32StatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*Float32Statistics) DistinctCount ¶
func (s *Float32Statistics) DistinctCount() int64
func (*Float32Statistics) Encode ¶
func (s *Float32Statistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*Float32Statistics) EncodeMax ¶
func (s *Float32Statistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*Float32Statistics) EncodeMin ¶
func (s *Float32Statistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*Float32Statistics) Equals ¶
func (s *Float32Statistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*Float32Statistics) HasDistinctCount ¶
func (s *Float32Statistics) HasDistinctCount() bool
func (*Float32Statistics) HasNullCount ¶
func (s *Float32Statistics) HasNullCount() bool
func (*Float32Statistics) Max ¶
func (s *Float32Statistics) Max() float32
func (*Float32Statistics) Merge ¶
func (s *Float32Statistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*Float32Statistics) Min ¶
func (s *Float32Statistics) Min() float32
func (*Float32Statistics) MinMaxEqual ¶
func (s *Float32Statistics) MinMaxEqual(rhs *Float32Statistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*Float32Statistics) SetMinMax ¶
func (s *Float32Statistics) SetMinMax(argMin, argMax float32)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*Float32Statistics) Update ¶
func (s *Float32Statistics) Update(values []float32, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*Float32Statistics) UpdateSpaced ¶
func (s *Float32Statistics) UpdateSpaced(values []float32, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type Float64Statistics ¶
type Float64Statistics struct {
// contains filtered or unexported fields
}
Float64Statistics is the typed interface for managing stats for a column of Float64 type.
func NewFloat64Statistics ¶
func NewFloat64Statistics(descr *schema.Column, mem memory.Allocator) *Float64Statistics
NewFloat64Statistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Double
func NewFloat64StatisticsFromEncoded ¶
func NewFloat64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float64Statistics
NewFloat64StatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*Float64Statistics) DistinctCount ¶
func (s *Float64Statistics) DistinctCount() int64
func (*Float64Statistics) Encode ¶
func (s *Float64Statistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*Float64Statistics) EncodeMax ¶
func (s *Float64Statistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*Float64Statistics) EncodeMin ¶
func (s *Float64Statistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*Float64Statistics) Equals ¶
func (s *Float64Statistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*Float64Statistics) HasDistinctCount ¶
func (s *Float64Statistics) HasDistinctCount() bool
func (*Float64Statistics) HasNullCount ¶
func (s *Float64Statistics) HasNullCount() bool
func (*Float64Statistics) Max ¶
func (s *Float64Statistics) Max() float64
func (*Float64Statistics) Merge ¶
func (s *Float64Statistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*Float64Statistics) Min ¶
func (s *Float64Statistics) Min() float64
func (*Float64Statistics) MinMaxEqual ¶
func (s *Float64Statistics) MinMaxEqual(rhs *Float64Statistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*Float64Statistics) SetMinMax ¶
func (s *Float64Statistics) SetMinMax(argMin, argMax float64)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*Float64Statistics) Update ¶
func (s *Float64Statistics) Update(values []float64, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*Float64Statistics) UpdateSpaced ¶
func (s *Float64Statistics) UpdateSpaced(values []float64, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type Int32Statistics ¶
type Int32Statistics struct {
// contains filtered or unexported fields
}
Int32Statistics is the typed interface for managing stats for a column of Int32 type.
func NewInt32Statistics ¶
func NewInt32Statistics(descr *schema.Column, mem memory.Allocator) *Int32Statistics
NewInt32Statistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Int32
func NewInt32StatisticsFromEncoded ¶
func NewInt32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int32Statistics
NewInt32StatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*Int32Statistics) DistinctCount ¶
func (s *Int32Statistics) DistinctCount() int64
func (*Int32Statistics) Encode ¶
func (s *Int32Statistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*Int32Statistics) EncodeMax ¶
func (s *Int32Statistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*Int32Statistics) EncodeMin ¶
func (s *Int32Statistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*Int32Statistics) Equals ¶
func (s *Int32Statistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*Int32Statistics) HasDistinctCount ¶
func (s *Int32Statistics) HasDistinctCount() bool
func (*Int32Statistics) HasNullCount ¶
func (s *Int32Statistics) HasNullCount() bool
func (*Int32Statistics) Max ¶
func (s *Int32Statistics) Max() int32
func (*Int32Statistics) Merge ¶
func (s *Int32Statistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*Int32Statistics) Min ¶
func (s *Int32Statistics) Min() int32
func (*Int32Statistics) MinMaxEqual ¶
func (s *Int32Statistics) MinMaxEqual(rhs *Int32Statistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*Int32Statistics) SetMinMax ¶
func (s *Int32Statistics) SetMinMax(argMin, argMax int32)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*Int32Statistics) Update ¶
func (s *Int32Statistics) Update(values []int32, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*Int32Statistics) UpdateSpaced ¶
func (s *Int32Statistics) UpdateSpaced(values []int32, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type Int64Statistics ¶
type Int64Statistics struct {
// contains filtered or unexported fields
}
Int64Statistics is the typed interface for managing stats for a column of Int64 type.
func NewInt64Statistics ¶
func NewInt64Statistics(descr *schema.Column, mem memory.Allocator) *Int64Statistics
NewInt64Statistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Int64
func NewInt64StatisticsFromEncoded ¶
func NewInt64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int64Statistics
NewInt64StatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*Int64Statistics) DistinctCount ¶
func (s *Int64Statistics) DistinctCount() int64
func (*Int64Statistics) Encode ¶
func (s *Int64Statistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*Int64Statistics) EncodeMax ¶
func (s *Int64Statistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*Int64Statistics) EncodeMin ¶
func (s *Int64Statistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*Int64Statistics) Equals ¶
func (s *Int64Statistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*Int64Statistics) HasDistinctCount ¶
func (s *Int64Statistics) HasDistinctCount() bool
func (*Int64Statistics) HasNullCount ¶
func (s *Int64Statistics) HasNullCount() bool
func (*Int64Statistics) Max ¶
func (s *Int64Statistics) Max() int64
func (*Int64Statistics) Merge ¶
func (s *Int64Statistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*Int64Statistics) Min ¶
func (s *Int64Statistics) Min() int64
func (*Int64Statistics) MinMaxEqual ¶
func (s *Int64Statistics) MinMaxEqual(rhs *Int64Statistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*Int64Statistics) SetMinMax ¶
func (s *Int64Statistics) SetMinMax(argMin, argMax int64)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*Int64Statistics) Update ¶
func (s *Int64Statistics) Update(values []int64, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*Int64Statistics) UpdateSpaced ¶
func (s *Int64Statistics) UpdateSpaced(values []int64, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type Int96Statistics ¶
type Int96Statistics struct {
// contains filtered or unexported fields
}
Int96Statistics is the typed interface for managing stats for a column of Int96 type.
func NewInt96Statistics ¶
func NewInt96Statistics(descr *schema.Column, mem memory.Allocator) *Int96Statistics
NewInt96Statistics constructs an appropriate stat object type using the given column descriptor and allocator.
Panics if the physical type of descr is not parquet.Type.Int96
func NewInt96StatisticsFromEncoded ¶
func NewInt96StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int96Statistics
NewInt96StatisticsFromEncoded will construct a propertly typed statistics object initializing it with the provided information.
func (*Int96Statistics) DistinctCount ¶
func (s *Int96Statistics) DistinctCount() int64
func (*Int96Statistics) Encode ¶
func (s *Int96Statistics) Encode() (enc EncodedStatistics, err error)
Encode returns a populated EncodedStatistics object
func (*Int96Statistics) EncodeMax ¶
func (s *Int96Statistics) EncodeMax() []byte
EncodeMax returns the current encoded max value with plain encoding
ByteArray stats do not include the length in the encoding
func (*Int96Statistics) EncodeMin ¶
func (s *Int96Statistics) EncodeMin() []byte
EncodeMin returns the encoded min value with plain encoding.
ByteArray stats do not include the length in the encoding.
func (*Int96Statistics) Equals ¶
func (s *Int96Statistics) Equals(other TypedStatistics) bool
Equals returns true only if both objects are the same type, have the same min and max values, null count, distinct count and number of values.
func (*Int96Statistics) HasDistinctCount ¶
func (s *Int96Statistics) HasDistinctCount() bool
func (*Int96Statistics) HasNullCount ¶
func (s *Int96Statistics) HasNullCount() bool
func (*Int96Statistics) Max ¶
func (s *Int96Statistics) Max() parquet.Int96
func (*Int96Statistics) Merge ¶
func (s *Int96Statistics) Merge(other TypedStatistics)
Merge merges the stats from other into this stat object, updating the null count, distinct count, number of values and the min/max if appropriate.
func (*Int96Statistics) Min ¶
func (s *Int96Statistics) Min() parquet.Int96
func (*Int96Statistics) MinMaxEqual ¶
func (s *Int96Statistics) MinMaxEqual(rhs *Int96Statistics) bool
MinMaxEqual returns true if both stat objects have the same Min and Max values
func (*Int96Statistics) SetMinMax ¶
func (s *Int96Statistics) SetMinMax(argMin, argMax parquet.Int96)
SetMinMax updates the min and max values only if they are not currently set or if argMin is less than the current min / argMax is greater than the current max
func (*Int96Statistics) Update ¶
func (s *Int96Statistics) Update(values []parquet.Int96, numNull int64)
Update is used to add more values to the current stat object, finding the min and max values etc.
func (*Int96Statistics) UpdateSpaced ¶
func (s *Int96Statistics) UpdateSpaced(values []parquet.Int96, validBits []byte, validBitsOffset, numNull int64)
UpdateSpaced is just like Update, but for spaced values using validBits to determine and skip null values.
type KeyValueMetadata ¶
KeyValueMetadata is an alias for a slice of thrift keyvalue pairs.
It is presumed that the metadata should all be utf8 valid.
func NewKeyValueMetadata ¶
func NewKeyValueMetadata() KeyValueMetadata
NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0)
func (*KeyValueMetadata) Append ¶
func (k *KeyValueMetadata) Append(key, value string) error
Append adds the passed in key and value to the metadata, if either contains any invalid utf8 runes, then it is not added and an error is returned.
func (KeyValueMetadata) Equals ¶
func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool
Equals compares all of the metadata keys and values to check they are equal
func (KeyValueMetadata) FindValue ¶
func (k KeyValueMetadata) FindValue(key string) *string
func (KeyValueMetadata) Keys ¶
func (k KeyValueMetadata) Keys() (ret []string)
func (KeyValueMetadata) Len ¶
func (k KeyValueMetadata) Len() int
func (KeyValueMetadata) Values ¶
func (k KeyValueMetadata) Values() (ret []string)
type PageEncodingStats ¶
PageEncodingStats is used for counting the number of pages of specific types with the given internal encoding.
type RowGroupMetaData ¶
RowGroupMetaData is a proxy around the thrift RowGroup meta data object
func NewRowGroupMetaData ¶
func NewRowGroupMetaData(rg *format.RowGroup, sc *schema.Schema, version *AppVersion, decryptor encryption.FileDecryptor) *RowGroupMetaData
NewRowGroupMetaData constructs an object from the underlying thrift objects and schema, decrypting if provided and necessary. This is primarily used internally and consumers should use the RowGroupMetaDataBuilder rather than this directly.
func (*RowGroupMetaData) ColumnChunk ¶
func (r *RowGroupMetaData) ColumnChunk(i int) (*ColumnChunkMetaData, error)
ColumnChunk returns the metadata for the requested (0-based) chunk index
func (*RowGroupMetaData) Equals ¶
func (r *RowGroupMetaData) Equals(other *RowGroupMetaData) bool
func (*RowGroupMetaData) FileOffset ¶
func (r *RowGroupMetaData) FileOffset() int64
FileOffset is the location in the file where the data for this rowgroup begins
func (*RowGroupMetaData) NumColumns ¶
func (r *RowGroupMetaData) NumColumns() int
NumColumns returns the number of column metadata objects in this row group
func (*RowGroupMetaData) NumRows ¶
func (r *RowGroupMetaData) NumRows() int64
NumRows is just the number of rows in this row group. All columns have the same number of rows for a row group regardless of repetition and definition levels.
func (*RowGroupMetaData) Ordinal ¶
func (r *RowGroupMetaData) Ordinal() int16
Ordinal is the row group number in order for the given file.
func (*RowGroupMetaData) TotalByteSize ¶
func (r *RowGroupMetaData) TotalByteSize() int64
TotalByteSize is the total size of this rowgroup on disk
func (*RowGroupMetaData) TotalCompressedSize ¶
func (r *RowGroupMetaData) TotalCompressedSize() int64
type RowGroupMetaDataBuilder ¶
type RowGroupMetaDataBuilder struct {
// contains filtered or unexported fields
}
RowGroupMetaDataBuilder is a convenience object for constructing row group metadata information. Primarily used in conjunction with writing new files.
func NewRowGroupMetaDataBuilder ¶
func NewRowGroupMetaDataBuilder(props *parquet.WriterProperties, schema *schema.Schema, rg *format.RowGroup) *RowGroupMetaDataBuilder
NewRowGroupMetaDataBuilder returns a builder using the given properties and underlying thrift object.
This is primarily used internally, consumers should use the file metadatabuilder and call AppendRowGroup on it to get instances of RowGroupMetaDataBuilder
func (*RowGroupMetaDataBuilder) CurrentColumn ¶
func (r *RowGroupMetaDataBuilder) CurrentColumn() int
CurrentColumn returns the current column chunk (0-based) index that is being built.
Returns -1 until the first time NextColumnChunk is called.
func (*RowGroupMetaDataBuilder) Finish ¶
func (r *RowGroupMetaDataBuilder) Finish(totalBytesWritten int64, ordinal int16) error
Finish should be called when complete and updates the metadata with the final file offset, and total compressed sizes. totalBytesWritten gets written as the TotalByteSize for the row group and Ordinal should be the index of the row group being written. e.g. first row group should be 0, second is 1, and so on...
func (*RowGroupMetaDataBuilder) NextColumnChunk ¶
func (r *RowGroupMetaDataBuilder) NextColumnChunk() *ColumnChunkMetaDataBuilder
NextColumnChunk appends a new column chunk, updates the column index, and returns a builder for that column chunk's metadata
func (*RowGroupMetaDataBuilder) NumColumns ¶
func (r *RowGroupMetaDataBuilder) NumColumns() int
NumColumns returns the current number of columns in this metadata
func (*RowGroupMetaDataBuilder) NumRows ¶
func (r *RowGroupMetaDataBuilder) NumRows() int64
func (*RowGroupMetaDataBuilder) SetNumRows ¶
func (r *RowGroupMetaDataBuilder) SetNumRows(nrows int)
type StatProvider ¶
type TypedStatistics ¶
type TypedStatistics interface { // Type is the underlying physical type for this stat block Type() parquet.Type // Returns true if there is a min and max value set for this stat object HasMinMax() bool // Returns true if a nullcount has been set HasNullCount() bool // returns true only if a distinct count has been set // current implementation does of the writer does not automatically populate // the distinct count right now. HasDistinctCount() bool NullCount() int64 DistinctCount() int64 NumValues() int64 // return the column descriptor that this stat object was initialized with Descr() *schema.Column // Encode the current min value and return the bytes. ByteArray does not // include the len in the encoded bytes, otherwise this is identical to // plain encoding EncodeMin() []byte // Encode the current max value and return the bytes. ByteArray does not // include the len in the encoded bytes, otherwise this is identical to // plain encoding EncodeMax() []byte // Populate an EncodedStatistics object from the current stats Encode() (EncodedStatistics, error) // Resets all values to 0 to enable reusing this stat object for multiple // columns, by calling Encode to get the finished values and then calling // reset Reset() // Merge the min/max/nullcounts and distinct count from the passed stat object // into this one. Merge(TypedStatistics) }
TypedStatistics is the base interface for dealing with stats as they are being populated
func NewStatistics ¶
func NewStatistics(descr *schema.Column, mem memory.Allocator) TypedStatistics
NewStatistics uses the type in the column descriptor to construct the appropriate typed stats object. If mem is nil, then memory.DefaultAllocator will be used.
func NewStatisticsFromEncoded ¶
func NewStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) TypedStatistics
NewStatisticsFromEncoded uses the provided information to initialize a typed stat object by checking the type of the provided column descriptor.
If mem is nil, then memory.DefaultAllocator is used.