Documentation ¶
Overview ¶
Package parquet is a library for working with parquet files. For an overview of Parquet's qualities as a storage format, see this blog post: https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet
Or see the Parquet documentation: https://parquet.apache.org/documentation/latest/
Index ¶
- Constants
- Variables
- func CopyPages(dst PageWriter, src PageReader) (numValues int64, err error)
- func CopyRows(dst RowWriter, src RowReader) (int64, error)
- func CopyValues(dst ValueWriter, src ValueReader) (int64, error)
- func Equal(v1, v2 Value) bool
- func LookupCompressionCodec(codec format.CompressionCodec) compress.Codec
- func LookupEncoding(enc format.Encoding) encoding.Encoding
- func Print(w io.Writer, name string, node Node) error
- func PrintIndent(w io.Writer, name string, node Node, pattern, newline string) error
- type BloomFilter
- type BloomFilterColumn
- type Buffer
- func (buf *Buffer) Column(i int) ColumnChunk
- func (buf *Buffer) Len() int
- func (buf *Buffer) Less(i, j int) bool
- func (buf *Buffer) NumColumns() int
- func (buf *Buffer) NumRows() int64
- func (buf *Buffer) Reset()
- func (buf *Buffer) Rows() Rows
- func (buf *Buffer) Schema() *Schema
- func (buf *Buffer) Size() int64
- func (buf *Buffer) SortingColumns() []SortingColumn
- func (buf *Buffer) Swap(i, j int)
- func (buf *Buffer) Write(row interface{}) error
- func (buf *Buffer) WriteRow(row Row) error
- func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error)
- type BufferedPage
- type Column
- func (c *Column) ChildByIndex(index int) Node
- func (c *Column) ChildByName(name string) Node
- func (c *Column) ChildNames() []string
- func (c *Column) Children() []*Column
- func (c *Column) Column(name string) *Column
- func (c *Column) Columns() []*Column
- func (c *Column) Compression() []compress.Codec
- func (c *Column) Depth() int
- func (c *Column) Encoding() []encoding.Encoding
- func (c *Column) GoType() reflect.Type
- func (c *Column) Index() int
- func (c *Column) MaxDefinitionLevel() int
- func (c *Column) MaxRepetitionLevel() int
- func (c *Column) Name() string
- func (c *Column) NumChildren() int
- func (c *Column) Optional() bool
- func (c *Column) Pages() Pages
- func (c *Column) Path() []string
- func (c *Column) Repeated() bool
- func (c *Column) Required() bool
- func (c *Column) String() string
- func (c *Column) Type() Type
- func (c *Column) ValueByIndex(base reflect.Value, index int) reflect.Value
- func (c *Column) ValueByName(base reflect.Value, name string) reflect.Value
- type ColumnBuffer
- type ColumnChunk
- type ColumnIndex
- type ColumnIndexer
- type ColumnReader
- type CompressedPage
- type Conversion
- type ConvertError
- type DataPageHeader
- type DataPageHeaderV1
- func (v1 DataPageHeaderV1) DefinitionLevelEncoding() format.Encoding
- func (v1 DataPageHeaderV1) Encoding() format.Encoding
- func (v1 DataPageHeaderV1) IsCompressed(codec format.CompressionCodec) bool
- func (v1 DataPageHeaderV1) MaxValue() []byte
- func (v1 DataPageHeaderV1) MinValue() []byte
- func (v1 DataPageHeaderV1) NullCount() int64
- func (v1 DataPageHeaderV1) NumValues() int64
- func (v1 DataPageHeaderV1) PageType() format.PageType
- func (v1 DataPageHeaderV1) RepetitionLevelEncoding() format.Encoding
- func (v1 DataPageHeaderV1) String() string
- type DataPageHeaderV2
- func (v2 DataPageHeaderV2) DefinitionLevelEncoding() format.Encoding
- func (v2 DataPageHeaderV2) DefinitionLevelsByteLength() int64
- func (v2 DataPageHeaderV2) Encoding() format.Encoding
- func (v2 DataPageHeaderV2) IsCompressed(codec format.CompressionCodec) bool
- func (v2 DataPageHeaderV2) MaxValue() []byte
- func (v2 DataPageHeaderV2) MinValue() []byte
- func (v2 DataPageHeaderV2) NullCount() int64
- func (v2 DataPageHeaderV2) NumNulls() int64
- func (v2 DataPageHeaderV2) NumRows() int64
- func (v2 DataPageHeaderV2) NumValues() int64
- func (v2 DataPageHeaderV2) PageType() format.PageType
- func (v2 DataPageHeaderV2) RepetitionLevelEncoding() format.Encoding
- func (v2 DataPageHeaderV2) RepetitionLevelsByteLength() int64
- func (v2 DataPageHeaderV2) String() string
- type Dictionary
- type DictionaryPageHeader
- type File
- func (f *File) ColumnIndexes() []format.ColumnIndex
- func (f *File) Lookup(key string) (value string, ok bool)
- func (f *File) NumRowGroups() int
- func (f *File) OffsetIndexes() []format.OffsetIndex
- func (f *File) ReadAt(b []byte, off int64) (int, error)
- func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error)
- func (f *File) Root() *Column
- func (f *File) RowGroup(i int) RowGroup
- func (f *File) Size() int64
- type FileConfig
- type FileOption
- type Group
- func (g Group) ChildByName(name string) Node
- func (g Group) ChildNames() []string
- func (g Group) Compression() []compress.Codec
- func (g Group) Encoding() []encoding.Encoding
- func (g Group) GoType() reflect.Type
- func (g Group) NumChildren() int
- func (g Group) Optional() bool
- func (g Group) Repeated() bool
- func (g Group) Required() bool
- func (g Group) String() string
- func (g Group) Type() Type
- func (g Group) ValueByName(base reflect.Value, name string) reflect.Value
- type IndexedNode
- type Kind
- type Node
- func BSON() Node
- func Compressed(node Node, codecs ...compress.Codec) Node
- func Date() Node
- func Decimal(scale, precision int, typ Type) Node
- func Encoded(node Node, encodings ...encoding.Encoding) Node
- func Enum() Node
- func Int(bitWidth int) Node
- func JSON() Node
- func Leaf(typ Type) Node
- func List(of Node) Node
- func Map(key, value Node) Node
- func Optional(node Node) Node
- func Repeated(node Node) Node
- func Required(node Node) Node
- func String() Node
- func Time(unit TimeUnit) Node
- func Timestamp(unit TimeUnit) Node
- func UUID() Node
- func Uint(bitWidth int) Node
- type OffsetIndex
- type Page
- type PageBufferPool
- type PageHeader
- type PageReader
- type PageWriter
- type Pages
- type Reader
- type ReaderConfig
- type ReaderOption
- type RequiredReader
- type RequiredWriter
- type Row
- type RowGroup
- type RowGroupConfig
- type RowGroupOption
- type RowGroupReader
- type RowGroupWriter
- type RowReadSeeker
- type RowReader
- type RowReaderAt
- type RowReaderFrom
- type RowReaderWithSchema
- type RowSeeker
- type RowWriter
- type RowWriterAt
- type RowWriterTo
- type RowWriterWithSchema
- type Rows
- type Schema
- func (s *Schema) ChildByName(name string) Node
- func (s *Schema) ChildNames() []string
- func (s *Schema) Compression() []compress.Codec
- func (s *Schema) ConfigureReader(config *ReaderConfig)
- func (s *Schema) ConfigureRowGroup(config *RowGroupConfig)
- func (s *Schema) ConfigureWriter(config *WriterConfig)
- func (s *Schema) Deconstruct(row Row, value interface{}) Row
- func (s *Schema) Encoding() []encoding.Encoding
- func (s *Schema) GoType() reflect.Type
- func (s *Schema) Name() string
- func (s *Schema) NumChildren() int
- func (s *Schema) Optional() bool
- func (s *Schema) Reconstruct(value interface{}, row Row) error
- func (s *Schema) Repeated() bool
- func (s *Schema) Required() bool
- func (s *Schema) String() string
- func (s *Schema) Type() Type
- func (s *Schema) ValueByName(base reflect.Value, name string) reflect.Value
- type SortConfig
- type SortFunc
- type SortOption
- type SortingColumn
- type TimeUnit
- type Type
- type Value
- func (v Value) AppendBytes(b []byte) []byte
- func (v Value) Boolean() bool
- func (v Value) ByteArray() []byte
- func (v Value) Bytes() []byte
- func (v Value) Clone() Value
- func (v Value) Column() int
- func (v Value) DefinitionLevel() int
- func (v Value) Double() float64
- func (v Value) Float() float32
- func (v Value) Format(w fmt.State, r rune)
- func (v Value) GoString() string
- func (v Value) Int32() int32
- func (v Value) Int64() int64
- func (v Value) Int96() deprecated.Int96
- func (v Value) IsNull() bool
- func (v Value) Kind() Kind
- func (v Value) Level(repetitionLevel, definitionLevel, columnIndex int) Value
- func (v Value) RepetitionLevel() int
- func (v Value) String() string
- type ValueReader
- type ValueReaderFrom
- type ValueWriter
- type ValueWriterTo
- type WrappedNode
- type Writer
- func (w *Writer) Close() error
- func (w *Writer) Flush() error
- func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error)
- func (w *Writer) Reset(output io.Writer)
- func (w *Writer) Schema() *Schema
- func (w *Writer) Write(row interface{}) error
- func (w *Writer) WriteRow(row Row) error
- func (w *Writer) WriteRowGroup(rowGroup RowGroup) (int64, error)
- type WriterConfig
- type WriterOption
- func BloomFilters(filters ...BloomFilterColumn) WriterOption
- func ColumnIndexSizeLimit(sizeLimit int) WriterOption
- func ColumnPageBuffers(buffers PageBufferPool) WriterOption
- func CreatedBy(createdBy string) WriterOption
- func DataPageStatistics(enabled bool) WriterOption
- func DataPageVersion(version int) WriterOption
- func KeyValueMetadata(key, value string) WriterOption
- func PageBufferSize(size int) WriterOption
Constants ¶
const ( DefaultCreatedBy = "github.com/segmentio/parquet-go" DefaultColumnIndexSizeLimit = 16 DefaultColumnBufferSize = 1 * 1024 * 1024 DefaultPageBufferSize = 1 * 1024 * 1024 DefaultDataPageVersion = 2 DefaultDataPageStatistics = false DefaultSkipPageIndex = false DefaultSkipBloomFilters = false )
const ( // MaxColumnDepth is the maximum column depth supported by this package. MaxColumnDepth = math.MaxInt8 // MaxColumnIndex is the maximum column index supported by this package. MaxColumnIndex = math.MaxInt16 // MaxRepetitionLevel is the maximum repetition level supported by this package. MaxRepetitionLevel = math.MaxInt8 // MaxDefinitionLevel is the maximum definition level supported by this package. MaxDefinitionLevel = math.MaxInt8 )
Variables ¶
var ( // Uncompressed is a parquet compression codec representing uncompressed // pages. Uncompressed uncompressed.Codec // Snappy is the SNAPPY parquet compression codec. Snappy snappy.Codec // Gzip is the GZIP parquet compression codec. Gzip = gzip.Codec{ Level: gzip.DefaultCompression, } // Brotli is the BROTLI parquet compression codec. Brotli = brotli.Codec{ Quality: brotli.DefaultQuality, LGWin: brotli.DefaultLGWin, } // Zstd is the ZSTD parquet compression codec. Zstd = zstd.Codec{ Level: zstd.DefaultLevel, Concurrency: zstd.DefaultConcurrency, } // Lz4Raw is the LZ4_RAW parquet compression codec. Lz4Raw = lz4.Codec{ BlockSize: lz4.DefaultBlockSize, Level: lz4.DefaultLevel, Concurrency: lz4.DefaultConcurrency, } )
var ( // Plain is the default parquet encoding. Plain plain.Encoding // RLE is the hybrid bit-pack/run-length parquet encoding. RLE rle.Encoding // PlainDictionary is the plain dictionary parquet encoding. // // This encoding should not be used anymore in parquet 2.0 and later, // it is implemented for backwards compatibility to support reading // files that were encoded with older parquet libraries. PlainDictionary plain.DictionaryEncoding // RLEDictionary is the RLE dictionary parquet encoding. RLEDictionary rle.DictionaryEncoding // DeltaBinaryPacked is the delta binary packed parquet encoding. DeltaBinaryPacked delta.BinaryPackedEncoding // DeltaLengthByteArray is the delta length byte array parquet encoding. DeltaLengthByteArray delta.LengthByteArrayEncoding // DeltaByteArray is the delta byte array parquet encoding. DeltaByteArray delta.ByteArrayEncoding // ByteStreamSplit is an encoding for floating-point data. ByteStreamSplit bytestreamsplit.Encoding )
var ( // ErrCorrupted is an error returned by the Err method of ColumnPages // instances when they encountered a mismatch between the CRC checksum // recorded in a page header and the one computed while reading the page // data. ErrCorrupted = errors.New("corrupted parquet page") // ErrMissingRootColumn is an error returned when opening an invalid parquet // file which does not have a root column. ErrMissingRootColumn = errors.New("parquet file is missing a root column") // ErrRowGroupSchemaMissing is an error returned when attempting to write a // row group but the source has no schema. ErrRowGroupSchemaMissing = errors.New("cannot write rows to a row group which has no schema") // ErrRowGroupSchemaMismatch is an error returned when attempting to write a // row group but the source and destination schemas differ. ErrRowGroupSchemaMismatch = errors.New("cannot write row groups with mismatching schemas") // ErrRowGroupSortingColumnsMismatch is an error returned when attempting to // write a row group but the sorting columns differ in the source and // destination. ErrRowGroupSortingColumnsMismatch = errors.New("cannot write row groups with mismatching sorting columns") // ErrSeekOutOfRange is an error returned when seeking to a row index which // is less than the first row of a page. ErrSeekOutOfRange = errors.New("seek to row index out of page range") )
Functions ¶
func CopyPages ¶
func CopyPages(dst PageWriter, src PageReader) (numValues int64, err error)
CopyPages copies pages from src to dst, returning the number of values that were copied.
The function returns any error it encounters reading or writing pages, except for io.EOF from the reader which indicates that there were no more pages to read.
func CopyRows ¶
CopyRows copies rows from src to dst.
The underlying types of src and dst are tested to determine if they expose information about the schema of rows that are read and expected to be written. If the schema information are available but do not match, the function will attempt to automatically convert the rows from the source schema to the destination.
As an optimization, the src argument may implement RowWriterTo to bypass the default row copy logic and provide its own. The dst argument may also implement RowReaderFrom for the same purpose.
The function returns the number of rows written, or any error encountered other than io.EOF.
func CopyValues ¶
func CopyValues(dst ValueWriter, src ValueReader) (int64, error)
CopyValues copies values from src to dst, returning the number of values that were written.
As an optimization, the reader and writer may choose to implement ValueReaderFrom and ValueWriterTo to provide their own copy logic.
The function returns any error it encounters reading or writing pages, except for io.EOF from the reader which indicates that there were no more values to read.
func Equal ¶
Equal returns true if v1 and v2 are equal.
Values are considered equal if they are of the same physical type and hold the same Go values. For BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY, the content of the underlying byte arrays are tested for equality.
Note that the repetition levels, definition levels, and column indexes are not compared by this function.
func LookupCompressionCodec ¶
func LookupCompressionCodec(codec format.CompressionCodec) compress.Codec
LookupCompressionCodec returns the compression codec associated with the given code.
The function never returns nil. If the encoding is not supported, an "unsupported" codec is returned.
func LookupEncoding ¶
LookupEncoding returns the parquet encoding associated with the given code.
The function never returns nil. If the encoding is not supported, encoding.NotSupported is returned.
Types ¶
type BloomFilter ¶
type BloomFilter interface { // Implement the io.ReaderAt interface as a mechanism to allow reading the // raw bits of the filter. io.ReaderAt // Returns the size of the bloom filter (in bytes). Size() int64 // Tests whether the given value is present in the filter. // // A non-nil error may be returned if reading the filter failed. This may // happen if the filter was lazily loaded from a storage medium during the // call to Check for example. Applications that can guarantee that the // filter was in memory at the time Check was called can safely ignore the // error, which would always be nil in this case. Check(value Value) (bool, error) }
BloomFilter is an interface allowing applications to test whether a key exists in a bloom filter.
type BloomFilterColumn ¶
type BloomFilterColumn interface { // Returns the path of the column that the filter applies to. Path() []string // Returns the hashing algorithm used when inserting values into a bloom // filter. Hash() bloom.Hash // NewFilter constructs a new bloom filter configured to hold the given // number of values and bits of filter per value. NewFilter(numValues int64, bitsPerValue uint) bloom.MutableFilter }
The BloomFilterColumn interface is a declarative representation of bloom filters used when configuring filters on a parquet writer.
func SplitBlockFilter ¶
func SplitBlockFilter(path ...string) BloomFilterColumn
SplitBlockFilter constructs a split block bloom filter object for the column at the given path.
type Buffer ¶
type Buffer struct {
// contains filtered or unexported fields
}
Buffer represents an in-memory group of parquet rows.
The main purpose of the Buffer type is to provide a way to sort rows before writing them to a parquet file. Buffer implements sort.Interface as a way to support reordering the rows that have been written to it.
func NewBuffer ¶
func NewBuffer(options ...RowGroupOption) *Buffer
NewBuffer constructs a new buffer, using the given list of buffer options to configure the buffer returned by the function.
The function panics if the buffer configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewBuffer should construct the buffer configuration independently prior to calling this function:
config, err := parquet.NewRowGroupConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a buffer is guaranteed not to panic buffer := parquet.NewBuffer(config) ... }
func (*Buffer) Column ¶
func (buf *Buffer) Column(i int) ColumnChunk
Column returns the buffer column at index i.
The method panics if i is negative or beyond the last column index in buf.
func (*Buffer) NumColumns ¶
NumColumns returns the number of columns in the buffer.
The count will be zero until a schema is configured on buf.
func (*Buffer) Reset ¶
func (buf *Buffer) Reset()
Reset clears the content of the buffer, allowing it to be reused.
func (*Buffer) Rows ¶
Rows returns a reader exposing the current content of the buffer.
The buffer and the returned reader share memory. Mutating the buffer concurrently to reading rows may result in non-deterministic behavior.
func (*Buffer) Schema ¶
Schema returns the schema of the buffer.
The schema is either configured by passing a Schema in the option list when constructing the buffer, or lazily discovered when the first row is written.
func (*Buffer) SortingColumns ¶
func (buf *Buffer) SortingColumns() []SortingColumn
SortingColumns returns the list of columns by which the buffer will be sorted.
The sorting order is configured by passing a SortingColumns option when constructing the buffer.
type BufferedPage ¶
type BufferedPage interface { Page // Returns a copy of the page which does not share any of the buffers, but // contains the same values, repetition and definition levels. Clone() BufferedPage // Returns a new page which is as slice of the receiver between row indexes // i and j. Slice(i, j int64) BufferedPage // Expose the lists of repetition and definition levels of the page. // // The returned slices may be empty when the page has no repetition or // definition levels. RepetitionLevels() []int8 DefinitionLevels() []int8 // Writes the page to the given encoder. WriteTo(encoding.Encoder) error }
BufferedPage is an extension of the Page interface implemented by pages that are buffered in memory.
type Column ¶
type Column struct {
// contains filtered or unexported fields
}
Column represents a column in a parquet file.
Methods of Column values are safe to call concurrently from multiple goroutines.
Column instances satisfy the Node interface.
func (*Column) ChildByIndex ¶
ChildByIndex returns a Node value representing the child column at the given index.
This method contributes to satisfying the IndexedNode interface.
func (*Column) ChildByName ¶
ChildByName returns a Node value representing the child column matching the name passed as argument.
This method contributes to satisfying the Node interface.
func (*Column) ChildNames ¶
ChildNames returns the names of child columns.
This method contributes to satisfying the Node interface.
func (*Column) Children ¶
Children returns the children of c.
The method returns a reference to an internal field of c that the application must treat as a read-only value.
func (*Column) Columns ¶
Columns returns the list of child columns.
The method returns the same slice across multiple calls, the program must treat it as a read-only value.
func (*Column) Compression ¶
Compression returns the compression codecs used by this column.
func (*Column) Index ¶
Index returns the position of the column in a row. Only leaf columns have a column index, the method returns -1 when called on non-leaf columns.
func (*Column) MaxDefinitionLevel ¶
MaxDefinitionLevel returns the maximum value of definition levels on this column.
func (*Column) MaxRepetitionLevel ¶
MaxRepetitionLevel returns the maximum value of repetition levels on this column.
func (*Column) NumChildren ¶
NumChildren returns the number of child columns.
This method contributes to satisfying the Node interface.
func (*Column) Type ¶
Type returns the type of the column.
The returned value is unspecified if c is not a leaf column.
func (*Column) ValueByIndex ¶
ValueByIndex returns the sub-value in base for the child column at the given index.
type ColumnBuffer ¶
type ColumnBuffer interface { // Exposes a read-only view of the column buffer. ColumnChunk // Allows reading rows back from the column by calling ReadRowAt. RowReaderAt // Provides the main mechanism for writing values to the column via the // WriteRow method. WriteRow must be called with a row containing only // the value for this column; unless this is a repeated column, the row // must contain a single value. RowWriter // The column implements ValueWriter as a mechanism to optimize the copy // of values into the buffer in contexts where the row information is // provided by the values because the repetition and definition levels // are set. ValueWriter // For indexed columns, returns the underlying dictionary holding the column // values. If the column is not indexed, nil is returned. Dictionary() Dictionary // Returns a copy of the column. The returned copy shares no memory with // the original, mutations of either column will not modify the other. Clone() ColumnBuffer // Returns the column as a BufferedPage. Page() BufferedPage // Clears all rows written to the column. Reset() // Returns the current capacity of the column (rows). Cap() int // Returns the number of rows currently written to the column. Len() int // Compares rows at index i and j and reports whether i < j. Less(i, j int) bool // Swaps rows at index i and j. Swap(i, j int) // Returns the size of the column buffer in bytes. Size() int64 }
ColumnBuffer is an interface representing columns of a row group.
ColumnBuffer implements sort.Interface as a way to support reordering the rows that have been written to it.
type ColumnChunk ¶
type ColumnChunk interface { // Returns the column type. Type() Type // Returns the index of this column in its parent row group. Column() int // Returns a reader exposing the pages of the column. Pages() Pages // Returns the components of the page index for this column chunk, // containing details about the content and location of pages within the // chunk. // // Note that the returned value may be the same across calls to these // methods, programs must treat those as read-only. // // If the column chunk does not have a page index, the methods return nil. ColumnIndex() ColumnIndex OffsetIndex() OffsetIndex BloomFilter() BloomFilter // Returns the number of values in the column chunk. // // This quantity may differ from the number of rows in the parent row group // because repeated columns may hold zero or more values per row. NumValues() int64 }
The ColumnChunk interface represents individual columns of a row group.
type ColumnIndex ¶
type ColumnIndex interface { // NumPages returns the number of paged in the column index. NumPages() int // Returns the number of null values in the page at the given index. NullCount(int) int64 // Tells whether the page at the given index contains null values only. NullPage(int) bool // PageIndex return min/max bounds for the page at the given index in the // column. MinValue(int) Value MaxValue(int) Value // IsAscending returns true if the column index min/max values are sorted // in ascending order (based on the ordering rules of the column's logical // type). IsAscending() bool // IsDescending returns true if the column index min/max values are sorted // in descending order (based on the ordering rules of the column's logical // type). IsDescending() bool }
type ColumnIndexer ¶
type ColumnIndexer interface { // Resets the column indexer state. Reset() // Add a page to the column indexer. IndexPage(numValues, numNulls int64, min, max Value) // Generates a format.ColumnIndex value from the current state of the // column indexer. // // The returned value may reference internal buffers, in which case the // values remain valid until the next call to IndexPage or Reset on the // column indexer. ColumnIndex() format.ColumnIndex }
The ColumnIndexer interface is implemented by types that support generating parquet column indexes.
The package does not export any types that implement this interface, programs must call NewColumnIndexer on a Type instance to construct column indexers.
type ColumnReader ¶
type ColumnReader interface { ValueReader // Returns the type of values read. Type() Type // Returns the column number of values read. Column() int // Resets the reader state to read values from the given decoder. // // Column readers created from parquet types are initialized to an empty // state and will return io.EOF on every read until a decoder is installed // via a call to Reset. Reset(decoder encoding.Decoder) }
ColumnReader is an interface implemented by types which support reading columns of values. The interface extends ValueReader to work on top of parquet encodings.
Implementations of ColumnReader may also provide extensions that the application can detect using type assertions. For example, readers for columns of INT32 values may implement the parquet.Int32Reader interface as a mechanism to provide a type safe and more efficient access to the column values.
type CompressedPage ¶
type CompressedPage interface { Page // Returns a representation of the page header. PageHeader() PageHeader // Returns a reader exposing the content of the compressed page. PageData() io.Reader // Returns the size of the page data. PageSize() int64 // CRC returns the IEEE CRC32 checksum of the page. CRC() uint32 }
CompressedPage is an extension of the Page interface implemented by pages that have been compressed to their on-file representation.
type Conversion ¶
type Conversion interface { // Applies the conversion logic on the src row, returning the result // appended to dst. Convert(dst, src Row) (Row, error) // Converts the given column index in the target schema to the original // column index in the source schema of the conversion. Column(int) int // Returns the target schema of the conversion. Schema() *Schema }
Conversion is an interface implemented by types that provide conversion of parquet rows from one schema to another.
Conversion instances must be safe to use concurrently from multiple goroutines.
func Convert ¶
func Convert(to, from Node) (conv Conversion, err error)
Convert constructs a conversion function from one parquet schema to another.
The function supports converting between schemas where the source or target have extra columns; if there are more columns in the source, they will be stripped out of the rows. Extra columns in the target schema will be set to null or zero values.
The returned function is intended to be used to append the converted source row to the destination buffer.
type ConvertError ¶
ConvertError is an error type returned by calls to Convert when the conversion of parquet schemas is impossible or the input row for the conversion is malformed.
func (*ConvertError) Error ¶
func (e *ConvertError) Error() string
Error satisfies the error interface.
type DataPageHeader ¶
type DataPageHeader interface { PageHeader // Returns whether the page is compressed, according to the codec given as // argument and details stored in the page header. IsCompressed(format.CompressionCodec) bool // Returns the encoding of the repetition level section. RepetitionLevelEncoding() format.Encoding // Returns the encoding of the definition level section. DefinitionLevelEncoding() format.Encoding // Returns the number of null values in the page. NullCount() int64 // Returns the minimum value in the page based on the ordering rules of the // column's logical type. // // As an optimization, the method may return the same slice across multiple // calls. Programs must treat the returned value as immutable to prevent // unpredictable behaviors. // // If the page only contains only null values, an empty slice is returned. MinValue() []byte // Returns the maximum value in the page based on the ordering rules of the // column's logical type. // // As an optimization, the method may return the same slice across multiple // calls. Programs must treat the returned value as immutable to prevent // unpredictable behaviors. // // If the page only contains only null values, an empty slice is returned. MaxValue() []byte }
DataPageHeader is a specialization of the PageHeader interface implemented by data pages.
type DataPageHeaderV1 ¶
type DataPageHeaderV1 struct {
// contains filtered or unexported fields
}
DataPageHeaderV1 is an implementation of the DataPageHeader interface representing data pages version 1.
func (DataPageHeaderV1) DefinitionLevelEncoding ¶
func (v1 DataPageHeaderV1) DefinitionLevelEncoding() format.Encoding
func (DataPageHeaderV1) Encoding ¶
func (v1 DataPageHeaderV1) Encoding() format.Encoding
func (DataPageHeaderV1) IsCompressed ¶
func (v1 DataPageHeaderV1) IsCompressed(codec format.CompressionCodec) bool
func (DataPageHeaderV1) MaxValue ¶
func (v1 DataPageHeaderV1) MaxValue() []byte
func (DataPageHeaderV1) MinValue ¶
func (v1 DataPageHeaderV1) MinValue() []byte
func (DataPageHeaderV1) NullCount ¶
func (v1 DataPageHeaderV1) NullCount() int64
func (DataPageHeaderV1) NumValues ¶
func (v1 DataPageHeaderV1) NumValues() int64
func (DataPageHeaderV1) PageType ¶
func (v1 DataPageHeaderV1) PageType() format.PageType
func (DataPageHeaderV1) RepetitionLevelEncoding ¶
func (v1 DataPageHeaderV1) RepetitionLevelEncoding() format.Encoding
func (DataPageHeaderV1) String ¶
func (v1 DataPageHeaderV1) String() string
type DataPageHeaderV2 ¶
type DataPageHeaderV2 struct {
// contains filtered or unexported fields
}
DataPageHeaderV2 is an implementation of the DataPageHeader interface representing data pages version 2.
func (DataPageHeaderV2) DefinitionLevelEncoding ¶
func (v2 DataPageHeaderV2) DefinitionLevelEncoding() format.Encoding
func (DataPageHeaderV2) DefinitionLevelsByteLength ¶
func (v2 DataPageHeaderV2) DefinitionLevelsByteLength() int64
func (DataPageHeaderV2) Encoding ¶
func (v2 DataPageHeaderV2) Encoding() format.Encoding
func (DataPageHeaderV2) IsCompressed ¶
func (v2 DataPageHeaderV2) IsCompressed(codec format.CompressionCodec) bool
func (DataPageHeaderV2) MaxValue ¶
func (v2 DataPageHeaderV2) MaxValue() []byte
func (DataPageHeaderV2) MinValue ¶
func (v2 DataPageHeaderV2) MinValue() []byte
func (DataPageHeaderV2) NullCount ¶
func (v2 DataPageHeaderV2) NullCount() int64
func (DataPageHeaderV2) NumNulls ¶
func (v2 DataPageHeaderV2) NumNulls() int64
func (DataPageHeaderV2) NumRows ¶
func (v2 DataPageHeaderV2) NumRows() int64
func (DataPageHeaderV2) NumValues ¶
func (v2 DataPageHeaderV2) NumValues() int64
func (DataPageHeaderV2) PageType ¶
func (v2 DataPageHeaderV2) PageType() format.PageType
func (DataPageHeaderV2) RepetitionLevelEncoding ¶
func (v2 DataPageHeaderV2) RepetitionLevelEncoding() format.Encoding
func (DataPageHeaderV2) RepetitionLevelsByteLength ¶
func (v2 DataPageHeaderV2) RepetitionLevelsByteLength() int64
func (DataPageHeaderV2) String ¶
func (v2 DataPageHeaderV2) String() string
type Dictionary ¶
type Dictionary interface { // Returns the type that the dictionary was created from. Type() Type // Returns the number of value indexed in the dictionary. Len() int // Returns the dictionary value at the given index. Index(index int32) Value // Inserts values from the second slice to the dictionary and writes the // indexes at which each value was inserted to the first slice. // // The method panics if the length of the indexes slice is smaller than the // length of the values slice. Insert(indexes []int32, values []Value) // Given an array of dictionary indexes, lookup the values into the array // of values passed as second argument. // // The method panics if len(indexes) > len(values), or one of the indexes // is negative or greater than the highest index in the dictionary. Lookup(indexes []int32, values []Value) // Returns the min and max values found in the given indexes. Bounds(indexed []int32) (min, max Value) // Resets the dictionary to its initial state, removing all values. Reset() // Returns a BufferedPage representing the content of the dictionary. // // The returned page shares the underlying memory of the buffer, it remains // valid to use until the dictionary's Reset method is called. Page() BufferedPage }
The Dictionary interface represents type-specific implementations of parquet dictionaries.
Programs can instantiate dictionaries by call the NewDictionary method of a Type object.
type DictionaryPageHeader ¶
type DictionaryPageHeader struct {
// contains filtered or unexported fields
}
DictionaryPageHeader is an implementation of the PageHeader interface representing dictionary pages.
func (DictionaryPageHeader) Encoding ¶
func (dict DictionaryPageHeader) Encoding() format.Encoding
func (DictionaryPageHeader) IsSorted ¶
func (dict DictionaryPageHeader) IsSorted() bool
func (DictionaryPageHeader) NumValues ¶
func (dict DictionaryPageHeader) NumValues() int64
func (DictionaryPageHeader) PageType ¶
func (dict DictionaryPageHeader) PageType() format.PageType
func (DictionaryPageHeader) String ¶
func (dict DictionaryPageHeader) String() string
type File ¶
type File struct {
// contains filtered or unexported fields
}
File represents a parquet file. The layout of a Parquet file can be found here: https://github.com/apache/parquet-format#file-format
func OpenFile ¶
OpenFile opens a parquet file and reads the content between offset 0 and the given size in r.
Only the parquet magic bytes and footer are read, column chunks and other parts of the file are left untouched; this means that successfully opening a file does not validate that the pages have valid checksums.
func (*File) ColumnIndexes ¶
func (f *File) ColumnIndexes() []format.ColumnIndex
ColumnIndexes returns the page index of the parquet file f.
If the file did not contain a column index, the method returns an empty slice and nil error.
func (*File) Lookup ¶
Lookup returns the value associated with the given key in the file key/value metadata.
The ok boolean will be true if the key was found, false otherwise.
func (*File) NumRowGroups ¶
NumRowGroups returns the number of row groups in f.
func (*File) OffsetIndexes ¶
func (f *File) OffsetIndexes() []format.OffsetIndex
OffsetIndexes returns the page index of the parquet file f.
If the file did not contain an offset index, the method returns an empty slice and nil error.
func (*File) ReadAt ¶
ReadAt reads bytes into b from f at the given offset.
The method satisfies the io.ReaderAt interface.
func (*File) ReadPageIndex ¶
func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error)
ReadPageIndex reads the page index section of the parquet file f.
If the file did not contain a page index, the method returns two empty slices and a nil error.
Only leaf columns have indexes, the returned indexes are arranged using the following layout:
- -------------- + | col 0: chunk 0 |
- -------------- + | col 1: chunk 0 |
- -------------- + | ... |
- -------------- + | col 0: chunk 1 |
- -------------- + | col 1: chunk 1 |
- -------------- + | ... |
- -------------- +
This method is useful in combination with the SkipPageIndex option to delay reading the page index section until after the file was opened. Note that in this case the page index is not cached within the file, programs are expected to make use of independently from the parquet package.
type FileConfig ¶
The FileConfig type carries configuration options for parquet files.
FileConfig implements the FileOption interface so it can be used directly as argument to the OpenFile function when needed, for example:
f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{ SkipPageIndex: true, SkipBloomFilters: true, })
func DefaultFileConfig ¶
func DefaultFileConfig() *FileConfig
DefaultFileConfig returns a new FileConfig value initialized with the default file configuration.
func NewFileConfig ¶
func NewFileConfig(options ...FileOption) (*FileConfig, error)
NewFileConfig constructs a new file configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*FileConfig) Apply ¶
func (c *FileConfig) Apply(options ...FileOption)
Apply applies the given list of options to c.
func (*FileConfig) ConfigureFile ¶
func (c *FileConfig) ConfigureFile(config *FileConfig)
ConfigureFile applies configuration options from c to config.
func (*FileConfig) Validate ¶
func (c *FileConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type FileOption ¶
type FileOption interface {
ConfigureFile(*FileConfig)
}
FileOption is an interface implemented by types that carry configuration options for parquet files.
func SkipPageIndex ¶
func SkipPageIndex(skip bool) FileOption
SkipPageIndex is a file configuration option which when set to true, prevents automatically reading the page index when opening a parquet file. This is useful as an optimization when programs know that they will not need to consume the page index.
Defaults to false.
type Group ¶
func (Group) ChildByName ¶
func (Group) ChildNames ¶
func (Group) Compression ¶
func (Group) NumChildren ¶
type IndexedNode ¶
type IndexedNode interface { Node // ChildByIndex returns the child node at the given index. ChildByIndex(index int) Node // ValueByIndex returns the sub-value of base at the given index. ValueByIndex(base reflect.Value, index int) reflect.Value }
IndexedNode is an extension of the Node interface implemented by types which support indexing child nodes by their position.
type Kind ¶
type Kind int8
Kind is an enumeration type representing the physical types supported by the parquet type system.
const ( Boolean Kind = Kind(format.Boolean) Int32 Kind = Kind(format.Int32) Int64 Kind = Kind(format.Int64) Int96 Kind = Kind(format.Int96) Float Kind = Kind(format.Float) Double Kind = Kind(format.Double) ByteArray Kind = Kind(format.ByteArray) FixedLenByteArray Kind = Kind(format.FixedLenByteArray) )
type Node ¶
type Node interface { // Returns a human-readable representation of the parquet node. String() string // For leaf nodes, returns the type of values of the parquet column. // // Calling this method on non-leaf nodes will panic. Type() Type // Returns whether the parquet column is optional. Optional() bool // Returns whether the parquet column is repeated. Repeated() bool // Returns whether the parquet column is required. Required() bool // Returns the number of child nodes. // // The method returns zero on leaf nodes. NumChildren() int // Returns the sorted list of child node names. // // The method returns an empty slice on leaf nodes. // // As an optimization, the returned slice may be the same across calls to // this method. Applications should treat the return value as immutable. ChildNames() []string // Returns the child node associated with the given name, or nil if the // name did not exist. // // The method panics if it is called on a leaf node. ChildByName(name string) Node // ValueByName is returns the sub-value with the given name in base. ValueByName(base reflect.Value, name string) reflect.Value // Returns the list of encodings used by the node and its children. // // The method may return an empty slice to indicate that only the plain // encoding is used. // // As an optimization, the returned slice may be the same across calls to // this method. Applications should treat the return value as immutable. Encoding() []encoding.Encoding // Returns the list of compression codecs used by the node and its children. // // The method may return an empty slice to indicate that no compression was // configured on the node. // // As an optimization, the returned slice may be the same across calls to // this method. Applications should treat the return value as immutable. Compression() []compress.Codec // Returns the Go type that best represents the parquet node. // // For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96, // float32, float64, string, []byte, or [N]byte. // // For groups, the method returns a struct type. // // If the method is called on a repeated node, the method returns a slice of // the underlying type. // // For optional nodes, the method returns a pointer of the underlying type. // // For nodes that were constructed from Go values (e.g. using SchemaOf), the // method returns the original Go type. GoType() reflect.Type }
Node values represent nodes of a parquet schema.
Nodes carry the type of values, as well as properties like whether the values are optional or repeat. Nodes with one or more children represent parquet groups and therefore do not have a logical type.
Nodes are immutable values and therefore safe to use concurrently from multiple goroutines.
func BSON ¶
func BSON() Node
BSON constructs a leaf node of BSON logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson
func Compressed ¶
Compressed wraps the node passed as argument to add the given list of compression codecs.
The function panics if it is called on a non-leaf node.
func Date ¶
func Date() Node
Date constructs a leaf node of DATE logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date
func Decimal ¶
Decimal constructs a leaf node of decimal logical type with the given scale, precision, and underlying type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
func Encoded ¶
Encoded wraps the node passed as argument to add the given list of encodings.
The function panics if it is called on a non-leaf node, or if one of the encodings is not able to encode the node type.
func Enum ¶
func Enum() Node
Enum constructs a leaf node with a logical type representing enumerations.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum
func Int ¶
Int constructs a leaf node of signed integer logical type of the given bit width.
The bit width must be one of 8, 16, 32, 64, or the function will panic.
func JSON ¶
func JSON() Node
JSON constructs a leaf node of JSON logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json
func List ¶
List constructs a node of LIST logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
func Map ¶
Map constructs a node of MAP logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
func String ¶
func String() Node
String constructs a leaf node of UTF8 logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string
func Time ¶
Time constructs a leaf node of TIME logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time
func Timestamp ¶
Timestamp constructs of leaf node of TIMESTAMP logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
func UUID ¶
func UUID() Node
UUID constructs a leaf node of UUID logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid
type OffsetIndex ¶
type OffsetIndex interface { // NumPages returns the number of pages in the offset index. NumPages() int // Offset returns the offset starting from the beginning of the file for the // page at the given index. Offset(int) int64 // CompressedPageSize returns the size of the page at the given index // (in bytes). CompressedPageSize(int) int64 // FirstRowIndex returns the the first row in the page at the given index. // // The returned row index is based on the row group that the page belongs // to, the first row has index zero. FirstRowIndex(int) int64 }
type Page ¶
type Page interface { // Returns the column index that this page belongs to. Column() int // If the page contains indexed values, calling this method returns the // dictionary in which the values are looked up. Otherwise, the method // returns nil. Dictionary() Dictionary // Returns the number of rows, values, and nulls in the page. The number of // rows may be less than the number of values in the page if the page is // part of a repeated column. NumRows() int64 NumValues() int64 NumNulls() int64 // Returns the min and max values currently buffered in the writer. Bounds() (min, max Value) // Returns the size of the page in bytes (uncompressed). Size() int64 // Returns a reader exposing the values contained in the page. // // Depending on the underlying implementation, the returned reader may // support reading an array of typed Go values by implementing interfaces // like parquet.Int32Reader. Applications should use type assertions on // the returned reader to determine whether those optimizations are // available. Values() ValueReader // Buffer returns the page as a BufferedPage, which may be the page itself // if it was already buffered. // // Compressed pages will be consumed to create the returned buffered page, // their content will no be readable anymore after the call. Buffer() BufferedPage }
Page values represent sequences of parquet values. From the Parquet documentation: "Column chunks are a chunk of the data for a particular column. They live in a particular row group and are guaranteed to be contiguous in the file. Column chunks are divided up into pages. A page is conceptually an indivisible unit (in terms of compression and encoding). There can be multiple page types which are interleaved in a column chunk."
type PageBufferPool ¶
type PageBufferPool interface { GetPageBuffer() io.ReadWriter PutPageBuffer(io.ReadWriter) }
func NewFileBufferPool ¶
func NewFileBufferPool(tempdir, pattern string) PageBufferPool
func NewPageBufferPool ¶
func NewPageBufferPool() PageBufferPool
type PageHeader ¶
type PageHeader interface { // Returns the number of values in the page (including nulls). NumValues() int64 // Returns the page encoding. Encoding() format.Encoding // Returns the parquet format page type. PageType() format.PageType }
PageHeader is an interface implemented by parquet page headers.
type PageReader ¶
PageReader is an interface implemented by types that support producing a sequence of pages.
type PageWriter ¶
PageWriter is an interface implemented by types that support writing pages to an underlying storage medium.
type Pages ¶
type Pages interface { PageReader RowSeeker }
Pages is an interface implemented by page readers returned by calling the Pages method of ColumnChunk instances.
type Reader ¶
type Reader struct {
// contains filtered or unexported fields
}
A Reader reads Go values from parquet files.
This example showcases a typical use of parquet readers:
reader := parquet.NewReader(file) rows := []RowType{} for { row := RowType{} err := reader.Read(&row) if err != nil { if err == io.EOF { break } ... } rows = append(rows, row) }
func NewReader ¶
func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader
NewReader constructs a parquet reader reading rows from the given io.ReaderAt.
In order to read parquet rows, the io.ReaderAt must be converted to a parquet.File. If r is already a parquet.File it is used directly; otherwise, the io.ReaderAt value is expected to either have a `Size() int64` method or implement io.Seeker in order to determine its size.
The function panics if the reader configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewReader should construct the reader configuration independently prior to calling this function:
config, err := parquet.NewReaderConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a reader is guaranteed not to panic reader := parquet.NewReader(input, config) ... }
func (*Reader) Read ¶
Read reads the next row from r. The type of the row must match the schema of the underlying parquet file or an error will be returned.
The method returns io.EOF when no more rows can be read from r.
func (*Reader) ReadRow ¶
ReadRow reads the next row from r and appends in to the given Row buffer.
The returned values are laid out in the order expected by the parquet.(*Schema).Reconstruct method.
The method returns io.EOF when no more rows can be read from r.
func (*Reader) Reset ¶
func (r *Reader) Reset()
Reset repositions the reader at the beginning of the underlying parquet file.
type ReaderConfig ¶
type ReaderConfig struct {
Schema *Schema
}
The ReaderConfig type carries configuration options for parquet readers.
ReaderConfig implements the ReaderOption interface so it can be used directly as argument to the NewReader function when needed, for example:
reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{ // ... })
func DefaultReaderConfig ¶
func DefaultReaderConfig() *ReaderConfig
DefaultReaderConfig returns a new ReaderConfig value initialized with the default reader configuration.
func NewReaderConfig ¶
func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error)
NewReaderConfig constructs a new reader configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*ReaderConfig) Apply ¶
func (c *ReaderConfig) Apply(options ...ReaderOption)
Apply applies the given list of options to c.
func (*ReaderConfig) ConfigureReader ¶
func (c *ReaderConfig) ConfigureReader(config *ReaderConfig)
ConfigureReader applies configuration options from c to config.
func (*ReaderConfig) Validate ¶
func (c *ReaderConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type ReaderOption ¶
type ReaderOption interface {
ConfigureReader(*ReaderConfig)
}
ReaderOption is an interface implemented by types that carry configuration options for parquet readers.
type RequiredReader ¶
type RequiredReader[T plain.Type] interface { // Read values into the data slice, returning the number of values read, or // an error if less than len(data) values could be read, or io.EOF if the // end of the sequence was reached. // // For columns of type BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY, T is byte and // the data is PLAIN encoded. // // If the column is of type FIXED_LEN_BYTE_ARRAY, the data slice length must // be a multiple of the column size. ReadRequired(data []T) (int, error) }
RequiredReader is a parameterized interface implemented by ValueReader instances which exposes the content of a column as array of Go values of the type parameter T.
type RequiredWriter ¶
type RequiredWriter[T plain.Type] interface { // Write values from the data slice, returning the number of values written, // or an error if less than len(data) values were written. // // For columns of type BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY, T is byte and // the data is PLAIN encoded. // // If the column is of type FIXED_LEN_BYTE_ARRAY, the data slice length must // be a multiple of the column size. WriteRequired(data []T) (int, error) }
RequiredWriter is a parameterized interface implemented by ValueWriter instances which allows writing arrays of Go values of the type parameter T.
type Row ¶
type Row []Value
Row represents a parquet row as a slice of values.
Each value should embed a column index, repetition level, and definition level allowing the program to determine how to reconstruct the original object from the row. Repeated values share the same column index, their relative position of repeated values is represented by their relative position in the row.
type RowGroup ¶
type RowGroup interface { // Returns the number of rows in the group. NumRows() int64 // Returns the number of leaf columns in the group. NumColumns() int // Returns the leaf column at the given index in the group. // // If the underlying implementation is not read-only, the returned // parquet.ColumnChunk may implement other interfaces: for example, // parquet.ColumnBuffer if the chunk is backed by an in-memory buffer, // or typed writer interfaces like parquet.Int32Writer depending on the // underlying type of values that can be written to the chunk. Column(int) ColumnChunk // Returns the schema of rows in the group. Schema() *Schema // Returns the list of sorting columns describing how rows are sorted in the // group. // // The method will return an empty slice if the rows are not sorted. SortingColumns() []SortingColumn // Returns a reader exposing the rows of the row group. // // As an optimization, the returned parquet.Rows object may implement // parquet.RowWriterTo, and test the RowWriter it receives for an // implementation of the parquet.RowGroupWriter interface. // // This optimization mechanism is leveraged by the parquet.CopyRows function // to skip the generic row-by-row copy algorithm and delegate the copy logic // to the parquet.Rows object. Rows() Rows }
RowGroup is an interface representing a parquet row group. From the Parquet docs, a RowGroup is "a logical horizontal partitioning of the data into rows. There is no physical structure that is guaranteed for a row group. A row group consists of a column chunk for each column in the dataset."
https://github.com/apache/parquet-format#glossary
func ConvertRowGroup ¶
func ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup
ConvertRowGroup constructs a wrapper of the given row group which applies the given schema conversion to its rows.
func MergeRowGroups ¶
func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, error)
MergeRowGroups constructs a row group which is a merged view of rowGroups. If rowGroups are sorted and the passed options include sorting, the merged row group will also be sorted.
The function validates the input to ensure that the merge operation is possible, ensuring that the schemas match or can be converted to an optionally configured target schema passed as argument in the option list.
The sorting columns of each row group are also consulted to determine whether the output can be represented. If sorting columns are configured on the merge they must be a prefix of sorting columns of all row groups being merged.
type RowGroupConfig ¶
type RowGroupConfig struct { ColumnBufferSize int SortingColumns []SortingColumn Schema *Schema }
The RowGroupConfig type carries configuration options for parquet row groups.
RowGroupConfig implements the RowGroupOption interface so it can be used directly as argument to the NewBuffer function when needed, for example:
buffer := parquet.NewBuffer(&parquet.RowGroupConfig{ ColumnBufferSize: 8 * 1024 * 1024, })
func DefaultRowGroupConfig ¶
func DefaultRowGroupConfig() *RowGroupConfig
DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the default row group configuration.
func NewRowGroupConfig ¶
func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error)
NewRowGroupConfig constructs a new row group configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*RowGroupConfig) Apply ¶
func (c *RowGroupConfig) Apply(options ...RowGroupOption)
func (*RowGroupConfig) ConfigureRowGroup ¶
func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig)
func (*RowGroupConfig) Validate ¶
func (c *RowGroupConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type RowGroupOption ¶
type RowGroupOption interface {
ConfigureRowGroup(*RowGroupConfig)
}
RowGroupOption is an interface implemented by types that carry configuration options for parquet row groups.
func ColumnBufferSize ¶
func ColumnBufferSize(size int) RowGroupOption
ColumnBufferSize creates a configuration option which defines the size of row group column buffers.
Defaults to 1 MiB.
func SortingColumns ¶
func SortingColumns(sortingColumns ...SortingColumn) RowGroupOption
SortingColumns creates a configuration option which defines the sorting order of columns in a row group.
The order of sorting columns passed as argument defines the ordering hierarchy; when elements are equal in the first column, the second column is used to order rows, etc...
type RowGroupReader ¶
RowGroupReader is an interface implemented by types that expose sequences of row groups to the application.
type RowGroupWriter ¶
RowGroupWriter is an interface implemented by types that allow the program to write row groups.
type RowReadSeeker ¶
RowReadSeeker is an interface implemented by row readers which support seeking to arbitrary row positions.
type RowReaderAt ¶
RowReaderAt reads parquet rows at specific indexes.
type RowReaderFrom ¶
RowReaderFrom reads parquet rows from reader.
type RowReaderWithSchema ¶
RowReaderWithSchema is an extension of the RowReader interface which advertises the schema of rows returned by ReadRow calls.
func ConvertRowReader ¶
func ConvertRowReader(rows RowReader, conv Conversion) RowReaderWithSchema
ConvertRowReader constructs a wrapper of the given row reader which applies the given schema conversion to the rows.
type RowSeeker ¶
RowSeeker is an interface implemented by readers of parquet rows which can be positioned at a specific row index.
type RowWriterAt ¶
RowWriterAt writes parquet rows at specific indexes.
type RowWriterTo ¶
RowWriterTo writes parquet rows to a writer.
type RowWriterWithSchema ¶
RowWriterWithSchema is an extension of the RowWriter interface which advertises the schema of rows expected to be passed to WriteRow calls.
type Rows ¶
type Rows interface { RowReaderWithSchema RowSeeker }
Rows is an interface implemented by row readers returned by calling the Rows method of RowGroup instances.
func NewRowGroupRowReader ¶
type Schema ¶
type Schema struct {
// contains filtered or unexported fields
}
Schema represents a parquet schema created from a Go value.
Schema implements the Node interface to represent the root node of a parquet schema.
func NewSchema ¶
NewSchema constructs a new Schema object with the given name and root node.
The function panics if Node contains more leaf columns than supported by the package (see parquet.MaxColumnIndex).
func SchemaOf ¶
func SchemaOf(model interface{}) *Schema
SchemaOf constructs a parquet schema from a Go value.
The function can construct parquet schemas from struct or pointer-to-struct values only. A panic is raised if a Go value of a different type is passed to this function.
When creating a parquet Schema from a Go value, the struct fields may contain a "parquet" tag to describe properties of the parquet node. The "parquet" tag follows the conventional format of Go struct tags: a comma-separated list of values describe the options, with the first one defining the name of the parquet column.
The following options are also supported in the "parquet" struct tag:
optional | make the parquet column optional snappy | sets the parquet column compression codec to snappy gzip | sets the parquet column compression codec to gzip brotli | sets the parquet column compression codec to brotli lz4 | sets the parquet column compression codec to lz4 zstd | sets the parquet column compression codec to zstd plain | enables the plain encoding (no-op default) dict | enables dictionary encoding on the parquet column delta | enables delta encoding on the parquet column list | for slice types, use the parquet LIST logical type enum | for string types, use the parquet ENUM logical type uuid | for string and [16]byte types, use the parquet UUID logical type decimal | for int32 and int64 types, use the parquet DECIMAL logical type
The decimal tag must be followed by two integer parameters, the first integer representing the scale and the second the precision; for example:
type Item struct { Cost int64 `parquet:"cost,decimal(0:3)"` }
Invalid combination of struct tags and Go types, or repeating options will cause the function to panic.
The schema name is the Go type name of the value.
func (*Schema) ChildByName ¶
ChildByName returns the child node with the given name in s.
func (*Schema) ChildNames ¶
ChildNames returns the list of child node names of s.
func (*Schema) Compression ¶
Compression returns the list of compression codecs in the child nodes of s.
func (*Schema) ConfigureReader ¶
func (s *Schema) ConfigureReader(config *ReaderConfig)
ConfigureReader satisfies the ReaderOption interface, allowing Schema instances to be passed to NewReader to pre-declare the schema of rows read from the reader.
func (*Schema) ConfigureRowGroup ¶
func (s *Schema) ConfigureRowGroup(config *RowGroupConfig)
ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema instances to be passed to row group constructors to pre-declare the schema of the output parquet file.
func (*Schema) ConfigureWriter ¶
func (s *Schema) ConfigureWriter(config *WriterConfig)
ConfigureWriter satisfies the WriterOption interface, allowing Schema instances to be passed to NewWriter to pre-declare the schema of the output parquet file.
func (*Schema) Deconstruct ¶
Deconstruct deconstructs a Go value and appends it to a row.
The method panics is the structure of the go value does not match the parquet schema.
func (*Schema) NumChildren ¶
NumChildren returns the number of child nodes of s.
func (*Schema) Optional ¶
Optional returns false since the root node of a parquet schema is always required.
func (*Schema) Reconstruct ¶
Reconstruct reconstructs a Go value from a row.
The go value passed as first argument must be a non-nil pointer for the row to be decoded into.
The method panics if the structure of the go value and parquet row do not match.
func (*Schema) Repeated ¶
Repeated returns false since the root node of a parquet schema is always required.
func (*Schema) Required ¶
Required returns true since the root node of a parquet schema is always required.
type SortConfig ¶
type SortConfig struct { MaxRepetitionLevel int MaxDefinitionLevel int Descending bool NullsFirst bool }
The SortConfig type carries configuration options used to generate sorting functions.
SortConfig implements the SortOption interface so it can be used directly as argument to the SortFuncOf function, for example:
sortFunc := parquet.SortFuncOf(columnType, &parquet.SortConfig{ Descending: true, NullsFirst: true, })
func (*SortConfig) Apply ¶
func (c *SortConfig) Apply(options ...SortOption)
Apply applies options to c.
func (*SortConfig) ConfigureSort ¶
func (c *SortConfig) ConfigureSort(config *SortConfig)
ConfigureSort satisfies the SortOption interface.
type SortFunc ¶
SortFunc is a function type which compares two sets of column values.
Slices with exactly one value must be passed to the function when comparing values of non-repeated columns. For repeated columns, there may be zero or more values in each slice, and the parameters may have different lengths.
SortFunc is a low-level API which is usually useful to construct customize implementations of the RowGroup interface.
func SortFuncOf ¶
func SortFuncOf(t Type, options ...SortOption) SortFunc
SortFuncOf constructs a sorting function for values of the given type.
The list of options contains the configuration used to construct the sorting function.
type SortOption ¶
type SortOption interface {
ConfigureSort(*SortConfig)
}
SortOption is an interface implemented by types that carry configuration options for sorting functions.
func SortDescending ¶
func SortDescending(descending bool) SortOption
SortDescending constructs a configuration option which inverts the order of a sorting function.
Defaults to false, which means values are sorted in ascending order.
func SortMaxDefinitionLevel ¶
func SortMaxDefinitionLevel(level int) SortOption
SortMaxDefinitionLevel constructs a configuration option which sets the maximum definition level known to a sorting function.
Defaults to zero, which represents a non-nullable column.
func SortMaxRepetitionLevel ¶
func SortMaxRepetitionLevel(level int) SortOption
SortMaxRepetitionLevel constructs a configuration option which sets the maximum repetition level known to a sorting function.
Defaults to zero, which represents a non-repeated column.
func SortNullsFirst ¶
func SortNullsFirst(nullsFirst bool) SortOption
SortNullsFirst constructs a configuration option which places the null values first or last.
Defaults to false, which means null values are placed last.
type SortingColumn ¶
type SortingColumn interface { // Returns the path of the column in the row group schema, omitting the name // of the root node. Path() []string // Returns true if the column will sort values in descending order. Descending() bool // Returns true if the column will put null values at the beginning. NullsFirst() bool }
SortingColumn represents a column by which a row group is sorted.
func Ascending ¶
func Ascending(path ...string) SortingColumn
Ascending constructs a SortingColumn value which dictates to sort the column at the path given as argument in ascending order.
func Descending ¶
func Descending(path ...string) SortingColumn
Descending constructs a SortingColumn value which dictates to sort the column at the path given as argument in descending order.
func NullsFirst ¶
func NullsFirst(sortingColumn SortingColumn) SortingColumn
NullsFirst wraps the SortingColumn passed as argument so that it instructs the row group to place null values first in the column.
type TimeUnit ¶
type TimeUnit interface { // Returns the precision of the time unit as a time.Duration value. Duration() time.Duration // Converts the TimeUnit value to its representation in the parquet thrift // format. TimeUnit() format.TimeUnit }
TimeUnit represents units of time in the parquet type system.
type Type ¶
type Type interface { // Returns a human-readable representation of the parquet type. String() string // Returns the Kind value representing the underlying physical type. // // The method panics if it is called on a group type. Kind() Kind // For integer and floating point physical types, the method returns the // size of values in bits. // // For fixed-length byte arrays, the method returns the size of elements // in bytes. // // For other types, the value is zero. Length() int // Compares two values and returns a negative integer if a < b, positive if // a > b, or zero if a == b. // // The values' Kind must match the type, otherwise the result is undefined. // // The method panics if it is called on a group type. Compare(a, b Value) int // ColumnOrder returns the type's column order. For group types, this method // returns nil. // // The order describes the comparison logic implemented by the Less method. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. ColumnOrder() *format.ColumnOrder // Returns the physical type as a *format.Type value. For group types, this // method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. PhysicalType() *format.Type // Returns the logical type as a *format.LogicalType value. When the logical // type is unknown, the method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. LogicalType() *format.LogicalType // Returns the logical type's equivalent converted type. When there are // no equivalent converted type, the method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. ConvertedType() *deprecated.ConvertedType // Creates a column indexer for values of this type. // // The size limit is a hint to the column indexer that it is allowed to // truncate the page boundaries to the given size. Only BYTE_ARRAY and // FIXED_LEN_BYTE_ARRAY types currently take this value into account. // // A value of zero or less means no limits. // // The method panics if it is called on a group type. NewColumnIndexer(sizeLimit int) ColumnIndexer // Creates a dictionary holding values of this type. // // The method panics if it is called on a group type. NewDictionary(columnIndex, bufferSize int) Dictionary // Creates a row group buffer column for values of this type. // // Column buffers are created using the index of the column they are // accumulating values in memory for (relative to the parent schema), // and the size of their memory buffer. // // The buffer size is given in bytes, because we want to control memory // consumption of the application, which is simpler to achieve with buffer // size expressed in bytes rather than number of elements. // // Note that the buffer size is not a hard limit, it defines the initial // capacity of the column buffer, but may grow as needed. Programs can use // the Size method of the column buffer (or the parent row group, when // relevant) to determine how many bytes are being used, and perform a flush // of the buffers to a storage layer. // // The method panics if it is called on a group type. NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer // Creates a reader for columns of this type. // // Column readers are created using the index of the column they are reading // values from (relative to the parent schema). The column index will be set // on values read from the reader. // // The buffer size is given in bytes, because we want to control memory // consumption of the application, which is simpler to achieve with buffer // size expressed in bytes rather than number of elements. // // The returned reader may implement extensions that can be tested via type // assertions. For example, on a INT32 type, the reader could implement the // parquet.Int32Reader interface to allow programs to more efficiently read // columns of INT32 values. NewColumnReader(columnIndex, bufferSize int) ColumnReader // Reads a dictionary with values of this type from the decoder passed as // argument. // // The number of values is a hint to optimize the allocation of memory // buffers for the dictionary. Callers that don't know how many values will // be decoded should pass zero for numValues. ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) }
The Type interface represents logical types of the parquet type system.
Types are immutable and therefore safe to access from multiple goroutines.
var ( BooleanType Type = primitiveType[bool]{/* contains filtered or unexported fields */} Int32Type Type = primitiveType[int32]{/* contains filtered or unexported fields */} Int64Type Type = primitiveType[int64]{/* contains filtered or unexported fields */} Int96Type Type = primitiveType[deprecated.Int96]{/* contains filtered or unexported fields */} FloatType Type = primitiveType[float32]{/* contains filtered or unexported fields */} DoubleType Type = primitiveType[float64]{/* contains filtered or unexported fields */} ByteArrayType Type = byteArrayType{} )
func FixedLenByteArrayType ¶
FixedLenByteArrayType constructs a type for fixed-length values of the given size (in bytes).
type Value ¶
type Value struct {
// contains filtered or unexported fields
}
The Value type is similar to the reflect.Value abstraction of Go values, but for parquet values. Value instances wrap underlying Go values mapped to one of the parquet physical types.
Value instances are small, immutable objects, and usually passed by value between function calls.
The zero-value of Value represents the null parquet value.
func ValueOf ¶
func ValueOf(v interface{}) Value
ValueOf constructs a parquet value from a Go value v.
The physical type of the value is assumed from the Go type of v using the following conversion table:
Go type | Parquet physical type ------- | --------------------- nil | NULL bool | BOOLEAN int8 | INT32 int16 | INT32 int32 | INT32 int64 | INT64 int | INT64 uint8 | INT32 uint16 | INT32 uint32 | INT32 uint64 | INT64 uintptr | INT64 float32 | FLOAT float64 | DOUBLE string | BYTE_ARRAY []byte | BYTE_ARRAY [*]byte | FIXED_LEN_BYTE_ARRAY
When converting a []byte or [*]byte value, the underlying byte array is not copied; instead, the returned parquet value holds a reference to it.
The repetition and definition levels of the returned value are both zero.
The function panics if the Go value cannot be represented in parquet.
func (Value) AppendBytes ¶
AppendBytes appends the binary representation of v to b.
If v is the null value, b is returned unchanged.
func (Value) ByteArray ¶
ByteArray returns v as a []byte, assuming the underlying type is either BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY.
The application must treat the returned byte slice as a read-only value, mutating the content will result in undefined behaviors.
func (Value) Bytes ¶
Bytes returns the binary representation of v.
If v is the null value, an nil byte slice is returned.
func (Value) Column ¶
Column returns the column index within the row that v was created from.
Returns -1 if the value does not carry a column index.
func (Value) DefinitionLevel ¶
DefinitionLevel returns the definition level of v.
func (Value) Format ¶
Format outputs a human-readable representation of v to w, using r as the formatting verb to describe how the value should be printed.
The following formatting options are supported:
%c prints the column index %+c prints the column index, prefixed with "C:" %d prints the definition level %+d prints the definition level, prefixed with "D:" %r prints the repetition level %+r prints the repetition level, prefixed with "R:" %q prints the quoted representation of v %+q prints the quoted representation of v, prefixed with "V:" %s prints the string representation of v %+s prints the string representation of v, prefixed with "V:" %v same as %s %+v prints a verbose representation of v %#v prints a Go value representation of v
Format satisfies the fmt.Formatter interface.
func (Value) Int96 ¶
func (v Value) Int96() deprecated.Int96
Int96 returns v as a int96, assuming the underlying type is INT96.
func (Value) Level ¶
Level returns v with the repetition level, definition level, and column index set to the values passed as arguments.
The method panics if either argument is negative.
func (Value) RepetitionLevel ¶
RepetitionLevel returns the repetition level of v.
type ValueReader ¶
type ValueReader interface { // Read values into the buffer passed as argument and return the number of // values read. When all values have been read, the error will be io.EOF. ReadValues([]Value) (int, error) }
ValueReader is an interface implemented by types that support reading batches of values.
type ValueReaderFrom ¶
type ValueReaderFrom interface {
ReadValuesFrom(ValueReader) (int64, error)
}
ValueReaderFrom is an interface implemented by value writers to read values from a reader.
type ValueWriter ¶
type ValueWriter interface { // Write values from the buffer passed as argument and returns the number // of values written. WriteValues([]Value) (int, error) }
ValueWriter is an interface implemented by types that support reading batches of values.
type ValueWriterTo ¶
type ValueWriterTo interface {
WriteValuesTo(ValueWriter) (int64, error)
}
ValueWriterTo is an interface implemented by value readers to write values to a writer.
type WrappedNode ¶
type WrappedNode interface { Node // Unwrap returns the underlying base node. // // Note that Unwrap is not intended to recursively unwrap multiple layers of // wrappers, it returns the immediate next layer. Unwrap() Node }
WrappedNode is an extension of the Node interface implemented by types which wrap another underlying node.
type Writer ¶
type Writer struct {
// contains filtered or unexported fields
}
A Writer uses a parquet schema and sequence of Go values to produce a parquet file to an io.Writer.
This example showcases a typical use of parquet writers:
writer := parquet.NewWriter(output) for _, row := range rows { if err := writer.Write(row); err != nil { ... } } if err := writer.Close(); err != nil { ... }
The Writer type optimizes for minimal memory usage, each page is written as soon as it has been filled so only a single page per column needs to be held in memory and as a result, there are no opportunities to sort rows within an entire row group. Programs that need to produce parquet files with sorted row groups should use the Buffer type to buffer and sort the rows prior to writing them to a Writer.
func NewWriter ¶
func NewWriter(output io.Writer, options ...WriterOption) *Writer
NewWriter constructs a parquet writer writing a file to the given io.Writer.
The function panics if the writer configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewWriter should construct the writer configuration independently prior to calling this function:
config, err := parquet.NewWriterConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a writer is guaranteed not to panic writer := parquet.NewWriter(output, config) ... }
func (*Writer) Close ¶
Close must be called after all values were produced to the writer in order to flush all buffers and write the parquet footer.
func (*Writer) Flush ¶
Flush flushes all buffers into a row group to the underlying io.Writer.
Flush is called automatically on Close, it is only useful to call explicitly if the application needs to limit the size of row groups or wants to produce multiple row groups per file.
func (*Writer) ReadRowsFrom ¶
ReadRowsFrom reads rows from the reader passed as arguments and writes them to w.
This is similar to calling WriteRow repeatedly, but will be more efficient if optimizations are supported by the reader.
func (*Writer) Reset ¶
Reset clears the state of the writer without flushing any of the buffers, and setting the output to the io.Writer passed as argument, allowing the writer to be reused to produce another parquet file.
Reset may be called at any time, including after a writer was closed.
func (*Writer) Schema ¶
Schema returns the schema of rows written by w.
The returned value will be nil if no schema has yet been configured on w.
func (*Writer) Write ¶
Write is called to write another row to the parquet file.
The method uses the parquet schema configured on w to traverse the Go value and decompose it into a set of columns and values. If no schema were passed to NewWriter, it is deducted from the Go type of the row, which then have to be a struct or pointer to struct.
func (*Writer) WriteRow ¶
WriteRow is called to write another row to the parquet file.
The Writer must have been given a schema when NewWriter was called, otherwise the structure of the parquet file cannot be determined from the row only.
The row is expected to contain values for each column of the writer's schema, in the order produced by the parquet.(*Schema).Deconstruct method.
func (*Writer) WriteRowGroup ¶
WriteRowGroup writes a row group to the parquet file.
Buffered rows will be flushed prior to writing rows from the group, unless the row group was empty in which case nothing is written to the file.
The content of the row group is flushed to the writer; after the method returns successfully, the row group will be empty and in ready to be reused.
type WriterConfig ¶
type WriterConfig struct { CreatedBy string ColumnPageBuffers PageBufferPool ColumnIndexSizeLimit int PageBufferPool PageBufferPool PageBufferSize int DataPageVersion int DataPageStatistics bool KeyValueMetadata map[string]string Schema *Schema BloomFilters []BloomFilterColumn }
The WriterConfig type carries configuration options for parquet writers.
WriterConfig implements the WriterOption interface so it can be used directly as argument to the NewWriter function when needed, for example:
writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{ CreatedBy: "my test program", })
func DefaultWriterConfig ¶
func DefaultWriterConfig() *WriterConfig
DefaultWriterConfig returns a new WriterConfig value initialized with the default writer configuration.
func NewWriterConfig ¶
func NewWriterConfig(options ...WriterOption) (*WriterConfig, error)
NewWriterConfig constructs a new writer configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*WriterConfig) Apply ¶
func (c *WriterConfig) Apply(options ...WriterOption)
Apply applies the given list of options to c.
func (*WriterConfig) ConfigureWriter ¶
func (c *WriterConfig) ConfigureWriter(config *WriterConfig)
ConfigureWriter applies configuration options from c to config.
func (*WriterConfig) Validate ¶
func (c *WriterConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type WriterOption ¶
type WriterOption interface {
ConfigureWriter(*WriterConfig)
}
WriterOption is an interface implemented by types that carry configuration options for parquet writers.
func BloomFilters ¶
func BloomFilters(filters ...BloomFilterColumn) WriterOption
BloomFilters creates a configuration option which defines the bloom filters that parquet writers should generate.
The compute and memory footprint of generating bloom filters for all columns of a parquet schema can be significant, so by default no filters are created and applications need to explicitly declare the columns that they want to create filters for.
func ColumnIndexSizeLimit ¶
func ColumnIndexSizeLimit(sizeLimit int) WriterOption
ColumnIndexSizeLimit creates a configuration option to customize the size limit of page boundaries recorded in column indexes.
Defaults to 16.
func ColumnPageBuffers ¶
func ColumnPageBuffers(buffers PageBufferPool) WriterOption
ColumnPageBuffers creates a configuration option to customize the buffer pool used when constructing row groups. This can be used to provide on-disk buffers as swap space to ensure that the parquet file creation will no be bottlenecked on the amount of memory available.
Defaults to using in-memory buffers.
func CreatedBy ¶
func CreatedBy(createdBy string) WriterOption
CreatedBy creates a configuration option which sets the name of the application that created a parquet file.
By default, this information is omitted.
func DataPageStatistics ¶
func DataPageStatistics(enabled bool) WriterOption
DataPageStatistics creates a configuration option which defines whether data page statistics are emitted. This option is useful when generating parquet files that intend to be backward compatible with older readers which may not have the ability to load page statistics from the column index.
Defaults to false.
func DataPageVersion ¶
func DataPageVersion(version int) WriterOption
DataPageVersion creates a configuration option which configures the version of data pages used when creating a parquet file.
Defaults to version 2.
func KeyValueMetadata ¶
func KeyValueMetadata(key, value string) WriterOption
KeyValueMetadata creates a configuration option which adds key/value metadata to add to the metadata of parquet files.
This option is additive, it may be used multiple times to add more than one key/value pair.
Keys are assumed to be unique, if the same key is repeated multiple times the last value is retained. While the parquet format does not require unique keys, this design decision was made to optimize for the most common use case where applications leverage this extension mechanism to associate single values to keys. This may create incompatibilities with other parquet libraries, or may cause some key/value pairs to be lost when open parquet files written with repeated keys. We can revisit this decision if it ever becomes a blocker.
func PageBufferSize ¶
func PageBufferSize(size int) WriterOption
PageBufferSize configures the size of column page buffers on parquet writers.
Note that the page buffer size refers to the in-memory buffers where pages are generated, not the size of pages after encoding and compression. This design choice was made to help control the amount of memory needed to read and write pages rather than controlling the space used by the encoded representation on disk.
Defaults to 1 MiB.
Source Files ¶
- bloom.go
- buffer.go
- class_go18.go
- column.go
- column_buffer.go
- column_buffer_go18.go
- column_chunk.go
- column_index.go
- column_index_go18.go
- column_path.go
- column_reader.go
- column_reader_go18.go
- compress.go
- concat.go
- config.go
- convert.go
- dictionary.go
- dictionary_go18.go
- encoding.go
- errors.go
- file.go
- limits.go
- merge.go
- node.go
- offset_index.go
- offset_index_go18.go
- page.go
- page_go18.go
- page_header.go
- parquet.go
- print.go
- reader.go
- row.go
- row_group.go
- schema.go
- sort.go
- type.go
- type_go18.go
- value.go
- value_go18.go
- writer.go
Directories ¶
Path | Synopsis |
---|---|
Package bloom implements parquet bloom filters.
|
Package bloom implements parquet bloom filters. |
xxhash
Package xxhash is an extension of github.com/cespare/xxhash which adds routines optimized to hash arrays of fixed size elements.
|
Package xxhash is an extension of github.com/cespare/xxhash which adds routines optimized to hash arrays of fixed size elements. |
Package compress provides the generic APIs implemented by parquet compression codecs.
|
Package compress provides the generic APIs implemented by parquet compression codecs. |
brotli
Package brotli implements the BROTLI parquet compression codec.
|
Package brotli implements the BROTLI parquet compression codec. |
gzip
Package gzip implements the GZIP parquet compression codec.
|
Package gzip implements the GZIP parquet compression codec. |
lz4
Package lz4 implements the LZ4_RAW parquet compression codec.
|
Package lz4 implements the LZ4_RAW parquet compression codec. |
snappy
Package snappy implements the SNAPPY parquet compression codec.
|
Package snappy implements the SNAPPY parquet compression codec. |
uncompressed
Package uncompressed provides implementations of the compression codec interfaces as pass-through without applying any compression nor decompression.
|
Package uncompressed provides implementations of the compression codec interfaces as pass-through without applying any compression nor decompression. |
zstd
Package zstd implements the ZSTD parquet compression codec.
|
Package zstd implements the ZSTD parquet compression codec. |
Package encoding provides the generic APIs implemented by parquet encodings in its sub-packages.
|
Package encoding provides the generic APIs implemented by parquet encodings in its sub-packages. |
plain
Package plain implements the PLAIN parquet encoding.
|
Package plain implements the PLAIN parquet encoding. |
rle
Package rle implements the hybrid RLE/Bit-Packed encoding employed in repetition and definition levels, dictionary indexed data pages, and boolean values in the PLAIN encoding.
|
Package rle implements the hybrid RLE/Bit-Packed encoding employed in repetition and definition levels, dictionary indexed data pages, and boolean values in the PLAIN encoding. |
internal
|
|