Documentation ¶
Overview ¶
Package parquet is a library for working with parquet files. For an overview of Parquet's qualities as a storage format, see this blog post: https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet
Or see the Parquet documentation: https://parquet.apache.org/docs/
Example ¶
// parquet-go uses the same struct-tag definition style as JSON and XML type Contact struct { Name string `parquet:"name"` // "zstd" specifies the compression for this column PhoneNumber string `parquet:"phoneNumber,optional,zstd"` } type AddressBook struct { Owner string `parquet:"owner,zstd"` OwnerPhoneNumbers []string `parquet:"ownerPhoneNumbers,gzip"` Contacts []Contact `parquet:"contacts"` } f, _ := ioutil.TempFile("", "parquet-example-") writer := parquet.NewWriter(f) rows := []AddressBook{ {Owner: "UserA", Contacts: []Contact{ {Name: "Alice", PhoneNumber: "+15505551234"}, {Name: "Bob"}, }}, // Add more rows here. } for _, row := range rows { if err := writer.Write(row); err != nil { log.Fatal(err) } } _ = writer.Close() _ = f.Close() // Now, we can read from the file. rf, _ := os.Open(f.Name()) pf := parquet.NewReader(rf) addrs := make([]AddressBook, 0) for { var addr AddressBook err := pf.Read(&addr) if err == io.EOF { break } if err != nil { log.Fatal(err) } addrs = append(addrs, addr) } fmt.Println(addrs[0].Owner)
Output: UserA
Index ¶
- Constants
- Variables
- func CompareDescending(cmp func(Value, Value) int) func(Value, Value) int
- func CompareNullsFirst(cmp func(Value, Value) int) func(Value, Value) int
- func CompareNullsLast(cmp func(Value, Value) int) func(Value, Value) int
- func CopyPages(dst PageWriter, src PageReader) (numValues int64, err error)
- func CopyRows(dst RowWriter, src RowReader) (int64, error)
- func CopyValues(dst ValueWriter, src ValueReader) (int64, error)
- func DeepEqual(v1, v2 Value) bool
- func Equal(v1, v2 Value) bool
- func Find(index ColumnIndex, value Value, cmp func(Value, Value) int) int
- func LookupCompressionCodec(codec format.CompressionCodec) compress.Codec
- func LookupEncoding(enc format.Encoding) encoding.Encoding
- func PrintColumnChunk(w io.Writer, columnChunk ColumnChunk) error
- func PrintPage(w io.Writer, page Page) error
- func PrintRowGroup(w io.Writer, rowGroup RowGroup) error
- func PrintSchema(w io.Writer, name string, node Node) error
- func PrintSchemaIndent(w io.Writer, name string, node Node, pattern, newline string) error
- func Read[T any](r io.ReaderAt, size int64, options ...ReaderOption) (rows []T, err error)
- func ReadFile[T any](path string, options ...ReaderOption) (rows []T, err error)
- func Release(page Page)
- func Retain(page Page)
- func Search(index ColumnIndex, value Value, typ Type) int
- func Write[T any](w io.Writer, rows []T, options ...WriterOption) error
- func WriteFile[T any](path string, rows []T, options ...WriterOption) error
- type BloomFilter
- type BloomFilterColumn
- type BooleanReader
- type BooleanWriter
- type Buffer
- func (buf *Buffer) ColumnBuffers() []ColumnBuffer
- func (buf *Buffer) ColumnChunks() []ColumnChunk
- func (buf *Buffer) Len() int
- func (buf *Buffer) Less(i, j int) bool
- func (buf *Buffer) NumRows() int64
- func (buf *Buffer) Reset()
- func (buf *Buffer) Rows() Rows
- func (buf *Buffer) Schema() *Schema
- func (buf *Buffer) Size() int64
- func (buf *Buffer) SortingColumns() []SortingColumn
- func (buf *Buffer) Swap(i, j int)
- func (buf *Buffer) Write(row interface{}) error
- func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error)
- func (buf *Buffer) WriteRows(rows []Row) (int, error)
- type BufferPool
- type ByteArrayReader
- type ByteArrayWriter
- type Column
- func (c *Column) Column(name string) *Column
- func (c *Column) Columns() []*Column
- func (c *Column) Compression() compress.Codec
- func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error)
- func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error)
- func (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error)
- func (c *Column) Depth() int
- func (c *Column) Encoding() encoding.Encoding
- func (c *Column) Fields() []Field
- func (c *Column) GoType() reflect.Type
- func (c *Column) Index() int
- func (c *Column) Leaf() bool
- func (c *Column) MaxDefinitionLevel() int
- func (c *Column) MaxRepetitionLevel() int
- func (c *Column) Name() string
- func (c *Column) Optional() bool
- func (c *Column) Pages() Pages
- func (c *Column) Path() []string
- func (c *Column) Repeated() bool
- func (c *Column) Required() bool
- func (c *Column) String() string
- func (c *Column) Type() Type
- func (c *Column) Value(base reflect.Value) reflect.Value
- type ColumnBuffer
- type ColumnChunk
- type ColumnIndex
- type ColumnIndexer
- type Conversion
- type ConvertError
- type DataPageHeader
- type DataPageHeaderV1
- func (v1 DataPageHeaderV1) DefinitionLevelEncoding() format.Encoding
- func (v1 DataPageHeaderV1) Encoding() format.Encoding
- func (v1 DataPageHeaderV1) MaxValue() []byte
- func (v1 DataPageHeaderV1) MinValue() []byte
- func (v1 DataPageHeaderV1) NullCount() int64
- func (v1 DataPageHeaderV1) NumValues() int64
- func (v1 DataPageHeaderV1) PageType() format.PageType
- func (v1 DataPageHeaderV1) RepetitionLevelEncoding() format.Encoding
- func (v1 DataPageHeaderV1) String() string
- type DataPageHeaderV2
- func (v2 DataPageHeaderV2) DefinitionLevelEncoding() format.Encoding
- func (v2 DataPageHeaderV2) DefinitionLevelsByteLength() int64
- func (v2 DataPageHeaderV2) Encoding() format.Encoding
- func (v2 DataPageHeaderV2) IsCompressed() bool
- func (v2 DataPageHeaderV2) MaxValue() []byte
- func (v2 DataPageHeaderV2) MinValue() []byte
- func (v2 DataPageHeaderV2) NullCount() int64
- func (v2 DataPageHeaderV2) NumNulls() int64
- func (v2 DataPageHeaderV2) NumRows() int64
- func (v2 DataPageHeaderV2) NumValues() int64
- func (v2 DataPageHeaderV2) PageType() format.PageType
- func (v2 DataPageHeaderV2) RepetitionLevelEncoding() format.Encoding
- func (v2 DataPageHeaderV2) RepetitionLevelsByteLength() int64
- func (v2 DataPageHeaderV2) String() string
- type Dictionary
- type DictionaryPageHeader
- type DoubleReader
- type DoubleWriter
- type Field
- type File
- func (f *File) ColumnIndexes() []format.ColumnIndex
- func (f *File) Lookup(key string) (value string, ok bool)
- func (f *File) Metadata() *format.FileMetaData
- func (f *File) NumRows() int64
- func (f *File) OffsetIndexes() []format.OffsetIndex
- func (f *File) ReadAt(b []byte, off int64) (int, error)
- func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error)
- func (f *File) Root() *Column
- func (f *File) RowGroups() []RowGroup
- func (f *File) Schema() *Schema
- func (f *File) Size() int64
- type FileConfig
- type FileOption
- type FixedLenByteArrayReader
- type FixedLenByteArrayWriter
- type FloatReader
- type FloatWriter
- type GenericBuffer
- func (buf *GenericBuffer[T]) ColumnBuffers() []ColumnBuffer
- func (buf *GenericBuffer[T]) ColumnChunks() []ColumnChunk
- func (buf *GenericBuffer[T]) Len() int
- func (buf *GenericBuffer[T]) Less(i, j int) bool
- func (buf *GenericBuffer[T]) NumRows() int64
- func (buf *GenericBuffer[T]) Reset()
- func (buf *GenericBuffer[T]) Rows() Rows
- func (buf *GenericBuffer[T]) Schema() *Schema
- func (buf *GenericBuffer[T]) Size() int64
- func (buf *GenericBuffer[T]) SortingColumns() []SortingColumn
- func (buf *GenericBuffer[T]) Swap(i, j int)
- func (buf *GenericBuffer[T]) Write(rows []T) (int, error)
- func (buf *GenericBuffer[T]) WriteRowGroup(rowGroup RowGroup) (int64, error)
- func (buf *GenericBuffer[T]) WriteRows(rows []Row) (int, error)
- type GenericReader
- func (r *GenericReader[T]) Close() error
- func (r *GenericReader[T]) NumRows() int64
- func (r *GenericReader[T]) Read(rows []T) (int, error)
- func (r *GenericReader[T]) ReadRows(rows []Row) (int, error)
- func (r *GenericReader[T]) Reset()
- func (r *GenericReader[T]) Schema() *Schema
- func (r *GenericReader[T]) SeekToRow(rowIndex int64) error
- type GenericWriter
- func (w *GenericWriter[T]) Close() error
- func (w *GenericWriter[T]) Flush() error
- func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error)
- func (w *GenericWriter[T]) Reset(output io.Writer)
- func (w *GenericWriter[T]) Schema() *Schema
- func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string)
- func (w *GenericWriter[T]) Write(rows []T) (int, error)
- func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error)
- func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error)
- type Group
- func (g Group) Compression() compress.Codec
- func (g Group) Encoding() encoding.Encoding
- func (g Group) Fields() []Field
- func (g Group) GoType() reflect.Type
- func (g Group) Leaf() bool
- func (g Group) Optional() bool
- func (g Group) Repeated() bool
- func (g Group) Required() bool
- func (g Group) String() string
- func (g Group) Type() Type
- type Int32Reader
- type Int32Writer
- type Int64Reader
- type Int64Writer
- type Int96Reader
- type Int96Writer
- type Kind
- type LeafColumn
- type Node
- func BSON() Node
- func Compressed(node Node, codec compress.Codec) Node
- func Date() Node
- func Decimal(scale, precision int, typ Type) Node
- func Encoded(node Node, encoding encoding.Encoding) Node
- func Enum() Node
- func Int(bitWidth int) Node
- func JSON() Node
- func Leaf(typ Type) Node
- func List(of Node) Node
- func Map(key, value Node) Node
- func Optional(node Node) Node
- func Repeated(node Node) Node
- func Required(node Node) Node
- func String() Node
- func Time(unit TimeUnit) Node
- func Timestamp(unit TimeUnit) Node
- func UUID() Node
- func Uint(bitWidth int) Node
- type OffsetIndex
- type Page
- type PageHeader
- type PageReader
- type PageWriter
- type Pages
- type ReadMode
- type Readerdeprecated
- type ReaderConfig
- type ReaderOption
- type Row
- type RowBuffer
- func (buf *RowBuffer[T]) ColumnChunks() []ColumnChunk
- func (buf *RowBuffer[T]) Len() int
- func (buf *RowBuffer[T]) Less(i, j int) bool
- func (buf *RowBuffer[T]) NumRows() int64
- func (buf *RowBuffer[T]) Reset()
- func (buf *RowBuffer[T]) Rows() Rows
- func (buf *RowBuffer[T]) Schema() *Schema
- func (buf *RowBuffer[T]) SortingColumns() []SortingColumn
- func (buf *RowBuffer[T]) Swap(i, j int)
- func (buf *RowBuffer[T]) Write(rows []T) (int, error)
- func (buf *RowBuffer[T]) WriteRows(rows []Row) (int, error)
- type RowBuilder
- type RowGroup
- type RowGroupConfig
- type RowGroupOption
- type RowGroupReader
- type RowGroupWriter
- type RowReadSeeker
- type RowReader
- func DedupeRowReader(reader RowReader, compare func(Row, Row) int) RowReader
- func FilterRowReader(reader RowReader, predicate func(Row) bool) RowReader
- func MergeRowReaders(readers []RowReader, compare func(Row, Row) int) RowReader
- func ScanRowReader(reader RowReader, predicate func(Row, int64) bool) RowReader
- func TransformRowReader(reader RowReader, transform func(dst, src Row) (Row, error)) RowReader
- type RowReaderFrom
- type RowReaderFunc
- type RowReaderWithSchema
- type RowSeeker
- type RowWriter
- type RowWriterFunc
- type RowWriterTo
- type RowWriterWithSchema
- type Rows
- type Schema
- func (s *Schema) Columns() [][]string
- func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int
- func (s *Schema) Compression() compress.Codec
- func (s *Schema) ConfigureReader(config *ReaderConfig)
- func (s *Schema) ConfigureRowGroup(config *RowGroupConfig)
- func (s *Schema) ConfigureWriter(config *WriterConfig)
- func (s *Schema) Deconstruct(row Row, value interface{}) Row
- func (s *Schema) Encoding() encoding.Encoding
- func (s *Schema) Fields() []Field
- func (s *Schema) GoType() reflect.Type
- func (s *Schema) Leaf() bool
- func (s *Schema) Lookup(path ...string) (LeafColumn, bool)
- func (s *Schema) Name() string
- func (s *Schema) Optional() bool
- func (s *Schema) Reconstruct(value interface{}, row Row) error
- func (s *Schema) Repeated() bool
- func (s *Schema) Required() bool
- func (s *Schema) String() string
- func (s *Schema) Type() Type
- type SortingColumn
- type SortingConfig
- type SortingOption
- type SortingWriter
- func (w *SortingWriter[T]) Close() error
- func (w *SortingWriter[T]) Flush() error
- func (w *SortingWriter[T]) Reset(output io.Writer)
- func (w *SortingWriter[T]) Schema() *Schema
- func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string)
- func (w *SortingWriter[T]) Write(rows []T) (int, error)
- func (w *SortingWriter[T]) WriteRows(rows []Row) (int, error)
- type TimeUnit
- type Type
- type Value
- func BooleanValue(value bool) Value
- func ByteArrayValue(value []byte) Value
- func DoubleValue(value float64) Value
- func FixedLenByteArrayValue(value []byte) Value
- func FloatValue(value float32) Value
- func Int32Value(value int32) Value
- func Int64Value(value int64) Value
- func Int96Value(value deprecated.Int96) Value
- func NullValue() Value
- func ValueOf(v interface{}) Value
- func ZeroValue(kind Kind) Value
- func (v Value) AppendBytes(b []byte) []byte
- func (v Value) Boolean() bool
- func (v Value) Byte() byte
- func (v Value) ByteArray() []byte
- func (v Value) Bytes() []byte
- func (v Value) Clone() Value
- func (v Value) Column() int
- func (v Value) DefinitionLevel() int
- func (v Value) Double() float64
- func (v Value) Float() float32
- func (v Value) Format(w fmt.State, r rune)
- func (v Value) GoString() string
- func (v Value) Int32() int32
- func (v Value) Int64() int64
- func (v Value) Int96() deprecated.Int96
- func (v Value) IsNull() bool
- func (v Value) Kind() Kind
- func (v Value) Level(repetitionLevel, definitionLevel, columnIndex int) Value
- func (v Value) RepetitionLevel() int
- func (v Value) String() string
- func (v Value) Uint32() uint32
- func (v Value) Uint64() uint64
- type ValueReader
- type ValueReaderAt
- type ValueReaderFrom
- type ValueReaderFunc
- type ValueWriter
- type ValueWriterFunc
- type ValueWriterTo
- type Writerdeprecated
- func (w *Writer) Close() error
- func (w *Writer) Flush() error
- func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error)
- func (w *Writer) Reset(output io.Writer)
- func (w *Writer) Schema() *Schema
- func (w *Writer) SetKeyValueMetadata(key, value string)
- func (w *Writer) Write(row interface{}) error
- func (w *Writer) WriteRowGroup(rowGroup RowGroup) (int64, error)
- func (w *Writer) WriteRows(rows []Row) (int, error)
- type WriterConfig
- type WriterOption
- func BloomFilters(filters ...BloomFilterColumn) WriterOption
- func ColumnIndexSizeLimit(sizeLimit int) WriterOption
- func ColumnPageBuffers(buffers BufferPool) WriterOption
- func Compression(codec compress.Codec) WriterOption
- func CreatedBy(application, version, build string) WriterOption
- func DataPageStatistics(enabled bool) WriterOption
- func DataPageVersion(version int) WriterOption
- func KeyValueMetadata(key, value string) WriterOption
- func MaxRowsPerRowGroup(numRows int64) WriterOption
- func PageBufferSize(size int) WriterOption
- func SortingWriterConfig(options ...SortingOption) WriterOption
- func WriteBufferSize(size int) WriterOption
Examples ¶
Constants ¶
const ( DefaultColumnIndexSizeLimit = 16 DefaultColumnBufferCapacity = 16 * 1024 DefaultPageBufferSize = 256 * 1024 DefaultWriteBufferSize = 32 * 1024 DefaultDataPageVersion = 2 DefaultDataPageStatistics = false DefaultSkipPageIndex = false DefaultSkipBloomFilters = false DefaultMaxRowsPerRowGroup = math.MaxInt64 DefaultReadMode = ReadModeSync )
const ( // MaxColumnDepth is the maximum column depth supported by this package. MaxColumnDepth = math.MaxInt8 // MaxColumnIndex is the maximum column index supported by this package. MaxColumnIndex = math.MaxInt16 // MaxRepetitionLevel is the maximum repetition level supported by this // package. MaxRepetitionLevel = math.MaxUint8 // MaxDefinitionLevel is the maximum definition level supported by this // package. MaxDefinitionLevel = math.MaxUint8 // MaxRowGroups is the maximum number of row groups which can be contained // in a single parquet file. // // This limit is enforced by the use of 16 bits signed integers in the file // metadata footer of parquet files. It is part of the parquet specification // and therefore cannot be changed. MaxRowGroups = math.MaxInt16 )
Variables ¶
var ( // Uncompressed is a parquet compression codec representing uncompressed // pages. Uncompressed uncompressed.Codec // Snappy is the SNAPPY parquet compression codec. Snappy snappy.Codec // Gzip is the GZIP parquet compression codec. Gzip = gzip.Codec{ Level: gzip.DefaultCompression, } // Brotli is the BROTLI parquet compression codec. Brotli = brotli.Codec{ Quality: brotli.DefaultQuality, LGWin: brotli.DefaultLGWin, } // Zstd is the ZSTD parquet compression codec. Zstd = zstd.Codec{ Level: zstd.DefaultLevel, } // Lz4Raw is the LZ4_RAW parquet compression codec. Lz4Raw = lz4.Codec{ Level: lz4.DefaultLevel, } )
var ( // Plain is the default parquet encoding. Plain plain.Encoding // RLE is the hybrid bit-pack/run-length parquet encoding. RLE rle.Encoding // BitPacked is the deprecated bit-packed encoding for repetition and // definition levels. BitPacked bitpacked.Encoding // PlainDictionary is the plain dictionary parquet encoding. // // This encoding should not be used anymore in parquet 2.0 and later, // it is implemented for backwards compatibility to support reading // files that were encoded with older parquet libraries. PlainDictionary plain.DictionaryEncoding // RLEDictionary is the RLE dictionary parquet encoding. RLEDictionary rle.DictionaryEncoding // DeltaBinaryPacked is the delta binary packed parquet encoding. DeltaBinaryPacked delta.BinaryPackedEncoding // DeltaLengthByteArray is the delta length byte array parquet encoding. DeltaLengthByteArray delta.LengthByteArrayEncoding // DeltaByteArray is the delta byte array parquet encoding. DeltaByteArray delta.ByteArrayEncoding // ByteStreamSplit is an encoding for floating-point data. ByteStreamSplit bytestreamsplit.Encoding )
var ( // ErrCorrupted is an error returned by the Err method of ColumnPages // instances when they encountered a mismatch between the CRC checksum // recorded in a page header and the one computed while reading the page // data. ErrCorrupted = errors.New("corrupted parquet page") // ErrMissingRootColumn is an error returned when opening an invalid parquet // file which does not have a root column. ErrMissingRootColumn = errors.New("parquet file is missing a root column") // ErrRowGroupSchemaMissing is an error returned when attempting to write a // row group but the source has no schema. ErrRowGroupSchemaMissing = errors.New("cannot write rows to a row group which has no schema") // ErrRowGroupSchemaMismatch is an error returned when attempting to write a // row group but the source and destination schemas differ. ErrRowGroupSchemaMismatch = errors.New("cannot write row groups with mismatching schemas") // ErrRowGroupSortingColumnsMismatch is an error returned when attempting to // write a row group but the sorting columns differ in the source and // destination. ErrRowGroupSortingColumnsMismatch = errors.New("cannot write row groups with mismatching sorting columns") // ErrSeekOutOfRange is an error returned when seeking to a row index which // is less than the first row of a page. ErrSeekOutOfRange = errors.New("seek to row index out of page range") // ErrUnexpectedDictionaryPage is an error returned when a page reader // encounters a dictionary page after the first page, or in a column // which does not use a dictionary encoding. ErrUnexpectedDictionaryPage = errors.New("unexpected dictionary page") // ErrMissingPageHeader is an error returned when a page reader encounters // a malformed page header which is missing page-type-specific information. ErrMissingPageHeader = errors.New("missing page header") // ErrUnexpectedRepetitionLevels is an error returned when attempting to // decode repetition levels into a page which is not part of a repeated // column. ErrUnexpectedRepetitionLevels = errors.New("unexpected repetition levels") // ErrUnexpectedDefinitionLevels is an error returned when attempting to // decode definition levels into a page which is part of a required column. ErrUnexpectedDefinitionLevels = errors.New("unexpected definition levels") // ErrTooManyRowGroups is returned when attempting to generate a parquet // file with more than MaxRowGroups row groups. ErrTooManyRowGroups = errors.New("the limit of 32767 row groups has been reached") // ErrConversion is used to indicate that a conversion betwen two values // cannot be done because there are no rules to translate between their // physical types. ErrInvalidConversion = errors.New("invalid conversion between parquet values") )
Functions ¶
func CompareDescending ¶
CompareDescending constructs a comparison function which inverses the order of values.
func CompareNullsFirst ¶
CompareNullsFirst constructs a comparison function which assumes that null values are smaller than all other values.
func CompareNullsLast ¶
CompareNullsLast constructs a comparison function which assumes that null values are greater than all other values.
func CopyPages ¶
func CopyPages(dst PageWriter, src PageReader) (numValues int64, err error)
CopyPages copies pages from src to dst, returning the number of values that were copied.
The function returns any error it encounters reading or writing pages, except for io.EOF from the reader which indicates that there were no more pages to read.
func CopyRows ¶
CopyRows copies rows from src to dst.
The underlying types of src and dst are tested to determine if they expose information about the schema of rows that are read and expected to be written. If the schema information are available but do not match, the function will attempt to automatically convert the rows from the source schema to the destination.
As an optimization, the src argument may implement RowWriterTo to bypass the default row copy logic and provide its own. The dst argument may also implement RowReaderFrom for the same purpose.
The function returns the number of rows written, or any error encountered other than io.EOF.
func CopyValues ¶
func CopyValues(dst ValueWriter, src ValueReader) (int64, error)
CopyValues copies values from src to dst, returning the number of values that were written.
As an optimization, the reader and writer may choose to implement ValueReaderFrom and ValueWriterTo to provide their own copy logic.
The function returns any error it encounters reading or writing pages, except for io.EOF from the reader which indicates that there were no more values to read.
func DeepEqual ¶
DeepEqual returns true if v1 and v2 are equal, including their repetition levels, definition levels, and column indexes.
See Equal for details about how value equality is determined.
func Equal ¶
Equal returns true if v1 and v2 are equal.
Values are considered equal if they are of the same physical type and hold the same Go values. For BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY, the content of the underlying byte arrays are tested for equality.
Note that the repetition levels, definition levels, and column indexes are not compared by this function, use DeepEqual instead.
func Find ¶
Find uses the ColumnIndex passed as argument to find the page in a column chunk (determined by the given ColumnIndex) that the given value is expected to be found in.
The function returns the index of the first page that might contain the value. If the function determines that the value does not exist in the index, NumPages is returned.
If you want to search the entire parquet file, you must iterate over the RowGroups and search each one individually, if there are multiple in the file. If you call writer.Flush before closing the file, then you will have multiple RowGroups to iterate over, otherwise Flush is called once on Close.
The comparison function passed as last argument is used to determine the relative order of values. This should generally be the Compare method of the column type, but can sometimes be customized to modify how null values are interpreted, for example:
pageIndex := parquet.Find(columnIndex, value, parquet.CompareNullsFirst(typ.Compare), )
func LookupCompressionCodec ¶
func LookupCompressionCodec(codec format.CompressionCodec) compress.Codec
LookupCompressionCodec returns the compression codec associated with the given code.
The function never returns nil. If the encoding is not supported, an "unsupported" codec is returned.
func LookupEncoding ¶
LookupEncoding returns the parquet encoding associated with the given code.
The function never returns nil. If the encoding is not supported, encoding.NotSupported is returned.
func PrintColumnChunk ¶
func PrintColumnChunk(w io.Writer, columnChunk ColumnChunk) error
func PrintSchemaIndent ¶
func Read ¶
Read reads and returns rows from the parquet file in the given reader.
The type T defines the type of rows read from r. T must be compatible with the file's schema or an error will be returned. The row type might represent a subset of the full schema, in which case only a subset of the columns will be loaded from r.
This function is provided for convenience to facilitate reading of parquet files from arbitrary locations in cases where the data set fit in memory.
Example (Any) ¶
type Row struct{ FirstName, LastName string } buf := new(bytes.Buffer) err := parquet.Write(buf, []Row{ {FirstName: "Luke", LastName: "Skywalker"}, {FirstName: "Han", LastName: "Solo"}, {FirstName: "R2", LastName: "D2"}, }) if err != nil { log.Fatal(err) } file := bytes.NewReader(buf.Bytes()) rows, err := parquet.Read[any](file, file.Size()) if err != nil { log.Fatal(err) } for _, row := range rows { fmt.Printf("%q\n", row) }
Output: map["FirstName":"Luke" "LastName":"Skywalker"] map["FirstName":"Han" "LastName":"Solo"] map["FirstName":"R2" "LastName":"D2"]
func ReadFile ¶
func ReadFile[T any](path string, options ...ReaderOption) (rows []T, err error)
ReadFile reads rows of the parquet file at the given path.
The type T defines the type of rows read from r. T must be compatible with the file's schema or an error will be returned. The row type might represent a subset of the full schema, in which case only a subset of the columns will be loaded from the file.
This function is provided for convenience to facilitate reading of parquet files from the file system in cases where the data set fit in memory.
Example ¶
type Row struct { ID int64 `parquet:"id"` Name string `parquet:"name,zstd"` } ExampleWriteFile() rows, err := parquet.ReadFile[Row]("/tmp/file.parquet") if err != nil { log.Fatal(err) } for _, row := range rows { fmt.Printf("%d: %q\n", row.ID, row.Name) }
Output: 0: "Bob" 1: "Alice" 2: "Franky"
func Release ¶
func Release(page Page)
Release is a helper function to decrement the reference counter of pages backed by memory which can be granularly managed by the application.
Usage of this is optional and with Retain, is intended to allow finer grained memory management in the application, at the expense of potentially causing panics if the page is used after its reference count has reached zero. Most programs should be able to rely on automated memory management provided by the Go garbage collector instead.
The function should be called to return a page to the internal buffer pool, when a goroutine "releases ownership" it acquired either by being the single owner (e.g. capturing the return value from a ReadPage call) or having gotten shared ownership by calling Retain.
Calling this function on pages that do not embed a reference counter does nothing.
func Retain ¶
func Retain(page Page)
Retain is a helper function to increment the reference counter of pages backed by memory which can be granularly managed by the application.
Usage of this function is optional and with Release, is intended to allow finer grain memory management in the application. Most programs should be able to rely on automated memory management provided by the Go garbage collector instead.
The function should be called when a page lifetime is about to be shared between multiple goroutines or layers of an application, and the program wants to express "sharing ownership" of the page.
Calling this function on pages that do not embed a reference counter does nothing.
func Search ¶
func Search(index ColumnIndex, value Value, typ Type) int
Search is like Find, but uses the default ordering of the given type. Search and Find are scoped to a given ColumnChunk and find the pages within a ColumnChunk which might contain the result. See Find for more details.
Example ¶
type Row struct{ FirstName, LastName string } buf := new(bytes.Buffer) // The column being searched should be sorted to avoid a full scan of the // column. See the section of the readme on sorting for how to sort on // insertion into the parquet file using parquet.SortingColumns rows := []Row{ {FirstName: "C", LastName: "3PO"}, {FirstName: "Han", LastName: "Solo"}, {FirstName: "Leia", LastName: "Organa"}, {FirstName: "Luke", LastName: "Skywalker"}, {FirstName: "R2", LastName: "D2"}, } // The tiny page buffer size ensures we get multiple pages out of the example above. w := parquet.NewGenericWriter[Row](buf, parquet.PageBufferSize(12), parquet.WriteBufferSize(0)) // Need to write 1 row at a time here as writing many at once disregards PageBufferSize option. for _, row := range rows { _, err := w.Write([]Row{row}) if err != nil { log.Fatal(err) } } err := w.Close() if err != nil { log.Fatal(err) } reader := bytes.NewReader(buf.Bytes()) file, err := parquet.OpenFile(reader, reader.Size()) if err != nil { log.Fatal(err) } // Search is scoped to a single RowGroup/ColumnChunk rowGroup := file.RowGroups()[0] firstNameColChunk := rowGroup.ColumnChunks()[0] found := parquet.Search(firstNameColChunk.ColumnIndex(), parquet.ValueOf("Luke"), parquet.ByteArrayType) offsetIndex := firstNameColChunk.OffsetIndex() fmt.Printf("numPages: %d\n", offsetIndex.NumPages()) fmt.Printf("result found in page: %d\n", found) if found < offsetIndex.NumPages() { r := parquet.NewGenericReader[Row](file) defer r.Close() // Seek to the first row in the page the result was found r.SeekToRow(offsetIndex.FirstRowIndex(found)) result := make([]Row, 2) _, _ = r.Read(result) // Leia is in index 0 for the page. for _, row := range result { if row.FirstName == "Luke" { fmt.Printf("%q\n", row) } } }
Output: numPages: 3 result found in page: 1 {"Luke" "Skywalker"}
func Write ¶
func Write[T any](w io.Writer, rows []T, options ...WriterOption) error
Write writes the given list of rows to a parquet file written to w.
This function is provided for convenience to facilitate the creation of parquet files.
Example (Any) ¶
schema := parquet.SchemaOf(struct { FirstName string LastName string }{}) buf := new(bytes.Buffer) err := parquet.Write[any]( buf, []any{ map[string]string{"FirstName": "Luke", "LastName": "Skywalker"}, map[string]string{"FirstName": "Han", "LastName": "Solo"}, map[string]string{"FirstName": "R2", "LastName": "D2"}, }, schema, ) if err != nil { log.Fatal(err) } file := bytes.NewReader(buf.Bytes()) rows, err := parquet.Read[any](file, file.Size()) if err != nil { log.Fatal(err) } for _, row := range rows { fmt.Printf("%q\n", row) }
Output: map["FirstName":"Luke" "LastName":"Skywalker"] map["FirstName":"Han" "LastName":"Solo"] map["FirstName":"R2" "LastName":"D2"]
func WriteFile ¶
func WriteFile[T any](path string, rows []T, options ...WriterOption) error
Write writes the given list of rows to a parquet file written to w.
This function is provided for convenience to facilitate writing parquet files to the file system.
Example ¶
type Row struct { ID int64 `parquet:"id"` Name string `parquet:"name,zstd"` } if err := parquet.WriteFile("/tmp/file.parquet", []Row{ {ID: 0, Name: "Bob"}, {ID: 1, Name: "Alice"}, {ID: 2, Name: "Franky"}, }); err != nil { log.Fatal(err) }
Output:
Types ¶
type BloomFilter ¶
type BloomFilter interface { // Implement the io.ReaderAt interface as a mechanism to allow reading the // raw bits of the filter. io.ReaderAt // Returns the size of the bloom filter (in bytes). Size() int64 // Tests whether the given value is present in the filter. // // A non-nil error may be returned if reading the filter failed. This may // happen if the filter was lazily loaded from a storage medium during the // call to Check for example. Applications that can guarantee that the // filter was in memory at the time Check was called can safely ignore the // error, which would always be nil in this case. Check(value Value) (bool, error) }
BloomFilter is an interface allowing applications to test whether a key exists in a bloom filter.
type BloomFilterColumn ¶
type BloomFilterColumn interface { // Returns the path of the column that the filter applies to. Path() []string // Returns the hashing algorithm used when inserting values into a bloom // filter. Hash() bloom.Hash // Returns an encoding which can be used to write columns of values to the // filter. Encoding() encoding.Encoding // Returns the size of the filter needed to encode values in the filter, // assuming each value will be encoded with the given number of bits. Size(numValues int64) int }
The BloomFilterColumn interface is a declarative representation of bloom filters used when configuring filters on a parquet writer.
func SplitBlockFilter ¶
func SplitBlockFilter(bitsPerValue uint, path ...string) BloomFilterColumn
SplitBlockFilter constructs a split block bloom filter object for the column at the given path, with the given bitsPerValue.
If you are unsure what number of bitsPerValue to use, 10 is a reasonable tradeoff between size and error rate for common datasets.
For more information on the tradeoff between size and error rate, consult this website: https://hur.st/bloomfilter/?n=4000&p=0.1&m=&k=1
type BooleanReader ¶
type BooleanReader interface { // Read boolean values into the buffer passed as argument. // // The method returns io.EOF when all values have been read. ReadBooleans(values []bool) (int, error) }
BooleanReader is an interface implemented by ValueReader instances which expose the content of a column of boolean values.
type BooleanWriter ¶
type BooleanWriter interface { // Write boolean values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteBooleans(values []bool) (int, error) }
BooleanWriter is an interface implemented by ValueWriter instances which support writing columns of boolean values.
type Buffer ¶
type Buffer struct {
// contains filtered or unexported fields
}
Buffer represents an in-memory group of parquet rows.
The main purpose of the Buffer type is to provide a way to sort rows before writing them to a parquet file. Buffer implements sort.Interface as a way to support reordering the rows that have been written to it.
func NewBuffer ¶
func NewBuffer(options ...RowGroupOption) *Buffer
NewBuffer constructs a new buffer, using the given list of buffer options to configure the buffer returned by the function.
The function panics if the buffer configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewBuffer should construct the buffer configuration independently prior to calling this function:
config, err := parquet.NewRowGroupConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a buffer is guaranteed not to panic buffer := parquet.NewBuffer(config) ... }
func (*Buffer) ColumnBuffers ¶
func (buf *Buffer) ColumnBuffers() []ColumnBuffer
ColumnBuffer returns the buffer columns.
This method is similar to ColumnChunks, but returns a list of ColumnBuffer instead of a ColumnChunk values (the latter being read-only); calling ColumnBuffers or ColumnChunks with the same index returns the same underlying objects, but with different types, which removes the need for making a type assertion if the program needed to write directly to the column buffers. The presence of the ColumnChunks method is still required to satisfy the RowGroup interface.
func (*Buffer) ColumnChunks ¶
func (buf *Buffer) ColumnChunks() []ColumnChunk
ColumnChunks returns the buffer columns.
func (*Buffer) Reset ¶
func (buf *Buffer) Reset()
Reset clears the content of the buffer, allowing it to be reused.
func (*Buffer) Rows ¶
Rows returns a reader exposing the current content of the buffer.
The buffer and the returned reader share memory. Mutating the buffer concurrently to reading rows may result in non-deterministic behavior.
func (*Buffer) Schema ¶
Schema returns the schema of the buffer.
The schema is either configured by passing a Schema in the option list when constructing the buffer, or lazily discovered when the first row is written.
func (*Buffer) SortingColumns ¶
func (buf *Buffer) SortingColumns() []SortingColumn
SortingColumns returns the list of columns by which the buffer will be sorted.
The sorting order is configured by passing a SortingColumns option when constructing the buffer.
func (*Buffer) WriteRowGroup ¶
WriteRowGroup satisfies the RowGroupWriter interface.
type BufferPool ¶
type BufferPool interface { // GetBuffer is called when a parquet writer needs to acquire a new // page buffer from the pool. GetBuffer() io.ReadWriteSeeker // PutBuffer is called when a parquet writer releases a page buffer to // the pool. // // The parquet.Writer type guarantees that the buffers it calls this method // with were previously acquired by a call to GetBuffer on the same // pool, and that it will not use them anymore after the call. PutBuffer(io.ReadWriteSeeker) }
BufferPool is an interface abstracting the underlying implementation of page buffer pools.
The parquet-go package provides two implementations of this interface, one backed by in-memory buffers (on the Go heap), and the other using temporary files on disk.
Applications which need finer grain control over the allocation and retention of page buffers may choose to provide their own implementation and install it via the parquet.ColumnPageBuffers writer option.
BufferPool implementations must be safe to use concurrently from multiple goroutines.
func NewBufferPool ¶
func NewBufferPool() BufferPool
NewBufferPool creates a new in-memory page buffer pool.
The implementation is backed by sync.Pool and allocates memory buffers on the Go heap.
func NewFileBufferPool ¶
func NewFileBufferPool(tempdir, pattern string) BufferPool
NewFileBufferPool creates a new on-disk page buffer pool.
type ByteArrayReader ¶
type ByteArrayReader interface { // Read values into the byte buffer passed as argument, returning the number // of values written to the buffer (not the number of bytes). Values are // written using the PLAIN encoding, each byte array prefixed with its // length encoded as a 4 bytes little endian unsigned integer. // // The method returns io.EOF when all values have been read. // // If the buffer was not empty, but too small to hold at least one value, // io.ErrShortBuffer is returned. ReadByteArrays(values []byte) (int, error) }
ByteArrayReader is an interface implemented by ValueReader instances which expose the content of a column of variable length byte array values.
type ByteArrayWriter ¶
type ByteArrayWriter interface { // Write variable length byte array values. // // The values passed as input must be laid out using the PLAIN encoding, // with each byte array prefixed with the four bytes little endian unsigned // integer length. // // The method returns the number of values written to the underlying column // (not the number of bytes), or any error that occurred while attempting to // write the values. WriteByteArrays(values []byte) (int, error) }
ByteArrayWriter is an interface implemented by ValueWriter instances which support writing columns of variable length byte array values.
type Column ¶
type Column struct {
// contains filtered or unexported fields
}
Column represents a column in a parquet file.
Methods of Column values are safe to call concurrently from multiple goroutines.
Column instances satisfy the Node interface.
func (*Column) Columns ¶
Columns returns the list of child columns.
The method returns the same slice across multiple calls, the program must treat it as a read-only value.
func (*Column) Compression ¶
Compression returns the compression codecs used by this column.
func (*Column) DecodeDataPageV1 ¶
func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error)
DecodeDataPageV1 decodes a data page from the header, compressed data, and optional dictionary passed as arguments.
func (*Column) DecodeDataPageV2 ¶
func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error)
DecodeDataPageV2 decodes a data page from the header, compressed data, and optional dictionary passed as arguments.
func (*Column) DecodeDictionary ¶
func (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error)
DecodeDictionary decodes a data page from the header and compressed data passed as arguments.
func (*Column) Index ¶
Index returns the position of the column in a row. Only leaf columns have a column index, the method returns -1 when called on non-leaf columns.
func (*Column) MaxDefinitionLevel ¶
MaxDefinitionLevel returns the maximum value of definition levels on this column.
func (*Column) MaxRepetitionLevel ¶
MaxRepetitionLevel returns the maximum value of repetition levels on this column.
type ColumnBuffer ¶
type ColumnBuffer interface { // Exposes a read-only view of the column buffer. ColumnChunk // The column implements ValueReaderAt as a mechanism to read values at // specific locations within the buffer. ValueReaderAt // The column implements ValueWriter as a mechanism to optimize the copy // of values into the buffer in contexts where the row information is // provided by the values because the repetition and definition levels // are set. ValueWriter // For indexed columns, returns the underlying dictionary holding the column // values. If the column is not indexed, nil is returned. Dictionary() Dictionary // Returns a copy of the column. The returned copy shares no memory with // the original, mutations of either column will not modify the other. Clone() ColumnBuffer // Returns the column as a Page. Page() Page // Clears all rows written to the column. Reset() // Returns the current capacity of the column (rows). Cap() int // Returns the number of rows currently written to the column. Len() int // Compares rows at index i and j and reports whether i < j. Less(i, j int) bool // Swaps rows at index i and j. Swap(i, j int) // Returns the size of the column buffer in bytes. Size() int64 // contains filtered or unexported methods }
ColumnBuffer is an interface representing columns of a row group.
ColumnBuffer implements sort.Interface as a way to support reordering the rows that have been written to it.
The current implementation has a limitation which prevents applications from providing custom versions of this interface because it contains unexported methods. The only way to create ColumnBuffer values is to call the NewColumnBuffer of Type instances. This limitation may be lifted in future releases.
type ColumnChunk ¶
type ColumnChunk interface { // Returns the column type. Type() Type // Returns the index of this column in its parent row group. Column() int // Returns a reader exposing the pages of the column. Pages() Pages // Returns the components of the page index for this column chunk, // containing details about the content and location of pages within the // chunk. // // Note that the returned value may be the same across calls to these // methods, programs must treat those as read-only. // // If the column chunk does not have a page index, the methods return nil. ColumnIndex() ColumnIndex OffsetIndex() OffsetIndex BloomFilter() BloomFilter // Returns the number of values in the column chunk. // // This quantity may differ from the number of rows in the parent row group // because repeated columns may hold zero or more values per row. NumValues() int64 }
The ColumnChunk interface represents individual columns of a row group.
type ColumnIndex ¶
type ColumnIndex interface { // NumPages returns the number of paged in the column index. NumPages() int // Returns the number of null values in the page at the given index. NullCount(int) int64 // Tells whether the page at the given index contains null values only. NullPage(int) bool // PageIndex return min/max bounds for the page at the given index in the // column. MinValue(int) Value MaxValue(int) Value // IsAscending returns true if the column index min/max values are sorted // in ascending order (based on the ordering rules of the column's logical // type). IsAscending() bool // IsDescending returns true if the column index min/max values are sorted // in descending order (based on the ordering rules of the column's logical // type). IsDescending() bool }
func NewColumnIndex ¶
func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex
NewColumnIndex constructs a ColumnIndex instance from the given parquet format column index. The kind argument configures the type of values
type ColumnIndexer ¶
type ColumnIndexer interface { // Resets the column indexer state. Reset() // Add a page to the column indexer. IndexPage(numValues, numNulls int64, min, max Value) // Generates a format.ColumnIndex value from the current state of the // column indexer. // // The returned value may reference internal buffers, in which case the // values remain valid until the next call to IndexPage or Reset on the // column indexer. ColumnIndex() format.ColumnIndex }
The ColumnIndexer interface is implemented by types that support generating parquet column indexes.
The package does not export any types that implement this interface, programs must call NewColumnIndexer on a Type instance to construct column indexers.
type Conversion ¶
type Conversion interface { // Applies the conversion logic on the src row, returning the result // appended to dst. Convert(rows []Row) (int, error) // Converts the given column index in the target schema to the original // column index in the source schema of the conversion. Column(int) int // Returns the target schema of the conversion. Schema() *Schema }
Conversion is an interface implemented by types that provide conversion of parquet rows from one schema to another.
Conversion instances must be safe to use concurrently from multiple goroutines.
func Convert ¶
func Convert(to, from Node) (conv Conversion, err error)
Convert constructs a conversion function from one parquet schema to another.
The function supports converting between schemas where the source or target have extra columns; if there are more columns in the source, they will be stripped out of the rows. Extra columns in the target schema will be set to null or zero values.
The returned function is intended to be used to append the converted source row to the destination buffer.
type ConvertError ¶
ConvertError is an error type returned by calls to Convert when the conversion of parquet schemas is impossible or the input row for the conversion is malformed.
func (*ConvertError) Error ¶
func (e *ConvertError) Error() string
Error satisfies the error interface.
type DataPageHeader ¶
type DataPageHeader interface { PageHeader // Returns the encoding of the repetition level section. RepetitionLevelEncoding() format.Encoding // Returns the encoding of the definition level section. DefinitionLevelEncoding() format.Encoding // Returns the number of null values in the page. NullCount() int64 // Returns the minimum value in the page based on the ordering rules of the // column's logical type. // // As an optimization, the method may return the same slice across multiple // calls. Programs must treat the returned value as immutable to prevent // unpredictable behaviors. // // If the page only contains only null values, an empty slice is returned. MinValue() []byte // Returns the maximum value in the page based on the ordering rules of the // column's logical type. // // As an optimization, the method may return the same slice across multiple // calls. Programs must treat the returned value as immutable to prevent // unpredictable behaviors. // // If the page only contains only null values, an empty slice is returned. MaxValue() []byte }
DataPageHeader is a specialization of the PageHeader interface implemented by data pages.
type DataPageHeaderV1 ¶
type DataPageHeaderV1 struct {
// contains filtered or unexported fields
}
DataPageHeaderV1 is an implementation of the DataPageHeader interface representing data pages version 1.
func (DataPageHeaderV1) DefinitionLevelEncoding ¶
func (v1 DataPageHeaderV1) DefinitionLevelEncoding() format.Encoding
func (DataPageHeaderV1) Encoding ¶
func (v1 DataPageHeaderV1) Encoding() format.Encoding
func (DataPageHeaderV1) MaxValue ¶
func (v1 DataPageHeaderV1) MaxValue() []byte
func (DataPageHeaderV1) MinValue ¶
func (v1 DataPageHeaderV1) MinValue() []byte
func (DataPageHeaderV1) NullCount ¶
func (v1 DataPageHeaderV1) NullCount() int64
func (DataPageHeaderV1) NumValues ¶
func (v1 DataPageHeaderV1) NumValues() int64
func (DataPageHeaderV1) PageType ¶
func (v1 DataPageHeaderV1) PageType() format.PageType
func (DataPageHeaderV1) RepetitionLevelEncoding ¶
func (v1 DataPageHeaderV1) RepetitionLevelEncoding() format.Encoding
func (DataPageHeaderV1) String ¶
func (v1 DataPageHeaderV1) String() string
type DataPageHeaderV2 ¶
type DataPageHeaderV2 struct {
// contains filtered or unexported fields
}
DataPageHeaderV2 is an implementation of the DataPageHeader interface representing data pages version 2.
func (DataPageHeaderV2) DefinitionLevelEncoding ¶
func (v2 DataPageHeaderV2) DefinitionLevelEncoding() format.Encoding
func (DataPageHeaderV2) DefinitionLevelsByteLength ¶
func (v2 DataPageHeaderV2) DefinitionLevelsByteLength() int64
func (DataPageHeaderV2) Encoding ¶
func (v2 DataPageHeaderV2) Encoding() format.Encoding
func (DataPageHeaderV2) IsCompressed ¶
func (v2 DataPageHeaderV2) IsCompressed() bool
func (DataPageHeaderV2) MaxValue ¶
func (v2 DataPageHeaderV2) MaxValue() []byte
func (DataPageHeaderV2) MinValue ¶
func (v2 DataPageHeaderV2) MinValue() []byte
func (DataPageHeaderV2) NullCount ¶
func (v2 DataPageHeaderV2) NullCount() int64
func (DataPageHeaderV2) NumNulls ¶
func (v2 DataPageHeaderV2) NumNulls() int64
func (DataPageHeaderV2) NumRows ¶
func (v2 DataPageHeaderV2) NumRows() int64
func (DataPageHeaderV2) NumValues ¶
func (v2 DataPageHeaderV2) NumValues() int64
func (DataPageHeaderV2) PageType ¶
func (v2 DataPageHeaderV2) PageType() format.PageType
func (DataPageHeaderV2) RepetitionLevelEncoding ¶
func (v2 DataPageHeaderV2) RepetitionLevelEncoding() format.Encoding
func (DataPageHeaderV2) RepetitionLevelsByteLength ¶
func (v2 DataPageHeaderV2) RepetitionLevelsByteLength() int64
func (DataPageHeaderV2) String ¶
func (v2 DataPageHeaderV2) String() string
type Dictionary ¶
type Dictionary interface { // Returns the type that the dictionary was created from. Type() Type // Returns the number of value indexed in the dictionary. Len() int // Returns the dictionary value at the given index. Index(index int32) Value // Inserts values from the second slice to the dictionary and writes the // indexes at which each value was inserted to the first slice. // // The method panics if the length of the indexes slice is smaller than the // length of the values slice. Insert(indexes []int32, values []Value) // Given an array of dictionary indexes, lookup the values into the array // of values passed as second argument. // // The method panics if len(indexes) > len(values), or one of the indexes // is negative or greater than the highest index in the dictionary. Lookup(indexes []int32, values []Value) // Returns the min and max values found in the given indexes. Bounds(indexes []int32) (min, max Value) // Resets the dictionary to its initial state, removing all values. Reset() // Returns a Page representing the content of the dictionary. // // The returned page shares the underlying memory of the buffer, it remains // valid to use until the dictionary's Reset method is called. Page() Page // contains filtered or unexported methods }
The Dictionary interface represents type-specific implementations of parquet dictionaries.
Programs can instantiate dictionaries by call the NewDictionary method of a Type object.
The current implementation has a limitation which prevents applications from providing custom versions of this interface because it contains unexported methods. The only way to create Dictionary values is to call the NewDictionary of Type instances. This limitation may be lifted in future releases.
type DictionaryPageHeader ¶
type DictionaryPageHeader struct {
// contains filtered or unexported fields
}
DictionaryPageHeader is an implementation of the PageHeader interface representing dictionary pages.
func (DictionaryPageHeader) Encoding ¶
func (dict DictionaryPageHeader) Encoding() format.Encoding
func (DictionaryPageHeader) IsSorted ¶
func (dict DictionaryPageHeader) IsSorted() bool
func (DictionaryPageHeader) NumValues ¶
func (dict DictionaryPageHeader) NumValues() int64
func (DictionaryPageHeader) PageType ¶
func (dict DictionaryPageHeader) PageType() format.PageType
func (DictionaryPageHeader) String ¶
func (dict DictionaryPageHeader) String() string
type DoubleReader ¶
type DoubleReader interface { // Read double-precision floating point values into the buffer passed as // argument. // // The method returns io.EOF when all values have been read. ReadDoubles(values []float64) (int, error) }
DoubleReader is an interface implemented by ValueReader instances which expose the content of a column of double-precision float point values.
type DoubleWriter ¶
type DoubleWriter interface { // Write double-precision floating point values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteDoubles(values []float64) (int, error) }
DoubleWriter is an interface implemented by ValueWriter instances which support writing columns of double-precision floating point values.
type Field ¶
type Field interface { Node // Returns the name of this field in its parent node. Name() string // Given a reference to the Go value matching the structure of the parent // node, returns the Go value of the field. Value(base reflect.Value) reflect.Value }
Field instances represent fields of a parquet node, which associate a node to their name in their parent node.
type File ¶
type File struct {
// contains filtered or unexported fields
}
File represents a parquet file. The layout of a Parquet file can be found here: https://github.com/apache/parquet-format#file-format
func OpenFile ¶
OpenFile opens a parquet file and reads the content between offset 0 and the given size in r.
Only the parquet magic bytes and footer are read, column chunks and other parts of the file are left untouched; this means that successfully opening a file does not validate that the pages have valid checksums.
func (*File) ColumnIndexes ¶
func (f *File) ColumnIndexes() []format.ColumnIndex
ColumnIndexes returns the page index of the parquet file f.
If the file did not contain a column index, the method returns an empty slice and nil error.
func (*File) Lookup ¶
Lookup returns the value associated with the given key in the file key/value metadata.
The ok boolean will be true if the key was found, false otherwise.
func (*File) Metadata ¶
func (f *File) Metadata() *format.FileMetaData
Metadata returns the metadata of f.
func (*File) OffsetIndexes ¶
func (f *File) OffsetIndexes() []format.OffsetIndex
OffsetIndexes returns the page index of the parquet file f.
If the file did not contain an offset index, the method returns an empty slice and nil error.
func (*File) ReadAt ¶
ReadAt reads bytes into b from f at the given offset.
The method satisfies the io.ReaderAt interface.
func (*File) ReadPageIndex ¶
func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error)
ReadPageIndex reads the page index section of the parquet file f.
If the file did not contain a page index, the method returns two empty slices and a nil error.
Only leaf columns have indexes, the returned indexes are arranged using the following layout:
------------------ | col 0: chunk 0 | ------------------ | col 1: chunk 0 | ------------------ | ... | ------------------ | col 0: chunk 1 | ------------------ | col 1: chunk 1 | ------------------ | ... | ------------------
This method is useful in combination with the SkipPageIndex option to delay reading the page index section until after the file was opened. Note that in this case the page index is not cached within the file, programs are expected to make use of independently from the parquet package.
type FileConfig ¶
type FileConfig struct { SkipPageIndex bool SkipBloomFilters bool ReadBufferSize int ReadMode ReadMode Schema *Schema }
The FileConfig type carries configuration options for parquet files.
FileConfig implements the FileOption interface so it can be used directly as argument to the OpenFile function when needed, for example:
f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{ SkipPageIndex: true, SkipBloomFilters: true, ReadMode: ReadModeAsync, })
func DefaultFileConfig ¶
func DefaultFileConfig() *FileConfig
DefaultFileConfig returns a new FileConfig value initialized with the default file configuration.
func NewFileConfig ¶
func NewFileConfig(options ...FileOption) (*FileConfig, error)
NewFileConfig constructs a new file configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*FileConfig) Apply ¶
func (c *FileConfig) Apply(options ...FileOption)
Apply applies the given list of options to c.
func (*FileConfig) ConfigureFile ¶
func (c *FileConfig) ConfigureFile(config *FileConfig)
ConfigureFile applies configuration options from c to config.
func (*FileConfig) Validate ¶
func (c *FileConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type FileOption ¶
type FileOption interface {
ConfigureFile(*FileConfig)
}
FileOption is an interface implemented by types that carry configuration options for parquet files.
func FileReadMode ¶
func FileReadMode(mode ReadMode) FileOption
FileReadMode is a file configuration option which controls the way pages are read. Currently the only two options are ReadModeAsync and ReadModeSync which control whether or not pages are loaded asynchronously. It can be advantageous to use ReadModeAsync if your reader is backed by network storage.
Defaults to ReadModeSync.
func FileSchema ¶
func FileSchema(schema *Schema) FileOption
FileSchema is used to pass a known schema in while opening a Parquet file. This optimization is only useful if your application is currently opening an extremely large number of parquet files with the same, known schema.
Defaults to nil.
func ReadBufferSize ¶
func ReadBufferSize(size int) FileOption
ReadBufferSize is a file configuration option which controls the default buffer sizes for reads made to the provided io.Reader. The default of 4096 is appropriate for disk based access but if your reader is backed by network storage it can be advantageous to increase this value to something more like 4 MiB.
Defaults to 4096.
func SkipBloomFilters ¶
func SkipBloomFilters(skip bool) FileOption
SkipBloomFilters is a file configuration option which prevents automatically reading the bloom filters when opening a parquet file, when set to true. This is useful as an optimization when programs know that they will not need to consume the bloom filters.
Defaults to false.
func SkipPageIndex ¶
func SkipPageIndex(skip bool) FileOption
SkipPageIndex is a file configuration option which prevents automatically reading the page index when opening a parquet file, when set to true. This is useful as an optimization when programs know that they will not need to consume the page index.
Defaults to false.
type FixedLenByteArrayReader ¶
type FixedLenByteArrayReader interface { // Read values into the byte buffer passed as argument, returning the number // of values written to the buffer (not the number of bytes). // // The method returns io.EOF when all values have been read. // // If the buffer was not empty, but too small to hold at least one value, // io.ErrShortBuffer is returned. ReadFixedLenByteArrays(values []byte) (int, error) }
FixedLenByteArrayReader is an interface implemented by ValueReader instances which expose the content of a column of fixed length byte array values.
type FixedLenByteArrayWriter ¶
type FixedLenByteArrayWriter interface { // Writes the fixed length byte array values. // // The size of the values is assumed to be the same as the expected size of // items in the column. The method errors if the length of the input values // is not a multiple of the expected item size. WriteFixedLenByteArrays(values []byte) (int, error) }
FixedLenByteArrayWriter is an interface implemented by ValueWriter instances which support writing columns of fixed length byte array values.
type FloatReader ¶
type FloatReader interface { // Read single-precision floating point values into the buffer passed as // argument. // // The method returns io.EOF when all values have been read. ReadFloats(values []float32) (int, error) }
FloatReader is an interface implemented by ValueReader instances which expose the content of a column of single-precision floating point values.
type FloatWriter ¶
type FloatWriter interface { // Write single-precision floating point values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteFloats(values []float32) (int, error) }
FloatWriter is an interface implemented by ValueWriter instances which support writing columns of single-precision floating point values.
type GenericBuffer ¶
type GenericBuffer[T any] struct { // contains filtered or unexported fields }
GenericBuffer is similar to a Buffer but uses a type parameter to define the Go type representing the schema of rows in the buffer.
See GenericWriter for details about the benefits over the classic Buffer API.
func NewGenericBuffer ¶
func NewGenericBuffer[T any](options ...RowGroupOption) *GenericBuffer[T]
NewGenericBuffer is like NewBuffer but returns a GenericBuffer[T] suited to write rows of Go type T.
The type parameter T should be a map, struct, or any. Any other types will cause a panic at runtime. Type checking is a lot more effective when the generic parameter is a struct type, using map and interface types is somewhat similar to using a Writer. If using an interface type for the type parameter, then providing a schema at instantiation is required.
If the option list may explicitly declare a schema, it must be compatible with the schema generated from T.
func (*GenericBuffer[T]) ColumnBuffers ¶
func (buf *GenericBuffer[T]) ColumnBuffers() []ColumnBuffer
func (*GenericBuffer[T]) ColumnChunks ¶
func (buf *GenericBuffer[T]) ColumnChunks() []ColumnChunk
func (*GenericBuffer[T]) Len ¶
func (buf *GenericBuffer[T]) Len() int
func (*GenericBuffer[T]) Less ¶
func (buf *GenericBuffer[T]) Less(i, j int) bool
func (*GenericBuffer[T]) NumRows ¶
func (buf *GenericBuffer[T]) NumRows() int64
func (*GenericBuffer[T]) Reset ¶
func (buf *GenericBuffer[T]) Reset()
func (*GenericBuffer[T]) Rows ¶
func (buf *GenericBuffer[T]) Rows() Rows
func (*GenericBuffer[T]) Schema ¶
func (buf *GenericBuffer[T]) Schema() *Schema
func (*GenericBuffer[T]) Size ¶
func (buf *GenericBuffer[T]) Size() int64
func (*GenericBuffer[T]) SortingColumns ¶
func (buf *GenericBuffer[T]) SortingColumns() []SortingColumn
func (*GenericBuffer[T]) Swap ¶
func (buf *GenericBuffer[T]) Swap(i, j int)
func (*GenericBuffer[T]) Write ¶
func (buf *GenericBuffer[T]) Write(rows []T) (int, error)
func (*GenericBuffer[T]) WriteRowGroup ¶
func (buf *GenericBuffer[T]) WriteRowGroup(rowGroup RowGroup) (int64, error)
type GenericReader ¶
type GenericReader[T any] struct { // contains filtered or unexported fields }
GenericReader is similar to a Reader but uses a type parameter to define the Go type representing the schema of rows being read.
See GenericWriter for details about the benefits over the classic Reader API.
func NewGenericReader ¶
func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *GenericReader[T]
NewGenericReader is like NewReader but returns GenericReader[T] suited to write rows of Go type T.
The type parameter T should be a map, struct, or any. Any other types will cause a panic at runtime. Type checking is a lot more effective when the generic parameter is a struct type, using map and interface types is somewhat similar to using a Writer.
If the option list may explicitly declare a schema, it must be compatible with the schema generated from T.
func NewGenericRowGroupReader ¶
func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) *GenericReader[T]
func (*GenericReader[T]) Close ¶
func (r *GenericReader[T]) Close() error
func (*GenericReader[T]) NumRows ¶
func (r *GenericReader[T]) NumRows() int64
func (*GenericReader[T]) Read ¶
func (r *GenericReader[T]) Read(rows []T) (int, error)
Read reads the next rows from the reader into the given rows slice up to len(rows).
The returned values are safe to reuse across Read calls and do not share memory with the reader's underlying page buffers.
The method returns the number of rows read and io.EOF when no more rows can be read from the reader.
func (*GenericReader[T]) Reset ¶
func (r *GenericReader[T]) Reset()
func (*GenericReader[T]) Schema ¶
func (r *GenericReader[T]) Schema() *Schema
func (*GenericReader[T]) SeekToRow ¶
func (r *GenericReader[T]) SeekToRow(rowIndex int64) error
type GenericWriter ¶
type GenericWriter[T any] struct { // contains filtered or unexported fields }
GenericWriter is similar to a Writer but uses a type parameter to define the Go type representing the schema of rows being written.
Using this type over Writer has multiple advantages:
By leveraging type information, the Go compiler can provide greater guarantees that the code is correct. For example, the parquet.Writer.Write method accepts an argument of type interface{}, which delays type checking until runtime. The parquet.GenericWriter[T].Write method ensures at compile time that the values it receives will be of type T, reducing the risk of introducing errors.
Since type information is known at compile time, the implementation of parquet.GenericWriter[T] can make safe assumptions, removing the need for runtime validation of how the parameters are passed to its methods. Optimizations relying on type information are more effective, some of the writer's state can be precomputed at initialization, which was not possible with parquet.Writer.
The parquet.GenericWriter[T].Write method uses a data-oriented design, accepting an slice of T instead of a single value, creating more opportunities to amortize the runtime cost of abstractions. This optimization is not available for parquet.Writer because its Write method's argument would be of type []interface{}, which would require conversions back and forth from concrete types to empty interfaces (since a []T cannot be interpreted as []interface{} in Go), would make the API more difficult to use and waste compute resources in the type conversions, defeating the purpose of the optimization in the first place.
Note that this type is only available when compiling with Go 1.18 or later.
func NewGenericWriter ¶
func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *GenericWriter[T]
NewGenericWriter is like NewWriter but returns a GenericWriter[T] suited to write rows of Go type T.
The type parameter T should be a map, struct, or any. Any other types will cause a panic at runtime. Type checking is a lot more effective when the generic parameter is a struct type, using map and interface types is somewhat similar to using a Writer.
If the option list may explicitly declare a schema, it must be compatible with the schema generated from T.
Sorting columns may be set on the writer to configure the generated row groups metadata. However, rows are always written in the order they were seen, no reordering is performed, the writer expects the application to ensure proper correlation between the order of rows and the list of sorting columns. See SortingWriter[T] for a writer which handles reordering rows based on the configured sorting columns.
func (*GenericWriter[T]) Close ¶
func (w *GenericWriter[T]) Close() error
func (*GenericWriter[T]) Flush ¶
func (w *GenericWriter[T]) Flush() error
func (*GenericWriter[T]) ReadRowsFrom ¶
func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error)
func (*GenericWriter[T]) Reset ¶
func (w *GenericWriter[T]) Reset(output io.Writer)
func (*GenericWriter[T]) Schema ¶
func (w *GenericWriter[T]) Schema() *Schema
func (*GenericWriter[T]) SetKeyValueMetadata ¶
func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string)
SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.
Keys are assumed to be unique, if the same key is repeated multiple times the last value is retained. While the parquet format does not require unique keys, this design decision was made to optimize for the most common use case where applications leverage this extension mechanism to associate single values to keys. This may create incompatibilities with other parquet libraries, or may cause some key/value pairs to be lost when open parquet files written with repeated keys. We can revisit this decision if it ever becomes a blocker.
func (*GenericWriter[T]) Write ¶
func (w *GenericWriter[T]) Write(rows []T) (int, error)
func (*GenericWriter[T]) WriteRowGroup ¶
func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error)
type Group ¶
func (Group) Compression ¶
type Int32Reader ¶
type Int32Reader interface { // Read 32 bits integer values into the buffer passed as argument. // // The method returns io.EOF when all values have been read. ReadInt32s(values []int32) (int, error) }
Int32Reader is an interface implemented by ValueReader instances which expose the content of a column of int32 values.
type Int32Writer ¶
type Int32Writer interface { // Write 32 bits signed integer values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteInt32s(values []int32) (int, error) }
Int32Writer is an interface implemented by ValueWriter instances which support writing columns of 32 bits signed integer values.
type Int64Reader ¶
type Int64Reader interface { // Read 64 bits integer values into the buffer passed as argument. // // The method returns io.EOF when all values have been read. ReadInt64s(values []int64) (int, error) }
Int64Reader is an interface implemented by ValueReader instances which expose the content of a column of int64 values.
type Int64Writer ¶
type Int64Writer interface { // Write 64 bits signed integer values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteInt64s(values []int64) (int, error) }
Int64Writer is an interface implemented by ValueWriter instances which support writing columns of 64 bits signed integer values.
type Int96Reader ¶
type Int96Reader interface { // Read 96 bits integer values into the buffer passed as argument. // // The method returns io.EOF when all values have been read. ReadInt96s(values []deprecated.Int96) (int, error) }
Int96Reader is an interface implemented by ValueReader instances which expose the content of a column of int96 values.
type Int96Writer ¶
type Int96Writer interface { // Write 96 bits signed integer values. // // The method returns the number of values written, and any error that // occurred while writing the values. WriteInt96s(values []deprecated.Int96) (int, error) }
Int96Writer is an interface implemented by ValueWriter instances which support writing columns of 96 bits signed integer values.
type Kind ¶
type Kind int8
Kind is an enumeration type representing the physical types supported by the parquet type system.
const ( Boolean Kind = Kind(format.Boolean) Int32 Kind = Kind(format.Int32) Int64 Kind = Kind(format.Int64) Int96 Kind = Kind(format.Int96) Float Kind = Kind(format.Float) Double Kind = Kind(format.Double) ByteArray Kind = Kind(format.ByteArray) FixedLenByteArray Kind = Kind(format.FixedLenByteArray) )
type LeafColumn ¶
type LeafColumn struct { Node Node Path []string ColumnIndex int MaxRepetitionLevel int MaxDefinitionLevel int }
LeafColumn is a struct type representing leaf columns of a parquet schema.
type Node ¶
type Node interface { // Returns a human-readable representation of the parquet node. String() string // For leaf nodes, returns the type of values of the parquet column. // // Calling this method on non-leaf nodes will panic. Type() Type // Returns whether the parquet column is optional. Optional() bool // Returns whether the parquet column is repeated. Repeated() bool // Returns whether the parquet column is required. Required() bool // Returns true if this a leaf node. Leaf() bool // Returns a mapping of the node's fields. // // As an optimization, the same slices may be returned by multiple calls to // this method, programs must treat the returned values as immutable. // // This method returns an empty mapping when called on leaf nodes. Fields() []Field // Returns the encoding used by the node. // // The method may return nil to indicate that no specific encoding was // configured on the node, in which case a default encoding might be used. Encoding() encoding.Encoding // Returns compression codec used by the node. // // The method may return nil to indicate that no specific compression codec // was configured on the node, in which case a default compression might be // used. Compression() compress.Codec // Returns the Go type that best represents the parquet node. // // For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96, // float32, float64, string, []byte, or [N]byte. // // For groups, the method returns a struct type. // // If the method is called on a repeated node, the method returns a slice of // the underlying type. // // For optional nodes, the method returns a pointer of the underlying type. // // For nodes that were constructed from Go values (e.g. using SchemaOf), the // method returns the original Go type. GoType() reflect.Type }
Node values represent nodes of a parquet schema.
Nodes carry the type of values, as well as properties like whether the values are optional or repeat. Nodes with one or more children represent parquet groups and therefore do not have a logical type.
Nodes are immutable values and therefore safe to use concurrently from multiple goroutines.
func BSON ¶
func BSON() Node
BSON constructs a leaf node of BSON logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson
func Compressed ¶
Compressed wraps the node passed as argument to use the given compression codec.
If the codec is nil, the node's compression is left unchanged.
The function panics if it is called on a non-leaf node.
func Date ¶
func Date() Node
Date constructs a leaf node of DATE logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date
func Decimal ¶
Decimal constructs a leaf node of decimal logical type with the given scale, precision, and underlying type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
func Encoded ¶
Encoded wraps the node passed as argument to use the given encoding.
The function panics if it is called on a non-leaf node, or if the encoding does not support the node type.
func Enum ¶
func Enum() Node
Enum constructs a leaf node with a logical type representing enumerations.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum
func Int ¶
Int constructs a leaf node of signed integer logical type of the given bit width.
The bit width must be one of 8, 16, 32, 64, or the function will panic.
func JSON ¶
func JSON() Node
JSON constructs a leaf node of JSON logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json
func List ¶
List constructs a node of LIST logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
func Map ¶
Map constructs a node of MAP logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
func String ¶
func String() Node
String constructs a leaf node of UTF8 logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string
func Time ¶
Time constructs a leaf node of TIME logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time
func Timestamp ¶
Timestamp constructs of leaf node of TIMESTAMP logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
func UUID ¶
func UUID() Node
UUID constructs a leaf node of UUID logical type.
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid
type OffsetIndex ¶
type OffsetIndex interface { // NumPages returns the number of pages in the offset index. NumPages() int // Offset returns the offset starting from the beginning of the file for the // page at the given index. Offset(int) int64 // CompressedPageSize returns the size of the page at the given index // (in bytes). CompressedPageSize(int) int64 // FirstRowIndex returns the the first row in the page at the given index. // // The returned row index is based on the row group that the page belongs // to, the first row has index zero. FirstRowIndex(int) int64 }
type Page ¶
type Page interface { // Returns the type of values read from this page. // // The returned type can be used to encode the page data, in the case of // an indexed page (which has a dictionary), the type is configured to // encode the indexes stored in the page rather than the plain values. Type() Type // Returns the column index that this page belongs to. Column() int // If the page contains indexed values, calling this method returns the // dictionary in which the values are looked up. Otherwise, the method // returns nil. Dictionary() Dictionary // Returns the number of rows, values, and nulls in the page. The number of // rows may be less than the number of values in the page if the page is // part of a repeated column. NumRows() int64 NumValues() int64 NumNulls() int64 // Returns the page's min and max values. // // The third value is a boolean indicating whether the page bounds were // available. Page bounds may not be known if the page contained no values // or only nulls, or if they were read from a parquet file which had neither // page statistics nor a page index. Bounds() (min, max Value, ok bool) // Returns the size of the page in bytes (uncompressed). Size() int64 // Returns a reader exposing the values contained in the page. // // Depending on the underlying implementation, the returned reader may // support reading an array of typed Go values by implementing interfaces // like parquet.Int32Reader. Applications should use type assertions on // the returned reader to determine whether those optimizations are // available. Values() ValueReader // Returns a new page which is as slice of the receiver between row indexes // i and j. Slice(i, j int64) Page // Expose the lists of repetition and definition levels of the page. // // The returned slices may be empty when the page has no repetition or // definition levels. RepetitionLevels() []byte DefinitionLevels() []byte // Returns the in-memory buffer holding the page values. // // The intent is for the returned value to be used as input parameter when // calling the Encode method of the associated Type. // // The slices referenced by the encoding.Values may be the same across // multiple calls to this method, applications must treat the content as // immutable. Data() encoding.Values }
Page values represent sequences of parquet values. From the Parquet documentation: "Column chunks are a chunk of the data for a particular column. They live in a particular row group and are guaranteed to be contiguous in the file. Column chunks are divided up into pages. A page is conceptually an indivisible unit (in terms of compression and encoding). There can be multiple page types which are interleaved in a column chunk."
type PageHeader ¶
type PageHeader interface { // Returns the number of values in the page (including nulls). NumValues() int64 // Returns the page encoding. Encoding() format.Encoding // Returns the parquet format page type. PageType() format.PageType }
PageHeader is an interface implemented by parquet page headers.
type PageReader ¶
type PageReader interface { // Reads and returns the next page from the sequence. When all pages have // been read, or if the sequence was closed, the method returns io.EOF. ReadPage() (Page, error) }
PageReader is an interface implemented by types that support producing a sequence of pages.
type PageWriter ¶
PageWriter is an interface implemented by types that support writing pages to an underlying storage medium.
type Pages ¶
type Pages interface { PageReader RowSeeker io.Closer }
Pages is an interface implemented by page readers returned by calling the Pages method of ColumnChunk instances.
func AsyncPages ¶
AsyncPages wraps the given Pages instance to perform page reads asynchronously in a separate goroutine.
Performing page reads asynchronously is important when the application may be reading pages from a high latency backend, and the last page read may be processed while initiating reading of the next page.
type ReadMode ¶
type ReadMode int
ReadMode is an enum that is used to configure the way that a File reads pages.
type Reader
deprecated
type Reader struct {
// contains filtered or unexported fields
}
Deprecated: A Reader reads Go values from parquet files.
This example showcases a typical use of parquet readers:
reader := parquet.NewReader(file) rows := []RowType{} for { row := RowType{} err := reader.Read(&row) if err != nil { if err == io.EOF { break } ... } rows = append(rows, row) } if err := reader.Close(); err != nil { ... }
For programs building with Go 1.18 or later, the GenericReader[T] type supersedes this one.
func NewReader ¶
func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader
NewReader constructs a parquet reader reading rows from the given io.ReaderAt.
In order to read parquet rows, the io.ReaderAt must be converted to a parquet.File. If r is already a parquet.File it is used directly; otherwise, the io.ReaderAt value is expected to either have a `Size() int64` method or implement io.Seeker in order to determine its size.
The function panics if the reader configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewReader should construct the reader configuration independently prior to calling this function:
config, err := parquet.NewReaderConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a reader is guaranteed not to panic reader := parquet.NewReader(input, config) ... }
func NewRowGroupReader ¶
func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader
NewRowGroupReader constructs a new Reader which reads rows from the RowGroup passed as argument.
func (*Reader) Read ¶
Read reads the next row from r. The type of the row must match the schema of the underlying parquet file or an error will be returned.
The method returns io.EOF when no more rows can be read from r.
func (*Reader) ReadRows ¶
ReadRows reads the next rows from r into the given Row buffer.
The returned values are laid out in the order expected by the parquet.(*Schema).Reconstruct method.
The method returns io.EOF when no more rows can be read from r.
func (*Reader) Reset ¶
func (r *Reader) Reset()
Reset repositions the reader at the beginning of the underlying parquet file.
type ReaderConfig ¶
type ReaderConfig struct {
Schema *Schema
}
The ReaderConfig type carries configuration options for parquet readers.
ReaderConfig implements the ReaderOption interface so it can be used directly as argument to the NewReader function when needed, for example:
reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{ // ... })
func DefaultReaderConfig ¶
func DefaultReaderConfig() *ReaderConfig
DefaultReaderConfig returns a new ReaderConfig value initialized with the default reader configuration.
func NewReaderConfig ¶
func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error)
NewReaderConfig constructs a new reader configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*ReaderConfig) Apply ¶
func (c *ReaderConfig) Apply(options ...ReaderOption)
Apply applies the given list of options to c.
func (*ReaderConfig) ConfigureReader ¶
func (c *ReaderConfig) ConfigureReader(config *ReaderConfig)
ConfigureReader applies configuration options from c to config.
func (*ReaderConfig) Validate ¶
func (c *ReaderConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type ReaderOption ¶
type ReaderOption interface {
ConfigureReader(*ReaderConfig)
}
ReaderOption is an interface implemented by types that carry configuration options for parquet readers.
type Row ¶
type Row []Value
Row represents a parquet row as a slice of values.
Each value should embed a column index, repetition level, and definition level allowing the program to determine how to reconstruct the original object from the row.
func AppendRow ¶
AppendRow appends to row the given list of column values.
AppendRow can be used to construct a Row value from columns, while retaining the underlying memory buffer to avoid reallocation; for example:
The function panics if the column indexes of values in each column do not match their position in the argument list.
func MakeRow ¶
MakeRow constructs a Row from a list of column values.
The function panics if the column indexes of values in each column do not match their position in the argument list.
func (Row) Clone ¶
Clone creates a copy of the row which shares no pointers.
This method is useful to capture rows after a call to RowReader.ReadRows when values need to be retained before the next call to ReadRows or after the lifespan of the reader.
type RowBuffer ¶
type RowBuffer[T any] struct { // contains filtered or unexported fields }
RowBuffer is an implementation of the RowGroup interface which stores parquet rows in memory.
Unlike GenericBuffer which uses a column layout to store values in memory buffers, RowBuffer uses a row layout. The use of row layout provides greater efficiency when sorting the buffer, which is the primary use case for the RowBuffer type. Applications which intend to sort rows prior to writing them to a parquet file will often see lower CPU utilization from using a RowBuffer than a GenericBuffer.
RowBuffer values are not safe to use concurrently from multiple goroutines.
func NewRowBuffer ¶
func NewRowBuffer[T any](options ...RowGroupOption) *RowBuffer[T]
NewRowBuffer constructs a new row buffer.
func (*RowBuffer[T]) ColumnChunks ¶
func (buf *RowBuffer[T]) ColumnChunks() []ColumnChunk
ColumnChunks returns a view of the buffer's columns.
Note that reading columns of a RowBuffer will be less efficient than reading columns of a GenericBuffer since the latter uses a column layout. This method is mainly exposed to satisfy the RowGroup interface, applications which need compute-efficient column scans on in-memory buffers should likely use a GenericBuffer instead.
The returned column chunks are snapshots at the time the method is called, they remain valid until the next call to Reset on the buffer.
func (*RowBuffer[T]) Len ¶
Len returns the number of rows in the buffer.
The method contributes to satisfying sort.Interface.
func (*RowBuffer[T]) Less ¶
Less compares the rows at index i and j according to the sorting columns configured on the buffer.
The method contributes to satisfying sort.Interface.
func (*RowBuffer[T]) Reset ¶
func (buf *RowBuffer[T]) Reset()
Reset clears the content of the buffer without releasing its memory.
func (*RowBuffer[T]) Rows ¶
Rows returns a Rows instance exposing rows stored in the buffer.
The rows returned are a snapshot at the time the method is called. The returned rows and values read from it remain valid until the next call to Reset on the buffer.
func (*RowBuffer[T]) SortingColumns ¶
func (buf *RowBuffer[T]) SortingColumns() []SortingColumn
SortingColumns returns the list of columns that rows are expected to be sorted by.
The list of sorting columns is configured when the buffer is created and used when it is sorted.
Note that unless the buffer is explicitly sorted, there are no guarantees that the rows it contains will be in the order specified by the sorting columns.
func (*RowBuffer[T]) Swap ¶
Swap exchanges the rows at index i and j in the buffer.
The method contributes to satisfying sort.Interface.
type RowBuilder ¶
type RowBuilder struct {
// contains filtered or unexported fields
}
RowBuilder is a type which helps build parquet rows incrementally by adding values to columns.
Example ¶
builder := parquet.NewRowBuilder(parquet.Group{ "birth_date": parquet.Optional(parquet.Date()), "first_name": parquet.String(), "last_name": parquet.String(), }) builder.Add(1, parquet.ByteArrayValue([]byte("Luke"))) builder.Add(2, parquet.ByteArrayValue([]byte("Skywalker"))) row := builder.Row() row.Range(func(columnIndex int, columnValues []parquet.Value) bool { fmt.Printf("%+v\n", columnValues[0]) return true })
Output: C:0 D:0 R:0 V:<null> C:1 D:0 R:0 V:Luke C:2 D:0 R:0 V:Skywalker
func NewRowBuilder ¶
func NewRowBuilder(schema Node) *RowBuilder
NewRowBuilder constructs a RowBuilder which builds rows for the parquet schema passed as argument.
func (*RowBuilder) Add ¶
func (b *RowBuilder) Add(columnIndex int, columnValue Value)
Add adds columnValue to the column at columnIndex.
func (*RowBuilder) AppendRow ¶
func (b *RowBuilder) AppendRow(row Row) Row
AppendRow appends the current state of b to row and returns it.
func (*RowBuilder) Next ¶
func (b *RowBuilder) Next(columnIndex int)
Next must be called to indicate the start of a new repeated record for the column at the given index.
If the column index is part of a repeated group, the builder automatically starts a new record for all adjacent columns, the application does not need to call this method for each column of the repeated group.
Next must be called after adding a sequence of records.
func (*RowBuilder) Reset ¶
func (b *RowBuilder) Reset()
Reset clears the internal state of b, making it possible to reuse while retaining the internal buffers.
func (*RowBuilder) Row ¶
func (b *RowBuilder) Row() Row
Row materializes the current state of b into a parquet row.
type RowGroup ¶
type RowGroup interface { // Returns the number of rows in the group. NumRows() int64 // Returns the list of column chunks in this row group. The chunks are // ordered in the order of leaf columns from the row group's schema. // // If the underlying implementation is not read-only, the returned // parquet.ColumnChunk may implement other interfaces: for example, // parquet.ColumnBuffer if the chunk is backed by an in-memory buffer, // or typed writer interfaces like parquet.Int32Writer depending on the // underlying type of values that can be written to the chunk. // // As an optimization, the row group may return the same slice across // multiple calls to this method. Applications should treat the returned // slice as read-only. ColumnChunks() []ColumnChunk // Returns the schema of rows in the group. Schema() *Schema // Returns the list of sorting columns describing how rows are sorted in the // group. // // The method will return an empty slice if the rows are not sorted. SortingColumns() []SortingColumn // Returns a reader exposing the rows of the row group. // // As an optimization, the returned parquet.Rows object may implement // parquet.RowWriterTo, and test the RowWriter it receives for an // implementation of the parquet.RowGroupWriter interface. // // This optimization mechanism is leveraged by the parquet.CopyRows function // to skip the generic row-by-row copy algorithm and delegate the copy logic // to the parquet.Rows object. Rows() Rows }
RowGroup is an interface representing a parquet row group. From the Parquet docs, a RowGroup is "a logical horizontal partitioning of the data into rows. There is no physical structure that is guaranteed for a row group. A row group consists of a column chunk for each column in the dataset."
https://github.com/apache/parquet-format#glossary
func ConvertRowGroup ¶
func ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup
ConvertRowGroup constructs a wrapper of the given row group which applies the given schema conversion to its rows.
func MergeRowGroups ¶
func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, error)
MergeRowGroups constructs a row group which is a merged view of rowGroups. If rowGroups are sorted and the passed options include sorting, the merged row group will also be sorted.
The function validates the input to ensure that the merge operation is possible, ensuring that the schemas match or can be converted to an optionally configured target schema passed as argument in the option list.
The sorting columns of each row group are also consulted to determine whether the output can be represented. If sorting columns are configured on the merge they must be a prefix of sorting columns of all row groups being merged.
func MultiRowGroup ¶
MultiRowGroup wraps multiple row groups to appear as if it was a single RowGroup. RowGroups must have the same schema or it will error.
type RowGroupConfig ¶
type RowGroupConfig struct { ColumnBufferCapacity int Schema *Schema Sorting SortingConfig }
The RowGroupConfig type carries configuration options for parquet row groups.
RowGroupConfig implements the RowGroupOption interface so it can be used directly as argument to the NewBuffer function when needed, for example:
buffer := parquet.NewBuffer(&parquet.RowGroupConfig{ ColumnBufferCapacity: 10_000, })
func DefaultRowGroupConfig ¶
func DefaultRowGroupConfig() *RowGroupConfig
DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the default row group configuration.
func NewRowGroupConfig ¶
func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error)
NewRowGroupConfig constructs a new row group configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*RowGroupConfig) Apply ¶
func (c *RowGroupConfig) Apply(options ...RowGroupOption)
func (*RowGroupConfig) ConfigureRowGroup ¶
func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig)
func (*RowGroupConfig) Validate ¶
func (c *RowGroupConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type RowGroupOption ¶
type RowGroupOption interface {
ConfigureRowGroup(*RowGroupConfig)
}
RowGroupOption is an interface implemented by types that carry configuration options for parquet row groups.
func ColumnBufferCapacity ¶
func ColumnBufferCapacity(size int) RowGroupOption
ColumnBufferCapacity creates a configuration option which defines the size of row group column buffers.
Defaults to 16384.
func SortingRowGroupConfig ¶
func SortingRowGroupConfig(options ...SortingOption) RowGroupOption
SortingRowGroupConfig is a row group option which applies configuration specific sorting row groups.
type RowGroupReader ¶
RowGroupReader is an interface implemented by types that expose sequences of row groups to the application.
type RowGroupWriter ¶
RowGroupWriter is an interface implemented by types that allow the program to write row groups.
type RowReadSeeker ¶
RowReadSeeker is an interface implemented by row readers which support seeking to arbitrary row positions.
type RowReader ¶
type RowReader interface { // ReadRows reads rows from the reader, returning the number of rows read // into the buffer, and any error that occurred. Note that the rows read // into the buffer are not safe for reuse after a subsequent call to // ReadRows. Callers that want to reuse rows must copy the rows using Clone. // // When all rows have been read, the reader returns io.EOF to indicate the // end of the sequence. It is valid for the reader to return both a non-zero // number of rows and a non-nil error (including io.EOF). // // The buffer of rows passed as argument will be used to store values of // each row read from the reader. If the rows are not nil, the backing array // of the slices will be used as an optimization to avoid re-allocating new // arrays. // // The application is expected to handle the case where ReadRows returns // less rows than requested and no error, by looking at the first returned // value from ReadRows, which is the number of rows that were read. ReadRows([]Row) (int, error) }
RowReader reads a sequence of parquet rows.
func DedupeRowReader ¶
DedupeRowReader constructs a row reader which drops duplicated consecutive rows, according to the comparator function passed as argument.
If the underlying reader produces a sequence of rows sorted by the same comparison predicate, the output is guaranteed to produce unique rows only.
func FilterRowReader ¶
FilterRowReader constructs a RowReader which exposes rows from reader for which the predicate has returned true.
func MergeRowReaders ¶
MergeRowReader constructs a RowReader which creates an ordered sequence of all the readers using the given compare function as the ordering predicate.
func ScanRowReader ¶
ScanRowReader constructs a RowReader which exposes rows from reader until the predicate returns false for one of the rows, or EOF is reached.
func TransformRowReader ¶
TransformRowReader constructs a RowReader which applies the given transform to each row rad from reader.
The transformation function appends the transformed src row to dst, returning dst and any error that occurred during the transformation. If dst is returned unchanged, the row is skipped.
type RowReaderFrom ¶
RowReaderFrom reads parquet rows from reader.
type RowReaderFunc ¶
RowReaderFunc is a function type implementing the RowReader interface.
type RowReaderWithSchema ¶
RowReaderWithSchema is an extension of the RowReader interface which advertises the schema of rows returned by ReadRow calls.
func ConvertRowReader ¶
func ConvertRowReader(rows RowReader, conv Conversion) RowReaderWithSchema
ConvertRowReader constructs a wrapper of the given row reader which applies the given schema conversion to the rows.
type RowSeeker ¶
type RowSeeker interface { // Positions the stream on the given row index. // // Some implementations of the interface may only allow seeking forward. // // The method returns io.ErrClosedPipe if the stream had already been closed. SeekToRow(int64) error }
RowSeeker is an interface implemented by readers of parquet rows which can be positioned at a specific row index.
type RowWriter ¶
type RowWriter interface { // Writes rows to the writer, returning the number of rows written and any // error that occurred. // // Because columnar operations operate on independent columns of values, // writes of rows may not be atomic operations, and could result in some // rows being partially written. The method returns the number of rows that // were successfully written, but if an error occurs, values of the row(s) // that failed to be written may have been partially committed to their // columns. For that reason, applications should consider a write error as // fatal and assume that they need to discard the state, they cannot retry // the write nor recover the underlying file. WriteRows([]Row) (int, error) }
RowWriter writes parquet rows to an underlying medium.
func DedupeRowWriter ¶
DedupeRowWriter constructs a row writer which drops duplicated consecutive rows, according to the comparator function passed as argument.
If the writer is given a sequence of rows sorted by the same comparison predicate, the output is guaranteed to contain unique rows only.
func FilterRowWriter ¶
FilterRowWriter constructs a RowWriter which writes rows to writer for which the predicate has returned true.
func MultiRowWriter ¶
MultiRowWriter constructs a RowWriter which dispatches writes to all the writers passed as arguments.
When writing rows, if any of the writers returns an error, the operation is aborted and the error returned. If one of the writers did not error, but did not write all the rows, the operation is aborted and io.ErrShortWrite is returned.
Rows are written sequentially to each writer in the order they are given to this function.
func TransformRowWriter ¶
TransformRowWriter constructs a RowWriter which applies the given transform to each row writter to writer.
The transformation function appends the transformed src row to dst, returning dst and any error that occurred during the transformation. If dst is returned unchanged, the row is skipped.
type RowWriterFunc ¶
RowWriterFunc is a function type implementing the RowWriter interface.
type RowWriterTo ¶
RowWriterTo writes parquet rows to a writer.
type RowWriterWithSchema ¶
RowWriterWithSchema is an extension of the RowWriter interface which advertises the schema of rows expected to be passed to WriteRow calls.
type Rows ¶
type Rows interface { RowReaderWithSchema RowSeeker io.Closer }
Rows is an interface implemented by row readers returned by calling the Rows method of RowGroup instances.
Applications should call Close when they are done using a Rows instance in order to release the underlying resources held by the row sequence.
After calling Close, all attempts to read more rows will return io.EOF.
func NewRowGroupRowReader ¶
type Schema ¶
type Schema struct {
// contains filtered or unexported fields
}
Schema represents a parquet schema created from a Go value.
Schema implements the Node interface to represent the root node of a parquet schema.
func NewSchema ¶
NewSchema constructs a new Schema object with the given name and root node.
The function panics if Node contains more leaf columns than supported by the package (see parquet.MaxColumnIndex).
func SchemaOf ¶
func SchemaOf(model interface{}) *Schema
SchemaOf constructs a parquet schema from a Go value.
The function can construct parquet schemas from struct or pointer-to-struct values only. A panic is raised if a Go value of a different type is passed to this function.
When creating a parquet Schema from a Go value, the struct fields may contain a "parquet" tag to describe properties of the parquet node. The "parquet" tag follows the conventional format of Go struct tags: a comma-separated list of values describe the options, with the first one defining the name of the parquet column.
The following options are also supported in the "parquet" struct tag:
optional | make the parquet column optional snappy | sets the parquet column compression codec to snappy gzip | sets the parquet column compression codec to gzip brotli | sets the parquet column compression codec to brotli lz4 | sets the parquet column compression codec to lz4 zstd | sets the parquet column compression codec to zstd plain | enables the plain encoding (no-op default) dict | enables dictionary encoding on the parquet column delta | enables delta encoding on the parquet column list | for slice types, use the parquet LIST logical type enum | for string types, use the parquet ENUM logical type uuid | for string and [16]byte types, use the parquet UUID logical type decimal | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type date | for int32 types use the DATE logical type timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision split | for float32/float64, use the BYTE_STREAM_SPLIT encoding
The date logical type is an int32 value of the number of days since the unix epoch ¶
The timestamp precision can be changed by defining which precision to use as an argument. Supported precisions are: nanosecond, millisecond and microsecond. Example:
type Message struct { TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" }
The decimal tag must be followed by two integer parameters, the first integer representing the scale and the second the precision; for example:
type Item struct { Cost int64 `parquet:"cost,decimal(0:3)"` }
Invalid combination of struct tags and Go types, or repeating options will cause the function to panic.
As a special case, if the field tag is "-", the field is omitted from the schema and the data will not be written into the parquet file(s). Note that a field with name "-" can still be generated using the tag "-,".
The configuration of Parquet maps are done via two tags:
- The `parquet-key` tag allows to configure the key of a map.
- The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types.
When configuring a Parquet map, the `parquet` tag will configure the map itself.
For example, the following will set the int64 key of the map to be a timestamp:
type Actions struct { Action map[int64]string `parquet:"," parquet-key:",timestamp"` }
The schema name is the Go type name of the value.
func (*Schema) Columns ¶
Columns returns the list of column paths available in the schema.
The method always returns the same slice value across calls to ColumnPaths, applications should treat it as immutable.
func (*Schema) Comparator ¶
func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int
Comparator constructs a comparator function which orders rows according to the list of sorting columns passed as arguments.
func (*Schema) Compression ¶
Compression returns the compression codec set on the root node of the parquet schema.
func (*Schema) ConfigureReader ¶
func (s *Schema) ConfigureReader(config *ReaderConfig)
ConfigureReader satisfies the ReaderOption interface, allowing Schema instances to be passed to NewReader to pre-declare the schema of rows read from the reader.
func (*Schema) ConfigureRowGroup ¶
func (s *Schema) ConfigureRowGroup(config *RowGroupConfig)
ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema instances to be passed to row group constructors to pre-declare the schema of the output parquet file.
func (*Schema) ConfigureWriter ¶
func (s *Schema) ConfigureWriter(config *WriterConfig)
ConfigureWriter satisfies the WriterOption interface, allowing Schema instances to be passed to NewWriter to pre-declare the schema of the output parquet file.
func (*Schema) Deconstruct ¶
Deconstruct deconstructs a Go value and appends it to a row.
The method panics is the structure of the go value does not match the parquet schema.
func (*Schema) Lookup ¶
func (s *Schema) Lookup(path ...string) (LeafColumn, bool)
Lookup returns the leaf column at the given path.
The path is the sequence of column names identifying a leaf column (not including the root).
If the path was not found in the mapping, or if it did not represent a leaf column of the parquet schema, the boolean will be false.
Example ¶
schema := parquet.SchemaOf(struct { FirstName string `parquet:"first_name"` LastName string `parquet:"last_name"` Attributes []struct { Name string `parquet:"name"` Value string `parquet:"value"` } `parquet:"attributes"` }{}) for _, path := range schema.Columns() { leaf, _ := schema.Lookup(path...) fmt.Printf("%d => %q\n", leaf.ColumnIndex, strings.Join(path, ".")) }
Output: 0 => "first_name" 1 => "last_name" 2 => "attributes.name" 3 => "attributes.value"
func (*Schema) Optional ¶
Optional returns false since the root node of a parquet schema is always required.
func (*Schema) Reconstruct ¶
Reconstruct reconstructs a Go value from a row.
The go value passed as first argument must be a non-nil pointer for the row to be decoded into.
The method panics if the structure of the go value and parquet row do not match.
func (*Schema) Repeated ¶
Repeated returns false since the root node of a parquet schema is always required.
func (*Schema) Required ¶
Required returns true since the root node of a parquet schema is always required.
type SortingColumn ¶
type SortingColumn interface { // Returns the path of the column in the row group schema, omitting the name // of the root node. Path() []string // Returns true if the column will sort values in descending order. Descending() bool // Returns true if the column will put null values at the beginning. NullsFirst() bool }
SortingColumn represents a column by which a row group is sorted.
func Ascending ¶
func Ascending(path ...string) SortingColumn
Ascending constructs a SortingColumn value which dictates to sort the column at the path given as argument in ascending order.
func Descending ¶
func Descending(path ...string) SortingColumn
Descending constructs a SortingColumn value which dictates to sort the column at the path given as argument in descending order.
func NullsFirst ¶
func NullsFirst(sortingColumn SortingColumn) SortingColumn
NullsFirst wraps the SortingColumn passed as argument so that it instructs the row group to place null values first in the column.
type SortingConfig ¶
type SortingConfig struct { SortingBuffers BufferPool SortingColumns []SortingColumn DropDuplicatedRows bool }
The SortingConfig type carries configuration options for parquet row groups.
SortingConfig implements the SortingOption interface so it can be used directly as argument to the NewSortingWriter function when needed, for example:
buffer := parquet.NewSortingWriter[Row]( parquet.SortingWriterConfig( parquet.DropDuplicatedRows(true), ), })
func DefaultSortingConfig ¶
func DefaultSortingConfig() *SortingConfig
DefaultSortingConfig returns a new SortingConfig value initialized with the default row group configuration.
func NewSortingConfig ¶
func NewSortingConfig(options ...SortingOption) (*SortingConfig, error)
NewSortingConfig constructs a new sorting configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*SortingConfig) Apply ¶
func (c *SortingConfig) Apply(options ...SortingOption)
func (*SortingConfig) ConfigureSorting ¶
func (c *SortingConfig) ConfigureSorting(config *SortingConfig)
func (*SortingConfig) Validate ¶
func (c *SortingConfig) Validate() error
type SortingOption ¶
type SortingOption interface {
ConfigureSorting(*SortingConfig)
}
SortingOption is an interface implemented by types that carry configuration options for parquet sorting writers.
func DropDuplicatedRows ¶
func DropDuplicatedRows(drop bool) SortingOption
DropDuplicatedRows configures whether a sorting writer will keep or remove duplicated rows.
Two rows are considered duplicates if the values of their all their sorting columns are equal.
Defaults to false
func SortingBuffers ¶
func SortingBuffers(buffers BufferPool) SortingOption
SortingBuffers creates a configuration option which sets the pool of buffers used to hold intermediary state when sorting parquet rows.
Defaults to using in-memory buffers.
func SortingColumns ¶
func SortingColumns(columns ...SortingColumn) SortingOption
SortingColumns creates a configuration option which defines the sorting order of columns in a row group.
The order of sorting columns passed as argument defines the ordering hierarchy; when elements are equal in the first column, the second column is used to order rows, etc...
type SortingWriter ¶
type SortingWriter[T any] struct { // contains filtered or unexported fields }
SortingWriter is a type similar to GenericWriter but it ensures that rows are sorted according to the sorting columns configured on the writer.
The writer accumulates rows in an in-memory buffer which is sorted when it reaches the target number of rows, then written to a temporary row group. When the writer is flushed or closed, the temporary row groups are merged into a row group in the output file, ensuring that rows remain sorted in the final row group.
Because row groups get encoded and compressed, they hold a lot less memory than if all rows were retained in memory. Sorting then merging rows chunks also tends to be a lot more efficient than sorting all rows in memory as it results in better CPU cache utilization since sorting multi-megabyte arrays causes a lot of cache misses since the data set cannot be held in CPU caches.
func NewSortingWriter ¶
func NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T]
NewSortingWriter constructs a new sorting writer which writes a parquet file where rows of each row group are ordered according to the sorting columns configured on the writer.
The sortRowCount argument defines the target number of rows that will be sorted in memory before being written to temporary row groups. The greater this value the more memory is needed to buffer rows in memory. Choosing a value that is too small limits the maximum number of rows that can exist in the output file since the writer cannot create more than 32K temporary row groups to hold the sorted row chunks.
func (*SortingWriter[T]) Close ¶
func (w *SortingWriter[T]) Close() error
func (*SortingWriter[T]) Flush ¶
func (w *SortingWriter[T]) Flush() error
func (*SortingWriter[T]) Reset ¶
func (w *SortingWriter[T]) Reset(output io.Writer)
func (*SortingWriter[T]) Schema ¶
func (w *SortingWriter[T]) Schema() *Schema
func (*SortingWriter[T]) SetKeyValueMetadata ¶
func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string)
func (*SortingWriter[T]) Write ¶
func (w *SortingWriter[T]) Write(rows []T) (int, error)
type TimeUnit ¶
type TimeUnit interface { // Returns the precision of the time unit as a time.Duration value. Duration() time.Duration // Converts the TimeUnit value to its representation in the parquet thrift // format. TimeUnit() format.TimeUnit }
TimeUnit represents units of time in the parquet type system.
type Type ¶
type Type interface { // Returns a human-readable representation of the parquet type. String() string // Returns the Kind value representing the underlying physical type. // // The method panics if it is called on a group type. Kind() Kind // For integer and floating point physical types, the method returns the // size of values in bits. // // For fixed-length byte arrays, the method returns the size of elements // in bytes. // // For other types, the value is zero. Length() int // Returns an estimation of the number of bytes required to hold the given // number of values of this type in memory. // // The method returns zero for group types. EstimateSize(numValues int) int // Returns an estimation of the number of values of this type that can be // held in the given byte size. // // The method returns zero for group types. EstimateNumValues(size int) int // Compares two values and returns a negative integer if a < b, positive if // a > b, or zero if a == b. // // The values' Kind must match the type, otherwise the result is undefined. // // The method panics if it is called on a group type. Compare(a, b Value) int // ColumnOrder returns the type's column order. For group types, this method // returns nil. // // The order describes the comparison logic implemented by the Less method. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. ColumnOrder() *format.ColumnOrder // Returns the physical type as a *format.Type value. For group types, this // method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. PhysicalType() *format.Type // Returns the logical type as a *format.LogicalType value. When the logical // type is unknown, the method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. LogicalType() *format.LogicalType // Returns the logical type's equivalent converted type. When there are // no equivalent converted type, the method returns nil. // // As an optimization, the method may return the same pointer across // multiple calls. Applications must treat the returned value as immutable, // mutating the value will result in undefined behavior. ConvertedType() *deprecated.ConvertedType // Creates a column indexer for values of this type. // // The size limit is a hint to the column indexer that it is allowed to // truncate the page boundaries to the given size. Only BYTE_ARRAY and // FIXED_LEN_BYTE_ARRAY types currently take this value into account. // // A value of zero or less means no limits. // // The method panics if it is called on a group type. NewColumnIndexer(sizeLimit int) ColumnIndexer // Creates a row group buffer column for values of this type. // // Column buffers are created using the index of the column they are // accumulating values in memory for (relative to the parent schema), // and the size of their memory buffer. // // The application may give an estimate of the number of values it expects // to write to the buffer as second argument. This estimate helps set the // initialize buffer capacity but is not a hard limit, the underlying memory // buffer will grown as needed to allow more values to be written. Programs // may use the Size method of the column buffer (or the parent row group, // when relevant) to determine how many bytes are being used, and perform a // flush of the buffers to a storage layer. // // The method panics if it is called on a group type. NewColumnBuffer(columnIndex, numValues int) ColumnBuffer // Creates a dictionary holding values of this type. // // The dictionary retains the data buffer, it does not make a copy of it. // If the application needs to share ownership of the memory buffer, it must // ensure that it will not be modified while the page is in use, or it must // make a copy of it prior to creating the dictionary. // // The method panics if the data type does not correspond to the parquet // type it is called on. NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary // Creates a page belonging to a column at the given index, backed by the // data buffer. // // The page retains the data buffer, it does not make a copy of it. If the // application needs to share ownership of the memory buffer, it must ensure // that it will not be modified while the page is in use, or it must make a // copy of it prior to creating the page. // // The method panics if the data type does not correspond to the parquet // type it is called on. NewPage(columnIndex, numValues int, data encoding.Values) Page // Creates an encoding.Values instance backed by the given buffers. // // The offsets is only used by BYTE_ARRAY types, where it represents the // positions of each variable length value in the values buffer. // // The following expression creates an empty instance for any type: // // values := typ.NewValues(nil, nil) // // The method panics if it is called on group types. NewValues(values []byte, offsets []uint32) encoding.Values // Assuming the src buffer contains PLAIN encoded values of the type it is // called on, applies the given encoding and produces the output to the dst // buffer passed as first argument by dispatching the call to one of the // encoding methods. Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) // Assuming the src buffer contains values encoding in the given encoding, // decodes the input and produces the encoded values into the dst output // buffer passed as first argument by dispatching the call to one of the // encoding methods. Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) // Returns an estimation of the output size after decoding the values passed // as first argument with the given encoding. // // For most types, this is similar to calling EstimateSize with the known // number of encoded values. For variable size types, using this method may // provide a more precise result since it can inspect the input buffer. EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int // Assigns a Parquet value to a Go value. Returns an error if assignment is // not possible. The source Value must be an expected logical type for the // receiver. This can be accomplished using ConvertValue. AssignValue(dst reflect.Value, src Value) error // Convert a Parquet Value of the given Type into a Parquet Value that is // compatible with the receiver. The returned Value is suitable to be passed // to AssignValue. ConvertValue(val Value, typ Type) (Value, error) }
The Type interface represents logical types of the parquet type system.
Types are immutable and therefore safe to access from multiple goroutines.
func FixedLenByteArrayType ¶
FixedLenByteArrayType constructs a type for fixed-length values of the given size (in bytes).
type Value ¶
type Value struct {
// contains filtered or unexported fields
}
The Value type is similar to the reflect.Value abstraction of Go values, but for parquet values. Value instances wrap underlying Go values mapped to one of the parquet physical types.
Value instances are small, immutable objects, and usually passed by value between function calls.
The zero-value of Value represents the null parquet value.
func BooleanValue ¶
BooleanValue constructs a BOOLEAN parquet value from the bool passed as argument.
func ByteArrayValue ¶
ByteArrayValue constructs a BYTE_ARRAY parquet value from the byte slice passed as argument.
func DoubleValue ¶
DoubleValue constructs a DOUBLE parquet value from the float64 passed as argument.
func FixedLenByteArrayValue ¶
FixedLenByteArrayValue constructs a BYTE_ARRAY parquet value from the byte slice passed as argument.
func FloatValue ¶
FloatValue constructs a FLOAT parquet value from the float32 passed as argument.
func Int32Value ¶
Int32Value constructs a INT32 parquet value from the int32 passed as argument.
func Int64Value ¶
Int64Value constructs a INT64 parquet value from the int64 passed as argument.
func Int96Value ¶
func Int96Value(value deprecated.Int96) Value
Int96Value constructs a INT96 parquet value from the deprecated.Int96 passed as argument.
func NullValue ¶
func NullValue() Value
NulLValue constructs a null value, which is the zero-value of the Value type.
func ValueOf ¶
func ValueOf(v interface{}) Value
ValueOf constructs a parquet value from a Go value v.
The physical type of the value is assumed from the Go type of v using the following conversion table:
Go type | Parquet physical type ------- | --------------------- nil | NULL bool | BOOLEAN int8 | INT32 int16 | INT32 int32 | INT32 int64 | INT64 int | INT64 uint8 | INT32 uint16 | INT32 uint32 | INT32 uint64 | INT64 uintptr | INT64 float32 | FLOAT float64 | DOUBLE string | BYTE_ARRAY []byte | BYTE_ARRAY [*]byte | FIXED_LEN_BYTE_ARRAY
When converting a []byte or [*]byte value, the underlying byte array is not copied; instead, the returned parquet value holds a reference to it.
The repetition and definition levels of the returned value are both zero.
The function panics if the Go value cannot be represented in parquet.
func (Value) AppendBytes ¶
AppendBytes appends the binary representation of v to b.
If v is the null value, b is returned unchanged.
func (Value) ByteArray ¶
ByteArray returns v as a []byte, assuming the underlying type is either BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY.
The application must treat the returned byte slice as a read-only value, mutating the content will result in undefined behaviors.
func (Value) Bytes ¶
Bytes returns the binary representation of v.
If v is the null value, an nil byte slice is returned.
func (Value) Column ¶
Column returns the column index within the row that v was created from.
Returns -1 if the value does not carry a column index.
func (Value) DefinitionLevel ¶
DefinitionLevel returns the definition level of v.
func (Value) Format ¶
Format outputs a human-readable representation of v to w, using r as the formatting verb to describe how the value should be printed.
The following formatting options are supported:
%c prints the column index %+c prints the column index, prefixed with "C:" %d prints the definition level %+d prints the definition level, prefixed with "D:" %r prints the repetition level %+r prints the repetition level, prefixed with "R:" %q prints the quoted representation of v %+q prints the quoted representation of v, prefixed with "V:" %s prints the string representation of v %+s prints the string representation of v, prefixed with "V:" %v same as %s %+v prints a verbose representation of v %#v prints a Go value representation of v
Format satisfies the fmt.Formatter interface.
func (Value) Int96 ¶
func (v Value) Int96() deprecated.Int96
Int96 returns v as a int96, assuming the underlying type is INT96.
func (Value) Level ¶
Level returns v with the repetition level, definition level, and column index set to the values passed as arguments.
The method panics if either argument is negative.
func (Value) RepetitionLevel ¶
RepetitionLevel returns the repetition level of v.
type ValueReader ¶
type ValueReader interface { // Read values into the buffer passed as argument and return the number of // values read. When all values have been read, the error will be io.EOF. ReadValues([]Value) (int, error) }
ValueReader is an interface implemented by types that support reading batches of values.
type ValueReaderAt ¶
ValueReaderAt is an interface implemented by types that support reading values at offsets specified by the application.
type ValueReaderFrom ¶
type ValueReaderFrom interface {
ReadValuesFrom(ValueReader) (int64, error)
}
ValueReaderFrom is an interface implemented by value writers to read values from a reader.
type ValueReaderFunc ¶
ValueReaderFunc is a function type implementing the ValueReader interface.
func (ValueReaderFunc) ReadValues ¶
func (f ValueReaderFunc) ReadValues(values []Value) (int, error)
type ValueWriter ¶
type ValueWriter interface { // Write values from the buffer passed as argument and returns the number // of values written. WriteValues([]Value) (int, error) }
ValueWriter is an interface implemented by types that support reading batches of values.
type ValueWriterFunc ¶
ValueWriterFunc is a function type implementing the ValueWriter interface.
func (ValueWriterFunc) WriteValues ¶
func (f ValueWriterFunc) WriteValues(values []Value) (int, error)
type ValueWriterTo ¶
type ValueWriterTo interface {
WriteValuesTo(ValueWriter) (int64, error)
}
ValueWriterTo is an interface implemented by value readers to write values to a writer.
type Writer
deprecated
type Writer struct {
// contains filtered or unexported fields
}
Deprecated: A Writer uses a parquet schema and sequence of Go values to produce a parquet file to an io.Writer.
This example showcases a typical use of parquet writers:
writer := parquet.NewWriter(output) for _, row := range rows { if err := writer.Write(row); err != nil { ... } } if err := writer.Close(); err != nil { ... }
The Writer type optimizes for minimal memory usage, each page is written as soon as it has been filled so only a single page per column needs to be held in memory and as a result, there are no opportunities to sort rows within an entire row group. Programs that need to produce parquet files with sorted row groups should use the Buffer type to buffer and sort the rows prior to writing them to a Writer.
For programs building with Go 1.18 or later, the GenericWriter[T] type supersedes this one.
func NewWriter ¶
func NewWriter(output io.Writer, options ...WriterOption) *Writer
NewWriter constructs a parquet writer writing a file to the given io.Writer.
The function panics if the writer configuration is invalid. Programs that cannot guarantee the validity of the options passed to NewWriter should construct the writer configuration independently prior to calling this function:
config, err := parquet.NewWriterConfig(options...) if err != nil { // handle the configuration error ... } else { // this call to create a writer is guaranteed not to panic writer := parquet.NewWriter(output, config) ... }
func (*Writer) Close ¶
Close must be called after all values were produced to the writer in order to flush all buffers and write the parquet footer.
func (*Writer) Flush ¶
Flush flushes all buffers into a row group to the underlying io.Writer.
Flush is called automatically on Close, it is only useful to call explicitly if the application needs to limit the size of row groups or wants to produce multiple row groups per file.
If the writer attempts to create more than MaxRowGroups row groups the method returns ErrTooManyRowGroups.
func (*Writer) ReadRowsFrom ¶
ReadRowsFrom reads rows from the reader passed as arguments and writes them to w.
This is similar to calling WriteRow repeatedly, but will be more efficient if optimizations are supported by the reader.
func (*Writer) Reset ¶
Reset clears the state of the writer without flushing any of the buffers, and setting the output to the io.Writer passed as argument, allowing the writer to be reused to produce another parquet file.
Reset may be called at any time, including after a writer was closed.
func (*Writer) Schema ¶
Schema returns the schema of rows written by w.
The returned value will be nil if no schema has yet been configured on w.
func (*Writer) SetKeyValueMetadata ¶
SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.
Keys are assumed to be unique, if the same key is repeated multiple times the last value is retained. While the parquet format does not require unique keys, this design decision was made to optimize for the most common use case where applications leverage this extension mechanism to associate single values to keys. This may create incompatibilities with other parquet libraries, or may cause some key/value pairs to be lost when open parquet files written with repeated keys. We can revisit this decision if it ever becomes a blocker.
func (*Writer) Write ¶
Write is called to write another row to the parquet file.
The method uses the parquet schema configured on w to traverse the Go value and decompose it into a set of columns and values. If no schema were passed to NewWriter, it is deducted from the Go type of the row, which then have to be a struct or pointer to struct.
func (*Writer) WriteRowGroup ¶
WriteRowGroup writes a row group to the parquet file.
Buffered rows will be flushed prior to writing rows from the group, unless the row group was empty in which case nothing is written to the file.
The content of the row group is flushed to the writer; after the method returns successfully, the row group will be empty and in ready to be reused.
func (*Writer) WriteRows ¶
WriteRows is called to write rows to the parquet file.
The Writer must have been given a schema when NewWriter was called, otherwise the structure of the parquet file cannot be determined from the row only.
The row is expected to contain values for each column of the writer's schema, in the order produced by the parquet.(*Schema).Deconstruct method.
type WriterConfig ¶
type WriterConfig struct { CreatedBy string ColumnPageBuffers BufferPool ColumnIndexSizeLimit int PageBufferSize int WriteBufferSize int DataPageVersion int DataPageStatistics bool MaxRowsPerRowGroup int64 KeyValueMetadata map[string]string Schema *Schema BloomFilters []BloomFilterColumn Compression compress.Codec Sorting SortingConfig }
The WriterConfig type carries configuration options for parquet writers.
WriterConfig implements the WriterOption interface so it can be used directly as argument to the NewWriter function when needed, for example:
writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{ CreatedBy: "my test program", })
func DefaultWriterConfig ¶
func DefaultWriterConfig() *WriterConfig
DefaultWriterConfig returns a new WriterConfig value initialized with the default writer configuration.
func NewWriterConfig ¶
func NewWriterConfig(options ...WriterOption) (*WriterConfig, error)
NewWriterConfig constructs a new writer configuration applying the options passed as arguments.
The function returns an non-nil error if some of the options carried invalid configuration values.
func (*WriterConfig) Apply ¶
func (c *WriterConfig) Apply(options ...WriterOption)
Apply applies the given list of options to c.
func (*WriterConfig) ConfigureWriter ¶
func (c *WriterConfig) ConfigureWriter(config *WriterConfig)
ConfigureWriter applies configuration options from c to config.
func (*WriterConfig) Validate ¶
func (c *WriterConfig) Validate() error
Validate returns a non-nil error if the configuration of c is invalid.
type WriterOption ¶
type WriterOption interface {
ConfigureWriter(*WriterConfig)
}
WriterOption is an interface implemented by types that carry configuration options for parquet writers.
func BloomFilters ¶
func BloomFilters(filters ...BloomFilterColumn) WriterOption
BloomFilters creates a configuration option which defines the bloom filters that parquet writers should generate.
The compute and memory footprint of generating bloom filters for all columns of a parquet schema can be significant, so by default no filters are created and applications need to explicitly declare the columns that they want to create filters for.
func ColumnIndexSizeLimit ¶
func ColumnIndexSizeLimit(sizeLimit int) WriterOption
ColumnIndexSizeLimit creates a configuration option to customize the size limit of page boundaries recorded in column indexes.
Defaults to 16.
func ColumnPageBuffers ¶
func ColumnPageBuffers(buffers BufferPool) WriterOption
ColumnPageBuffers creates a configuration option to customize the buffer pool used when constructing row groups. This can be used to provide on-disk buffers as swap space to ensure that the parquet file creation will no be bottlenecked on the amount of memory available.
Defaults to using in-memory buffers.
func Compression ¶
func Compression(codec compress.Codec) WriterOption
Compression creates a configuration option which sets the default compression codec used by a writer for columns where none were defined.
func CreatedBy ¶
func CreatedBy(application, version, build string) WriterOption
CreatedBy creates a configuration option which sets the name of the application that created a parquet file.
The option formats the "CreatedBy" file metadata according to the convention described by the parquet spec:
"<application> version <version> (build <build>)"
By default, the option is set to the parquet-go module name, version, and build hash.
func DataPageStatistics ¶
func DataPageStatistics(enabled bool) WriterOption
DataPageStatistics creates a configuration option which defines whether data page statistics are emitted. This option is useful when generating parquet files that intend to be backward compatible with older readers which may not have the ability to load page statistics from the column index.
Defaults to false.
func DataPageVersion ¶
func DataPageVersion(version int) WriterOption
DataPageVersion creates a configuration option which configures the version of data pages used when creating a parquet file.
Defaults to version 2.
func KeyValueMetadata ¶
func KeyValueMetadata(key, value string) WriterOption
KeyValueMetadata creates a configuration option which adds key/value metadata to add to the metadata of parquet files.
This option is additive, it may be used multiple times to add more than one key/value pair.
Keys are assumed to be unique, if the same key is repeated multiple times the last value is retained. While the parquet format does not require unique keys, this design decision was made to optimize for the most common use case where applications leverage this extension mechanism to associate single values to keys. This may create incompatibilities with other parquet libraries, or may cause some key/value pairs to be lost when open parquet files written with repeated keys. We can revisit this decision if it ever becomes a blocker.
func MaxRowsPerRowGroup ¶
func MaxRowsPerRowGroup(numRows int64) WriterOption
MaxRowsPerRowGroup configures the maximum number of rows that a writer will produce in each row group.
This limit is useful to control size of row groups in both number of rows and byte size. While controlling the byte size of a row group is difficult to achieve with parquet due to column encoding and compression, the number of rows remains a useful proxy.
Defaults to unlimited.
func PageBufferSize ¶
func PageBufferSize(size int) WriterOption
PageBufferSize configures the size of column page buffers on parquet writers.
Note that the page buffer size refers to the in-memory buffers where pages are generated, not the size of pages after encoding and compression. This design choice was made to help control the amount of memory needed to read and write pages rather than controlling the space used by the encoded representation on disk.
Defaults to 256KiB.
func SortingWriterConfig ¶
func SortingWriterConfig(options ...SortingOption) WriterOption
SortingWriterConfig is a writer option which applies configuration specific to sorting writers.
func WriteBufferSize ¶
func WriteBufferSize(size int) WriterOption
WriteBufferSize configures the size of the write buffer.
Setting the writer buffer size to zero deactivates buffering, all writes are immediately sent to the output io.Writer.
Defaults to 32KiB.
Source Files ¶
- allocator.go
- array.go
- array_go18.go
- bitmap.go
- bloom.go
- buffer.go
- buffer_go18.go
- buffer_pool.go
- column.go
- column_buffer.go
- column_buffer_amd64.go
- column_buffer_go18.go
- column_chunk.go
- column_index.go
- column_mapping.go
- column_path.go
- compare.go
- compress.go
- config.go
- convert.go
- dedupe.go
- dictionary.go
- dictionary_amd64.go
- encoding.go
- errors.go
- file.go
- filter.go
- level.go
- limits.go
- merge.go
- multi_row_group.go
- node.go
- null.go
- null_amd64.go
- offset_index.go
- order.go
- order_amd64.go
- page.go
- page_bounds.go
- page_bounds_amd64.go
- page_header.go
- page_max.go
- page_max_amd64.go
- page_min.go
- page_min_amd64.go
- page_values.go
- parquet.go
- parquet_amd64.go
- parquet_go18.go
- print.go
- reader.go
- reader_go18.go
- row.go
- row_buffer.go
- row_builder.go
- row_group.go
- scan.go
- schema.go
- search.go
- sorting.go
- transform.go
- type.go
- value.go
- value_amd64.go
- value_go18.go
- writer.go
- writer_go18.go
Directories ¶
Path | Synopsis |
---|---|
Package bloom implements parquet bloom filters.
|
Package bloom implements parquet bloom filters. |
xxhash
Package xxhash is an extension of github.com/cespare/xxhash which adds routines optimized to hash arrays of fixed size elements.
|
Package xxhash is an extension of github.com/cespare/xxhash which adds routines optimized to hash arrays of fixed size elements. |
Package compress provides the generic APIs implemented by parquet compression codecs.
|
Package compress provides the generic APIs implemented by parquet compression codecs. |
brotli
Package brotli implements the BROTLI parquet compression codec.
|
Package brotli implements the BROTLI parquet compression codec. |
gzip
Package gzip implements the GZIP parquet compression codec.
|
Package gzip implements the GZIP parquet compression codec. |
lz4
Package lz4 implements the LZ4_RAW parquet compression codec.
|
Package lz4 implements the LZ4_RAW parquet compression codec. |
snappy
Package snappy implements the SNAPPY parquet compression codec.
|
Package snappy implements the SNAPPY parquet compression codec. |
uncompressed
Package uncompressed provides implementations of the compression codec interfaces as pass-through without applying any compression nor decompression.
|
Package uncompressed provides implementations of the compression codec interfaces as pass-through without applying any compression nor decompression. |
zstd
Package zstd implements the ZSTD parquet compression codec.
|
Package zstd implements the ZSTD parquet compression codec. |
Package encoding provides the generic APIs implemented by parquet encodings in its sub-packages.
|
Package encoding provides the generic APIs implemented by parquet encodings in its sub-packages. |
fuzz
Package fuzz contains functions to help fuzz test parquet encodings.
|
Package fuzz contains functions to help fuzz test parquet encodings. |
plain
Package plain implements the PLAIN parquet encoding.
|
Package plain implements the PLAIN parquet encoding. |
rle
Package rle implements the hybrid RLE/Bit-Packed encoding employed in repetition and definition levels, dictionary indexed data pages, and boolean values in the PLAIN encoding.
|
Package rle implements the hybrid RLE/Bit-Packed encoding employed in repetition and definition levels, dictionary indexed data pages, and boolean values in the PLAIN encoding. |
Package hashprobe provides implementations of probing tables for various data types.
|
Package hashprobe provides implementations of probing tables for various data types. |
aeshash
Package aeshash implements hashing functions derived from the Go runtime's internal hashing based on the support of AES encryption in CPU instructions.
|
Package aeshash implements hashing functions derived from the Go runtime's internal hashing based on the support of AES encryption in CPU instructions. |
wyhash
Package wyhash implements a hashing algorithm derived from the Go runtime's internal hashing fallback, which uses a variation of the wyhash algorithm.
|
Package wyhash implements a hashing algorithm derived from the Go runtime's internal hashing fallback, which uses a variation of the wyhash algorithm. |
internal
|
|
bitpack
Package bitpack implements efficient bit packing and unpacking routines for integers of various bit widths.
|
Package bitpack implements efficient bit packing and unpacking routines for integers of various bit widths. |
bytealg
Package bytealg contains optimized algorithms operating on byte slices.
|
Package bytealg contains optimized algorithms operating on byte slices. |
unsafecast
Package unsafecast exposes functions to bypass the Go type system and perform conversions between types that would otherwise not be possible.
|
Package unsafecast exposes functions to bypass the Go type system and perform conversions between types that would otherwise not be possible. |
Package sparse contains abstractions to help work on arrays of values in sparse memory locations.
|
Package sparse contains abstractions to help work on arrays of values in sparse memory locations. |