The highest tagged major version is v17.

file

package

v11.0.0 Latest Latest Go to latest Published: Jan 18, 2023 License: Apache-2.0, BSD-2-Clause, BSD-3-Clause, + 8 more Imports: 30 Imported by: 6

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/apache/arrow

Links

Open Source Insights

Documentation ¶

Rendered for

Index ¶

func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput)
func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error
func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, ...) error
type BinaryRecordReader
type BooleanColumnChunkReader
- func (c *BooleanColumnChunkReader) Descriptor() *schema.Column
- func (c *BooleanColumnChunkReader) Err() error
- func (c *BooleanColumnChunkReader) HasNext() bool
- func (cr *BooleanColumnChunkReader) ReadBatch(batchSize int64, values []bool, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *BooleanColumnChunkReader) Type() parquet.Type
type BooleanColumnChunkWriter
- func NewBooleanColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *BooleanColumnChunkWriter
- func (w *BooleanColumnChunkWriter) Close() (err error)
- func (w *BooleanColumnChunkWriter) Descr() *schema.Column
- func (w *BooleanColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *BooleanColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *BooleanColumnChunkWriter) FlushCurrentPage() error
- func (w *BooleanColumnChunkWriter) HasBitsBuffer() bool
- func (w *BooleanColumnChunkWriter) LevelInfo() LevelInfo
- func (w *BooleanColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *BooleanColumnChunkWriter) RowsWritten() int
- func (w *BooleanColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *BooleanColumnChunkWriter) TotalBytesWritten() int64
- func (w *BooleanColumnChunkWriter) TotalCompressedBytes() int64
- func (w *BooleanColumnChunkWriter) Type() parquet.Type
- func (w *BooleanColumnChunkWriter) WriteBatch(values []bool, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *BooleanColumnChunkWriter) WriteBatchSpaced(values []bool, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *BooleanColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *BooleanColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *BooleanColumnChunkWriter) WriteDictionaryPage() error
- func (w *BooleanColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type BufferedRowGroupWriter
type ByteArrayColumnChunkReader
- func (c *ByteArrayColumnChunkReader) Descriptor() *schema.Column
- func (c *ByteArrayColumnChunkReader) Err() error
- func (c *ByteArrayColumnChunkReader) HasNext() bool
- func (cr *ByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.ByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *ByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *ByteArrayColumnChunkReader) Type() parquet.Type
type ByteArrayColumnChunkWriter
- func NewByteArrayColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *ByteArrayColumnChunkWriter
- func (w *ByteArrayColumnChunkWriter) Close() (err error)
- func (w *ByteArrayColumnChunkWriter) Descr() *schema.Column
- func (w *ByteArrayColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *ByteArrayColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *ByteArrayColumnChunkWriter) FlushCurrentPage() error
- func (w *ByteArrayColumnChunkWriter) HasBitsBuffer() bool
- func (w *ByteArrayColumnChunkWriter) LevelInfo() LevelInfo
- func (w *ByteArrayColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *ByteArrayColumnChunkWriter) RowsWritten() int
- func (w *ByteArrayColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *ByteArrayColumnChunkWriter) TotalBytesWritten() int64
- func (w *ByteArrayColumnChunkWriter) TotalCompressedBytes() int64
- func (w *ByteArrayColumnChunkWriter) Type() parquet.Type
- func (w *ByteArrayColumnChunkWriter) WriteBatch(values []parquet.ByteArray, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *ByteArrayColumnChunkWriter) WriteBatchSpaced(values []parquet.ByteArray, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *ByteArrayColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *ByteArrayColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *ByteArrayColumnChunkWriter) WriteDictionaryPage() error
- func (w *ByteArrayColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type ColumnChunkReader
- func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator, ...) ColumnChunkReader
type ColumnChunkWriter
- func NewColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, ...) ColumnChunkWriter
type CryptoContext
type DataPage
type DataPageV1
- func NewDataPageV1(buffer *memory.Buffer, num int32, ...) *DataPageV1
- func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, ...) *DataPageV1
- func (p *DataPageV1) Data() []byte
- func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding
- func (p *DataPageV1) Encoding() format.Encoding
- func (p *DataPageV1) NumValues() int32
- func (d *DataPageV1) Release()
- func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding
- func (d *DataPageV1) Statistics() metadata.EncodedStatistics
- func (p *DataPageV1) Type() format.PageType
- func (d *DataPageV1) UncompressedSize() int32
type DataPageV2
- func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, ...) *DataPageV2
- func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, ...) *DataPageV2
- func (p *DataPageV2) Data() []byte
- func (d *DataPageV2) DefinitionLevelByteLen() int32
- func (p *DataPageV2) Encoding() format.Encoding
- func (d *DataPageV2) IsCompressed() bool
- func (d *DataPageV2) NumNulls() int32
- func (d *DataPageV2) NumRows() int32
- func (p *DataPageV2) NumValues() int32
- func (d *DataPageV2) Release()
- func (d *DataPageV2) RepetitionLevelByteLen() int32
- func (d *DataPageV2) Statistics() metadata.EncodedStatistics
- func (p *DataPageV2) Type() format.PageType
- func (d *DataPageV2) UncompressedSize() int32
type DictionaryPage
- func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage
- func (p *DictionaryPage) Data() []byte
- func (p *DictionaryPage) Encoding() format.Encoding
- func (d *DictionaryPage) IsSorted() bool
- func (p *DictionaryPage) NumValues() int32
- func (d *DictionaryPage) Release()
- func (p *DictionaryPage) Type() format.PageType
type FixedLenByteArrayColumnChunkReader
- func (c *FixedLenByteArrayColumnChunkReader) Descriptor() *schema.Column
- func (c *FixedLenByteArrayColumnChunkReader) Err() error
- func (c *FixedLenByteArrayColumnChunkReader) HasNext() bool
- func (cr *FixedLenByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.FixedLenByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *FixedLenByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *FixedLenByteArrayColumnChunkReader) Type() parquet.Type
type FixedLenByteArrayColumnChunkWriter
- func NewFixedLenByteArrayColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *FixedLenByteArrayColumnChunkWriter
- func (w *FixedLenByteArrayColumnChunkWriter) Close() (err error)
- func (w *FixedLenByteArrayColumnChunkWriter) Descr() *schema.Column
- func (w *FixedLenByteArrayColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *FixedLenByteArrayColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *FixedLenByteArrayColumnChunkWriter) FlushCurrentPage() error
- func (w *FixedLenByteArrayColumnChunkWriter) HasBitsBuffer() bool
- func (w *FixedLenByteArrayColumnChunkWriter) LevelInfo() LevelInfo
- func (w *FixedLenByteArrayColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *FixedLenByteArrayColumnChunkWriter) RowsWritten() int
- func (w *FixedLenByteArrayColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *FixedLenByteArrayColumnChunkWriter) TotalBytesWritten() int64
- func (w *FixedLenByteArrayColumnChunkWriter) TotalCompressedBytes() int64
- func (w *FixedLenByteArrayColumnChunkWriter) Type() parquet.Type
- func (w *FixedLenByteArrayColumnChunkWriter) WriteBatch(values []parquet.FixedLenByteArray, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *FixedLenByteArrayColumnChunkWriter) WriteBatchSpaced(values []parquet.FixedLenByteArray, defLevels, repLevels []int16, ...)
- func (w *FixedLenByteArrayColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *FixedLenByteArrayColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *FixedLenByteArrayColumnChunkWriter) WriteDictionaryPage() error
- func (w *FixedLenByteArrayColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type Float32ColumnChunkReader
- func (c *Float32ColumnChunkReader) Descriptor() *schema.Column
- func (c *Float32ColumnChunkReader) Err() error
- func (c *Float32ColumnChunkReader) HasNext() bool
- func (cr *Float32ColumnChunkReader) ReadBatch(batchSize int64, values []float32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Float32ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Float32ColumnChunkReader) Type() parquet.Type
type Float32ColumnChunkWriter
- func NewFloat32ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *Float32ColumnChunkWriter
- func (w *Float32ColumnChunkWriter) Close() (err error)
- func (w *Float32ColumnChunkWriter) Descr() *schema.Column
- func (w *Float32ColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *Float32ColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *Float32ColumnChunkWriter) FlushCurrentPage() error
- func (w *Float32ColumnChunkWriter) HasBitsBuffer() bool
- func (w *Float32ColumnChunkWriter) LevelInfo() LevelInfo
- func (w *Float32ColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *Float32ColumnChunkWriter) RowsWritten() int
- func (w *Float32ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *Float32ColumnChunkWriter) TotalBytesWritten() int64
- func (w *Float32ColumnChunkWriter) TotalCompressedBytes() int64
- func (w *Float32ColumnChunkWriter) Type() parquet.Type
- func (w *Float32ColumnChunkWriter) WriteBatch(values []float32, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *Float32ColumnChunkWriter) WriteBatchSpaced(values []float32, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *Float32ColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *Float32ColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *Float32ColumnChunkWriter) WriteDictionaryPage() error
- func (w *Float32ColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type Float64ColumnChunkReader
- func (c *Float64ColumnChunkReader) Descriptor() *schema.Column
- func (c *Float64ColumnChunkReader) Err() error
- func (c *Float64ColumnChunkReader) HasNext() bool
- func (cr *Float64ColumnChunkReader) ReadBatch(batchSize int64, values []float64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Float64ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Float64ColumnChunkReader) Type() parquet.Type
type Float64ColumnChunkWriter
- func NewFloat64ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *Float64ColumnChunkWriter
- func (w *Float64ColumnChunkWriter) Close() (err error)
- func (w *Float64ColumnChunkWriter) Descr() *schema.Column
- func (w *Float64ColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *Float64ColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *Float64ColumnChunkWriter) FlushCurrentPage() error
- func (w *Float64ColumnChunkWriter) HasBitsBuffer() bool
- func (w *Float64ColumnChunkWriter) LevelInfo() LevelInfo
- func (w *Float64ColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *Float64ColumnChunkWriter) RowsWritten() int
- func (w *Float64ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *Float64ColumnChunkWriter) TotalBytesWritten() int64
- func (w *Float64ColumnChunkWriter) TotalCompressedBytes() int64
- func (w *Float64ColumnChunkWriter) Type() parquet.Type
- func (w *Float64ColumnChunkWriter) WriteBatch(values []float64, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *Float64ColumnChunkWriter) WriteBatchSpaced(values []float64, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *Float64ColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *Float64ColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *Float64ColumnChunkWriter) WriteDictionaryPage() error
- func (w *Float64ColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type Int32ColumnChunkReader
- func (c *Int32ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int32ColumnChunkReader) Err() error
- func (c *Int32ColumnChunkReader) HasNext() bool
- func (cr *Int32ColumnChunkReader) ReadBatch(batchSize int64, values []int32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int32ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int32ColumnChunkReader) Type() parquet.Type
type Int32ColumnChunkWriter
- func NewInt32ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *Int32ColumnChunkWriter
- func (w *Int32ColumnChunkWriter) Close() (err error)
- func (w *Int32ColumnChunkWriter) Descr() *schema.Column
- func (w *Int32ColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *Int32ColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *Int32ColumnChunkWriter) FlushCurrentPage() error
- func (w *Int32ColumnChunkWriter) HasBitsBuffer() bool
- func (w *Int32ColumnChunkWriter) LevelInfo() LevelInfo
- func (w *Int32ColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *Int32ColumnChunkWriter) RowsWritten() int
- func (w *Int32ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *Int32ColumnChunkWriter) TotalBytesWritten() int64
- func (w *Int32ColumnChunkWriter) TotalCompressedBytes() int64
- func (w *Int32ColumnChunkWriter) Type() parquet.Type
- func (w *Int32ColumnChunkWriter) WriteBatch(values []int32, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *Int32ColumnChunkWriter) WriteBatchSpaced(values []int32, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *Int32ColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *Int32ColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *Int32ColumnChunkWriter) WriteDictionaryPage() error
- func (w *Int32ColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type Int64ColumnChunkReader
- func (c *Int64ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int64ColumnChunkReader) Err() error
- func (c *Int64ColumnChunkReader) HasNext() bool
- func (cr *Int64ColumnChunkReader) ReadBatch(batchSize int64, values []int64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int64ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int64ColumnChunkReader) Type() parquet.Type
type Int64ColumnChunkWriter
- func NewInt64ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *Int64ColumnChunkWriter
- func (w *Int64ColumnChunkWriter) Close() (err error)
- func (w *Int64ColumnChunkWriter) Descr() *schema.Column
- func (w *Int64ColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *Int64ColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *Int64ColumnChunkWriter) FlushCurrentPage() error
- func (w *Int64ColumnChunkWriter) HasBitsBuffer() bool
- func (w *Int64ColumnChunkWriter) LevelInfo() LevelInfo
- func (w *Int64ColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *Int64ColumnChunkWriter) RowsWritten() int
- func (w *Int64ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *Int64ColumnChunkWriter) TotalBytesWritten() int64
- func (w *Int64ColumnChunkWriter) TotalCompressedBytes() int64
- func (w *Int64ColumnChunkWriter) Type() parquet.Type
- func (w *Int64ColumnChunkWriter) WriteBatch(values []int64, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *Int64ColumnChunkWriter) WriteBatchSpaced(values []int64, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *Int64ColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *Int64ColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *Int64ColumnChunkWriter) WriteDictionaryPage() error
- func (w *Int64ColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type Int96ColumnChunkReader
- func (c *Int96ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int96ColumnChunkReader) Err() error
- func (c *Int96ColumnChunkReader) HasNext() bool
- func (cr *Int96ColumnChunkReader) ReadBatch(batchSize int64, values []parquet.Int96, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int96ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int96ColumnChunkReader) Type() parquet.Type
type Int96ColumnChunkWriter
- func NewInt96ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, ...) *Int96ColumnChunkWriter
- func (w *Int96ColumnChunkWriter) Close() (err error)
- func (w *Int96ColumnChunkWriter) Descr() *schema.Column
- func (w *Int96ColumnChunkWriter) EstimatedBufferedValueBytes() int64
- func (w *Int96ColumnChunkWriter) FlushBufferedDataPages() (err error)
- func (w *Int96ColumnChunkWriter) FlushCurrentPage() error
- func (w *Int96ColumnChunkWriter) HasBitsBuffer() bool
- func (w *Int96ColumnChunkWriter) LevelInfo() LevelInfo
- func (w *Int96ColumnChunkWriter) Properties() *parquet.WriterProperties
- func (w *Int96ColumnChunkWriter) RowsWritten() int
- func (w *Int96ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)
- func (w *Int96ColumnChunkWriter) TotalBytesWritten() int64
- func (w *Int96ColumnChunkWriter) TotalCompressedBytes() int64
- func (w *Int96ColumnChunkWriter) Type() parquet.Type
- func (w *Int96ColumnChunkWriter) WriteBatch(values []parquet.Int96, defLevels, repLevels []int16) (valueOffset int64, err error)
- func (w *Int96ColumnChunkWriter) WriteBatchSpaced(values []parquet.Int96, defLevels, repLevels []int16, validBits []byte, ...)
- func (w *Int96ColumnChunkWriter) WriteDataPage(page DataPage) error
- func (w *Int96ColumnChunkWriter) WriteDefinitionLevels(levels []int16)
- func (w *Int96ColumnChunkWriter) WriteDictionaryPage() error
- func (w *Int96ColumnChunkWriter) WriteRepetitionLevels(levels []int16)
type LevelInfo
- func (l *LevelInfo) Equal(rhs *LevelInfo) bool
- func (l *LevelInfo) HasNullableValues() bool
- func (l *LevelInfo) Increment(n schema.Node)
- func (l *LevelInfo) IncrementOptional()
- func (l *LevelInfo) IncrementRepeated() int16
type Page
type PageReader
- func NewPageReader(r parquet.BufferedReader, nrows int64, compressType compress.Compression, ...) (PageReader, error)
type PageWriter
- func NewPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, ...) (PageWriter, error)
type ReadOption
- func WithMetadata(m *metadata.FileMetaData) ReadOption
- func WithReadProps(props *parquet.ReaderProperties) ReadOption
type Reader
- func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error)
- func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error)
- func (f *Reader) BufferPool() *sync.Pool
- func (f *Reader) Close() error
- func (f *Reader) MetaData() *metadata.FileMetaData
- func (f *Reader) NumRowGroups() int
- func (f *Reader) NumRows() int64
- func (f *Reader) RowGroup(i int) *RowGroupReader
- func (f *Reader) WriterVersion() *metadata.AppVersion
type RecordReader
- func NewRecordReader(descr *schema.Column, info LevelInfo, readDict bool, mem memory.Allocator, ...) RecordReader
type RowGroupReader
- func (r *RowGroupReader) ByteSize() int64
- func (r *RowGroupReader) Column(i int) (ColumnChunkReader, error)
- func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error)
- func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData
- func (r *RowGroupReader) NumColumns() int
- func (r *RowGroupReader) NumRows() int64
type RowGroupWriter
type SerialRowGroupWriter
type ValidityBitmapInputOutput
type WriteOption
- func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption
- func WithWriterProps(props *parquet.WriterProperties) WriteOption
type Writer
- func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer
- func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter
- func (fw *Writer) AppendRowGroup() SerialRowGroupWriter
- func (fw *Writer) Close() error
- func (fw *Writer) NumColumns() int
- func (fw *Writer) NumRowGroups() int
- func (fw *Writer) NumRows() int
- func (fw *Writer) Properties() *parquet.WriterProperties

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DefLevelsToBitmap ¶

func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput)

DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object.

func DefRepLevelsToBitmap ¶

func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error

DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels properly handling nested lists and parents.

func DefRepLevelsToListInfo ¶

func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error

DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap and properly handle nested lists and update the offsets for them.

Types ¶

type BinaryRecordReader ¶

type BinaryRecordReader interface {
	RecordReader
	GetBuilderChunks() []arrow.Array
}

BinaryRecordReader provides an extra GetBuilderChunks function above and beyond the plain RecordReader to allow for efficiently building chunked arrays.

type BooleanColumnChunkReader ¶

type BooleanColumnChunkReader struct {
	// contains filtered or unexported fields
}

BooleanColumnChunkReader is the Typed Column chunk reader instance for reading Boolean column data.

func (*BooleanColumnChunkReader) Descriptor ¶

func (c *BooleanColumnChunkReader) Descriptor() *schema.Column

func (*BooleanColumnChunkReader) Err ¶

func (c *BooleanColumnChunkReader) Err() error

func (*BooleanColumnChunkReader) HasNext ¶

func (c *BooleanColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*BooleanColumnChunkReader) ReadBatch ¶

func (cr *BooleanColumnChunkReader) ReadBatch(batchSize int64, values []bool, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*BooleanColumnChunkReader) Skip ¶

func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*BooleanColumnChunkReader) Type ¶

func (c *BooleanColumnChunkReader) Type() parquet.Type

type BooleanColumnChunkWriter ¶

type BooleanColumnChunkWriter struct {
	// contains filtered or unexported fields
}

BooleanColumnChunkWriter is the typed interface for writing columns to a parquet file for Boolean columns.

func NewBooleanColumnChunkWriter ¶

func NewBooleanColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *BooleanColumnChunkWriter

NewBooleanColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*BooleanColumnChunkWriter) Close ¶

func (w *BooleanColumnChunkWriter) Close() (err error)

func (*BooleanColumnChunkWriter) Descr ¶

func (w *BooleanColumnChunkWriter) Descr() *schema.Column

func (*BooleanColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *BooleanColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*BooleanColumnChunkWriter) FlushBufferedDataPages ¶

func (w *BooleanColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*BooleanColumnChunkWriter) FlushCurrentPage ¶

func (w *BooleanColumnChunkWriter) FlushCurrentPage() error

func (*BooleanColumnChunkWriter) HasBitsBuffer ¶

func (w *BooleanColumnChunkWriter) HasBitsBuffer() bool

func (*BooleanColumnChunkWriter) LevelInfo ¶

func (w *BooleanColumnChunkWriter) LevelInfo() LevelInfo

func (*BooleanColumnChunkWriter) Properties ¶

func (w *BooleanColumnChunkWriter) Properties() *parquet.WriterProperties

func (*BooleanColumnChunkWriter) RowsWritten ¶

func (w *BooleanColumnChunkWriter) RowsWritten() int

func (*BooleanColumnChunkWriter) SetBitsBuffer ¶

func (w *BooleanColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*BooleanColumnChunkWriter) TotalBytesWritten ¶

func (w *BooleanColumnChunkWriter) TotalBytesWritten() int64

func (*BooleanColumnChunkWriter) TotalCompressedBytes ¶

func (w *BooleanColumnChunkWriter) TotalCompressedBytes() int64

func (*BooleanColumnChunkWriter) Type ¶

func (w *BooleanColumnChunkWriter) Type() parquet.Type

func (*BooleanColumnChunkWriter) WriteBatch ¶

func (w *BooleanColumnChunkWriter) WriteBatch(values []bool, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*BooleanColumnChunkWriter) WriteBatchSpaced ¶

func (w *BooleanColumnChunkWriter) WriteBatchSpaced(values []bool, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*BooleanColumnChunkWriter) WriteDataPage ¶

func (w *BooleanColumnChunkWriter) WriteDataPage(page DataPage) error

func (*BooleanColumnChunkWriter) WriteDefinitionLevels ¶

func (w *BooleanColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*BooleanColumnChunkWriter) WriteDictionaryPage ¶

func (w *BooleanColumnChunkWriter) WriteDictionaryPage() error

func (*BooleanColumnChunkWriter) WriteRepetitionLevels ¶

func (w *BooleanColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type BufferedRowGroupWriter ¶

type BufferedRowGroupWriter interface {
	RowGroupWriter
	Column(i int) (ColumnChunkWriter, error)
}

BufferedRowGroupWriter allows writing to multiple columns simultaneously, data will not be flushed to the underlying writer until closing the RowGroupWriter.

All columns must have equal numbers of rows before closing the row group or it will panic.

type ByteArrayColumnChunkReader ¶

type ByteArrayColumnChunkReader struct {
	// contains filtered or unexported fields
}

ByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading ByteArray column data.

func (*ByteArrayColumnChunkReader) Descriptor ¶

func (c *ByteArrayColumnChunkReader) Descriptor() *schema.Column

func (*ByteArrayColumnChunkReader) Err ¶

func (c *ByteArrayColumnChunkReader) Err() error

func (*ByteArrayColumnChunkReader) HasNext ¶

func (c *ByteArrayColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*ByteArrayColumnChunkReader) ReadBatch ¶

func (cr *ByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.ByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*ByteArrayColumnChunkReader) Skip ¶

func (cr *ByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*ByteArrayColumnChunkReader) Type ¶

func (c *ByteArrayColumnChunkReader) Type() parquet.Type

type ByteArrayColumnChunkWriter ¶

type ByteArrayColumnChunkWriter struct {
	// contains filtered or unexported fields
}

ByteArrayColumnChunkWriter is the typed interface for writing columns to a parquet file for ByteArray columns.

func NewByteArrayColumnChunkWriter ¶

func NewByteArrayColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *ByteArrayColumnChunkWriter

NewByteArrayColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*ByteArrayColumnChunkWriter) Close ¶

func (w *ByteArrayColumnChunkWriter) Close() (err error)

func (*ByteArrayColumnChunkWriter) Descr ¶

func (w *ByteArrayColumnChunkWriter) Descr() *schema.Column

func (*ByteArrayColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *ByteArrayColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*ByteArrayColumnChunkWriter) FlushBufferedDataPages ¶

func (w *ByteArrayColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*ByteArrayColumnChunkWriter) FlushCurrentPage ¶

func (w *ByteArrayColumnChunkWriter) FlushCurrentPage() error

func (*ByteArrayColumnChunkWriter) HasBitsBuffer ¶

func (w *ByteArrayColumnChunkWriter) HasBitsBuffer() bool

func (*ByteArrayColumnChunkWriter) LevelInfo ¶

func (w *ByteArrayColumnChunkWriter) LevelInfo() LevelInfo

func (*ByteArrayColumnChunkWriter) Properties ¶

func (w *ByteArrayColumnChunkWriter) Properties() *parquet.WriterProperties

func (*ByteArrayColumnChunkWriter) RowsWritten ¶

func (w *ByteArrayColumnChunkWriter) RowsWritten() int

func (*ByteArrayColumnChunkWriter) SetBitsBuffer ¶

func (w *ByteArrayColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*ByteArrayColumnChunkWriter) TotalBytesWritten ¶

func (w *ByteArrayColumnChunkWriter) TotalBytesWritten() int64

func (*ByteArrayColumnChunkWriter) TotalCompressedBytes ¶

func (w *ByteArrayColumnChunkWriter) TotalCompressedBytes() int64

func (*ByteArrayColumnChunkWriter) Type ¶

func (w *ByteArrayColumnChunkWriter) Type() parquet.Type

func (*ByteArrayColumnChunkWriter) WriteBatch ¶

func (w *ByteArrayColumnChunkWriter) WriteBatch(values []parquet.ByteArray, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*ByteArrayColumnChunkWriter) WriteBatchSpaced ¶

func (w *ByteArrayColumnChunkWriter) WriteBatchSpaced(values []parquet.ByteArray, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*ByteArrayColumnChunkWriter) WriteDataPage ¶

func (w *ByteArrayColumnChunkWriter) WriteDataPage(page DataPage) error

func (*ByteArrayColumnChunkWriter) WriteDefinitionLevels ¶

func (w *ByteArrayColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*ByteArrayColumnChunkWriter) WriteDictionaryPage ¶

func (w *ByteArrayColumnChunkWriter) WriteDictionaryPage() error

func (*ByteArrayColumnChunkWriter) WriteRepetitionLevels ¶

func (w *ByteArrayColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type ColumnChunkReader ¶

type ColumnChunkReader interface {
	// HasNext returns whether there is more data to be read in this column
	// and row group.
	HasNext() bool
	// Type returns the underlying physical type of the column
	Type() parquet.Type
	// Descriptor returns the column schema container
	Descriptor() *schema.Column
	// if HasNext returns false because of an error, this will return the error
	// it encountered. Otherwise this will be nil if it's just the end of the
	// column
	Err() error
	// contains filtered or unexported methods
}

ColumnChunkReader is the basic interface for all column readers. It will use a page reader to read all the pages in a column chunk from a row group.

To actually Read out the column data, you need to convert to the properly typed ColumnChunkReader type such as *BooleanColumnReader etc.

Some things to clarify when working with column readers:

"Values" refers to the physical data values in a data page.

This is separate from the number of "rows" in a column and the total number of "elements" in a column because null values aren't stored physically in the data page but are represented via definition levels, so the number of values in a column can be less than the number of rows.

The total number of "elements" in a column also differs because of potential repeated fields, where you can have multiple values in the page which together make up a single element (such as a list) or depending on the repetition level and definition level, could represent an entire null list or just a null element inside of a list.

func NewColumnReader ¶

func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator, bufferPool *sync.Pool) ColumnChunkReader

NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will provide the pages of data for this column. The type is determined from the column passed in.

In addition to the page reader and allocator, a pointer to a shared sync.Pool is expected to provide buffers for temporary usage to minimize allocations. The bufferPool should provide *memory.Buffer objects that can be resized as necessary, buffers should have `ResizeNoShrink(0)` called on them before being put back into the pool.

type ColumnChunkWriter ¶

type ColumnChunkWriter interface {
	// Close ends this column and returns the number of bytes written
	Close() error
	// Type returns the underlying physical parquet type for this column
	Type() parquet.Type
	// Descr returns the column information for this writer
	Descr() *schema.Column
	// RowsWritten returns the number of rows that have so far been written with this writer
	RowsWritten() int
	// TotalCompressedBytes returns the number of bytes, after compression, that have been written so far
	TotalCompressedBytes() int64
	// TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is
	// just the data and page headers
	TotalBytesWritten() int64
	// Properties returns the current WriterProperties in use for this writer
	Properties() *parquet.WriterProperties

	LevelInfo() LevelInfo
	SetBitsBuffer(*memory.Buffer)
	HasBitsBuffer() bool
}

ColumnChunkWriter is the base interface for all columnwriters. To directly write data to the column, you need to assert it to the correctly typed ColumnChunkWriter instance, such as Int32ColumnWriter.

func NewColumnChunkWriter ¶

func NewColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, props *parquet.WriterProperties) ColumnChunkWriter

NewColumnChunkWriter constructs a column writer of the appropriate type by using the metadata builder and writer properties to determine the correct type of column writer to construct and whether or not to use dictionary encoding.

type CryptoContext ¶

type CryptoContext struct {
	StartDecryptWithDictionaryPage bool
	RowGroupOrdinal                int16
	ColumnOrdinal                  int16
	MetaDecryptor                  encryption.Decryptor
	DataDecryptor                  encryption.Decryptor
}

CryptoContext is a context for keeping track of the current methods for decrypting. It keeps track of the row group and column numbers along with references to the decryptor objects.

type DataPage ¶

type DataPage interface {
	Page
	UncompressedSize() int32
	Statistics() metadata.EncodedStatistics
}

DataPage is the base interface for both DataPageV1 and DataPageV2 of the parquet spec.

type DataPageV1 ¶

type DataPageV1 struct {
	// contains filtered or unexported fields
}

DataPageV1 represents a DataPage version 1 from the parquet.thrift file

func NewDataPageV1 ¶

func NewDataPageV1(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32) *DataPageV1

NewDataPageV1 returns a V1 data page with the given buffer as its data and the specified encoding information

Will utilize objects that have been released back into the data page pool and re-use them if available as opposed to creating new objects. Calling Release on the data page object will release it back to the pool for re-use.

func NewDataPageV1WithStats ¶

func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32, stats metadata.EncodedStatistics) *DataPageV1

NewDataPageV1WithStats is the same as NewDataPageV1, but also allows adding the stat info into the created page

func (*DataPageV1) Data ¶

func (p *DataPageV1) Data() []byte

func (*DataPageV1) DefinitionLevelEncoding ¶

func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding

DefinitionLevelEncoding returns the encoding utilized for the Definition Levels

func (*DataPageV1) Encoding ¶

func (p *DataPageV1) Encoding() format.Encoding

func (*DataPageV1) NumValues ¶

func (p *DataPageV1) NumValues() int32

func (*DataPageV1) Release ¶

func (d *DataPageV1) Release()

Release this page back into the DataPage object pool so that it can be reused.

After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.

func (*DataPageV1) RepetitionLevelEncoding ¶

func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding

RepetitionLevelEncoding returns the encoding utilized for the Repetition Levels

func (*DataPageV1) Statistics ¶

func (d *DataPageV1) Statistics() metadata.EncodedStatistics

Statistics returns the encoded statistics on this data page

func (*DataPageV1) Type ¶

func (p *DataPageV1) Type() format.PageType

func (*DataPageV1) UncompressedSize ¶

func (d *DataPageV1) UncompressedSize() int32

UncompressedSize returns the size of the data in this data page when uncompressed

type DataPageV2 ¶

type DataPageV2 struct {
	// contains filtered or unexported fields
}

DataPageV2 is the representation of the V2 data page from the parquet.thrift spec

func NewDataPageV2 ¶

func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool) *DataPageV2

NewDataPageV2 constructs a new V2 data page with the provided information and a buffer of the raw data.

func NewDataPageV2WithStats ¶

func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool, stats metadata.EncodedStatistics) *DataPageV2

NewDataPageV2WithStats is the same as NewDataPageV2 but allows providing the encoded stats with the page.

func (*DataPageV2) Data ¶

func (p *DataPageV2) Data() []byte

func (*DataPageV2) DefinitionLevelByteLen ¶

func (d *DataPageV2) DefinitionLevelByteLen() int32

DefinitionLevelByteLen is the number of bytes in the buffer that are used to represent the definition levels

func (*DataPageV2) Encoding ¶

func (p *DataPageV2) Encoding() format.Encoding

func (*DataPageV2) IsCompressed ¶

func (d *DataPageV2) IsCompressed() bool

IsCompressed returns true if the data of this page is compressed

func (*DataPageV2) NumNulls ¶

func (d *DataPageV2) NumNulls() int32

NumNulls is the reported number of nulls in this datapage

func (*DataPageV2) NumRows ¶

func (d *DataPageV2) NumRows() int32

NumRows is the number of rows recorded in the page header

func (*DataPageV2) NumValues ¶

func (p *DataPageV2) NumValues() int32

func (*DataPageV2) Release ¶

func (d *DataPageV2) Release()

Release this page back into the DataPage object pool so that it can be reused.

After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.

func (*DataPageV2) RepetitionLevelByteLen ¶

func (d *DataPageV2) RepetitionLevelByteLen() int32

RepetitionLevelByteLen is the number of bytes in the buffer which are used to represent the repetition Levels

func (*DataPageV2) Statistics ¶

func (d *DataPageV2) Statistics() metadata.EncodedStatistics

Statistics are the encoded statistics in the data page

func (*DataPageV2) Type ¶

func (p *DataPageV2) Type() format.PageType

func (*DataPageV2) UncompressedSize ¶

func (d *DataPageV2) UncompressedSize() int32

UncompressedSize is the size of the raw page when uncompressed. If `IsCompressed` is true, then the raw data in the buffer is expected to be compressed.

type DictionaryPage ¶

type DictionaryPage struct {
	// contains filtered or unexported fields
}

DictionaryPage represents the a page of data that uses dictionary encoding

func NewDictionaryPage ¶

func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage

NewDictionaryPage constructs a new dictionary page with the provided data buffer and number of values.

func (*DictionaryPage) Data ¶

func (p *DictionaryPage) Data() []byte

func (*DictionaryPage) Encoding ¶

func (p *DictionaryPage) Encoding() format.Encoding

func (*DictionaryPage) IsSorted ¶

func (d *DictionaryPage) IsSorted() bool

IsSorted returns whether the dictionary itself is sorted

func (*DictionaryPage) NumValues ¶

func (p *DictionaryPage) NumValues() int32

func (*DictionaryPage) Release ¶

func (d *DictionaryPage) Release()

Release this page back into the DataPage object pool so that it can be reused.

After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.

func (*DictionaryPage) Type ¶

func (p *DictionaryPage) Type() format.PageType

type FixedLenByteArrayColumnChunkReader ¶

type FixedLenByteArrayColumnChunkReader struct {
	// contains filtered or unexported fields
}

FixedLenByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading FixedLenByteArray column data.

func (*FixedLenByteArrayColumnChunkReader) Descriptor ¶

func (c *FixedLenByteArrayColumnChunkReader) Descriptor() *schema.Column

func (*FixedLenByteArrayColumnChunkReader) Err ¶

func (c *FixedLenByteArrayColumnChunkReader) Err() error

func (*FixedLenByteArrayColumnChunkReader) HasNext ¶

func (c *FixedLenByteArrayColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*FixedLenByteArrayColumnChunkReader) ReadBatch ¶

func (cr *FixedLenByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.FixedLenByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*FixedLenByteArrayColumnChunkReader) Skip ¶

func (cr *FixedLenByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*FixedLenByteArrayColumnChunkReader) Type ¶

func (c *FixedLenByteArrayColumnChunkReader) Type() parquet.Type

type FixedLenByteArrayColumnChunkWriter ¶

type FixedLenByteArrayColumnChunkWriter struct {
	// contains filtered or unexported fields
}

FixedLenByteArrayColumnChunkWriter is the typed interface for writing columns to a parquet file for FixedLenByteArray columns.

func NewFixedLenByteArrayColumnChunkWriter ¶

func NewFixedLenByteArrayColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *FixedLenByteArrayColumnChunkWriter

NewFixedLenByteArrayColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*FixedLenByteArrayColumnChunkWriter) Close ¶

func (w *FixedLenByteArrayColumnChunkWriter) Close() (err error)

func (*FixedLenByteArrayColumnChunkWriter) Descr ¶

func (w *FixedLenByteArrayColumnChunkWriter) Descr() *schema.Column

func (*FixedLenByteArrayColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *FixedLenByteArrayColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*FixedLenByteArrayColumnChunkWriter) FlushBufferedDataPages ¶

func (w *FixedLenByteArrayColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*FixedLenByteArrayColumnChunkWriter) FlushCurrentPage ¶

func (w *FixedLenByteArrayColumnChunkWriter) FlushCurrentPage() error

func (*FixedLenByteArrayColumnChunkWriter) HasBitsBuffer ¶

func (w *FixedLenByteArrayColumnChunkWriter) HasBitsBuffer() bool

func (*FixedLenByteArrayColumnChunkWriter) LevelInfo ¶

func (w *FixedLenByteArrayColumnChunkWriter) LevelInfo() LevelInfo

func (*FixedLenByteArrayColumnChunkWriter) Properties ¶

func (w *FixedLenByteArrayColumnChunkWriter) Properties() *parquet.WriterProperties

func (*FixedLenByteArrayColumnChunkWriter) RowsWritten ¶

func (w *FixedLenByteArrayColumnChunkWriter) RowsWritten() int

func (*FixedLenByteArrayColumnChunkWriter) SetBitsBuffer ¶

func (w *FixedLenByteArrayColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*FixedLenByteArrayColumnChunkWriter) TotalBytesWritten ¶

func (w *FixedLenByteArrayColumnChunkWriter) TotalBytesWritten() int64

func (*FixedLenByteArrayColumnChunkWriter) TotalCompressedBytes ¶

func (w *FixedLenByteArrayColumnChunkWriter) TotalCompressedBytes() int64

func (*FixedLenByteArrayColumnChunkWriter) Type ¶

func (w *FixedLenByteArrayColumnChunkWriter) Type() parquet.Type

func (*FixedLenByteArrayColumnChunkWriter) WriteBatch ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteBatch(values []parquet.FixedLenByteArray, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*FixedLenByteArrayColumnChunkWriter) WriteBatchSpaced ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteBatchSpaced(values []parquet.FixedLenByteArray, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*FixedLenByteArrayColumnChunkWriter) WriteDataPage ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteDataPage(page DataPage) error

func (*FixedLenByteArrayColumnChunkWriter) WriteDefinitionLevels ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*FixedLenByteArrayColumnChunkWriter) WriteDictionaryPage ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteDictionaryPage() error

func (*FixedLenByteArrayColumnChunkWriter) WriteRepetitionLevels ¶

func (w *FixedLenByteArrayColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type Float32ColumnChunkReader ¶

type Float32ColumnChunkReader struct {
	// contains filtered or unexported fields
}

Float32ColumnChunkReader is the Typed Column chunk reader instance for reading Float32 column data.

func (*Float32ColumnChunkReader) Descriptor ¶

func (c *Float32ColumnChunkReader) Descriptor() *schema.Column

func (*Float32ColumnChunkReader) Err ¶

func (c *Float32ColumnChunkReader) Err() error

func (*Float32ColumnChunkReader) HasNext ¶

func (c *Float32ColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*Float32ColumnChunkReader) ReadBatch ¶

func (cr *Float32ColumnChunkReader) ReadBatch(batchSize int64, values []float32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*Float32ColumnChunkReader) Skip ¶

func (cr *Float32ColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*Float32ColumnChunkReader) Type ¶

func (c *Float32ColumnChunkReader) Type() parquet.Type

type Float32ColumnChunkWriter ¶

type Float32ColumnChunkWriter struct {
	// contains filtered or unexported fields
}

Float32ColumnChunkWriter is the typed interface for writing columns to a parquet file for Float32 columns.

func NewFloat32ColumnChunkWriter ¶

func NewFloat32ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *Float32ColumnChunkWriter

NewFloat32ColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*Float32ColumnChunkWriter) Close ¶

func (w *Float32ColumnChunkWriter) Close() (err error)

func (*Float32ColumnChunkWriter) Descr ¶

func (w *Float32ColumnChunkWriter) Descr() *schema.Column

func (*Float32ColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *Float32ColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*Float32ColumnChunkWriter) FlushBufferedDataPages ¶

func (w *Float32ColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*Float32ColumnChunkWriter) FlushCurrentPage ¶

func (w *Float32ColumnChunkWriter) FlushCurrentPage() error

func (*Float32ColumnChunkWriter) HasBitsBuffer ¶

func (w *Float32ColumnChunkWriter) HasBitsBuffer() bool

func (*Float32ColumnChunkWriter) LevelInfo ¶

func (w *Float32ColumnChunkWriter) LevelInfo() LevelInfo

func (*Float32ColumnChunkWriter) Properties ¶

func (w *Float32ColumnChunkWriter) Properties() *parquet.WriterProperties

func (*Float32ColumnChunkWriter) RowsWritten ¶

func (w *Float32ColumnChunkWriter) RowsWritten() int

func (*Float32ColumnChunkWriter) SetBitsBuffer ¶

func (w *Float32ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*Float32ColumnChunkWriter) TotalBytesWritten ¶

func (w *Float32ColumnChunkWriter) TotalBytesWritten() int64

func (*Float32ColumnChunkWriter) TotalCompressedBytes ¶

func (w *Float32ColumnChunkWriter) TotalCompressedBytes() int64

func (*Float32ColumnChunkWriter) Type ¶

func (w *Float32ColumnChunkWriter) Type() parquet.Type

func (*Float32ColumnChunkWriter) WriteBatch ¶

func (w *Float32ColumnChunkWriter) WriteBatch(values []float32, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*Float32ColumnChunkWriter) WriteBatchSpaced ¶

func (w *Float32ColumnChunkWriter) WriteBatchSpaced(values []float32, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*Float32ColumnChunkWriter) WriteDataPage ¶

func (w *Float32ColumnChunkWriter) WriteDataPage(page DataPage) error

func (*Float32ColumnChunkWriter) WriteDefinitionLevels ¶

func (w *Float32ColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*Float32ColumnChunkWriter) WriteDictionaryPage ¶

func (w *Float32ColumnChunkWriter) WriteDictionaryPage() error

func (*Float32ColumnChunkWriter) WriteRepetitionLevels ¶

func (w *Float32ColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type Float64ColumnChunkReader ¶

type Float64ColumnChunkReader struct {
	// contains filtered or unexported fields
}

Float64ColumnChunkReader is the Typed Column chunk reader instance for reading Float64 column data.

func (*Float64ColumnChunkReader) Descriptor ¶

func (c *Float64ColumnChunkReader) Descriptor() *schema.Column

func (*Float64ColumnChunkReader) Err ¶

func (c *Float64ColumnChunkReader) Err() error

func (*Float64ColumnChunkReader) HasNext ¶

func (c *Float64ColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*Float64ColumnChunkReader) ReadBatch ¶

func (cr *Float64ColumnChunkReader) ReadBatch(batchSize int64, values []float64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*Float64ColumnChunkReader) Skip ¶

func (cr *Float64ColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*Float64ColumnChunkReader) Type ¶

func (c *Float64ColumnChunkReader) Type() parquet.Type

type Float64ColumnChunkWriter ¶

type Float64ColumnChunkWriter struct {
	// contains filtered or unexported fields
}

Float64ColumnChunkWriter is the typed interface for writing columns to a parquet file for Float64 columns.

func NewFloat64ColumnChunkWriter ¶

func NewFloat64ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *Float64ColumnChunkWriter

NewFloat64ColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*Float64ColumnChunkWriter) Close ¶

func (w *Float64ColumnChunkWriter) Close() (err error)

func (*Float64ColumnChunkWriter) Descr ¶

func (w *Float64ColumnChunkWriter) Descr() *schema.Column

func (*Float64ColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *Float64ColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*Float64ColumnChunkWriter) FlushBufferedDataPages ¶

func (w *Float64ColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*Float64ColumnChunkWriter) FlushCurrentPage ¶

func (w *Float64ColumnChunkWriter) FlushCurrentPage() error

func (*Float64ColumnChunkWriter) HasBitsBuffer ¶

func (w *Float64ColumnChunkWriter) HasBitsBuffer() bool

func (*Float64ColumnChunkWriter) LevelInfo ¶

func (w *Float64ColumnChunkWriter) LevelInfo() LevelInfo

func (*Float64ColumnChunkWriter) Properties ¶

func (w *Float64ColumnChunkWriter) Properties() *parquet.WriterProperties

func (*Float64ColumnChunkWriter) RowsWritten ¶

func (w *Float64ColumnChunkWriter) RowsWritten() int

func (*Float64ColumnChunkWriter) SetBitsBuffer ¶

func (w *Float64ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*Float64ColumnChunkWriter) TotalBytesWritten ¶

func (w *Float64ColumnChunkWriter) TotalBytesWritten() int64

func (*Float64ColumnChunkWriter) TotalCompressedBytes ¶

func (w *Float64ColumnChunkWriter) TotalCompressedBytes() int64

func (*Float64ColumnChunkWriter) Type ¶

func (w *Float64ColumnChunkWriter) Type() parquet.Type

func (*Float64ColumnChunkWriter) WriteBatch ¶

func (w *Float64ColumnChunkWriter) WriteBatch(values []float64, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*Float64ColumnChunkWriter) WriteBatchSpaced ¶

func (w *Float64ColumnChunkWriter) WriteBatchSpaced(values []float64, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*Float64ColumnChunkWriter) WriteDataPage ¶

func (w *Float64ColumnChunkWriter) WriteDataPage(page DataPage) error

func (*Float64ColumnChunkWriter) WriteDefinitionLevels ¶

func (w *Float64ColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*Float64ColumnChunkWriter) WriteDictionaryPage ¶

func (w *Float64ColumnChunkWriter) WriteDictionaryPage() error

func (*Float64ColumnChunkWriter) WriteRepetitionLevels ¶

func (w *Float64ColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type Int32ColumnChunkReader ¶

type Int32ColumnChunkReader struct {
	// contains filtered or unexported fields
}

Int32ColumnChunkReader is the Typed Column chunk reader instance for reading Int32 column data.

func (*Int32ColumnChunkReader) Descriptor ¶

func (c *Int32ColumnChunkReader) Descriptor() *schema.Column

func (*Int32ColumnChunkReader) Err ¶

func (c *Int32ColumnChunkReader) Err() error

func (*Int32ColumnChunkReader) HasNext ¶

func (c *Int32ColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*Int32ColumnChunkReader) ReadBatch ¶

func (cr *Int32ColumnChunkReader) ReadBatch(batchSize int64, values []int32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*Int32ColumnChunkReader) Skip ¶

func (cr *Int32ColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*Int32ColumnChunkReader) Type ¶

func (c *Int32ColumnChunkReader) Type() parquet.Type

type Int32ColumnChunkWriter ¶

type Int32ColumnChunkWriter struct {
	// contains filtered or unexported fields
}

Int32ColumnChunkWriter is the typed interface for writing columns to a parquet file for Int32 columns.

func NewInt32ColumnChunkWriter ¶

func NewInt32ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *Int32ColumnChunkWriter

NewInt32ColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*Int32ColumnChunkWriter) Close ¶

func (w *Int32ColumnChunkWriter) Close() (err error)

func (*Int32ColumnChunkWriter) Descr ¶

func (w *Int32ColumnChunkWriter) Descr() *schema.Column

func (*Int32ColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *Int32ColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*Int32ColumnChunkWriter) FlushBufferedDataPages ¶

func (w *Int32ColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*Int32ColumnChunkWriter) FlushCurrentPage ¶

func (w *Int32ColumnChunkWriter) FlushCurrentPage() error

func (*Int32ColumnChunkWriter) HasBitsBuffer ¶

func (w *Int32ColumnChunkWriter) HasBitsBuffer() bool

func (*Int32ColumnChunkWriter) LevelInfo ¶

func (w *Int32ColumnChunkWriter) LevelInfo() LevelInfo

func (*Int32ColumnChunkWriter) Properties ¶

func (w *Int32ColumnChunkWriter) Properties() *parquet.WriterProperties

func (*Int32ColumnChunkWriter) RowsWritten ¶

func (w *Int32ColumnChunkWriter) RowsWritten() int

func (*Int32ColumnChunkWriter) SetBitsBuffer ¶

func (w *Int32ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*Int32ColumnChunkWriter) TotalBytesWritten ¶

func (w *Int32ColumnChunkWriter) TotalBytesWritten() int64

func (*Int32ColumnChunkWriter) TotalCompressedBytes ¶

func (w *Int32ColumnChunkWriter) TotalCompressedBytes() int64

func (*Int32ColumnChunkWriter) Type ¶

func (w *Int32ColumnChunkWriter) Type() parquet.Type

func (*Int32ColumnChunkWriter) WriteBatch ¶

func (w *Int32ColumnChunkWriter) WriteBatch(values []int32, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*Int32ColumnChunkWriter) WriteBatchSpaced ¶

func (w *Int32ColumnChunkWriter) WriteBatchSpaced(values []int32, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*Int32ColumnChunkWriter) WriteDataPage ¶

func (w *Int32ColumnChunkWriter) WriteDataPage(page DataPage) error

func (*Int32ColumnChunkWriter) WriteDefinitionLevels ¶

func (w *Int32ColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*Int32ColumnChunkWriter) WriteDictionaryPage ¶

func (w *Int32ColumnChunkWriter) WriteDictionaryPage() error

func (*Int32ColumnChunkWriter) WriteRepetitionLevels ¶

func (w *Int32ColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type Int64ColumnChunkReader ¶

type Int64ColumnChunkReader struct {
	// contains filtered or unexported fields
}

Int64ColumnChunkReader is the Typed Column chunk reader instance for reading Int64 column data.

func (*Int64ColumnChunkReader) Descriptor ¶

func (c *Int64ColumnChunkReader) Descriptor() *schema.Column

func (*Int64ColumnChunkReader) Err ¶

func (c *Int64ColumnChunkReader) Err() error

func (*Int64ColumnChunkReader) HasNext ¶

func (c *Int64ColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*Int64ColumnChunkReader) ReadBatch ¶

func (cr *Int64ColumnChunkReader) ReadBatch(batchSize int64, values []int64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*Int64ColumnChunkReader) Skip ¶

func (cr *Int64ColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*Int64ColumnChunkReader) Type ¶

func (c *Int64ColumnChunkReader) Type() parquet.Type

type Int64ColumnChunkWriter ¶

type Int64ColumnChunkWriter struct {
	// contains filtered or unexported fields
}

Int64ColumnChunkWriter is the typed interface for writing columns to a parquet file for Int64 columns.

func NewInt64ColumnChunkWriter ¶

func NewInt64ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *Int64ColumnChunkWriter

NewInt64ColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*Int64ColumnChunkWriter) Close ¶

func (w *Int64ColumnChunkWriter) Close() (err error)

func (*Int64ColumnChunkWriter) Descr ¶

func (w *Int64ColumnChunkWriter) Descr() *schema.Column

func (*Int64ColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *Int64ColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*Int64ColumnChunkWriter) FlushBufferedDataPages ¶

func (w *Int64ColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*Int64ColumnChunkWriter) FlushCurrentPage ¶

func (w *Int64ColumnChunkWriter) FlushCurrentPage() error

func (*Int64ColumnChunkWriter) HasBitsBuffer ¶

func (w *Int64ColumnChunkWriter) HasBitsBuffer() bool

func (*Int64ColumnChunkWriter) LevelInfo ¶

func (w *Int64ColumnChunkWriter) LevelInfo() LevelInfo

func (*Int64ColumnChunkWriter) Properties ¶

func (w *Int64ColumnChunkWriter) Properties() *parquet.WriterProperties

func (*Int64ColumnChunkWriter) RowsWritten ¶

func (w *Int64ColumnChunkWriter) RowsWritten() int

func (*Int64ColumnChunkWriter) SetBitsBuffer ¶

func (w *Int64ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*Int64ColumnChunkWriter) TotalBytesWritten ¶

func (w *Int64ColumnChunkWriter) TotalBytesWritten() int64

func (*Int64ColumnChunkWriter) TotalCompressedBytes ¶

func (w *Int64ColumnChunkWriter) TotalCompressedBytes() int64

func (*Int64ColumnChunkWriter) Type ¶

func (w *Int64ColumnChunkWriter) Type() parquet.Type

func (*Int64ColumnChunkWriter) WriteBatch ¶

func (w *Int64ColumnChunkWriter) WriteBatch(values []int64, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*Int64ColumnChunkWriter) WriteBatchSpaced ¶

func (w *Int64ColumnChunkWriter) WriteBatchSpaced(values []int64, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*Int64ColumnChunkWriter) WriteDataPage ¶

func (w *Int64ColumnChunkWriter) WriteDataPage(page DataPage) error

func (*Int64ColumnChunkWriter) WriteDefinitionLevels ¶

func (w *Int64ColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*Int64ColumnChunkWriter) WriteDictionaryPage ¶

func (w *Int64ColumnChunkWriter) WriteDictionaryPage() error

func (*Int64ColumnChunkWriter) WriteRepetitionLevels ¶

func (w *Int64ColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type Int96ColumnChunkReader ¶

type Int96ColumnChunkReader struct {
	// contains filtered or unexported fields
}

Int96ColumnChunkReader is the Typed Column chunk reader instance for reading Int96 column data.

func (*Int96ColumnChunkReader) Descriptor ¶

func (c *Int96ColumnChunkReader) Descriptor() *schema.Column

func (*Int96ColumnChunkReader) Err ¶

func (c *Int96ColumnChunkReader) Err() error

func (*Int96ColumnChunkReader) HasNext ¶

func (c *Int96ColumnChunkReader) HasNext() bool

HasNext returns whether there is more data to be read in this column and row group.

func (*Int96ColumnChunkReader) ReadBatch ¶

func (cr *Int96ColumnChunkReader) ReadBatch(batchSize int64, values []parquet.Int96, defLvls, repLvls []int16) (total int64, valuesRead int, err error)

ReadBatch reads batchSize values from the column.

Returns error if values is not at least big enough to hold the number of values that will be read.

defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.

total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls

func (*Int96ColumnChunkReader) Skip ¶

func (cr *Int96ColumnChunkReader) Skip(nvalues int64) (int64, error)

Skip skips the next nvalues so that the next call to ReadBatch will start reading *after* the skipped values.

func (*Int96ColumnChunkReader) Type ¶

func (c *Int96ColumnChunkReader) Type() parquet.Type

type Int96ColumnChunkWriter ¶

type Int96ColumnChunkWriter struct {
	// contains filtered or unexported fields
}

Int96ColumnChunkWriter is the typed interface for writing columns to a parquet file for Int96 columns.

func NewInt96ColumnChunkWriter ¶

func NewInt96ColumnChunkWriter(meta *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) *Int96ColumnChunkWriter

NewInt96ColumnChunkWriter constructs a new column writer using the given metadata chunk builder provided Pager, and desired encoding and properties.

This will likely not be often called directly by consumers but rather used internally.

ColumnChunkWriters should be acquired by using fileWriter and RowGroupWriter objects

func (*Int96ColumnChunkWriter) Close ¶

func (w *Int96ColumnChunkWriter) Close() (err error)

func (*Int96ColumnChunkWriter) Descr ¶

func (w *Int96ColumnChunkWriter) Descr() *schema.Column

func (*Int96ColumnChunkWriter) EstimatedBufferedValueBytes ¶

func (w *Int96ColumnChunkWriter) EstimatedBufferedValueBytes() int64

func (*Int96ColumnChunkWriter) FlushBufferedDataPages ¶

func (w *Int96ColumnChunkWriter) FlushBufferedDataPages() (err error)

func (*Int96ColumnChunkWriter) FlushCurrentPage ¶

func (w *Int96ColumnChunkWriter) FlushCurrentPage() error

func (*Int96ColumnChunkWriter) HasBitsBuffer ¶

func (w *Int96ColumnChunkWriter) HasBitsBuffer() bool

func (*Int96ColumnChunkWriter) LevelInfo ¶

func (w *Int96ColumnChunkWriter) LevelInfo() LevelInfo

func (*Int96ColumnChunkWriter) Properties ¶

func (w *Int96ColumnChunkWriter) Properties() *parquet.WriterProperties

func (*Int96ColumnChunkWriter) RowsWritten ¶

func (w *Int96ColumnChunkWriter) RowsWritten() int

func (*Int96ColumnChunkWriter) SetBitsBuffer ¶

func (w *Int96ColumnChunkWriter) SetBitsBuffer(buf *memory.Buffer)

func (*Int96ColumnChunkWriter) TotalBytesWritten ¶

func (w *Int96ColumnChunkWriter) TotalBytesWritten() int64

func (*Int96ColumnChunkWriter) TotalCompressedBytes ¶

func (w *Int96ColumnChunkWriter) TotalCompressedBytes() int64

func (*Int96ColumnChunkWriter) Type ¶

func (w *Int96ColumnChunkWriter) Type() parquet.Type

func (*Int96ColumnChunkWriter) WriteBatch ¶

func (w *Int96ColumnChunkWriter) WriteBatch(values []parquet.Int96, defLevels, repLevels []int16) (valueOffset int64, err error)

WriteBatch writes a batch of repetition levels, definition levels, and values to the column. `def_levels` (resp. `rep_levels`) can be null if the column's max definition level (resp. max repetition level) is 0. If not null, each of `def_levels` and `rep_levels` must have at least `len(values)`.

The number of physical values written (taken from `values`) is returned. It can be smaller than `len(values)` is there are some undefined values.

When using DataPageV2 to write a repeated column rows cannot cross data page boundaries. To ensure this the writer ensures that every batch of w.props.BatchSize begins and ends on a row boundary. As a consequence, the first value to WriteBatch must always be the beginning of a row if repLevels is not nil (repLevels[0] should always be 0) and using DataPageV2.

func (*Int96ColumnChunkWriter) WriteBatchSpaced ¶

func (w *Int96ColumnChunkWriter) WriteBatchSpaced(values []parquet.Int96, defLevels, repLevels []int16, validBits []byte, validBitsOffset int64)

WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the column.

In comparison to WriteBatch the length of repetition and definition levels is the same as of the number of values read for max_definition_level == 1. In the case of max_definition_level > 1, the repetition and definition levels are larger than the values but the values include the null entries with definition_level == (max_definition_level - 1). Thus we have to differentiate in the parameters of this function if the input has the length of num_values or the _number of rows in the lowest nesting level_.

In the case that the most inner node in the Parquet is required, the _number of rows in the lowest nesting level_ is equal to the number of non-null values. If the inner-most schema node is optional, the _number of rows in the lowest nesting level_ also includes all values with definition_level == (max_definition_level - 1).

func (*Int96ColumnChunkWriter) WriteDataPage ¶

func (w *Int96ColumnChunkWriter) WriteDataPage(page DataPage) error

func (*Int96ColumnChunkWriter) WriteDefinitionLevels ¶

func (w *Int96ColumnChunkWriter) WriteDefinitionLevels(levels []int16)

func (*Int96ColumnChunkWriter) WriteDictionaryPage ¶

func (w *Int96ColumnChunkWriter) WriteDictionaryPage() error

func (*Int96ColumnChunkWriter) WriteRepetitionLevels ¶

func (w *Int96ColumnChunkWriter) WriteRepetitionLevels(levels []int16)

type LevelInfo ¶

type LevelInfo struct {
	// How many slots an undefined but present (i.e. null) element in
	// parquet consumes when decoding to Arrow.
	// "Slot" is used in the same context as the Arrow specification
	// (i.e. a value holder).
	// This is only ever >1 for descendents of FixedSizeList.
	NullSlotUsage int32
	// The definition level at which the value for the field
	// is considered not null (definition levels greater than
	// or equal to this value indicate a not-null
	// value for the field). For list fields definition levels
	// greater than or equal to this field indicate a present,
	// possibly null, child value.
	DefLevel int16
	// The repetition level corresponding to this element
	// or the closest repeated ancestor.  Any repetition
	// level less than this indicates either a new list OR
	// an empty list (which is determined in conjunction
	// with definition levels).
	RepLevel int16
	// The definition level indicating the level at which the closest
	// repeated ancestor is not empty.  This is used to discriminate
	// between a value less than |def_level| being null or excluded entirely.
	// For instance if we have an arrow schema like:
	// list(struct(f0: int)).  Then then there are the following
	// definition levels:
	//   0 = null list
	//   1 = present but empty list.
	//   2 = a null value in the list
	//   3 = a non null struct but null integer.
	//   4 = a present integer.
	// When reconstructing, the struct and integer arrays'
	// repeated_ancestor_def_level would be 2.  Any
	// def_level < 2 indicates that there isn't a corresponding
	// child value in the list.
	// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
	// has the def levels [0, 1, 2, 3, 4].  The actual
	// struct array is only of length 3: [not-set, set, set] and
	// the int array is also of length 3: [N/A, null, 1].
	RepeatedAncestorDefLevel int16
}

func (*LevelInfo) Equal ¶

func (l *LevelInfo) Equal(rhs *LevelInfo) bool

func (*LevelInfo) HasNullableValues ¶

func (l *LevelInfo) HasNullableValues() bool

func (*LevelInfo) Increment ¶

func (l *LevelInfo) Increment(n schema.Node)

func (*LevelInfo) IncrementOptional ¶

func (l *LevelInfo) IncrementOptional()

func (*LevelInfo) IncrementRepeated ¶

func (l *LevelInfo) IncrementRepeated() int16

type Page ¶

type Page interface {
	// Returns which kind of page this is
	Type() format.PageType
	// Get the raw bytes of this page
	Data() []byte
	// return the encoding used for this page, Plain/RLE, etc.
	Encoding() format.Encoding
	// get the number of values in this page
	NumValues() int32
	// release this page object back into the page pool for re-use
	Release()
}

Page is an interface for handling DataPages or Dictionary Pages

type PageReader ¶

type PageReader interface {
	// Set the maximum Page header size allowed to be read
	SetMaxPageHeaderSize(int)
	// Return the current page, or nil if there are no more
	Page() Page
	// Fetch the next page, returns false if there are no more pages
	Next() bool
	// if Next returns false, Err will return the error encountered or
	// nil if there was no error and you just hit the end of the page
	Err() error
	// Reset allows reusing a page reader
	Reset(r parquet.BufferedReader, nrows int64, compressType compress.Compression, ctx *CryptoContext)
}

PageReader is the interface used by the columnreader in order to read and handle DataPages and loop through them.

func NewPageReader ¶

func NewPageReader(r parquet.BufferedReader, nrows int64, compressType compress.Compression, mem memory.Allocator, ctx *CryptoContext) (PageReader, error)

NewPageReader returns a page reader for the data which can be read from the provided reader and compression.

type PageWriter ¶

type PageWriter interface {
	// Closes the current page, flushing any buffered data pages/dictionary pages
	// based on the input parameters. Subsequent calls have no effect.
	Close(hasDict, fallback bool) error
	// Write the provided datapage out to the underlying writer
	WriteDataPage(page DataPage) (int64, error)
	// Write the provided dictionary page out to the underlying writer
	WriteDictionaryPage(page *DictionaryPage) (int64, error)
	// returns true if there is a configured compressor for the data
	HasCompressor() bool
	// use the configured compressor and writer properties to compress the data in src
	// using the buffer buf. Returns the slice of the compressed bytes which may be
	// the bytes in the provided buffer
	Compress(buf *bytes.Buffer, src []byte) []byte
	// Allow reuse of the pagewriter object by resetting it using these values instead
	// of having to create a new object.
	Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error
}

PageWriter is the interface for both serialized and buffered page writers

func NewPageWriter ¶

func NewPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, buffered bool, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error)

NewPageWriter returns a page writer using either the buffered or serialized implementations

type ReadOption ¶

type ReadOption func(*Reader)

func WithMetadata ¶

func WithMetadata(m *metadata.FileMetaData) ReadOption

WithMetadata allows providing a specific FileMetaData object rather than reading the file metadata from the file itself.

func WithReadProps ¶

func WithReadProps(props *parquet.ReaderProperties) ReadOption

WithReadProps specifies a specific reader properties instance to use, rather than using the default ReaderProperties.

type Reader ¶

type Reader struct {
	// contains filtered or unexported fields
}

Reader is the main interface for reading a parquet file

func NewParquetReader ¶

func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error)

NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r. This reader needs to support Read, ReadAt and Seeking.

If no read properties are provided then the default ReaderProperties will be used. The WithMetadata option can be used to provide a FileMetaData object rather than reading the file metadata from the file.

func OpenParquetFile ¶

func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error)

OpenParquetFile will return a Reader for the given parquet file on the local file system.

Optionally the file can be memory mapped for faster reading. If no read properties are provided then the default ReaderProperties will be used. The WithMetadata option can be used to provide a FileMetaData object rather than reading the file metadata from the file.

func (*Reader) BufferPool ¶

func (f *Reader) BufferPool() *sync.Pool

BufferPool returns the internal buffer pool being utilized by this reader. This is primarily for use by the pqarrow.FileReader or anything that builds on top of the Reader and constructs their own ColumnReaders (like the RecordReader)

func (*Reader) Close ¶

func (f *Reader) Close() error

Close will close the current reader, and if the underlying reader being used is an `io.Closer` then Close will be called on it too.

func (*Reader) MetaData ¶

func (f *Reader) MetaData() *metadata.FileMetaData

MetaData returns the underlying FileMetadata object

func (*Reader) NumRowGroups ¶

func (f *Reader) NumRowGroups() int

NumRowGroups returns the total number of row groups in this file.

func (*Reader) NumRows ¶

func (f *Reader) NumRows() int64

NumRows returns the total number of rows in this parquet file.

func (*Reader) RowGroup ¶

func (f *Reader) RowGroup(i int) *RowGroupReader

RowGroup returns a reader for the desired (0-based) row group

func (*Reader) WriterVersion ¶

func (f *Reader) WriterVersion() *metadata.AppVersion

WriterVersion returns the Application Version that was written in the file metadata

type RecordReader ¶

type RecordReader interface {
	// DefLevels returns the current crop of definition levels for this record
	DefLevels() []int16
	// LevelsPos is the number of definition / repetition levels (from the decoded ones)
	// which the reader has already consumed.
	LevelsPos() int64
	// RepLevels returns the current decoded repetition levels
	RepLevels() []int16
	// Reset resets the state, clearing consumed values and repetition/definition
	// levels as the result of calling ReadRecords
	Reset()
	// Reserve pre-allocates space for data
	Reserve(int64) error
	// HasMore returns true if there is more internal data which hasn't been
	// processed yet.
	HasMore() bool
	// ReadRecords attempts to read the provided number of records from the
	// column chunk, returning the number of records read and any error.
	ReadRecords(num int64) (int64, error)
	// ValuesWritten is the number of values written internally including any nulls
	ValuesWritten() int
	// ReleaseValidBits transfers the buffer of bits for the validity bitmap
	// to the caller, subsequent calls will allocate a new one in the reader.
	ReleaseValidBits() *memory.Buffer
	// ReleaseValues transfers the buffer of data with the values to the caller,
	// a new buffer will be allocated on subsequent calls.
	ReleaseValues() *memory.Buffer
	// NullCount returns the number of nulls decoded
	NullCount() int64
	// Type returns the parquet physical type of the column
	Type() parquet.Type
	// Values returns the decoded data buffer, including any nulls, without
	// transferring ownership
	Values() []byte
	// SetPageReader allows progressing to the next column chunk while reusing
	// this record reader by providing the page reader for the next chunk.
	SetPageReader(PageReader)
	// Retain increments the ref count by one
	Retain()
	// Release decrements the ref count by one, releasing the internal buffers when
	// the ref count is 0.
	Release()
}

RecordReader is an interface for reading entire records/rows at a time from a parquet file for both flat and nested columns. Properly delimiting semantic records according to the def and repetition levels.

func NewRecordReader ¶

func NewRecordReader(descr *schema.Column, info LevelInfo, readDict bool, mem memory.Allocator, bufferPool *sync.Pool) RecordReader

type RowGroupReader ¶

type RowGroupReader struct {
	// contains filtered or unexported fields
}

RowGroupReader is the primary interface for reading a single row group

func (*RowGroupReader) ByteSize ¶

func (r *RowGroupReader) ByteSize() int64

ByteSize returns the full byte size of this row group as defined in its metadata

func (*RowGroupReader) Column ¶

func (r *RowGroupReader) Column(i int) (ColumnChunkReader, error)

Column returns a column reader for the requested (0-indexed) column

panics if passed a column not in the range [0, NumColumns)

func (*RowGroupReader) GetColumnPageReader ¶

func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error)

func (*RowGroupReader) MetaData ¶

func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData

MetaData returns the metadata of the current Row Group

func (*RowGroupReader) NumColumns ¶

func (r *RowGroupReader) NumColumns() int

NumColumns returns the number of columns of data as defined in the metadata of this row group

func (*RowGroupReader) NumRows ¶

func (r *RowGroupReader) NumRows() int64

NumRows returns the number of rows in just this row group

type RowGroupWriter ¶

type RowGroupWriter interface {
	// Returns the number of columns for this row group writer
	NumColumns() int
	// returns the current number of rows that have been written.
	// Returns an error if they are unequal between columns that have been written so far
	NumRows() (int, error)
	// The total compressed bytes so
	TotalCompressedBytes() int64
	// the total bytes written and flushed out
	TotalBytesWritten() int64
	// Closes any unclosed columnwriters, and closes the rowgroup, writing out
	// the metadata. subsequent calls have no effect
	// returns an error if columns contain unequal numbers of rows.
	Close() error
	// Buffered returns true if it's a BufferedRowGroupWriter and false for a
	// SerialRowGroupWriter
	Buffered() bool
}

RowGroupWriter is the base interface for writing rowgroups, the actual writer will be either the SerialRowGroupWriter or the BufferedRowGroupWriter

type SerialRowGroupWriter ¶

type SerialRowGroupWriter interface {
	RowGroupWriter
	NextColumn() (ColumnChunkWriter, error)
	// returns the current column being built, if buffered it will equal NumColumns
	// if serialized then it will return which column is currenly being written
	CurrentColumn() int
}

SerialRowGroupWriter expects each column to be written one after the other, data is flushed every time NextColumn is called and will panic if there is an unequal number of rows written per column.

type ValidityBitmapInputOutput ¶

type ValidityBitmapInputOutput struct {
	// Input only.
	// The maximum number of values_read expected (actual
	// values read must be less than or equal to this value).
	// If this number is exceeded methods will throw a
	// ParquetException. Exceeding this limit indicates
	// either a corrupt or incorrectly written file.
	ReadUpperBound int64
	// Output only. The number of values added to the encountered
	// (this is logically the count of the number of elements
	// for an Arrow array).
	Read int64
	// Input/Output. The number of nulls encountered.
	NullCount int64
	// Output only. The validity bitmap to populate. May be be null only
	// for DefRepLevelsToListInfo (if all that is needed is list offsets).
	ValidBits []byte
	// Input only, offset into valid_bits to start at.
	ValidBitsOffset int64
}

Input/Output structure for reconstructed validity bitmaps.

type WriteOption ¶

type WriteOption func(*Writer)

func WithWriteMetadata ¶

func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption

func WithWriterProps ¶

func WithWriterProps(props *parquet.WriterProperties) WriteOption

type Writer ¶

type Writer struct {

	// The Schema of this writer
	Schema *schema.Schema
	// The current FileMetadata to write
	FileMetadata *metadata.FileMetaData
	// The current keyvalue metadata
	KeyValueMetadata metadata.KeyValueMetadata
	// contains filtered or unexported fields
}

Writer is the primary interface for writing a parquet file

func NewParquetWriter ¶

func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer

NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema.

If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil, it will be added to the file.

func (*Writer) AppendBufferedRowGroup ¶

func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter

AppendBufferedRowGroup appends a rowgroup to the file and returns a writer that buffers the row group in memory allowing writing multiple columns at once to the row group. Data is not flushed out until the row group is closed.

When calling Close, all columns must have the same number of rows written.

func (*Writer) AppendRowGroup ¶

func (fw *Writer) AppendRowGroup() SerialRowGroupWriter

AppendRowGroup appends a row group to the file and returns a writer that writes columns to the row group in serial via calling NextColumn.

When calling NextColumn, the same number of rows need to have been written to each column before moving on. Otherwise the rowgroup writer will panic.

func (*Writer) Close ¶

func (fw *Writer) Close() error

Close closes any open row group writer and writes the file footer. Subsequent calls to close will have no effect.

func (*Writer) NumColumns ¶

func (fw *Writer) NumColumns() int

NumColumns returns the number of columns to write as defined by the schema.

func (*Writer) NumRowGroups ¶

func (fw *Writer) NumRowGroups() int

NumRowGroups returns the current number of row groups that will be written for this file.

func (*Writer) NumRows ¶

func (fw *Writer) NumRows() int

NumRows returns the current number of rows that have be written

func (*Writer) Properties ¶

func (fw *Writer) Properties() *parquet.WriterProperties

Properties returns the writer properties that are in use for this file.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL