dynparquet

package
v0.0.0-...-969a4e3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 16, 2022 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// The size of the column indicies in parquet files.
	ColumnIndexSize = 16
)
View Source
const (
	DynamicColumnsKey = "dynamic_columns"
)

Variables

View Source
var ErrMalformedDynamicColumns = errors.New("malformed dynamic columns string")
View Source
var ErrNoDynamicColumns = errors.New("no dynamic columns metadata found, it must be present")

Functions

func FieldByName

func FieldByName(schema *parquet.Schema, name string) parquet.Field

func FindChildIndex

func FindChildIndex(schema *parquet.Schema, name string) int

func ValuesForIndex

func ValuesForIndex(row parquet.Row, index int) []parquet.Value

Types

type Buffer

type Buffer struct {
	// contains filtered or unexported fields
}

Buffer represents an batch of rows with a concrete set of dynamic column names representing how its parquet schema was created off of a dynamic parquet schema.

func (*Buffer) Clone

func (b *Buffer) Clone() (*Buffer, error)

func (*Buffer) ColumnChunks

func (b *Buffer) ColumnChunks() []parquet.ColumnChunk

ColumnChunks returns the list of parquet.ColumnChunk for the given index. It contains all the pages associated with this row group's column. Implements the parquet.RowGroup interface.

func (*Buffer) DynamicColumns

func (b *Buffer) DynamicColumns() map[string][]string

DynamicColumns returns the concrete dynamic column names of the buffer. It implements the DynamicRowGroup interface.

func (*Buffer) DynamicRows

func (b *Buffer) DynamicRows() DynamicRowReader

DynamicRows returns an iterator for the rows in the buffer. It implements the DynamicRowGroup interface.

func (*Buffer) NumRows

func (b *Buffer) NumRows() int64

NumRows returns the number of rows in the buffer. Implements the parquet.RowGroup interface.

func (*Buffer) Rows

func (b *Buffer) Rows() parquet.Rows

Rows returns an iterator for the rows in the buffer. It implements the parquet.RowGroup interface.

func (*Buffer) Schema

func (b *Buffer) Schema() *parquet.Schema

Schema returns the concrete parquet.Schema of the buffer. Implements the parquet.RowGroup interface.

func (*Buffer) Sort

func (b *Buffer) Sort()

func (*Buffer) SortingColumns

func (b *Buffer) SortingColumns() []parquet.SortingColumn

SortingColumns returns the concrete slice of parquet.SortingColumns of the buffer. Implements the parquet.RowGroup interface.

func (*Buffer) WriteRowGroup

func (b *Buffer) WriteRowGroup(rg parquet.RowGroup) (int64, error)

WriteRowGroup writes a single row to the buffer.

func (*Buffer) WriteRows

func (b *Buffer) WriteRows(rows []parquet.Row) (int, error)

WriteRow writes a single row to the buffer.

type ColumnDefinition

type ColumnDefinition struct {
	Name          string
	StorageLayout parquet.Node
	Dynamic       bool
}

ColumnDefinition describes a column in a dynamic parquet schema.

type DynamicRow

type DynamicRow struct {
	Row            parquet.Row
	Schema         *parquet.Schema
	DynamicColumns map[string][]string
}

type DynamicRowGroup

type DynamicRowGroup interface {
	parquet.RowGroup
	// DynamicColumns returns the concrete dynamic column names that were used
	// create its concrete parquet schema with a dynamic parquet schema.
	DynamicColumns() map[string][]string
	// DynamicRows return an iterator over the rows in the row group.
	DynamicRows() DynamicRowReader
}

DynamicRowGroup is a parquet.RowGroup that can describe the concrete dynamic columns.

func Concat

func Concat(drg ...DynamicRowGroup) DynamicRowGroup

type DynamicRowReader

type DynamicRowReader interface {
	ReadRows(*DynamicRows) (int, error)
	Close() error
}

DynamicRowReaders is an iterator over the rows in a DynamicRowGroup.

type DynamicRows

type DynamicRows struct {
	Rows           []parquet.Row
	Schema         *parquet.Schema
	DynamicColumns map[string][]string
}

func (*DynamicRows) Get

func (r *DynamicRows) Get(i int) *DynamicRow

func (*DynamicRows) GetCopy

func (r *DynamicRows) GetCopy(i int) *DynamicRow

type Label

type Label struct {
	Name  string
	Value string
}

type MergedRowGroup

type MergedRowGroup struct {
	parquet.RowGroup
	DynCols map[string][]string
}

MergedRowGroup allows wrapping any parquet.RowGroup to implement the DynamicRowGroup interface by specifying the concrete dynamic column names the RowGroup's schema contains.

func (*MergedRowGroup) DynamicColumns

func (r *MergedRowGroup) DynamicColumns() map[string][]string

DynamicColumns returns the concrete dynamic column names that were used create its concrete parquet schema with a dynamic parquet schema. Implements the DynamicRowGroup interface.

func (*MergedRowGroup) DynamicRows

func (r *MergedRowGroup) DynamicRows() DynamicRowReader

DynamicRows returns an iterator over the rows in the row group. Implements the DynamicRowGroup interface.

type NilColumnChunk

type NilColumnChunk struct {
	// contains filtered or unexported fields
}

NilColumnChunk is a column chunk that contains a single page with all null values of the given type, given length and column index of the parent schema. It implements the parquet.ColumnChunk interface.

func NewNilColumnChunk

func NewNilColumnChunk(typ parquet.Type, columnIndex, numValues int) *NilColumnChunk

NewNilColumnChunk creates a new column chunk configured with the given type, column index and number of values in the page.

func (*NilColumnChunk) BloomFilter

func (c *NilColumnChunk) BloomFilter() parquet.BloomFilter

BloomFilter returns the bloomfilter of the column chunk. Since the NilColumnChunk is a virtual column chunk only for in-memory purposes, it returns nil. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) Column

func (c *NilColumnChunk) Column() int

Type returns the index of the column chunk within the parent schema. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) ColumnIndex

func (c *NilColumnChunk) ColumnIndex() parquet.ColumnIndex

ColumnIndex returns the column index of the column chunk. Since the NilColumnChunk is a virtual column chunk only for in-memory purposes, it returns nil. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) NumValues

func (c *NilColumnChunk) NumValues() int64

NumValues returns the number of values in the column chunk. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) OffsetIndex

func (c *NilColumnChunk) OffsetIndex() parquet.OffsetIndex

OffsetIndex returns the offset index of the column chunk. Since the NilColumnChunk is a virtual column chunk only for in-memory purposes, it returns nil. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) Pages

func (c *NilColumnChunk) Pages() parquet.Pages

Pages returns an iterator for all pages within the column chunk. This iterator will only ever return a single page filled with all null values of the configured amount. Implements the parquet.ColumnChunk interface.

func (*NilColumnChunk) Type

func (c *NilColumnChunk) Type() parquet.Type

Type returns the type of the column chunk. Implements the parquet.ColumnChunk interface.

type Sample

type Sample struct {
	ExampleType string
	Labels      []Label
	Stacktrace  []uuid.UUID
	Timestamp   int64
	Value       int64
}

func (Sample) ToParquetRow

func (s Sample) ToParquetRow(labelNames []string) parquet.Row

type Samples

type Samples []Sample

func NewTestSamples

func NewTestSamples() Samples

func (Samples) SampleLabelNames

func (s Samples) SampleLabelNames() []string

func (Samples) ToBuffer

func (s Samples) ToBuffer(schema *Schema) (*Buffer, error)

type Schema

type Schema struct {
	// contains filtered or unexported fields
}

Schema is a dynamic parquet schema. It extends a parquet schema with the ability that any column definition that is dynamic will have columns dynamically created as their column name is seen for the first time.

func NewSampleSchema

func NewSampleSchema() *Schema

func NewSchema

func NewSchema(
	name string,
	columns []ColumnDefinition,
	sortingColumns []SortingColumn,
) *Schema

NewSchema creates a new dynamic parquet schema with the given name, column definitions and sorting columns. The order of the sorting columns is important as it determines the order in which data is written to a file or laid out in memory.

func (*Schema) ColumnByName

func (s *Schema) ColumnByName(name string) (ColumnDefinition, bool)

func (*Schema) Columns

func (s *Schema) Columns() []ColumnDefinition

func (*Schema) MergeDynamicRowGroups

func (s *Schema) MergeDynamicRowGroups(rowGroups []DynamicRowGroup) (DynamicRowGroup, error)

MergeDynamicRowGroups merges the given dynamic row groups into a single dynamic row group. It merges the parquet schema in a non-conflicting way by merging all the concrete dynamic column names and generating a superset parquet schema that all given dynamic row groups are compatible with.

func (*Schema) NewBuffer

func (s *Schema) NewBuffer(dynamicColumns map[string][]string) (*Buffer, error)

NewBuffer returns a new buffer with a concrete parquet schema generated using the given concrete dynamic column names.

func (*Schema) NewWriter

func (s *Schema) NewWriter(w io.Writer, dynamicColumns map[string][]string) (*parquet.Writer, error)

NewWriter returns a new parquet writer with a concrete parquet schema generated using the given concrete dynamic column names.

func (*Schema) RowLessThan

func (s *Schema) RowLessThan(a, b *DynamicRow) bool

func (*Schema) SerializeBuffer

func (s *Schema) SerializeBuffer(buffer *Buffer) ([]byte, error)

NewWriter returns a new parquet writer with a concrete parquet schema generated using the given concrete dynamic column names.

type SerializedBuffer

type SerializedBuffer struct {
	// contains filtered or unexported fields
}

func NewSerializedBuffer

func NewSerializedBuffer(f *parquet.File) (*SerializedBuffer, error)

func ReaderFromBytes

func ReaderFromBytes(buf []byte) (*SerializedBuffer, error)

func (*SerializedBuffer) DynamicColumns

func (b *SerializedBuffer) DynamicColumns() map[string][]string

func (*SerializedBuffer) DynamicRowGroup

func (b *SerializedBuffer) DynamicRowGroup(i int) DynamicRowGroup

func (*SerializedBuffer) DynamicRows

func (b *SerializedBuffer) DynamicRows() DynamicRowReader

func (*SerializedBuffer) NumRowGroups

func (b *SerializedBuffer) NumRowGroups() int

func (*SerializedBuffer) NumRows

func (b *SerializedBuffer) NumRows() int64

func (*SerializedBuffer) ParquetFile

func (b *SerializedBuffer) ParquetFile() *parquet.File

func (*SerializedBuffer) Reader

func (b *SerializedBuffer) Reader() *parquet.Reader

type SortingColumn

type SortingColumn interface {
	parquet.SortingColumn
	ColumnName() string
}

SortingColumn describes a column to sort by in a dynamic parquet schema.

func Ascending

func Ascending(column string) SortingColumn

Ascending constructs a SortingColumn value which dictates to sort by the column in ascending order.

func Descending

func Descending(column string) SortingColumn

Descending constructs a SortingColumn value which dictates to sort by the column in descending order.

func NullsFirst

func NullsFirst(sortingColumn SortingColumn) SortingColumn

NullsFirst wraps the SortingColumn passed as argument so that it instructs the row group to place null values first in the column.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL