Documentation ¶
Overview ¶
Package schema provides types and functions for manipulating and building parquet file schemas.
Some of the utilities provided include building a schema using Struct Tags on a struct type, getting Column Paths from a node, and dealing with the converted and logical types for Parquet.
Logical types specify ways to interpret the primitive types allowing the number of primitive types to be smaller and reuse efficient encodings. For instance a "string" is just a ByteArray column with a UTF-8 annotation or "String Logical Type".
For more information about Logical and Converted Types, check: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
Index ¶
- Variables
- func ColumnPathFromNode(n Node) parquet.ColumnPath
- func NewStructFromSchema(sc *Schema) (t reflect.Type, err error)
- func PrintSchema(n Node, w io.Writer, indentWidth int)
- func ToThrift(schema *GroupNode) []*format.SchemaElement
- type BSONLogicalType
- func (BSONLogicalType) Equals(rhs LogicalType) bool
- func (BSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (BSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (BSONLogicalType) IsNested() bool
- func (BSONLogicalType) IsNone() bool
- func (BSONLogicalType) IsSerialized() bool
- func (BSONLogicalType) IsValid() bool
- func (BSONLogicalType) MarshalJSON() ([]byte, error)
- func (BSONLogicalType) SortOrder() SortOrder
- func (BSONLogicalType) String() string
- func (BSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type Column
- func (c *Column) ColumnOrder() parquet.ColumnOrder
- func (c *Column) ColumnPath() parquet.ColumnPath
- func (c *Column) ConvertedType() ConvertedType
- func (c *Column) Equals(rhs *Column) bool
- func (c *Column) LogicalType() LogicalType
- func (c *Column) MaxDefinitionLevel() int16
- func (c *Column) MaxRepetitionLevel() int16
- func (c *Column) Name() string
- func (c *Column) Path() string
- func (c *Column) PhysicalType() parquet.Type
- func (c *Column) SchemaNode() Node
- func (c *Column) SortOrder() SortOrder
- func (c *Column) String() string
- func (c *Column) TypeLength() int
- type ConvertedType
- type DateLogicalType
- func (DateLogicalType) Equals(rhs LogicalType) bool
- func (DateLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (DateLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
- func (DateLogicalType) IsNested() bool
- func (DateLogicalType) IsNone() bool
- func (DateLogicalType) IsSerialized() bool
- func (DateLogicalType) IsValid() bool
- func (DateLogicalType) MarshalJSON() ([]byte, error)
- func (DateLogicalType) SortOrder() SortOrder
- func (DateLogicalType) String() string
- func (DateLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type DecimalLogicalType
- func (t DecimalLogicalType) Equals(rhs LogicalType) bool
- func (t DecimalLogicalType) IsApplicable(typ parquet.Type, tlen int32) bool
- func (t DecimalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (DecimalLogicalType) IsNested() bool
- func (DecimalLogicalType) IsNone() bool
- func (DecimalLogicalType) IsSerialized() bool
- func (DecimalLogicalType) IsValid() bool
- func (t DecimalLogicalType) MarshalJSON() ([]byte, error)
- func (t DecimalLogicalType) Precision() int32
- func (t DecimalLogicalType) Scale() int32
- func (DecimalLogicalType) SortOrder() SortOrder
- func (t DecimalLogicalType) String() string
- func (t DecimalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type DecimalMetadata
- type EnumLogicalType
- func (EnumLogicalType) Equals(rhs LogicalType) bool
- func (EnumLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (EnumLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
- func (EnumLogicalType) IsNested() bool
- func (EnumLogicalType) IsNone() bool
- func (EnumLogicalType) IsSerialized() bool
- func (EnumLogicalType) IsValid() bool
- func (EnumLogicalType) MarshalJSON() ([]byte, error)
- func (EnumLogicalType) SortOrder() SortOrder
- func (EnumLogicalType) String() string
- func (EnumLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type FieldList
- type GroupNode
- func GroupNodeFromThrift(elem *format.SchemaElement, fields FieldList) (*GroupNode, error)
- func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error)
- func MapOf(name string, key Node, value Node, mapRep parquet.Repetition, fieldID int32) (*GroupNode, error)
- func MustGroup(n Node, err error) *GroupNode
- func NewGroupNode(name string, repetition parquet.Repetition, fields FieldList, fieldID int32) (*GroupNode, error)
- func NewGroupNodeConverted(name string, repetition parquet.Repetition, fields FieldList, ...) (n *GroupNode, err error)
- func NewGroupNodeLogical(name string, repetition parquet.Repetition, fields FieldList, ...) (n *GroupNode, err error)
- func (n *GroupNode) ConvertedType() ConvertedType
- func (g *GroupNode) Equals(rhs Node) bool
- func (g *GroupNode) Field(i int) Node
- func (n *GroupNode) FieldID() int32
- func (g *GroupNode) FieldIndexByField(n Node) int
- func (g *GroupNode) FieldIndexByName(name string) int
- func (g *GroupNode) HasRepeatedFields() bool
- func (n *GroupNode) LogicalType() LogicalType
- func (n *GroupNode) Name() string
- func (g *GroupNode) NumFields() int
- func (n *GroupNode) Parent() Node
- func (n *GroupNode) Path() string
- func (n *GroupNode) RepetitionType() parquet.Repetition
- func (n *GroupNode) SetParent(p Node)
- func (n *GroupNode) Type() NodeType
- func (g *GroupNode) Visit(v Visitor)
- type IntLogicalType
- func (t IntLogicalType) BitWidth() int8
- func (t IntLogicalType) Equals(rhs LogicalType) bool
- func (t IntLogicalType) IsApplicable(typ parquet.Type, _ int32) bool
- func (t IntLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (IntLogicalType) IsNested() bool
- func (IntLogicalType) IsNone() bool
- func (IntLogicalType) IsSerialized() bool
- func (t IntLogicalType) IsSigned() bool
- func (IntLogicalType) IsValid() bool
- func (t IntLogicalType) MarshalJSON() ([]byte, error)
- func (t IntLogicalType) SortOrder() SortOrder
- func (t IntLogicalType) String() string
- func (t IntLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type IntervalLogicalType
- func (IntervalLogicalType) Equals(rhs LogicalType) bool
- func (IntervalLogicalType) IsApplicable(t parquet.Type, tlen int32) bool
- func (IntervalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (IntervalLogicalType) IsNested() bool
- func (IntervalLogicalType) IsNone() bool
- func (IntervalLogicalType) IsSerialized() bool
- func (IntervalLogicalType) IsValid() bool
- func (IntervalLogicalType) MarshalJSON() ([]byte, error)
- func (IntervalLogicalType) SortOrder() SortOrder
- func (IntervalLogicalType) String() string
- func (IntervalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type JSONLogicalType
- func (JSONLogicalType) Equals(rhs LogicalType) bool
- func (JSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (JSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (JSONLogicalType) IsNested() bool
- func (JSONLogicalType) IsNone() bool
- func (JSONLogicalType) IsSerialized() bool
- func (JSONLogicalType) IsValid() bool
- func (JSONLogicalType) MarshalJSON() ([]byte, error)
- func (JSONLogicalType) SortOrder() SortOrder
- func (JSONLogicalType) String() string
- func (JSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type ListLogicalType
- func (ListLogicalType) Equals(rhs LogicalType) bool
- func (ListLogicalType) IsApplicable(parquet.Type, int32) bool
- func (ListLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
- func (ListLogicalType) IsNested() bool
- func (ListLogicalType) IsNone() bool
- func (ListLogicalType) IsSerialized() bool
- func (ListLogicalType) IsValid() bool
- func (ListLogicalType) MarshalJSON() ([]byte, error)
- func (ListLogicalType) SortOrder() SortOrder
- func (ListLogicalType) String() string
- func (ListLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type LogicalType
- func NewDecimalLogicalType(precision int32, scale int32) LogicalType
- func NewIntLogicalType(bitWidth int8, signed bool) LogicalType
- func NewListLogicalType() LogicalType
- func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
- func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
- func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
- type MapLogicalType
- func (MapLogicalType) Equals(rhs LogicalType) bool
- func (MapLogicalType) IsApplicable(parquet.Type, int32) bool
- func (MapLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
- func (MapLogicalType) IsNested() bool
- func (MapLogicalType) IsNone() bool
- func (MapLogicalType) IsSerialized() bool
- func (MapLogicalType) IsValid() bool
- func (MapLogicalType) MarshalJSON() ([]byte, error)
- func (MapLogicalType) SortOrder() SortOrder
- func (MapLogicalType) String() string
- func (MapLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type NoLogicalType
- func (NoLogicalType) Equals(rhs LogicalType) bool
- func (NoLogicalType) IsApplicable(parquet.Type, int32) bool
- func (NoLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (NoLogicalType) IsNested() bool
- func (NoLogicalType) IsNone() bool
- func (NoLogicalType) IsSerialized() bool
- func (NoLogicalType) IsValid() bool
- func (NoLogicalType) MarshalJSON() ([]byte, error)
- func (NoLogicalType) SortOrder() SortOrder
- func (NoLogicalType) String() string
- func (NoLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type Node
- type NodeType
- type NullLogicalType
- func (NullLogicalType) Equals(rhs LogicalType) bool
- func (NullLogicalType) IsApplicable(parquet.Type, int32) bool
- func (NullLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (NullLogicalType) IsNested() bool
- func (NullLogicalType) IsNone() bool
- func (NullLogicalType) IsSerialized() bool
- func (NullLogicalType) IsValid() bool
- func (NullLogicalType) MarshalJSON() ([]byte, error)
- func (NullLogicalType) SortOrder() SortOrder
- func (NullLogicalType) String() string
- func (NullLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type PrimitiveNode
- func MustPrimitive(n Node, err error) *PrimitiveNode
- func NewBooleanNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewByteArrayNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewFixedLenByteArrayNode(name string, rep parquet.Repetition, length int32, fieldID int32) *PrimitiveNode
- func NewFloat32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewFloat64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewInt32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewInt64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewInt96Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
- func NewPrimitiveNode(name string, repetition parquet.Repetition, typ parquet.Type, ...) (*PrimitiveNode, error)
- func NewPrimitiveNodeConverted(name string, repetition parquet.Repetition, typ parquet.Type, ...) (*PrimitiveNode, error)
- func NewPrimitiveNodeLogical(name string, repetition parquet.Repetition, logicalType LogicalType, ...) (*PrimitiveNode, error)
- func PrimitiveNodeFromThrift(elem *format.SchemaElement) (*PrimitiveNode, error)
- func (n *PrimitiveNode) ConvertedType() ConvertedType
- func (p *PrimitiveNode) DecimalMetadata() DecimalMetadata
- func (p *PrimitiveNode) Equals(rhs Node) bool
- func (n *PrimitiveNode) FieldID() int32
- func (n *PrimitiveNode) LogicalType() LogicalType
- func (n *PrimitiveNode) Name() string
- func (n *PrimitiveNode) Parent() Node
- func (n *PrimitiveNode) Path() string
- func (p *PrimitiveNode) PhysicalType() parquet.Type
- func (n *PrimitiveNode) RepetitionType() parquet.Repetition
- func (n *PrimitiveNode) SetParent(p Node)
- func (p *PrimitiveNode) SetTypeLength(length int)
- func (n *PrimitiveNode) Type() NodeType
- func (p *PrimitiveNode) TypeLength() int
- func (p *PrimitiveNode) Visit(v Visitor)
- type Schema
- func (s *Schema) Column(i int) *Column
- func (s *Schema) ColumnIndexByName(nodePath string) int
- func (s *Schema) ColumnIndexByNode(n Node) int
- func (s *Schema) ColumnRoot(i int) Node
- func (s *Schema) Equals(rhs *Schema) bool
- func (s *Schema) HasRepeatedFields() bool
- func (s *Schema) NumColumns() int
- func (s *Schema) Root() *GroupNode
- func (s *Schema) String() string
- func (s *Schema) UpdateColumnOrders(orders []parquet.ColumnOrder) error
- type SortOrder
- type StringLogicalType
- func (StringLogicalType) Equals(rhs LogicalType) bool
- func (StringLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (StringLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
- func (StringLogicalType) IsNested() bool
- func (StringLogicalType) IsNone() bool
- func (StringLogicalType) IsSerialized() bool
- func (StringLogicalType) IsValid() bool
- func (StringLogicalType) MarshalJSON() ([]byte, error)
- func (StringLogicalType) SortOrder() SortOrder
- func (StringLogicalType) String() string
- func (StringLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type TemporalLogicalType
- type TimeLogicalType
- func (t TimeLogicalType) Equals(rhs LogicalType) bool
- func (t TimeLogicalType) IsAdjustedToUTC() bool
- func (t TimeLogicalType) IsApplicable(typ parquet.Type, _ int32) bool
- func (t TimeLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (TimeLogicalType) IsNested() bool
- func (TimeLogicalType) IsNone() bool
- func (TimeLogicalType) IsSerialized() bool
- func (TimeLogicalType) IsValid() bool
- func (t TimeLogicalType) MarshalJSON() ([]byte, error)
- func (TimeLogicalType) SortOrder() SortOrder
- func (t TimeLogicalType) String() string
- func (t TimeLogicalType) TimeUnit() TimeUnitType
- func (t TimeLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type TimeUnitType
- type TimestampLogicalType
- func (t TimestampLogicalType) Equals(rhs LogicalType) bool
- func (t TimestampLogicalType) IsAdjustedToUTC() bool
- func (TimestampLogicalType) IsApplicable(t parquet.Type, _ int32) bool
- func (t TimestampLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (t TimestampLogicalType) IsFromConvertedType() bool
- func (TimestampLogicalType) IsNested() bool
- func (TimestampLogicalType) IsNone() bool
- func (t TimestampLogicalType) IsSerialized() bool
- func (TimestampLogicalType) IsValid() bool
- func (t TimestampLogicalType) MarshalJSON() ([]byte, error)
- func (TimestampLogicalType) SortOrder() SortOrder
- func (t TimestampLogicalType) String() string
- func (t TimestampLogicalType) TimeUnit() TimeUnitType
- func (t TimestampLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type UUIDLogicalType
- func (UUIDLogicalType) Equals(rhs LogicalType) bool
- func (UUIDLogicalType) IsApplicable(t parquet.Type, tlen int32) bool
- func (UUIDLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (UUIDLogicalType) IsNested() bool
- func (UUIDLogicalType) IsNone() bool
- func (UUIDLogicalType) IsSerialized() bool
- func (UUIDLogicalType) IsValid() bool
- func (UUIDLogicalType) MarshalJSON() ([]byte, error)
- func (UUIDLogicalType) SortOrder() SortOrder
- func (UUIDLogicalType) String() string
- func (UUIDLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type UnknownLogicalType
- func (UnknownLogicalType) Equals(rhs LogicalType) bool
- func (UnknownLogicalType) IsApplicable(parquet.Type, int32) bool
- func (UnknownLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
- func (UnknownLogicalType) IsNested() bool
- func (UnknownLogicalType) IsNone() bool
- func (UnknownLogicalType) IsSerialized() bool
- func (UnknownLogicalType) IsValid() bool
- func (UnknownLogicalType) MarshalJSON() ([]byte, error)
- func (UnknownLogicalType) SortOrder() SortOrder
- func (UnknownLogicalType) String() string
- func (UnknownLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
- type Visitor
Examples ¶
Constants ¶
This section is empty.
Variables ¶
var ( // ConvertedTypes is a struct containing the constants for the types // to make it easy to reference them while making it clear what they are ConvertedTypes = struct { None ConvertedType UTF8 ConvertedType Map ConvertedType MapKeyValue ConvertedType List ConvertedType Enum ConvertedType Decimal ConvertedType Date ConvertedType TimeMillis ConvertedType TimeMicros ConvertedType TimestampMillis ConvertedType TimestampMicros ConvertedType Uint8 ConvertedType Uint16 ConvertedType Uint32 ConvertedType Uint64 ConvertedType Int8 ConvertedType Int16 ConvertedType Int32 ConvertedType Int64 ConvertedType JSON ConvertedType BSON ConvertedType Interval ConvertedType NA ConvertedType }{ None: -1, UTF8: ConvertedType(format.ConvertedType_UTF8), Map: ConvertedType(format.ConvertedType_MAP), MapKeyValue: ConvertedType(format.ConvertedType_MAP_KEY_VALUE), List: ConvertedType(format.ConvertedType_LIST), Enum: ConvertedType(format.ConvertedType_ENUM), Decimal: ConvertedType(format.ConvertedType_DECIMAL), Date: ConvertedType(format.ConvertedType_DATE), TimeMillis: ConvertedType(format.ConvertedType_TIME_MILLIS), TimeMicros: ConvertedType(format.ConvertedType_TIME_MICROS), TimestampMillis: ConvertedType(format.ConvertedType_TIMESTAMP_MILLIS), TimestampMicros: ConvertedType(format.ConvertedType_TIMESTAMP_MICROS), Uint8: ConvertedType(format.ConvertedType_UINT_8), Uint16: ConvertedType(format.ConvertedType_UINT_16), Uint32: ConvertedType(format.ConvertedType_UINT_32), Uint64: ConvertedType(format.ConvertedType_UINT_64), Int8: ConvertedType(format.ConvertedType_INT_8), Int16: ConvertedType(format.ConvertedType_INT_16), Int32: ConvertedType(format.ConvertedType_INT_32), Int64: ConvertedType(format.ConvertedType_INT_64), JSON: ConvertedType(format.ConvertedType_JSON), BSON: ConvertedType(format.ConvertedType_BSON), Interval: ConvertedType(format.ConvertedType_INTERVAL), NA: 24, } )
Functions ¶
func ColumnPathFromNode ¶
func ColumnPathFromNode(n Node) parquet.ColumnPath
ColumnPathFromNode walks the parents of the given node to construct it's column path
func NewStructFromSchema ¶
NewStructFromSchema generates a struct type as a reflect.Type from the schema by using the appropriate physical types and making things either pointers or slices based on whether they are repeated/optional/required. It does not use the logical or converted types to change the physical storage so that it is more efficient to use the resulting type for reading without having to do conversions.
It will use maps for map types and slices for list types, but otherwise ignores the converted and logical types of the nodes. Group nodes that are not List or Map will be nested structs.
func PrintSchema ¶
PrintSchema writes a string representation of the tree to w using the indent width provided.
Example ¶
package main import ( "os" "github.com/apache/arrow/go/v10/parquet" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)} item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */) item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */) list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */)) bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */)) fields = append(fields, bag) fields = append(fields, schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)), schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */))) sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */)) schema.PrintSchema(sc, os.Stdout, 2) }
Output: repeated group field_id=0 schema { required int32 field_id=1 a; optional group field_id=2 bag { repeated group field_id=3 b (List) { optional int64 field_id=4 item1; required boolean field_id=5 item2; } } required int32 field_id=6 c (Decimal(precision=3, scale=2)); required int64 field_id=7 d (Decimal(precision=10, scale=5)); }
func ToThrift ¶
func ToThrift(schema *GroupNode) []*format.SchemaElement
ToThrift converts a GroupNode to a slice of SchemaElements which is used for thrift serialization.
Types ¶
type BSONLogicalType ¶
type BSONLogicalType struct {
// contains filtered or unexported fields
}
BSONLogicalType represents a binary JSON string in the byte array
func (BSONLogicalType) Equals ¶
func (BSONLogicalType) Equals(rhs LogicalType) bool
func (BSONLogicalType) IsApplicable ¶
func (BSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (BSONLogicalType) IsCompatible ¶
func (BSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (BSONLogicalType) IsSerialized ¶
func (BSONLogicalType) IsSerialized() bool
func (BSONLogicalType) MarshalJSON ¶
func (BSONLogicalType) MarshalJSON() ([]byte, error)
func (BSONLogicalType) SortOrder ¶
func (BSONLogicalType) SortOrder() SortOrder
func (BSONLogicalType) String ¶
func (BSONLogicalType) String() string
func (BSONLogicalType) ToConvertedType ¶
func (BSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type Column ¶
type Column struct {
// contains filtered or unexported fields
}
Column encapsulates the information necessary to interpret primitive column data in the context of a particular schema. We have to examine the node structure of a column's path to the root in the schema tree to be able to reassemble the nested structure from the repetition and definition levels.
func NewColumn ¶
func NewColumn(n *PrimitiveNode, maxDefinitionLvl, maxRepetitionLvl int16) *Column
NewColumn returns a new column object for the given node with the provided maximum definition and repetition levels.
func (*Column) ColumnOrder ¶
func (c *Column) ColumnOrder() parquet.ColumnOrder
func (*Column) ColumnPath ¶
func (c *Column) ColumnPath() parquet.ColumnPath
ColumnPath returns the full path to this column from the root of the schema
func (*Column) ConvertedType ¶
func (c *Column) ConvertedType() ConvertedType
func (*Column) Equals ¶
Equals will return true if the rhs Column has the same Max Repetition and Definition levels along with having the same node definition.
func (*Column) LogicalType ¶
func (c *Column) LogicalType() LogicalType
func (*Column) MaxDefinitionLevel ¶
func (*Column) MaxRepetitionLevel ¶
func (*Column) Path ¶
Path is equivalent to ColumnPath().String() returning the dot-string version of the path
func (*Column) PhysicalType ¶
func (*Column) SchemaNode ¶
SchemaNode returns the underlying Node in the schema tree for this column.
func (*Column) SortOrder ¶
SortOrder returns the sort order of this column's statistics based on the Logical and Converted types.
func (*Column) TypeLength ¶
TypeLength is -1 if not a FixedLenByteArray, otherwise it is the length of elements in the column
type ConvertedType ¶
type ConvertedType format.ConvertedType
ConvertedType corresponds to the ConvertedType in the parquet.Thrift, with added values of None and NA for handling when these values are not set in the metadata
func (ConvertedType) String ¶
func (p ConvertedType) String() string
func (ConvertedType) ToLogicalType ¶
func (p ConvertedType) ToLogicalType(convertedDecimal DecimalMetadata) LogicalType
ToLogicalType returns the correct LogicalType for the given ConvertedType, using the decimal metadata provided to define the precision/scale if necessary
type DateLogicalType ¶
type DateLogicalType struct {
// contains filtered or unexported fields
}
DateLogicalType is an int32 representing the number of days since the Unix Epoch 1 January 1970
func (DateLogicalType) Equals ¶
func (DateLogicalType) Equals(rhs LogicalType) bool
func (DateLogicalType) IsApplicable ¶
func (DateLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (DateLogicalType) IsCompatible ¶
func (DateLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
func (DateLogicalType) IsSerialized ¶
func (DateLogicalType) IsSerialized() bool
func (DateLogicalType) MarshalJSON ¶
func (DateLogicalType) MarshalJSON() ([]byte, error)
func (DateLogicalType) SortOrder ¶
func (DateLogicalType) SortOrder() SortOrder
func (DateLogicalType) String ¶
func (DateLogicalType) String() string
func (DateLogicalType) ToConvertedType ¶
func (DateLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type DecimalLogicalType ¶
type DecimalLogicalType struct {
// contains filtered or unexported fields
}
DecimalLogicalType is used to represent a decimal value of a given precision and scale
func (DecimalLogicalType) Equals ¶
func (t DecimalLogicalType) Equals(rhs LogicalType) bool
func (DecimalLogicalType) IsApplicable ¶
func (t DecimalLogicalType) IsApplicable(typ parquet.Type, tlen int32) bool
func (DecimalLogicalType) IsCompatible ¶
func (t DecimalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (DecimalLogicalType) IsSerialized ¶
func (DecimalLogicalType) IsSerialized() bool
func (DecimalLogicalType) MarshalJSON ¶
func (t DecimalLogicalType) MarshalJSON() ([]byte, error)
func (DecimalLogicalType) Precision ¶
func (t DecimalLogicalType) Precision() int32
func (DecimalLogicalType) Scale ¶
func (t DecimalLogicalType) Scale() int32
func (DecimalLogicalType) SortOrder ¶
func (DecimalLogicalType) SortOrder() SortOrder
func (DecimalLogicalType) String ¶
func (t DecimalLogicalType) String() string
func (DecimalLogicalType) ToConvertedType ¶
func (t DecimalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type DecimalMetadata ¶
DecimalMetadata is a struct for managing scale and precision information between converted and logical types.
type EnumLogicalType ¶
type EnumLogicalType struct {
// contains filtered or unexported fields
}
EnumLogicalType is for representing an enum, which should be a byte array type
func (EnumLogicalType) Equals ¶
func (EnumLogicalType) Equals(rhs LogicalType) bool
func (EnumLogicalType) IsApplicable ¶
func (EnumLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (EnumLogicalType) IsCompatible ¶
func (EnumLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
func (EnumLogicalType) IsSerialized ¶
func (EnumLogicalType) IsSerialized() bool
func (EnumLogicalType) MarshalJSON ¶
func (EnumLogicalType) MarshalJSON() ([]byte, error)
func (EnumLogicalType) SortOrder ¶
func (EnumLogicalType) SortOrder() SortOrder
func (EnumLogicalType) String ¶
func (EnumLogicalType) String() string
func (EnumLogicalType) ToConvertedType ¶
func (EnumLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type GroupNode ¶
type GroupNode struct {
// contains filtered or unexported fields
}
GroupNode is for mananging nested nodes like List, Map, etc.
func GroupNodeFromThrift ¶
func GroupNodeFromThrift(elem *format.SchemaElement, fields FieldList) (*GroupNode, error)
func ListOf ¶
ListOf is a convenience helper function to create a properly structured list structure according to the Parquet Spec.
<list-repetition> group <name> (LIST) { repeated group list { <element-repetition> <element-type> element; } }
<list-repetition> can only be optional or required. panics if repeated. <element-repetition> can only be optional or required. panics if repeated.
func MapOf ¶
func MapOf(name string, key Node, value Node, mapRep parquet.Repetition, fieldID int32) (*GroupNode, error)
MapOf is a convenience helper function to create a properly structured parquet map node setup according to the Parquet Spec.
<map-repetition> group <name> (MAP) { repeated group key_value { required <key-type> key; <value-repetition> <value-type> value; } }
key node will be renamed to "key", value node if not nil will be renamed to "value"
<map-repetition> must be only optional or required. panics if repeated is passed.
the key node *must* be required repetition. panics if optional or repeated
value node can be nil (omitted) or have a repetition of required or optional *only*. panics if value node is not nil and has a repetition of repeated.
func MustGroup ¶
MustGroup is like Must, except it casts the node to a *GroupNode, which will panic if it is a primitive node.
func NewGroupNode ¶
func NewGroupNode(name string, repetition parquet.Repetition, fields FieldList, fieldID int32) (*GroupNode, error)
NewGroupNode constructs a new group node with the provided fields, but with converted type None and No Logical Type
func NewGroupNodeConverted ¶
func NewGroupNodeConverted(name string, repetition parquet.Repetition, fields FieldList, converted ConvertedType, id int32) (n *GroupNode, err error)
NewGroupNodeConverted constructs a group node with the provided fields and converted type, determining the logical type from that converted type.
func NewGroupNodeLogical ¶
func NewGroupNodeLogical(name string, repetition parquet.Repetition, fields FieldList, logical LogicalType, id int32) (n *GroupNode, err error)
NewGroupNodeLogical constructs a group node with the provided fields and logical type, determining the converted type from the provided logical type.
func (*GroupNode) ConvertedType ¶
func (n *GroupNode) ConvertedType() ConvertedType
func (*GroupNode) Equals ¶
Equals will compare this node to the provided node and only return true if this node and all of it's children are the same as the passed in node and its children.
func (*GroupNode) Field ¶
Field returns the node in the field list which is of the provided (0-based) index
func (*GroupNode) FieldIndexByField ¶
FieldIndexByField looks up the index child of this node. Returns -1 if n isn't a child of this group
func (*GroupNode) FieldIndexByName ¶
FieldIndexByName provides the index for the field of the given name. Returns -1 if not found.
If there are more than one field of this name, it returns the index for the first one.
func (*GroupNode) HasRepeatedFields ¶
HasRepeatedFields returns true if any of the children of this node have Repeated as its repetition type.
This is recursive and will check the children of any group nodes that are children.
func (*GroupNode) LogicalType ¶
func (n *GroupNode) LogicalType() LogicalType
func (*GroupNode) NumFields ¶
NumFields returns the number of direct child fields for this group node
func (*GroupNode) RepetitionType ¶
func (n *GroupNode) RepetitionType() parquet.Repetition
type IntLogicalType ¶
type IntLogicalType struct {
// contains filtered or unexported fields
}
IntLogicalType represents an integer type of a specific bit width and is either signed or unsigned.
func (IntLogicalType) BitWidth ¶
func (t IntLogicalType) BitWidth() int8
func (IntLogicalType) Equals ¶
func (t IntLogicalType) Equals(rhs LogicalType) bool
func (IntLogicalType) IsApplicable ¶
func (t IntLogicalType) IsApplicable(typ parquet.Type, _ int32) bool
func (IntLogicalType) IsCompatible ¶
func (t IntLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (IntLogicalType) IsSerialized ¶
func (IntLogicalType) IsSerialized() bool
func (IntLogicalType) IsSigned ¶
func (t IntLogicalType) IsSigned() bool
func (IntLogicalType) MarshalJSON ¶
func (t IntLogicalType) MarshalJSON() ([]byte, error)
func (IntLogicalType) SortOrder ¶
func (t IntLogicalType) SortOrder() SortOrder
func (IntLogicalType) String ¶
func (t IntLogicalType) String() string
func (IntLogicalType) ToConvertedType ¶
func (t IntLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type IntervalLogicalType ¶
type IntervalLogicalType struct {
// contains filtered or unexported fields
}
IntervalLogicalType is not yet in the thrift spec, but represents an interval time and needs to be a fixed length byte array of 12 bytes
func (IntervalLogicalType) Equals ¶
func (IntervalLogicalType) Equals(rhs LogicalType) bool
func (IntervalLogicalType) IsApplicable ¶
func (IntervalLogicalType) IsApplicable(t parquet.Type, tlen int32) bool
func (IntervalLogicalType) IsCompatible ¶
func (IntervalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (IntervalLogicalType) IsSerialized ¶
func (IntervalLogicalType) IsSerialized() bool
func (IntervalLogicalType) MarshalJSON ¶
func (IntervalLogicalType) MarshalJSON() ([]byte, error)
func (IntervalLogicalType) SortOrder ¶
func (IntervalLogicalType) SortOrder() SortOrder
func (IntervalLogicalType) String ¶
func (IntervalLogicalType) String() string
func (IntervalLogicalType) ToConvertedType ¶
func (IntervalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type JSONLogicalType ¶
type JSONLogicalType struct {
// contains filtered or unexported fields
}
JSONLogicalType represents a byte array column which is to be interpreted as a JSON string.
func (JSONLogicalType) Equals ¶
func (JSONLogicalType) Equals(rhs LogicalType) bool
func (JSONLogicalType) IsApplicable ¶
func (JSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (JSONLogicalType) IsCompatible ¶
func (JSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (JSONLogicalType) IsSerialized ¶
func (JSONLogicalType) IsSerialized() bool
func (JSONLogicalType) MarshalJSON ¶
func (JSONLogicalType) MarshalJSON() ([]byte, error)
func (JSONLogicalType) SortOrder ¶
func (JSONLogicalType) SortOrder() SortOrder
func (JSONLogicalType) String ¶
func (JSONLogicalType) String() string
func (JSONLogicalType) ToConvertedType ¶
func (JSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type ListLogicalType ¶
type ListLogicalType struct {
// contains filtered or unexported fields
}
ListLogicalType is used for columns which are themselves nested lists
func (ListLogicalType) Equals ¶
func (ListLogicalType) Equals(rhs LogicalType) bool
func (ListLogicalType) IsApplicable ¶
func (ListLogicalType) IsApplicable(parquet.Type, int32) bool
func (ListLogicalType) IsCompatible ¶
func (ListLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
func (ListLogicalType) IsNested ¶
func (ListLogicalType) IsNested() bool
func (ListLogicalType) IsSerialized ¶
func (ListLogicalType) IsSerialized() bool
func (ListLogicalType) MarshalJSON ¶
func (ListLogicalType) MarshalJSON() ([]byte, error)
func (ListLogicalType) SortOrder ¶
func (ListLogicalType) SortOrder() SortOrder
func (ListLogicalType) String ¶
func (ListLogicalType) String() string
func (ListLogicalType) ToConvertedType ¶
func (ListLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type LogicalType ¶
type LogicalType interface { // Returns true if a nested type like List or Map IsNested() bool // Returns true if this type can be serialized, ie: not Unknown/NoType/Interval IsSerialized() bool // Returns true if not NoLogicalType IsValid() bool // Returns true if it is NoType IsNone() bool // returns a string representation of the Logical Type String() string // Return the equivalent ConvertedType for legacy Parquet systems ToConvertedType() (ConvertedType, DecimalMetadata) // Returns true if the specified ConvertedType is compatible with this // logical type IsCompatible(ConvertedType, DecimalMetadata) bool // Returns true if this logical type can be used with the provided physical type IsApplicable(t parquet.Type, tlen int32) bool // Returns true if the logical types are the same Equals(LogicalType) bool // Returns the default stat sort order for this logical type SortOrder() SortOrder // contains filtered or unexported methods }
LogicalType is the descriptor that defines the usage of a physical primitive type in the schema, such as an Interval, Date, etc.
func NewDecimalLogicalType ¶
func NewDecimalLogicalType(precision int32, scale int32) LogicalType
NewDecimalLogicalType returns a Decimal logical type with the given precision and scale.
Panics if precision < 1 or scale is not in the range (0, precision)
func NewIntLogicalType ¶
func NewIntLogicalType(bitWidth int8, signed bool) LogicalType
NewIntLogicalType creates an integer logical type of the desired bitwidth and whether it is signed or not.
Bit width must be exactly 8, 16, 32 or 64 for an integer logical type
func NewListLogicalType ¶
func NewListLogicalType() LogicalType
func NewTimeLogicalType ¶
func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
NewTimeLogicalType returns a time type of the given unit.
func NewTimestampLogicalType ¶
func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
NewTimestampLogicalType returns a logical timestamp type with "forceConverted" set to false
func NewTimestampLogicalTypeForce ¶
func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType
NewTimestampLogicalTypeForce returns a timestamp logical type with "forceConverted" set to true
type MapLogicalType ¶
type MapLogicalType struct {
// contains filtered or unexported fields
}
MapLogicalType represents a mapped type
func (MapLogicalType) Equals ¶
func (MapLogicalType) Equals(rhs LogicalType) bool
func (MapLogicalType) IsApplicable ¶
func (MapLogicalType) IsApplicable(parquet.Type, int32) bool
func (MapLogicalType) IsCompatible ¶
func (MapLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
func (MapLogicalType) IsNested ¶
func (MapLogicalType) IsNested() bool
func (MapLogicalType) IsSerialized ¶
func (MapLogicalType) IsSerialized() bool
func (MapLogicalType) MarshalJSON ¶
func (MapLogicalType) MarshalJSON() ([]byte, error)
func (MapLogicalType) SortOrder ¶
func (MapLogicalType) SortOrder() SortOrder
func (MapLogicalType) String ¶
func (MapLogicalType) String() string
func (MapLogicalType) ToConvertedType ¶
func (MapLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type NoLogicalType ¶
type NoLogicalType struct {
// contains filtered or unexported fields
}
func (NoLogicalType) Equals ¶
func (NoLogicalType) Equals(rhs LogicalType) bool
func (NoLogicalType) IsApplicable ¶
func (NoLogicalType) IsApplicable(parquet.Type, int32) bool
func (NoLogicalType) IsCompatible ¶
func (NoLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (NoLogicalType) IsNone ¶
func (NoLogicalType) IsNone() bool
func (NoLogicalType) IsSerialized ¶
func (NoLogicalType) IsSerialized() bool
func (NoLogicalType) MarshalJSON ¶
func (NoLogicalType) MarshalJSON() ([]byte, error)
func (NoLogicalType) SortOrder ¶
func (NoLogicalType) SortOrder() SortOrder
func (NoLogicalType) String ¶
func (NoLogicalType) String() string
func (NoLogicalType) ToConvertedType ¶
func (NoLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type Node ¶
type Node interface { Name() string Type() NodeType RepetitionType() parquet.Repetition ConvertedType() ConvertedType LogicalType() LogicalType FieldID() int32 Parent() Node SetParent(Node) Path() string Equals(Node) bool Visit(v Visitor) // contains filtered or unexported methods }
Node is the interface for both Group and Primitive Nodes. A logical schema type has a name, repetition level, and optionally a logical type (converted type is the deprecated version of the logical type concept, which is maintained for forward compatibility)
func FromParquet ¶
func FromParquet(elems []*format.SchemaElement) (Node, error)
FromParquet converts a slice of thrift Schema Elements to the correct node type
type NullLogicalType ¶
type NullLogicalType struct {
// contains filtered or unexported fields
}
func (NullLogicalType) Equals ¶
func (NullLogicalType) Equals(rhs LogicalType) bool
func (NullLogicalType) IsApplicable ¶
func (NullLogicalType) IsApplicable(parquet.Type, int32) bool
func (NullLogicalType) IsCompatible ¶
func (NullLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (NullLogicalType) IsSerialized ¶
func (NullLogicalType) IsSerialized() bool
func (NullLogicalType) MarshalJSON ¶
func (NullLogicalType) MarshalJSON() ([]byte, error)
func (NullLogicalType) SortOrder ¶
func (NullLogicalType) SortOrder() SortOrder
func (NullLogicalType) String ¶
func (NullLogicalType) String() string
func (NullLogicalType) ToConvertedType ¶
func (NullLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type PrimitiveNode ¶
type PrimitiveNode struct { ColumnOrder parquet.ColumnOrder // contains filtered or unexported fields }
A PrimitiveNode is a type that is one of the primitive Parquet storage types. In addition to the other type metadata (name, repetition level, logical type), also has the physical storage type and their type-specific metadata (byte width, decimal parameters)
func MustPrimitive ¶
func MustPrimitive(n Node, err error) *PrimitiveNode
MustPrimitive is like Must except it casts the node to *PrimitiveNode which will panic if it is a group node.
func NewBooleanNode ¶
func NewBooleanNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewBooleanNode is a convenience factory for constructing an Boolean Primitive Node
func NewByteArrayNode ¶
func NewByteArrayNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewByteArrayNode is a convenience factory for constructing an Byte Array Primitive Node
func NewFixedLenByteArrayNode ¶
func NewFixedLenByteArrayNode(name string, rep parquet.Repetition, length int32, fieldID int32) *PrimitiveNode
NewFixedLenByteArrayNode is a convenience factory for constructing an Fixed Length Byte Array Primitive Node of the given length
func NewFloat32Node ¶
func NewFloat32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewFloat32Node is a convenience factory for constructing an Float Primitive Node
func NewFloat64Node ¶
func NewFloat64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewFloat64Node is a convenience factory for constructing an Double Primitive Node
func NewInt32Node ¶
func NewInt32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewInt32Node is a convenience factory for constructing an Int32 Primitive Node
func NewInt64Node ¶
func NewInt64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewInt64Node is a convenience factory for constructing an Int64 Primitive Node
func NewInt96Node ¶
func NewInt96Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode
NewInt96Node is a convenience factory for constructing an Int96 Primitive Node
func NewPrimitiveNode ¶
func NewPrimitiveNode(name string, repetition parquet.Repetition, typ parquet.Type, fieldID, typeLength int32) (*PrimitiveNode, error)
NewPrimitiveNode constructs a primitive node with the ConvertedType of None and no logical type.
Use NewPrimitiveNodeLogical and NewPrimitiveNodeConverted to specify the logical or converted type.
func NewPrimitiveNodeConverted ¶
func NewPrimitiveNodeConverted(name string, repetition parquet.Repetition, typ parquet.Type, converted ConvertedType, typeLen, precision, scale int, id int32) (*PrimitiveNode, error)
NewPrimitiveNodeConverted constructs a primitive node from the given physical type and converted type, determining the logical type from the converted type.
func NewPrimitiveNodeLogical ¶
func NewPrimitiveNodeLogical(name string, repetition parquet.Repetition, logicalType LogicalType, physicalType parquet.Type, typeLen int, id int32) (*PrimitiveNode, error)
NewPrimitiveNodeLogical constructs a Primtive node using the provided logical type for a given physical type and typelength.
func PrimitiveNodeFromThrift ¶
func PrimitiveNodeFromThrift(elem *format.SchemaElement) (*PrimitiveNode, error)
func (*PrimitiveNode) ConvertedType ¶
func (n *PrimitiveNode) ConvertedType() ConvertedType
func (*PrimitiveNode) DecimalMetadata ¶
func (p *PrimitiveNode) DecimalMetadata() DecimalMetadata
DecimalMetadata returns the current metadata for the node. If not a decimal typed column, the return should have IsSet == false.
func (*PrimitiveNode) Equals ¶
func (p *PrimitiveNode) Equals(rhs Node) bool
Equals returns true if both nodes are primitive nodes with the same physical and converted/logical types.
func (*PrimitiveNode) LogicalType ¶
func (n *PrimitiveNode) LogicalType() LogicalType
func (*PrimitiveNode) PhysicalType ¶
func (p *PrimitiveNode) PhysicalType() parquet.Type
PhysicalType returns the proper Physical parquet.Type primitive that is used to store the values in this column.
func (*PrimitiveNode) RepetitionType ¶
func (n *PrimitiveNode) RepetitionType() parquet.Repetition
func (*PrimitiveNode) SetTypeLength ¶
func (p *PrimitiveNode) SetTypeLength(length int)
SetTypeLength will change the type length of the node, has no effect if the physical type is not FixedLength Byte Array
func (*PrimitiveNode) TypeLength ¶
func (p *PrimitiveNode) TypeLength() int
TypeLength will be -1 if not a FixedLenByteArray column, otherwise will be the length of the FixedLen Byte Array
func (*PrimitiveNode) Visit ¶
func (p *PrimitiveNode) Visit(v Visitor)
Visit is for implementing a Visitor pattern handler to walk a schema's tree. One example is the Schema Printer which walks the tree to print out the schema in order.
type Schema ¶
type Schema struct {
// contains filtered or unexported fields
}
Schema is the container for the converted Parquet schema with a computed information from the schema analysis needed for file reading
* Column index to Node
* Max repetition / definition levels for each primitive node
The ColumnDescriptor objects produced by this class can be used to assist in the reconstruction of fully materialized data structures from the repetition-definition level encoding of nested data
func NewSchema ¶
NewSchema constructs a new Schema object from a root group node.
Any fields with a field-id of -1 will be given an appropriate field number based on their order.
func NewSchemaFromStruct ¶
NewSchemaFromStruct generates a schema from an object type via reflection of the type and reading struct tags for "parquet".
Rules ¶
Everything defaults to Required repetition, unless otherwise specified. Pointer types become Optional repetition. Arrays and Slices become logical List types unless using the tag `repetition=repeated`.
A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length unless otherwise specified by tags.
string and []byte both become ByteArray unless otherwise specified.
Integer types will default to having a logical type of the appropriate bit width and signedness rather than having no logical type, ie: an int8 will become an int32 node with logical type Int(bitWidth=8, signed=true).
Structs will become group nodes with the fields of the struct as the fields of the group, recursively creating the nodes.
maps will become appropriate Map structures in the schema of the defined key and values.
Available Tags ¶
name: by default the node will have the same name as the field, this tag let's you specify a name
type: Specify the physical type instead of using the field type
length: specify the type length of the node, only relevant for fixed_len_byte_array
scale: specify the scale for a decimal field
precision: specify the precision for a decimal field
fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file.
repetition: specify the repetition as something other than what is determined by the type
converted: specify the Converted Type of the field
logical: specify the logical type of the field, if using decimal then the scale and precision will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields with the logical. prefixed versions taking precedence. For Time or Timestamp logical types, use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify those values, with bitwidth being required, and signed defaulting to true.
All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice)
Example (Convertedtypes) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type ConvertedSchema struct { Utf8 string `parquet:"name=utf8, converted=UTF8"` Uint32 uint32 `parquet:"converted=INT_32"` Date int32 `parquet:"name=date, converted=date"` TimeMilli int32 `parquet:"name=timemilli, converted=TIME_MILLIS"` TimeMicro int64 `parquet:"name=timemicro, converted=time_micros"` TimeStampMilli int64 `parquet:"converted=timestamp_millis"` TimeStampMicro int64 `parquet:"converted=timestamp_micros"` Interval parquet.Int96 `parquet:"converted=INTERVAL"` Decimal1 int32 `parquet:"converted=decimal, scale=2, precision=9"` Decimal2 int64 `parquet:"converted=decimal, scale=2, precision=18"` Decimal3 [12]byte `parquet:"converted=decimal, scale=2, precision=10"` Decimal4 string `parquet:"converted=decimal, scale=2, precision=20"` } sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 ConvertedSchema { required byte_array field_id=-1 utf8 (String); required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true)); required int32 field_id=-1 date (Date); required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds)); required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false)); required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false)); required int96 field_id=-1 Interval; required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2)); required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2)); required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2)); }
Example (Logicaltypes) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type LogicalTypes struct { String []byte `parquet:"logical=String"` Enum string `parquet:"logical=enum"` Date int32 `parquet:"logical=date"` Decimal1 int32 `parquet:"logical=decimal, precision=9, scale=2"` Decimal2 int32 `parquet:"logical=decimal, logical.precision=9, scale=2"` Decimal3 int32 `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"` TimeMilliUTC int32 `parquet:"logical=TIME, logical.unit=millis"` TimeMilli int32 `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"` TimeMicros int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"` TimeMicrosUTC int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"` TimeNanos int64 `parquet:"logical=time, logical.unit=nanos"` TimestampMilli int64 `parquet:"logical=timestamp, logical.unit=millis"` TimestampMicrosNotUTC int64 `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"` TimestampNanos int64 `parquet:"logical=timestamp, logical.unit=nanos"` JSON string `parquet:"logical=json"` BSON []byte `parquet:"logical=BSON"` UUID [16]byte `parquet:"logical=uuid"` } sc, err := schema.NewSchemaFromStruct(LogicalTypes{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 LogicalTypes { required byte_array field_id=-1 String (String); required byte_array field_id=-1 Enum (Enum); required int32 field_id=-1 Date (Date); required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2)); required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3)); required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds)); required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds)); required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds)); required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false)); required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false)); required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false)); required byte_array field_id=-1 JSON (JSON); required byte_array field_id=-1 BSON (BSON); required fixed_len_byte_array field_id=-1 UUID (UUID); }
Example (Nestedtypes) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type Other struct { OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"` } type MyMap map[int32]string type Nested struct { SimpleMap map[int32]string FixedLenMap map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"` DecimalMap map[int32]string `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"` OtherList []*Other OtherRepeated []Other `parquet:"repetition=repeated"` DateArray [5]int32 `parquet:"valuelogical=date, logical=list"` DateMap MyMap `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"` } sc, err := schema.NewSchemaFromStruct(Nested{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 Nested { required group field_id=-1 SimpleMap (Map) { repeated group field_id=-1 key_value { required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true)); required byte_array field_id=-1 value; } } required group field_id=-1 FixedLenMap (Map) { repeated group field_id=-1 key_value { required fixed_len_byte_array field_id=10 key; required byte_array field_id=11 value; } } required group field_id=-1 DecimalMap (Map) { repeated group field_id=-1 key_value { required int32 field_id=-1 key (Decimal(precision=7, scale=3)); required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2)); } } required group field_id=-1 OtherList (List) { repeated group field_id=-1 list { optional group field_id=-1 element { optional group field_id=-1 OptionalMap (Map) { repeated group field_id=-1 key_value { required byte_array field_id=-1 key (String); required byte_array field_id=-1 value (BSON); } } } } } repeated group field_id=-1 OtherRepeated { optional group field_id=-1 OptionalMap (Map) { repeated group field_id=-1 key_value { required byte_array field_id=-1 key (String); required byte_array field_id=-1 value (BSON); } } } required group field_id=-1 DateArray (List) { repeated group field_id=-1 list { required int32 field_id=-1 element (Date); } } required group field_id=-1 DateMap (Map) { repeated group field_id=-1 key_value { required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); required byte_array field_id=-1 value (Enum); } } }
Example (Physicaltype) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type ChangeTypes struct { Int32 int64 `parquet:"type=int32"` FixedLen string `parquet:"type=fixed_len_byte_array, length=10"` SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"` Int int `parquet:"type=int32"` } sc, err := schema.NewSchemaFromStruct(ChangeTypes{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 ChangeTypes { required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); required fixed_len_byte_array field_id=-1 FixedLen; required fixed_len_byte_array field_id=-1 SliceAsFixed; required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true)); }
Example (Primitives) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type Schema struct { Bool bool Int8 int8 Uint16 uint16 Int32 int32 Int64 int64 Int96 parquet.Int96 Float float32 Double float64 ByteArray string FixedLenByteArray [10]byte } sc, err := schema.NewSchemaFromStruct(Schema{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 Schema { required boolean field_id=-1 Bool; required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true)); required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false)); required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true)); required int96 field_id=-1 Int96; required float field_id=-1 Float; required double field_id=-1 Double; required byte_array field_id=-1 ByteArray; required fixed_len_byte_array field_id=-1 FixedLenByteArray; }
Example (Repetition) ¶
package main import ( "log" "os" "github.com/apache/arrow/go/v10/parquet/schema" ) func main() { type RepetitionSchema struct { List []int64 `parquet:"fieldid=1"` Repeated []int64 `parquet:"repetition=repeated, fieldid=2"` Optional *int64 `parquet:"fieldid=3"` Required *int64 `parquet:"repetition=REQUIRED, fieldid=4"` Opt int64 `parquet:"repetition=OPTIONAL, fieldid=5"` } sc, err := schema.NewSchemaFromStruct(RepetitionSchema{}) if err != nil { log.Fatal(err) } schema.PrintSchema(sc.Root(), os.Stdout, 2) }
Output: repeated group field_id=-1 RepetitionSchema { required group field_id=1 List (List) { repeated group field_id=-1 list { required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true)); } } repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true)); optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true)); required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true)); optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true)); }
func (*Schema) ColumnIndexByName ¶
ColumnIndexByName looks up the column by it's full dot separated node path. If there are multiple columns that match, it returns the first one.
Returns -1 if not found.
func (*Schema) ColumnIndexByNode ¶
ColumnIndexByNode returns the index of the column represented by this node.
Returns -1 if not found.
func (*Schema) ColumnRoot ¶
ColumnRoot returns the root node of a given column if it is under a nested group node, providing that root group node.
func (*Schema) Equals ¶
Equals returns true as long as the leaf columns are equal, doesn't take into account the groups and only checks whether the schemas are compatible at the physical storage level.
func (*Schema) HasRepeatedFields ¶
HasRepeatedFields returns true if any node in the schema has a repeated field type.
func (*Schema) NumColumns ¶
NumColumns returns the number of leaf nodes that are the actual primitive columns in this schema.
func (*Schema) UpdateColumnOrders ¶
func (s *Schema) UpdateColumnOrders(orders []parquet.ColumnOrder) error
UpdateColumnOrders must get a slice that is the same length as the number of leaf columns and is used to update the schema metadata Column Orders. len(orders) must equal s.NumColumns()
type SortOrder ¶
type SortOrder int8
SortOrder mirrors the parquet.thrift sort order type
Constants for the Stat sort order definitions
func DefaultSortOrder ¶
DefaultSortOrder returns the default stat sort order for the given physical type
func GetLogicalSortOrder ¶
func GetLogicalSortOrder(logical LogicalType, primitive format.Type) SortOrder
GetLogicalSortOrder returns the default sort order for this logical type or falls back to the default sort order for the physical type if not valid
func GetSortOrder ¶
func GetSortOrder(convert ConvertedType, primitive format.Type) SortOrder
GetSortOrder defaults to the sort order based on the physical type if convert is ConvertedTypes.None, otherwise determines the sort order by the converted type.
type StringLogicalType ¶
type StringLogicalType struct {
// contains filtered or unexported fields
}
StringLogicalType is a UTF8 string, only usable with ByteArray and FixedLenByteArray
func (StringLogicalType) Equals ¶
func (StringLogicalType) Equals(rhs LogicalType) bool
func (StringLogicalType) IsApplicable ¶
func (StringLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (StringLogicalType) IsCompatible ¶
func (StringLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool
func (StringLogicalType) IsSerialized ¶
func (StringLogicalType) IsSerialized() bool
func (StringLogicalType) MarshalJSON ¶
func (StringLogicalType) MarshalJSON() ([]byte, error)
func (StringLogicalType) SortOrder ¶
func (StringLogicalType) SortOrder() SortOrder
func (StringLogicalType) String ¶
func (StringLogicalType) String() string
func (StringLogicalType) ToConvertedType ¶
func (StringLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type TemporalLogicalType ¶
type TemporalLogicalType interface { LogicalType IsAdjustedToUTC() bool TimeUnit() TimeUnitType }
TemporalLogicalType is a smaller interface for Time based logical types like Time / Timestamp
type TimeLogicalType ¶
type TimeLogicalType struct {
// contains filtered or unexported fields
}
TimeLogicalType is a time type without a date and must be an int32 for milliseconds, or an int64 for micro or nano seconds.
func (TimeLogicalType) Equals ¶
func (t TimeLogicalType) Equals(rhs LogicalType) bool
func (TimeLogicalType) IsAdjustedToUTC ¶
func (t TimeLogicalType) IsAdjustedToUTC() bool
func (TimeLogicalType) IsApplicable ¶
func (t TimeLogicalType) IsApplicable(typ parquet.Type, _ int32) bool
func (TimeLogicalType) IsCompatible ¶
func (t TimeLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (TimeLogicalType) IsSerialized ¶
func (TimeLogicalType) IsSerialized() bool
func (TimeLogicalType) MarshalJSON ¶
func (t TimeLogicalType) MarshalJSON() ([]byte, error)
func (TimeLogicalType) SortOrder ¶
func (TimeLogicalType) SortOrder() SortOrder
func (TimeLogicalType) String ¶
func (t TimeLogicalType) String() string
func (TimeLogicalType) TimeUnit ¶
func (t TimeLogicalType) TimeUnit() TimeUnitType
func (TimeLogicalType) ToConvertedType ¶
func (t TimeLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type TimeUnitType ¶
type TimeUnitType int
TimeUnitType is an enum for denoting whether a time based logical type is using milliseconds, microseconds or nanoseconds.
const ( TimeUnitMillis TimeUnitType = iota TimeUnitMicros TimeUnitNanos TimeUnitUnknown )
Constants for the TimeUnitType
type TimestampLogicalType ¶
type TimestampLogicalType struct {
// contains filtered or unexported fields
}
TimestampLogicalType represents an int64 number that can be decoded into a year, month, day, hour, minute, second, and subsecond
func (TimestampLogicalType) Equals ¶
func (t TimestampLogicalType) Equals(rhs LogicalType) bool
func (TimestampLogicalType) IsAdjustedToUTC ¶
func (t TimestampLogicalType) IsAdjustedToUTC() bool
func (TimestampLogicalType) IsApplicable ¶
func (TimestampLogicalType) IsApplicable(t parquet.Type, _ int32) bool
func (TimestampLogicalType) IsCompatible ¶
func (t TimestampLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (TimestampLogicalType) IsFromConvertedType ¶
func (t TimestampLogicalType) IsFromConvertedType() bool
func (TimestampLogicalType) IsSerialized ¶
func (t TimestampLogicalType) IsSerialized() bool
func (TimestampLogicalType) MarshalJSON ¶
func (t TimestampLogicalType) MarshalJSON() ([]byte, error)
func (TimestampLogicalType) SortOrder ¶
func (TimestampLogicalType) SortOrder() SortOrder
func (TimestampLogicalType) String ¶
func (t TimestampLogicalType) String() string
func (TimestampLogicalType) TimeUnit ¶
func (t TimestampLogicalType) TimeUnit() TimeUnitType
func (TimestampLogicalType) ToConvertedType ¶
func (t TimestampLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type UUIDLogicalType ¶
type UUIDLogicalType struct {
// contains filtered or unexported fields
}
UUIDLogicalType can only be used with a FixedLength byte array column that is exactly 16 bytes long
func (UUIDLogicalType) Equals ¶
func (UUIDLogicalType) Equals(rhs LogicalType) bool
func (UUIDLogicalType) IsApplicable ¶
func (UUIDLogicalType) IsApplicable(t parquet.Type, tlen int32) bool
func (UUIDLogicalType) IsCompatible ¶
func (UUIDLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (UUIDLogicalType) IsSerialized ¶
func (UUIDLogicalType) IsSerialized() bool
func (UUIDLogicalType) MarshalJSON ¶
func (UUIDLogicalType) MarshalJSON() ([]byte, error)
func (UUIDLogicalType) SortOrder ¶
func (UUIDLogicalType) SortOrder() SortOrder
func (UUIDLogicalType) String ¶
func (UUIDLogicalType) String() string
func (UUIDLogicalType) ToConvertedType ¶
func (UUIDLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type UnknownLogicalType ¶
type UnknownLogicalType struct {
// contains filtered or unexported fields
}
UnknownLogicalType is a type that is essentially a placeholder for when we don't know the type.
func (UnknownLogicalType) Equals ¶
func (UnknownLogicalType) Equals(rhs LogicalType) bool
func (UnknownLogicalType) IsApplicable ¶
func (UnknownLogicalType) IsApplicable(parquet.Type, int32) bool
func (UnknownLogicalType) IsCompatible ¶
func (UnknownLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool
func (UnknownLogicalType) IsSerialized ¶
func (UnknownLogicalType) IsSerialized() bool
func (UnknownLogicalType) IsValid ¶
func (UnknownLogicalType) IsValid() bool
func (UnknownLogicalType) MarshalJSON ¶
func (UnknownLogicalType) MarshalJSON() ([]byte, error)
func (UnknownLogicalType) SortOrder ¶
func (UnknownLogicalType) SortOrder() SortOrder
func (UnknownLogicalType) String ¶
func (UnknownLogicalType) String() string
func (UnknownLogicalType) ToConvertedType ¶
func (UnknownLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)
type Visitor ¶
Visitor is an interface for creating functionality to walk the schema tree.
A visitor can be passed to the Visit function of a Node in order to walk the tree. VisitPre is called the first time a node is encountered. If it is a group node, the return is checked and if it is false, the children will be skipped.
VisitPost is called after visiting any children