schema

package
v17.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 11, 2024 License: Apache-2.0, BSD-2-Clause, BSD-3-Clause, + 7 more Imports: 14 Imported by: 4

Documentation

Overview

Package schema provides types and functions for manipulating and building parquet file schemas.

Some of the utilities provided include building a schema using Struct Tags on a struct type, getting Column Paths from a node, and dealing with the converted and logical types for Parquet.

Logical types specify ways to interpret the primitive types allowing the number of primitive types to be smaller and reuse efficient encodings. For instance a "string" is just a ByteArray column with a UTF-8 annotation or "String Logical Type".

For more information about Logical and Converted Types, check: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md

Index

Examples

Constants

This section is empty.

Variables

View Source
var (
	// ConvertedTypes is a struct containing the constants for the types
	// to make it easy to reference them while making it clear what they are
	ConvertedTypes = struct {
		None            ConvertedType
		UTF8            ConvertedType
		Map             ConvertedType
		MapKeyValue     ConvertedType
		List            ConvertedType
		Enum            ConvertedType
		Decimal         ConvertedType
		Date            ConvertedType
		TimeMillis      ConvertedType
		TimeMicros      ConvertedType
		TimestampMillis ConvertedType
		TimestampMicros ConvertedType
		Uint8           ConvertedType
		Uint16          ConvertedType
		Uint32          ConvertedType
		Uint64          ConvertedType
		Int8            ConvertedType
		Int16           ConvertedType
		Int32           ConvertedType
		Int64           ConvertedType
		JSON            ConvertedType
		BSON            ConvertedType
		Interval        ConvertedType
		NA              ConvertedType
	}{
		None:            -1,
		UTF8:            ConvertedType(format.ConvertedType_UTF8),
		Map:             ConvertedType(format.ConvertedType_MAP),
		MapKeyValue:     ConvertedType(format.ConvertedType_MAP_KEY_VALUE),
		List:            ConvertedType(format.ConvertedType_LIST),
		Enum:            ConvertedType(format.ConvertedType_ENUM),
		Decimal:         ConvertedType(format.ConvertedType_DECIMAL),
		Date:            ConvertedType(format.ConvertedType_DATE),
		TimeMillis:      ConvertedType(format.ConvertedType_TIME_MILLIS),
		TimeMicros:      ConvertedType(format.ConvertedType_TIME_MICROS),
		TimestampMillis: ConvertedType(format.ConvertedType_TIMESTAMP_MILLIS),
		TimestampMicros: ConvertedType(format.ConvertedType_TIMESTAMP_MICROS),
		Uint8:           ConvertedType(format.ConvertedType_UINT_8),
		Uint16:          ConvertedType(format.ConvertedType_UINT_16),
		Uint32:          ConvertedType(format.ConvertedType_UINT_32),
		Uint64:          ConvertedType(format.ConvertedType_UINT_64),
		Int8:            ConvertedType(format.ConvertedType_INT_8),
		Int16:           ConvertedType(format.ConvertedType_INT_16),
		Int32:           ConvertedType(format.ConvertedType_INT_32),
		Int64:           ConvertedType(format.ConvertedType_INT_64),
		JSON:            ConvertedType(format.ConvertedType_JSON),
		BSON:            ConvertedType(format.ConvertedType_BSON),
		Interval:        ConvertedType(format.ConvertedType_INTERVAL),
		NA:              24,
	}
)

Functions

func ColumnPathFromNode

func ColumnPathFromNode(n Node) parquet.ColumnPath

ColumnPathFromNode walks the parents of the given node to construct it's column path

func NewStructFromSchema

func NewStructFromSchema(sc *Schema) (t reflect.Type, err error)

NewStructFromSchema generates a struct type as a reflect.Type from the schema by using the appropriate physical types and making things either pointers or slices based on whether they are repeated/optional/required. It does not use the logical or converted types to change the physical storage so that it is more efficient to use the resulting type for reading without having to do conversions.

It will use maps for map types and slices for list types, but otherwise ignores the converted and logical types of the nodes. Group nodes that are not List or Map will be nested structs.

func PrintSchema

func PrintSchema(n Node, w io.Writer, indentWidth int)

PrintSchema writes a string representation of the tree to w using the indent width provided.

Example
package main

import (
	"os"

	"github.com/apache/arrow/go/v17/parquet"
	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)}
	item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */)
	item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */)
	list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */))
	bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */))
	fields = append(fields, bag)

	fields = append(fields,
		schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)),
		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */)))

	sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */))
	schema.PrintSchema(sc, os.Stdout, 2)

}
Output:

repeated group field_id=0 schema {
  required int32 field_id=1 a;
  optional group field_id=2 bag {
    repeated group field_id=3 b (List) {
      optional int64 field_id=4 item1;
      required boolean field_id=5 item2;
    }
  }
  required int32 field_id=6 c (Decimal(precision=3, scale=2));
  required int64 field_id=7 d (Decimal(precision=10, scale=5));
}

func ToThrift

func ToThrift(schema *GroupNode) []*format.SchemaElement

ToThrift converts a GroupNode to a slice of SchemaElements which is used for thrift serialization.

Types

type BSONLogicalType

type BSONLogicalType struct {
	// contains filtered or unexported fields
}

BSONLogicalType represents a binary JSON string in the byte array

func (BSONLogicalType) Equals

func (BSONLogicalType) Equals(rhs LogicalType) bool

func (BSONLogicalType) IsApplicable

func (BSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (BSONLogicalType) IsCompatible

func (BSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (BSONLogicalType) IsNested

func (BSONLogicalType) IsNested() bool

func (BSONLogicalType) IsNone

func (BSONLogicalType) IsNone() bool

func (BSONLogicalType) IsSerialized

func (BSONLogicalType) IsSerialized() bool

func (BSONLogicalType) IsValid

func (BSONLogicalType) IsValid() bool

func (BSONLogicalType) MarshalJSON

func (BSONLogicalType) MarshalJSON() ([]byte, error)

func (BSONLogicalType) SortOrder

func (BSONLogicalType) SortOrder() SortOrder

func (BSONLogicalType) String

func (BSONLogicalType) String() string

func (BSONLogicalType) ToConvertedType

func (BSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type Column

type Column struct {
	// contains filtered or unexported fields
}

Column encapsulates the information necessary to interpret primitive column data in the context of a particular schema. We have to examine the node structure of a column's path to the root in the schema tree to be able to reassemble the nested structure from the repetition and definition levels.

func NewColumn

func NewColumn(n *PrimitiveNode, maxDefinitionLvl, maxRepetitionLvl int16) *Column

NewColumn returns a new column object for the given node with the provided maximum definition and repetition levels.

func (*Column) ColumnOrder

func (c *Column) ColumnOrder() parquet.ColumnOrder

func (*Column) ColumnPath

func (c *Column) ColumnPath() parquet.ColumnPath

ColumnPath returns the full path to this column from the root of the schema

func (*Column) ConvertedType

func (c *Column) ConvertedType() ConvertedType

func (*Column) Equals

func (c *Column) Equals(rhs *Column) bool

Equals will return true if the rhs Column has the same Max Repetition and Definition levels along with having the same node definition.

func (*Column) LogicalType

func (c *Column) LogicalType() LogicalType

func (*Column) MaxDefinitionLevel

func (c *Column) MaxDefinitionLevel() int16

func (*Column) MaxRepetitionLevel

func (c *Column) MaxRepetitionLevel() int16

func (*Column) Name

func (c *Column) Name() string

Name is the column's name

func (*Column) Path

func (c *Column) Path() string

Path is equivalent to ColumnPath().String() returning the dot-string version of the path

func (*Column) PhysicalType

func (c *Column) PhysicalType() parquet.Type

func (*Column) SchemaNode

func (c *Column) SchemaNode() Node

SchemaNode returns the underlying Node in the schema tree for this column.

func (*Column) SortOrder

func (c *Column) SortOrder() SortOrder

SortOrder returns the sort order of this column's statistics based on the Logical and Converted types.

func (*Column) String

func (c *Column) String() string

func (*Column) TypeLength

func (c *Column) TypeLength() int

TypeLength is -1 if not a FixedLenByteArray, otherwise it is the length of elements in the column

type ConvertedType

type ConvertedType format.ConvertedType

ConvertedType corresponds to the ConvertedType in the parquet.Thrift, with added values of None and NA for handling when these values are not set in the metadata

func (ConvertedType) String

func (p ConvertedType) String() string

func (ConvertedType) ToLogicalType

func (p ConvertedType) ToLogicalType(convertedDecimal DecimalMetadata) LogicalType

ToLogicalType returns the correct LogicalType for the given ConvertedType, using the decimal metadata provided to define the precision/scale if necessary

type DateLogicalType

type DateLogicalType struct {
	// contains filtered or unexported fields
}

DateLogicalType is an int32 representing the number of days since the Unix Epoch 1 January 1970

func (DateLogicalType) Equals

func (DateLogicalType) Equals(rhs LogicalType) bool

func (DateLogicalType) IsApplicable

func (DateLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (DateLogicalType) IsCompatible

func (DateLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool

func (DateLogicalType) IsNested

func (DateLogicalType) IsNested() bool

func (DateLogicalType) IsNone

func (DateLogicalType) IsNone() bool

func (DateLogicalType) IsSerialized

func (DateLogicalType) IsSerialized() bool

func (DateLogicalType) IsValid

func (DateLogicalType) IsValid() bool

func (DateLogicalType) MarshalJSON

func (DateLogicalType) MarshalJSON() ([]byte, error)

func (DateLogicalType) SortOrder

func (DateLogicalType) SortOrder() SortOrder

func (DateLogicalType) String

func (DateLogicalType) String() string

func (DateLogicalType) ToConvertedType

func (DateLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type DecimalLogicalType

type DecimalLogicalType struct {
	// contains filtered or unexported fields
}

DecimalLogicalType is used to represent a decimal value of a given precision and scale

func (DecimalLogicalType) Equals

func (t DecimalLogicalType) Equals(rhs LogicalType) bool

func (DecimalLogicalType) IsApplicable

func (t DecimalLogicalType) IsApplicable(typ parquet.Type, tlen int32) bool

func (DecimalLogicalType) IsCompatible

func (t DecimalLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (DecimalLogicalType) IsNested

func (DecimalLogicalType) IsNested() bool

func (DecimalLogicalType) IsNone

func (DecimalLogicalType) IsNone() bool

func (DecimalLogicalType) IsSerialized

func (DecimalLogicalType) IsSerialized() bool

func (DecimalLogicalType) IsValid

func (DecimalLogicalType) IsValid() bool

func (DecimalLogicalType) MarshalJSON

func (t DecimalLogicalType) MarshalJSON() ([]byte, error)

func (DecimalLogicalType) Precision

func (t DecimalLogicalType) Precision() int32

func (DecimalLogicalType) Scale

func (t DecimalLogicalType) Scale() int32

func (DecimalLogicalType) SortOrder

func (DecimalLogicalType) SortOrder() SortOrder

func (DecimalLogicalType) String

func (t DecimalLogicalType) String() string

func (DecimalLogicalType) ToConvertedType

func (t DecimalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type DecimalMetadata

type DecimalMetadata struct {
	IsSet     bool
	Scale     int32
	Precision int32
}

DecimalMetadata is a struct for managing scale and precision information between converted and logical types.

type EnumLogicalType

type EnumLogicalType struct {
	// contains filtered or unexported fields
}

EnumLogicalType is for representing an enum, which should be a byte array type

func (EnumLogicalType) Equals

func (EnumLogicalType) Equals(rhs LogicalType) bool

func (EnumLogicalType) IsApplicable

func (EnumLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (EnumLogicalType) IsCompatible

func (EnumLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool

func (EnumLogicalType) IsNested

func (EnumLogicalType) IsNested() bool

func (EnumLogicalType) IsNone

func (EnumLogicalType) IsNone() bool

func (EnumLogicalType) IsSerialized

func (EnumLogicalType) IsSerialized() bool

func (EnumLogicalType) IsValid

func (EnumLogicalType) IsValid() bool

func (EnumLogicalType) MarshalJSON

func (EnumLogicalType) MarshalJSON() ([]byte, error)

func (EnumLogicalType) SortOrder

func (EnumLogicalType) SortOrder() SortOrder

func (EnumLogicalType) String

func (EnumLogicalType) String() string

func (EnumLogicalType) ToConvertedType

func (EnumLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type FieldList

type FieldList []Node

FieldList is an alias for a slice of Nodes

func (FieldList) Len

func (f FieldList) Len() int

Len is equivalent to len(fieldlist)

type Float16LogicalType

type Float16LogicalType struct {
	// contains filtered or unexported fields
}

Float16LogicalType can only be used with a FixedLength byte array column that is exactly 2 bytes long

func (Float16LogicalType) Equals

func (Float16LogicalType) Equals(rhs LogicalType) bool

func (Float16LogicalType) IsApplicable

func (Float16LogicalType) IsApplicable(t parquet.Type, tlen int32) bool

func (Float16LogicalType) IsCompatible

func (Float16LogicalType) IsNested

func (Float16LogicalType) IsNested() bool

func (Float16LogicalType) IsNone

func (Float16LogicalType) IsNone() bool

func (Float16LogicalType) IsSerialized

func (Float16LogicalType) IsSerialized() bool

func (Float16LogicalType) IsValid

func (Float16LogicalType) IsValid() bool

func (Float16LogicalType) MarshalJSON

func (Float16LogicalType) MarshalJSON() ([]byte, error)

func (Float16LogicalType) SortOrder

func (Float16LogicalType) SortOrder() SortOrder

func (Float16LogicalType) String

func (Float16LogicalType) String() string

func (Float16LogicalType) ToConvertedType

func (Float16LogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type GroupNode

type GroupNode struct {
	// contains filtered or unexported fields
}

GroupNode is for managing nested nodes like List, Map, etc.

func GroupNodeFromThrift

func GroupNodeFromThrift(elem *format.SchemaElement, fields FieldList) (*GroupNode, error)

func ListOf

func ListOf(n Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error)

ListOf is a convenience helper function to create a properly structured list structure according to the Parquet Spec.

<list-repetition> group <name> (LIST) {
  repeated group list {
    <element-repetition> <element-type> element;
  }
}

<list-repetition> can only be optional or required. <element-repetition> can only be optional or required.

func ListOfWithName

func ListOfWithName(listName string, element Node, rep parquet.Repetition, fieldID int32) (*GroupNode, error)

ListOf is a convenience helper function to create a properly structured list structure according to the Parquet Spec.

<list-repetition> group <name> (LIST) {
  repeated group list {
    <element-repetition> <element-type> element;
  }
}

<list-repetition> can only be optional or required. <element-repetition> can only be optional or required.

func MapOf

func MapOf(name string, key Node, value Node, mapRep parquet.Repetition, fieldID int32) (*GroupNode, error)

MapOf is a convenience helper function to create a properly structured parquet map node setup according to the Parquet Spec.

<map-repetition> group <name> (MAP) {
	 repeated group key_value {
	   required <key-type> key;
    <value-repetition> <value-type> value;
  }
}

key node will be renamed to "key", value node if not nil will be renamed to "value"

<map-repetition> must be only optional or required. panics if repeated is passed.

the key node *must* be required repetition. panics if optional or repeated

value node can be nil (omitted) or have a repetition of required or optional *only*.

func MustGroup

func MustGroup(n Node, err error) *GroupNode

MustGroup is like Must, except it casts the node to a *GroupNode, which will panic if it is a primitive node.

func NewGroupNode

func NewGroupNode(name string, repetition parquet.Repetition, fields FieldList, fieldID int32) (*GroupNode, error)

NewGroupNode constructs a new group node with the provided fields, but with converted type None and No Logical Type

func NewGroupNodeConverted

func NewGroupNodeConverted(name string, repetition parquet.Repetition, fields FieldList, converted ConvertedType, id int32) (n *GroupNode, err error)

NewGroupNodeConverted constructs a group node with the provided fields and converted type, determining the logical type from that converted type.

func NewGroupNodeLogical

func NewGroupNodeLogical(name string, repetition parquet.Repetition, fields FieldList, logical LogicalType, id int32) (n *GroupNode, err error)

NewGroupNodeLogical constructs a group node with the provided fields and logical type, determining the converted type from the provided logical type.

func (*GroupNode) ConvertedType

func (n *GroupNode) ConvertedType() ConvertedType

func (*GroupNode) Equals

func (g *GroupNode) Equals(rhs Node) bool

Equals will compare this node to the provided node and only return true if this node and all of it's children are the same as the passed in node and its children.

func (*GroupNode) Field

func (g *GroupNode) Field(i int) Node

Field returns the node in the field list which is of the provided (0-based) index

func (*GroupNode) FieldID

func (n *GroupNode) FieldID() int32

func (*GroupNode) FieldIndexByField

func (g *GroupNode) FieldIndexByField(n Node) int

FieldIndexByField looks up the index child of this node. Returns -1 if n isn't a child of this group

func (*GroupNode) FieldIndexByName

func (g *GroupNode) FieldIndexByName(name string) int

FieldIndexByName provides the index for the field of the given name. Returns -1 if not found.

If there are more than one field of this name, it returns the index for the first one.

func (*GroupNode) HasRepeatedFields

func (g *GroupNode) HasRepeatedFields() bool

HasRepeatedFields returns true if any of the children of this node have Repeated as its repetition type.

This is recursive and will check the children of any group nodes that are children.

func (*GroupNode) LogicalType

func (n *GroupNode) LogicalType() LogicalType

func (*GroupNode) Name

func (n *GroupNode) Name() string

func (*GroupNode) NumFields

func (g *GroupNode) NumFields() int

NumFields returns the number of direct child fields for this group node

func (*GroupNode) Parent

func (n *GroupNode) Parent() Node

func (*GroupNode) Path

func (n *GroupNode) Path() string

func (*GroupNode) RepetitionType

func (n *GroupNode) RepetitionType() parquet.Repetition

func (*GroupNode) SetParent

func (n *GroupNode) SetParent(p Node)

func (*GroupNode) Type

func (n *GroupNode) Type() NodeType

func (*GroupNode) Visit

func (g *GroupNode) Visit(v Visitor)

Visit is for implementing a Visitor pattern handler to walk a schema's tree. One example is the Schema Printer which walks the tree to print out the schema in order.

type IntLogicalType

type IntLogicalType struct {
	// contains filtered or unexported fields
}

IntLogicalType represents an integer type of a specific bit width and is either signed or unsigned.

func (IntLogicalType) BitWidth

func (t IntLogicalType) BitWidth() int8

func (IntLogicalType) Equals

func (t IntLogicalType) Equals(rhs LogicalType) bool

func (IntLogicalType) IsApplicable

func (t IntLogicalType) IsApplicable(typ parquet.Type, _ int32) bool

func (IntLogicalType) IsCompatible

func (t IntLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (IntLogicalType) IsNested

func (IntLogicalType) IsNested() bool

func (IntLogicalType) IsNone

func (IntLogicalType) IsNone() bool

func (IntLogicalType) IsSerialized

func (IntLogicalType) IsSerialized() bool

func (IntLogicalType) IsSigned

func (t IntLogicalType) IsSigned() bool

func (IntLogicalType) IsValid

func (IntLogicalType) IsValid() bool

func (IntLogicalType) MarshalJSON

func (t IntLogicalType) MarshalJSON() ([]byte, error)

func (IntLogicalType) SortOrder

func (t IntLogicalType) SortOrder() SortOrder

func (IntLogicalType) String

func (t IntLogicalType) String() string

func (IntLogicalType) ToConvertedType

func (t IntLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type IntervalLogicalType

type IntervalLogicalType struct {
	// contains filtered or unexported fields
}

IntervalLogicalType is not yet in the thrift spec, but represents an interval time and needs to be a fixed length byte array of 12 bytes

func (IntervalLogicalType) Equals

func (IntervalLogicalType) Equals(rhs LogicalType) bool

func (IntervalLogicalType) IsApplicable

func (IntervalLogicalType) IsApplicable(t parquet.Type, tlen int32) bool

func (IntervalLogicalType) IsCompatible

func (IntervalLogicalType) IsNested

func (IntervalLogicalType) IsNested() bool

func (IntervalLogicalType) IsNone

func (IntervalLogicalType) IsNone() bool

func (IntervalLogicalType) IsSerialized

func (IntervalLogicalType) IsSerialized() bool

func (IntervalLogicalType) IsValid

func (IntervalLogicalType) IsValid() bool

func (IntervalLogicalType) MarshalJSON

func (IntervalLogicalType) MarshalJSON() ([]byte, error)

func (IntervalLogicalType) SortOrder

func (IntervalLogicalType) SortOrder() SortOrder

func (IntervalLogicalType) String

func (IntervalLogicalType) String() string

func (IntervalLogicalType) ToConvertedType

func (IntervalLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type JSONLogicalType

type JSONLogicalType struct {
	// contains filtered or unexported fields
}

JSONLogicalType represents a byte array column which is to be interpreted as a JSON string.

func (JSONLogicalType) Equals

func (JSONLogicalType) Equals(rhs LogicalType) bool

func (JSONLogicalType) IsApplicable

func (JSONLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (JSONLogicalType) IsCompatible

func (JSONLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (JSONLogicalType) IsNested

func (JSONLogicalType) IsNested() bool

func (JSONLogicalType) IsNone

func (JSONLogicalType) IsNone() bool

func (JSONLogicalType) IsSerialized

func (JSONLogicalType) IsSerialized() bool

func (JSONLogicalType) IsValid

func (JSONLogicalType) IsValid() bool

func (JSONLogicalType) MarshalJSON

func (JSONLogicalType) MarshalJSON() ([]byte, error)

func (JSONLogicalType) SortOrder

func (JSONLogicalType) SortOrder() SortOrder

func (JSONLogicalType) String

func (JSONLogicalType) String() string

func (JSONLogicalType) ToConvertedType

func (JSONLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type ListLogicalType

type ListLogicalType struct {
	// contains filtered or unexported fields
}

ListLogicalType is used for columns which are themselves nested lists

func (ListLogicalType) Equals

func (ListLogicalType) Equals(rhs LogicalType) bool

func (ListLogicalType) IsApplicable

func (ListLogicalType) IsApplicable(parquet.Type, int32) bool

func (ListLogicalType) IsCompatible

func (ListLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool

func (ListLogicalType) IsNested

func (ListLogicalType) IsNested() bool

func (ListLogicalType) IsNone

func (ListLogicalType) IsNone() bool

func (ListLogicalType) IsSerialized

func (ListLogicalType) IsSerialized() bool

func (ListLogicalType) IsValid

func (ListLogicalType) IsValid() bool

func (ListLogicalType) MarshalJSON

func (ListLogicalType) MarshalJSON() ([]byte, error)

func (ListLogicalType) SortOrder

func (ListLogicalType) SortOrder() SortOrder

func (ListLogicalType) String

func (ListLogicalType) String() string

func (ListLogicalType) ToConvertedType

func (ListLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type LogicalType

type LogicalType interface {
	// Returns true if a nested type like List or Map
	IsNested() bool
	// Returns true if this type can be serialized, ie: not Unknown/NoType/Interval
	IsSerialized() bool
	// Returns true if not NoLogicalType
	IsValid() bool
	// Returns true if it is NoType
	IsNone() bool
	// returns a string representation of the Logical Type
	String() string

	// Return the equivalent ConvertedType for legacy Parquet systems
	ToConvertedType() (ConvertedType, DecimalMetadata)
	// Returns true if the specified ConvertedType is compatible with this
	// logical type
	IsCompatible(ConvertedType, DecimalMetadata) bool
	// Returns true if this logical type can be used with the provided physical type
	IsApplicable(t parquet.Type, tlen int32) bool
	// Returns true if the logical types are the same
	Equals(LogicalType) bool
	// Returns the default stat sort order for this logical type
	SortOrder() SortOrder
	// contains filtered or unexported methods
}

LogicalType is the descriptor that defines the usage of a physical primitive type in the schema, such as an Interval, Date, etc.

func NewDecimalLogicalType

func NewDecimalLogicalType(precision int32, scale int32) LogicalType

NewDecimalLogicalType returns a Decimal logical type with the given precision and scale.

Panics if precision < 1 or scale is not in the range (0, precision)

func NewIntLogicalType

func NewIntLogicalType(bitWidth int8, signed bool) LogicalType

NewIntLogicalType creates an integer logical type of the desired bitwidth and whether it is signed or not.

Bit width must be exactly 8, 16, 32 or 64 for an integer logical type

func NewListLogicalType

func NewListLogicalType() LogicalType

func NewTimeLogicalType

func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType

NewTimeLogicalType returns a time type of the given unit.

func NewTimestampLogicalType

func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType

NewTimestampLogicalType returns a logical timestamp type with "forceConverted" set to false

func NewTimestampLogicalTypeForce

func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType

NewTimestampLogicalTypeForce returns a timestamp logical type with "forceConverted" set to true

func NewTimestampLogicalTypeWithOpts

func NewTimestampLogicalTypeWithOpts(opts ...TimestampOpt) LogicalType

NewTimestampLogicalTypeWithOpts creates a new TimestampLogicalType with the provided options.

TimestampType Unit defaults to milliseconds (TimeUnitMillis)

type MapLogicalType

type MapLogicalType struct {
	// contains filtered or unexported fields
}

MapLogicalType represents a mapped type

func (MapLogicalType) Equals

func (MapLogicalType) Equals(rhs LogicalType) bool

func (MapLogicalType) IsApplicable

func (MapLogicalType) IsApplicable(parquet.Type, int32) bool

func (MapLogicalType) IsCompatible

func (MapLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool

func (MapLogicalType) IsNested

func (MapLogicalType) IsNested() bool

func (MapLogicalType) IsNone

func (MapLogicalType) IsNone() bool

func (MapLogicalType) IsSerialized

func (MapLogicalType) IsSerialized() bool

func (MapLogicalType) IsValid

func (MapLogicalType) IsValid() bool

func (MapLogicalType) MarshalJSON

func (MapLogicalType) MarshalJSON() ([]byte, error)

func (MapLogicalType) SortOrder

func (MapLogicalType) SortOrder() SortOrder

func (MapLogicalType) String

func (MapLogicalType) String() string

func (MapLogicalType) ToConvertedType

func (MapLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type NoLogicalType

type NoLogicalType struct {
	// contains filtered or unexported fields
}

func (NoLogicalType) Equals

func (NoLogicalType) Equals(rhs LogicalType) bool

func (NoLogicalType) IsApplicable

func (NoLogicalType) IsApplicable(parquet.Type, int32) bool

func (NoLogicalType) IsCompatible

func (NoLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (NoLogicalType) IsNested

func (NoLogicalType) IsNested() bool

func (NoLogicalType) IsNone

func (NoLogicalType) IsNone() bool

func (NoLogicalType) IsSerialized

func (NoLogicalType) IsSerialized() bool

func (NoLogicalType) IsValid

func (NoLogicalType) IsValid() bool

func (NoLogicalType) MarshalJSON

func (NoLogicalType) MarshalJSON() ([]byte, error)

func (NoLogicalType) SortOrder

func (NoLogicalType) SortOrder() SortOrder

func (NoLogicalType) String

func (NoLogicalType) String() string

func (NoLogicalType) ToConvertedType

func (NoLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type Node

type Node interface {
	Name() string
	Type() NodeType
	RepetitionType() parquet.Repetition
	ConvertedType() ConvertedType
	LogicalType() LogicalType
	FieldID() int32
	Parent() Node
	SetParent(Node)
	Path() string
	Equals(Node) bool
	Visit(v Visitor)
	// contains filtered or unexported methods
}

Node is the interface for both Group and Primitive Nodes. A logical schema type has a name, repetition level, and optionally a logical type (converted type is the deprecated version of the logical type concept, which is maintained for forward compatibility)

func FromParquet

func FromParquet(elems []*format.SchemaElement) (Node, error)

FromParquet converts a slice of thrift Schema Elements to the correct node type

func Must

func Must(n Node, err error) Node

Must is a convenience function for the NewNode functions that return a Node and an error, panic'ing if err != nil or returning the node

type NodeType

type NodeType int

NodeType describes whether the Node is a Primitive or Group node

const (
	Primitive NodeType = iota
	Group
)

the available constants for NodeType

type NullLogicalType

type NullLogicalType struct {
	// contains filtered or unexported fields
}

func (NullLogicalType) Equals

func (NullLogicalType) Equals(rhs LogicalType) bool

func (NullLogicalType) IsApplicable

func (NullLogicalType) IsApplicable(parquet.Type, int32) bool

func (NullLogicalType) IsCompatible

func (NullLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (NullLogicalType) IsNested

func (NullLogicalType) IsNested() bool

func (NullLogicalType) IsNone

func (NullLogicalType) IsNone() bool

func (NullLogicalType) IsSerialized

func (NullLogicalType) IsSerialized() bool

func (NullLogicalType) IsValid

func (NullLogicalType) IsValid() bool

func (NullLogicalType) MarshalJSON

func (NullLogicalType) MarshalJSON() ([]byte, error)

func (NullLogicalType) SortOrder

func (NullLogicalType) SortOrder() SortOrder

func (NullLogicalType) String

func (NullLogicalType) String() string

func (NullLogicalType) ToConvertedType

func (NullLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type PrimitiveNode

type PrimitiveNode struct {
	ColumnOrder parquet.ColumnOrder
	// contains filtered or unexported fields
}

A PrimitiveNode is a type that is one of the primitive Parquet storage types. In addition to the other type metadata (name, repetition level, logical type), also has the physical storage type and their type-specific metadata (byte width, decimal parameters)

func MustPrimitive

func MustPrimitive(n Node, err error) *PrimitiveNode

MustPrimitive is like Must except it casts the node to *PrimitiveNode which will panic if it is a group node.

func NewBooleanNode

func NewBooleanNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewBooleanNode is a convenience factory for constructing an Boolean Primitive Node

func NewByteArrayNode

func NewByteArrayNode(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewByteArrayNode is a convenience factory for constructing an Byte Array Primitive Node

func NewFixedLenByteArrayNode

func NewFixedLenByteArrayNode(name string, rep parquet.Repetition, length int32, fieldID int32) *PrimitiveNode

NewFixedLenByteArrayNode is a convenience factory for constructing an Fixed Length Byte Array Primitive Node of the given length

func NewFloat32Node

func NewFloat32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewFloat32Node is a convenience factory for constructing an Float Primitive Node

func NewFloat64Node

func NewFloat64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewFloat64Node is a convenience factory for constructing an Double Primitive Node

func NewInt32Node

func NewInt32Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewInt32Node is a convenience factory for constructing an Int32 Primitive Node

func NewInt64Node

func NewInt64Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewInt64Node is a convenience factory for constructing an Int64 Primitive Node

func NewInt96Node

func NewInt96Node(name string, rep parquet.Repetition, fieldID int32) *PrimitiveNode

NewInt96Node is a convenience factory for constructing an Int96 Primitive Node

func NewPrimitiveNode

func NewPrimitiveNode(name string, repetition parquet.Repetition, typ parquet.Type, fieldID, typeLength int32) (*PrimitiveNode, error)

NewPrimitiveNode constructs a primitive node with the ConvertedType of None and no logical type.

Use NewPrimitiveNodeLogical and NewPrimitiveNodeConverted to specify the logical or converted type.

func NewPrimitiveNodeConverted

func NewPrimitiveNodeConverted(name string, repetition parquet.Repetition, typ parquet.Type, converted ConvertedType, typeLen, precision, scale int, id int32) (*PrimitiveNode, error)

NewPrimitiveNodeConverted constructs a primitive node from the given physical type and converted type, determining the logical type from the converted type.

func NewPrimitiveNodeLogical

func NewPrimitiveNodeLogical(name string, repetition parquet.Repetition, logicalType LogicalType, physicalType parquet.Type, typeLen int, id int32) (*PrimitiveNode, error)

NewPrimitiveNodeLogical constructs a Primitive node using the provided logical type for a given physical type and typelength.

func PrimitiveNodeFromThrift

func PrimitiveNodeFromThrift(elem *format.SchemaElement) (*PrimitiveNode, error)

func (*PrimitiveNode) ConvertedType

func (n *PrimitiveNode) ConvertedType() ConvertedType

func (*PrimitiveNode) DecimalMetadata

func (p *PrimitiveNode) DecimalMetadata() DecimalMetadata

DecimalMetadata returns the current metadata for the node. If not a decimal typed column, the return should have IsSet == false.

func (*PrimitiveNode) Equals

func (p *PrimitiveNode) Equals(rhs Node) bool

Equals returns true if both nodes are primitive nodes with the same physical and converted/logical types.

func (*PrimitiveNode) FieldID

func (n *PrimitiveNode) FieldID() int32

func (*PrimitiveNode) LogicalType

func (n *PrimitiveNode) LogicalType() LogicalType

func (*PrimitiveNode) Name

func (n *PrimitiveNode) Name() string

func (*PrimitiveNode) Parent

func (n *PrimitiveNode) Parent() Node

func (*PrimitiveNode) Path

func (n *PrimitiveNode) Path() string

func (*PrimitiveNode) PhysicalType

func (p *PrimitiveNode) PhysicalType() parquet.Type

PhysicalType returns the proper Physical parquet.Type primitive that is used to store the values in this column.

func (*PrimitiveNode) RepetitionType

func (n *PrimitiveNode) RepetitionType() parquet.Repetition

func (*PrimitiveNode) SetParent

func (n *PrimitiveNode) SetParent(p Node)

func (*PrimitiveNode) SetTypeLength

func (p *PrimitiveNode) SetTypeLength(length int)

SetTypeLength will change the type length of the node, has no effect if the physical type is not FixedLength Byte Array

func (*PrimitiveNode) Type

func (n *PrimitiveNode) Type() NodeType

func (*PrimitiveNode) TypeLength

func (p *PrimitiveNode) TypeLength() int

TypeLength will be -1 if not a FixedLenByteArray column, otherwise will be the length of the FixedLen Byte Array

func (*PrimitiveNode) Visit

func (p *PrimitiveNode) Visit(v Visitor)

Visit is for implementing a Visitor pattern handler to walk a schema's tree. One example is the Schema Printer which walks the tree to print out the schema in order.

type Schema

type Schema struct {
	// contains filtered or unexported fields
}

Schema is the container for the converted Parquet schema with a computed information from the schema analysis needed for file reading

* Column index to Node

* Max repetition / definition levels for each primitive node

The ColumnDescriptor objects produced by this class can be used to assist in the reconstruction of fully materialized data structures from the repetition-definition level encoding of nested data

func NewSchema

func NewSchema(root *GroupNode) *Schema

NewSchema constructs a new Schema object from a root group node.

Any fields with a field-id of -1 will be given an appropriate field number based on their order.

func NewSchemaFromStruct

func NewSchemaFromStruct(obj interface{}) (sc *Schema, err error)

NewSchemaFromStruct generates a schema from an object type via reflection of the type and reading struct tags for "parquet".

Rules

Everything defaults to Required repetition, unless otherwise specified. Pointer types become Optional repetition. Arrays and Slices become logical List types unless using the tag `repetition=repeated`.

A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length unless otherwise specified by tags.

string and []byte both become ByteArray unless otherwise specified.

Integer types will default to having a logical type of the appropriate bit width and signedness rather than having no logical type, ie: an int8 will become an int32 node with logical type Int(bitWidth=8, signed=true).

Structs will become group nodes with the fields of the struct as the fields of the group, recursively creating the nodes.

maps will become appropriate Map structures in the schema of the defined key and values.

Available Tags

name: by default the node will have the same name as the field, this tag let's you specify a name

type: Specify the physical type instead of using the field type

length: specify the type length of the node, only relevant for fixed_len_byte_array

scale: specify the scale for a decimal field

precision: specify the precision for a decimal field

fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file.

repetition: specify the repetition as something other than what is determined by the type

converted: specify the Converted Type of the field

logical: specify the logical type of the field, if using decimal then the scale and precision will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields with the logical. prefixed versions taking precedence. For Time or Timestamp logical types, use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify those values, with bitwidth being required, and signed defaulting to true.

All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice)

Example (Convertedtypes)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/parquet"
	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type ConvertedSchema struct {
		Utf8           string        `parquet:"name=utf8, converted=UTF8"`
		Uint32         uint32        `parquet:"converted=INT_32"`
		Date           int32         `parquet:"name=date, converted=date"`
		TimeMilli      int32         `parquet:"name=timemilli, converted=TIME_MILLIS"`
		TimeMicro      int64         `parquet:"name=timemicro, converted=time_micros"`
		TimeStampMilli int64         `parquet:"converted=timestamp_millis"`
		TimeStampMicro int64         `parquet:"converted=timestamp_micros"`
		Interval       parquet.Int96 `parquet:"converted=INTERVAL"`
		Decimal1       int32         `parquet:"converted=decimal, scale=2, precision=9"`
		Decimal2       int64         `parquet:"converted=decimal, scale=2, precision=18"`
		Decimal3       [12]byte      `parquet:"converted=decimal, scale=2, precision=10"`
		Decimal4       string        `parquet:"converted=decimal, scale=2, precision=20"`
	}

	sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 ConvertedSchema {
  required byte_array field_id=-1 utf8 (String);
  required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true));
  required int32 field_id=-1 date (Date);
  required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
  required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds));
  required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false));
  required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
  required int96 field_id=-1 Interval;
  required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
  required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2));
  required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2));
  required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2));
}
Example (Logicaltypes)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/arrow/float16"
	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type LogicalTypes struct {
		String                []byte   `parquet:"logical=String"`
		Enum                  string   `parquet:"logical=enum"`
		Date                  int32    `parquet:"logical=date"`
		Decimal1              int32    `parquet:"logical=decimal, precision=9, scale=2"`
		Decimal2              int32    `parquet:"logical=decimal, logical.precision=9, scale=2"`
		Decimal3              int32    `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"`
		TimeMilliUTC          int32    `parquet:"logical=TIME, logical.unit=millis"`
		TimeMilli             int32    `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"`
		TimeMicros            int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"`
		TimeMicrosUTC         int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"`
		TimeNanos             int64    `parquet:"logical=time, logical.unit=nanos"`
		TimestampMilli        int64    `parquet:"logical=timestamp, logical.unit=millis"`
		TimestampMicrosNotUTC int64    `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"`
		TimestampNanos        int64    `parquet:"logical=timestamp, logical.unit=nanos"`
		JSON                  string   `parquet:"logical=json"`
		BSON                  []byte   `parquet:"logical=BSON"`
		UUID                  [16]byte `parquet:"logical=uuid"`
		Float16               [2]byte  `parquet:"logical=float16"`
		Float16Optional       *[2]byte `parquet:"logical=float16"`
		Float16Num            float16.Num
	}

	sc, err := schema.NewSchemaFromStruct(LogicalTypes{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 LogicalTypes {
  required byte_array field_id=-1 String (String);
  required byte_array field_id=-1 Enum (Enum);
  required int32 field_id=-1 Date (Date);
  required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
  required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2));
  required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3));
  required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
  required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
  required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds));
  required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds));
  required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds));
  required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false));
  required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
  required byte_array field_id=-1 JSON (JSON);
  required byte_array field_id=-1 BSON (BSON);
  required fixed_len_byte_array field_id=-1 UUID (UUID);
  required fixed_len_byte_array field_id=-1 Float16 (Float16);
  optional fixed_len_byte_array field_id=-1 Float16Optional (Float16);
  required fixed_len_byte_array field_id=-1 Float16Num (Float16);
}
Example (Nestedtypes)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type Other struct {
		OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"`
	}

	type MyMap map[int32]string

	type Nested struct {
		SimpleMap     map[int32]string
		FixedLenMap   map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"`
		DecimalMap    map[int32]string  `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"`
		OtherList     []*Other
		OtherRepeated []Other  `parquet:"repetition=repeated"`
		DateArray     [5]int32 `parquet:"valuelogical=date, logical=list"`
		DateMap       MyMap    `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"`
	}

	sc, err := schema.NewSchemaFromStruct(Nested{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 Nested {
  required group field_id=-1 SimpleMap (Map) {
    repeated group field_id=-1 key_value {
      required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true));
      required byte_array field_id=-1 value;
    }
  }
  required group field_id=-1 FixedLenMap (Map) {
    repeated group field_id=-1 key_value {
      required fixed_len_byte_array field_id=10 key;
      required byte_array field_id=11 value;
    }
  }
  required group field_id=-1 DecimalMap (Map) {
    repeated group field_id=-1 key_value {
      required int32 field_id=-1 key (Decimal(precision=7, scale=3));
      required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2));
    }
  }
  required group field_id=-1 OtherList (List) {
    repeated group field_id=-1 list {
      optional group field_id=-1 element {
        optional group field_id=-1 OptionalMap (Map) {
          repeated group field_id=-1 key_value {
            required byte_array field_id=-1 key (String);
            required byte_array field_id=-1 value (BSON);
          }
        }
      }
    }
  }
  repeated group field_id=-1 OtherRepeated {
    optional group field_id=-1 OptionalMap (Map) {
      repeated group field_id=-1 key_value {
        required byte_array field_id=-1 key (String);
        required byte_array field_id=-1 value (BSON);
      }
    }
  }
  required group field_id=-1 DateArray (List) {
    repeated group field_id=-1 list {
      required int32 field_id=-1 element (Date);
    }
  }
  required group field_id=-1 DateMap (Map) {
    repeated group field_id=-1 key_value {
      required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
      required byte_array field_id=-1 value (Enum);
    }
  }
}
Example (Physicaltype)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type ChangeTypes struct {
		Int32        int64  `parquet:"type=int32"`
		FixedLen     string `parquet:"type=fixed_len_byte_array, length=10"`
		SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"`
		Int          int    `parquet:"type=int32"`
	}

	sc, err := schema.NewSchemaFromStruct(ChangeTypes{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 ChangeTypes {
  required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
  required fixed_len_byte_array field_id=-1 FixedLen;
  required fixed_len_byte_array field_id=-1 SliceAsFixed;
  required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true));
}
Example (Primitives)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/parquet"
	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type Schema struct {
		Bool              bool
		Int8              int8
		Uint16            uint16
		Int32             int32
		Int64             int64
		Int96             parquet.Int96
		Float             float32
		Double            float64
		ByteArray         string
		FixedLenByteArray [10]byte
	}

	sc, err := schema.NewSchemaFromStruct(Schema{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 Schema {
  required boolean field_id=-1 Bool;
  required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true));
  required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false));
  required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
  required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true));
  required int96 field_id=-1 Int96;
  required float field_id=-1 Float;
  required double field_id=-1 Double;
  required byte_array field_id=-1 ByteArray;
  required fixed_len_byte_array field_id=-1 FixedLenByteArray;
}
Example (Repetition)
package main

import (
	"log"
	"os"

	"github.com/apache/arrow/go/v17/parquet/schema"
)

func main() {
	type RepetitionSchema struct {
		List     []int64 `parquet:"fieldid=1"`
		Repeated []int64 `parquet:"repetition=repeated, fieldid=2"`
		Optional *int64  `parquet:"fieldid=3"`
		Required *int64  `parquet:"repetition=REQUIRED, fieldid=4"`
		Opt      int64   `parquet:"repetition=OPTIONAL, fieldid=5"`
	}

	sc, err := schema.NewSchemaFromStruct(RepetitionSchema{})
	if err != nil {
		log.Fatal(err)
	}

	schema.PrintSchema(sc.Root(), os.Stdout, 2)

}
Output:

repeated group field_id=-1 RepetitionSchema {
  required group field_id=1 List (List) {
    repeated group field_id=-1 list {
      required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true));
    }
  }
  repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true));
  optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true));
  required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true));
  optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true));
}

func (*Schema) Column

func (s *Schema) Column(i int) *Column

Column returns the (0-indexed) column of the provided index.

func (*Schema) ColumnIndexByName

func (s *Schema) ColumnIndexByName(nodePath string) int

ColumnIndexByName looks up the column by it's full dot separated node path. If there are multiple columns that match, it returns the first one.

Returns -1 if not found.

func (*Schema) ColumnIndexByNode

func (s *Schema) ColumnIndexByNode(n Node) int

ColumnIndexByNode returns the index of the column represented by this node.

Returns -1 if not found.

func (*Schema) ColumnRoot

func (s *Schema) ColumnRoot(i int) Node

ColumnRoot returns the root node of a given column if it is under a nested group node, providing that root group node.

func (*Schema) Equals

func (s *Schema) Equals(rhs *Schema) bool

Equals returns true as long as the leaf columns are equal, doesn't take into account the groups and only checks whether the schemas are compatible at the physical storage level.

func (*Schema) HasRepeatedFields

func (s *Schema) HasRepeatedFields() bool

HasRepeatedFields returns true if any node in the schema has a repeated field type.

func (*Schema) NumColumns

func (s *Schema) NumColumns() int

NumColumns returns the number of leaf nodes that are the actual primitive columns in this schema.

func (*Schema) Root

func (s *Schema) Root() *GroupNode

Root returns the group node that is the root of this schema

func (*Schema) String

func (s *Schema) String() string

func (*Schema) UpdateColumnOrders

func (s *Schema) UpdateColumnOrders(orders []parquet.ColumnOrder) error

UpdateColumnOrders must get a slice that is the same length as the number of leaf columns and is used to update the schema metadata Column Orders. len(orders) must equal s.NumColumns()

type SortOrder

type SortOrder int8

SortOrder mirrors the parquet.thrift sort order type

const (
	SortSIGNED SortOrder = iota
	SortUNSIGNED
	SortUNKNOWN
)

Constants for the Stat sort order definitions

func DefaultSortOrder

func DefaultSortOrder(primitive format.Type) SortOrder

DefaultSortOrder returns the default stat sort order for the given physical type

func GetLogicalSortOrder

func GetLogicalSortOrder(logical LogicalType, primitive format.Type) SortOrder

GetLogicalSortOrder returns the default sort order for this logical type or falls back to the default sort order for the physical type if not valid

func GetSortOrder

func GetSortOrder(convert ConvertedType, primitive format.Type) SortOrder

GetSortOrder defaults to the sort order based on the physical type if convert is ConvertedTypes.None, otherwise determines the sort order by the converted type.

type StringLogicalType

type StringLogicalType struct {
	// contains filtered or unexported fields
}

StringLogicalType is a UTF8 string, only usable with ByteArray and FixedLenByteArray

func (StringLogicalType) Equals

func (StringLogicalType) Equals(rhs LogicalType) bool

func (StringLogicalType) IsApplicable

func (StringLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (StringLogicalType) IsCompatible

func (StringLogicalType) IsCompatible(t ConvertedType, dec DecimalMetadata) bool

func (StringLogicalType) IsNested

func (StringLogicalType) IsNested() bool

func (StringLogicalType) IsNone

func (StringLogicalType) IsNone() bool

func (StringLogicalType) IsSerialized

func (StringLogicalType) IsSerialized() bool

func (StringLogicalType) IsValid

func (StringLogicalType) IsValid() bool

func (StringLogicalType) MarshalJSON

func (StringLogicalType) MarshalJSON() ([]byte, error)

func (StringLogicalType) SortOrder

func (StringLogicalType) SortOrder() SortOrder

func (StringLogicalType) String

func (StringLogicalType) String() string

func (StringLogicalType) ToConvertedType

func (StringLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type TemporalLogicalType

type TemporalLogicalType interface {
	LogicalType
	IsAdjustedToUTC() bool
	TimeUnit() TimeUnitType
}

TemporalLogicalType is a smaller interface for Time based logical types like Time / Timestamp

type TimeLogicalType

type TimeLogicalType struct {
	// contains filtered or unexported fields
}

TimeLogicalType is a time type without a date and must be an int32 for milliseconds, or an int64 for micro or nano seconds.

func (TimeLogicalType) Equals

func (t TimeLogicalType) Equals(rhs LogicalType) bool

func (TimeLogicalType) IsAdjustedToUTC

func (t TimeLogicalType) IsAdjustedToUTC() bool

func (TimeLogicalType) IsApplicable

func (t TimeLogicalType) IsApplicable(typ parquet.Type, _ int32) bool

func (TimeLogicalType) IsCompatible

func (t TimeLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (TimeLogicalType) IsNested

func (TimeLogicalType) IsNested() bool

func (TimeLogicalType) IsNone

func (TimeLogicalType) IsNone() bool

func (TimeLogicalType) IsSerialized

func (TimeLogicalType) IsSerialized() bool

func (TimeLogicalType) IsValid

func (TimeLogicalType) IsValid() bool

func (TimeLogicalType) MarshalJSON

func (t TimeLogicalType) MarshalJSON() ([]byte, error)

func (TimeLogicalType) SortOrder

func (TimeLogicalType) SortOrder() SortOrder

func (TimeLogicalType) String

func (t TimeLogicalType) String() string

func (TimeLogicalType) TimeUnit

func (t TimeLogicalType) TimeUnit() TimeUnitType

func (TimeLogicalType) ToConvertedType

func (t TimeLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type TimeUnitType

type TimeUnitType int

TimeUnitType is an enum for denoting whether a time based logical type is using milliseconds, microseconds or nanoseconds.

const (
	TimeUnitMillis TimeUnitType = iota
	TimeUnitMicros
	TimeUnitNanos
	TimeUnitUnknown
)

Constants for the TimeUnitType

type TimestampLogicalType

type TimestampLogicalType struct {
	// contains filtered or unexported fields
}

TimestampLogicalType represents an int64 number that can be decoded into a year, month, day, hour, minute, second, and subsecond

func (TimestampLogicalType) Equals

func (t TimestampLogicalType) Equals(rhs LogicalType) bool

func (TimestampLogicalType) IsAdjustedToUTC

func (t TimestampLogicalType) IsAdjustedToUTC() bool

func (TimestampLogicalType) IsApplicable

func (TimestampLogicalType) IsApplicable(t parquet.Type, _ int32) bool

func (TimestampLogicalType) IsCompatible

func (t TimestampLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (TimestampLogicalType) IsFromConvertedType

func (t TimestampLogicalType) IsFromConvertedType() bool

func (TimestampLogicalType) IsNested

func (TimestampLogicalType) IsNested() bool

func (TimestampLogicalType) IsNone

func (TimestampLogicalType) IsNone() bool

func (TimestampLogicalType) IsSerialized

func (t TimestampLogicalType) IsSerialized() bool

func (TimestampLogicalType) IsValid

func (TimestampLogicalType) IsValid() bool

func (TimestampLogicalType) MarshalJSON

func (t TimestampLogicalType) MarshalJSON() ([]byte, error)

func (TimestampLogicalType) SortOrder

func (TimestampLogicalType) SortOrder() SortOrder

func (TimestampLogicalType) String

func (t TimestampLogicalType) String() string

func (TimestampLogicalType) TimeUnit

func (t TimestampLogicalType) TimeUnit() TimeUnitType

func (TimestampLogicalType) ToConvertedType

func (t TimestampLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type TimestampOpt

type TimestampOpt func(*TimestampLogicalType)

TimestampOpt options used with New Timestamp Logical Type

func WithTSForceConverted

func WithTSForceConverted() TimestampOpt

WithTSForceConverted enable force converted mode

func WithTSFromConverted

func WithTSFromConverted() TimestampOpt

WithTSFromConverted enable the timestamp logical type to be constructed from a converted type.

func WithTSIsAdjustedToUTC

func WithTSIsAdjustedToUTC() TimestampOpt

WithTSIsAdjustedToUTC sets the IsAdjustedToUTC field of the timestamp type.

func WithTSTimeUnitType

func WithTSTimeUnitType(unit TimeUnitType) TimestampOpt

WithTSTimeUnitType sets the time unit for the timestamp type

type UUIDLogicalType

type UUIDLogicalType struct {
	// contains filtered or unexported fields
}

UUIDLogicalType can only be used with a FixedLength byte array column that is exactly 16 bytes long

func (UUIDLogicalType) Equals

func (UUIDLogicalType) Equals(rhs LogicalType) bool

func (UUIDLogicalType) IsApplicable

func (UUIDLogicalType) IsApplicable(t parquet.Type, tlen int32) bool

func (UUIDLogicalType) IsCompatible

func (UUIDLogicalType) IsCompatible(c ConvertedType, dec DecimalMetadata) bool

func (UUIDLogicalType) IsNested

func (UUIDLogicalType) IsNested() bool

func (UUIDLogicalType) IsNone

func (UUIDLogicalType) IsNone() bool

func (UUIDLogicalType) IsSerialized

func (UUIDLogicalType) IsSerialized() bool

func (UUIDLogicalType) IsValid

func (UUIDLogicalType) IsValid() bool

func (UUIDLogicalType) MarshalJSON

func (UUIDLogicalType) MarshalJSON() ([]byte, error)

func (UUIDLogicalType) SortOrder

func (UUIDLogicalType) SortOrder() SortOrder

func (UUIDLogicalType) String

func (UUIDLogicalType) String() string

func (UUIDLogicalType) ToConvertedType

func (UUIDLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type UnknownLogicalType

type UnknownLogicalType struct {
	// contains filtered or unexported fields
}

UnknownLogicalType is a type that is essentially a placeholder for when we don't know the type.

func (UnknownLogicalType) Equals

func (UnknownLogicalType) Equals(rhs LogicalType) bool

func (UnknownLogicalType) IsApplicable

func (UnknownLogicalType) IsApplicable(parquet.Type, int32) bool

func (UnknownLogicalType) IsCompatible

func (UnknownLogicalType) IsNested

func (UnknownLogicalType) IsNested() bool

func (UnknownLogicalType) IsNone

func (UnknownLogicalType) IsNone() bool

func (UnknownLogicalType) IsSerialized

func (UnknownLogicalType) IsSerialized() bool

func (UnknownLogicalType) IsValid

func (UnknownLogicalType) IsValid() bool

func (UnknownLogicalType) MarshalJSON

func (UnknownLogicalType) MarshalJSON() ([]byte, error)

func (UnknownLogicalType) SortOrder

func (UnknownLogicalType) SortOrder() SortOrder

func (UnknownLogicalType) String

func (UnknownLogicalType) String() string

func (UnknownLogicalType) ToConvertedType

func (UnknownLogicalType) ToConvertedType() (ConvertedType, DecimalMetadata)

type Visitor

type Visitor interface {
	VisitPre(Node) bool
	VisitPost(Node)
}

Visitor is an interface for creating functionality to walk the schema tree.

A visitor can be passed to the Visit function of a Node in order to walk the tree. VisitPre is called the first time a node is encountered. If it is a group node, the return is checked and if it is false, the children will be skipped.

VisitPost is called after visiting any children

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL