schema

package
v1.0.7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 13, 2024 License: Apache-2.0 Imports: 10 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ParquetFieldName = "name"

	ParquetType               = "type"
	ParquetConvertedType      = "convertedtype"
	ParquetValueType          = "valuetype"
	ParquetRepetitionType     = "repetitiontype"
	ParquetKeyType            = "keytype"
	ParquetKeyConvertedType   = "keyconvertedtype"
	ParquetValueConvertedType = "valueconvertedtype"
	ParquetLogicalType        = "logicaltype"

	RepetitionTypeOptional = "OPTIONAL"

	// Parquet type https://github.com/xitongsys/parquet-go?tab=readme-ov-file#type
	ParquetTypeBoolean           = "BOOLEAN"
	ParquetTypeInt32             = "INT32"
	ParquetTypeInt64             = "INT64"
	ParquetTypeInt96             = "INT96"
	ParquetTypeFloat             = "FLOAT"
	ParquetTypeDouble            = "DOUBLE"
	ParquetTypeByteArray         = "BYTE_ARRAY"
	ParquetTypeFixedLenByteArray = "FIXED_LEN_BYTE_ARRAY"

	ParquetTypeMap    = "MAP"
	ParquetTypeList   = "LIST"
	ParquetTypeStruct = "STRUCT"

	// Parquet converted type https://github.com/apache/parquet-format/blob/97ed3ba484d3b5a7b58678457eceb518b037ee04/LogicalTypes.md#L136
	ConvertedTypeUtf8            = "UTF8"
	ConvertedTypeMap             = "MAP"
	ConvertedTypeList            = "LIST"
	ConvertedTypeEnum            = "ENUM"
	ConvertedTypeDecimal         = "DECIMAL"
	ConvertedTypeDate            = "DATE"
	ConvertedTypeTimeMills       = "TIME_MILLIS"
	ConvertedTypeTimeMicros      = "TIME_MICROS"
	ConvertedTypeTimestampMills  = "TIMESTAMP_MILLIS"
	ConvertedTypeTimestampMicros = "TIMESTAMP_MICROS"
	ConvertedTypeUint8           = "UINT_8"
	ConvertedTypeUnint16         = "UINT_16"
	ConvertedTypeUnint32         = "UINT_32"
	ConvertedTypeUnint64         = "UINT_64"
	ConvertedTypeInt8            = "INT_8"
	ConvertedTypeInt16           = "INT_16"
	ConvertedTypeInt32           = "INT_32"
	ConvertedTypeInt64           = "INT_64"
	ConvertedTypeJson            = "JSON"
	ConvertedTypeBson            = "BSON"
	ConvertedTypeInterval        = "INTERVAL"

	// Parquet logical type
	LogicalTypeString = "STRING"

	ProtoEnumMethodName   = "Enum"
	ProtoStringMethodName = "String"
)

Variables

View Source
var ProtoTimestampType = reflect.TypeOf(timestamppb.Timestamp{})

Functions

func ConvertArrowToParquetSchema

func ConvertArrowToParquetSchema(schema *arrow.Schema) ([]string, error)

ConvertArrowToParquetSchema converts arrow schema to representation understandable by parquet-go library. We need this coversion and can't directly use arrow format because the go parquet type contains metadata which the base writer is using to determine the size of the objects.

func GenerateFieldTag

func GenerateFieldTag(field reflect.StructField, value reflect.Value) (tag string, err error)

Generate the tag for the struct field when there is no predefined tag

func GetDefinedTypeTag

func GetDefinedTypeTag(typ reflect.Type, value reflect.Value) (tags map[string]string, err error)

Get the tags from given reflect type through switch case, which has better performance than map. Given this function is per event call, the performance matters when there are many events to process.

func IsInternalField added in v1.0.3

func IsInternalField(name string) bool

func IsPointerGoTypeKind

func IsPointerGoTypeKind(kind reflect.Kind) bool

func IsPointerOrInterface added in v1.0.3

func IsPointerOrInterface(kind reflect.Kind) bool

func IsPrimitiveGoTypeKind

func IsPrimitiveGoTypeKind(kind reflect.Kind) bool

func IsPrimitiveOrPointerGoKind

func IsPrimitiveOrPointerGoKind(kind reflect.Kind) bool

func IsPrimitiveParquetType

func IsPrimitiveParquetType(typ string) bool

Types

type Item

type Item struct {
	GoType  reflect.Type
	GoValue reflect.Value
	Info    *common.Tag
}

func NewItem

func NewItem() *Item

type JSONSchemaItemType

type JSONSchemaItemType struct {
	Tag    string                `json:"Tag"`
	Fields []*JSONSchemaItemType `json:"Fields,omitempty"`
}

func NewJSONSchemaItem

func NewJSONSchemaItem() *JSONSchemaItemType

type PathMapType

type PathMapType struct {
	Path     string
	Children map[string]*PathMapType
}

PathMapType records the path and its children; This is used in Marshal for improve performance.

func NewPathMap

func NewPathMap(path string) *PathMapType

func (*PathMapType) Add

func (pmt *PathMapType) Add(path []string)

type SchemaHandler

type SchemaHandler struct {
	SchemaElements []*parquet.SchemaElement
	MapIndex       map[string]int32
	IndexMap       map[int32]string
	PathMap        *PathMapType
	Infos          []*common.Tag

	InPathToExPath map[string]string
	ExPathToInPath map[string]string

	ValueColumns []string
}

SchemaHandler stores the schema data

func NewSchemaHandlerFromArrow

func NewSchemaHandlerFromArrow(arrowSchema *arrow.Schema) (
	*SchemaHandler, error)

NewSchemaHandlerFromArrow creates a schema handler from arrow format. This handler is needed since the base ParquetWriter does not understand arrow schema and we need to translate it to the native format which the parquet-go library understands.

func NewSchemaHandlerFromJSON

func NewSchemaHandlerFromJSON(str string) (sh *SchemaHandler, err error)

func NewSchemaHandlerFromMetadata

func NewSchemaHandlerFromMetadata(mds []string) (*SchemaHandler, error)

Create a schema handler from CSV metadata

func NewSchemaHandlerFromSchemaHandler

func NewSchemaHandlerFromSchemaHandler(sh *SchemaHandler) *SchemaHandler

func NewSchemaHandlerFromSchemaList

func NewSchemaHandlerFromSchemaList(schemas []*parquet.SchemaElement) *SchemaHandler

NewSchemaHandlerFromSchemaList creates schema handler from schema list

func NewSchemaHandlerFromStruct

func NewSchemaHandlerFromStruct(obj interface{}, skipNoTagField bool) (sh *SchemaHandler, err error)

DFS traverse the obj underlying struct type. If field is strucd recursive visit its sub-fields. Generate the schemas for each non-struct field and create SchemaHandler from schema list. Primitive type is just generating one schema for it. Slice has its own special handling: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists Map also requires special handling: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps

func (*SchemaHandler) ConvertToInPathStr

func (sh *SchemaHandler) ConvertToInPathStr(pathStr string) (string, error)

Convert a path to internal path

func (*SchemaHandler) CreateInExMap

func (sh *SchemaHandler) CreateInExMap()

func (*SchemaHandler) GetColumnNum

func (sh *SchemaHandler) GetColumnNum() int64

func (*SchemaHandler) GetExName

func (sh *SchemaHandler) GetExName(index int) string

func (*SchemaHandler) GetInName

func (sh *SchemaHandler) GetInName(index int) string

func (*SchemaHandler) GetRepetitionLevelIndex

func (sh *SchemaHandler) GetRepetitionLevelIndex(path []string, rl int32) (int32, error)

MaxRepetitionLevel returns the max repetition level type of a column by it's schema path

func (*SchemaHandler) GetRepetitionType

func (sh *SchemaHandler) GetRepetitionType(path []string) (parquet.FieldRepetitionType, error)

GetRepetitionType returns the repetition type of a column by it's schema path

func (*SchemaHandler) GetRootExName

func (sh *SchemaHandler) GetRootExName() string

func (*SchemaHandler) GetRootInName

func (sh *SchemaHandler) GetRootInName() string

Get root name from the schema handler

func (*SchemaHandler) GetType

func (sh *SchemaHandler) GetType(prefixPath string) (reflect.Type, error)

func (*SchemaHandler) GetTypes

func (sh *SchemaHandler) GetTypes() []reflect.Type

Get object type from schema by reflect

func (*SchemaHandler) MaxDefinitionLevel

func (sh *SchemaHandler) MaxDefinitionLevel(path []string) (int32, error)

MaxDefinitionLevel returns the max definition level type of a column by it's schema path

func (*SchemaHandler) MaxRepetitionLevel

func (sh *SchemaHandler) MaxRepetitionLevel(path []string) (int32, error)

MaxRepetitionLevel returns the max repetition level type of a column by it's schema path

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL