Documentation ¶
Index ¶
- Constants
- Variables
- func ConvertArrowToParquetSchema(schema *arrow.Schema) ([]string, error)
- func GenerateFieldTag(field reflect.StructField, value reflect.Value) (tag string, err error)
- func GetDefinedTypeTag(typ reflect.Type, value reflect.Value) (tags map[string]string, err error)
- func IsInternalField(name string) bool
- func IsPointerGoTypeKind(kind reflect.Kind) bool
- func IsPointerOrInterface(kind reflect.Kind) bool
- func IsPrimitiveGoTypeKind(kind reflect.Kind) bool
- func IsPrimitiveOrPointerGoKind(kind reflect.Kind) bool
- func IsPrimitiveParquetType(typ string) bool
- type Item
- type JSONSchemaItemType
- type PathMapType
- type SchemaHandler
- func NewSchemaHandlerFromArrow(arrowSchema *arrow.Schema) (*SchemaHandler, error)
- func NewSchemaHandlerFromJSON(str string) (sh *SchemaHandler, err error)
- func NewSchemaHandlerFromMetadata(mds []string) (*SchemaHandler, error)
- func NewSchemaHandlerFromSchemaHandler(sh *SchemaHandler) *SchemaHandler
- func NewSchemaHandlerFromSchemaList(schemas []*parquet.SchemaElement) *SchemaHandler
- func NewSchemaHandlerFromStruct(obj interface{}, skipNoTagField bool) (sh *SchemaHandler, err error)
- func (sh *SchemaHandler) ConvertToInPathStr(pathStr string) (string, error)
- func (sh *SchemaHandler) CreateInExMap()
- func (sh *SchemaHandler) GetColumnNum() int64
- func (sh *SchemaHandler) GetExName(index int) string
- func (sh *SchemaHandler) GetInName(index int) string
- func (sh *SchemaHandler) GetRepetitionLevelIndex(path []string, rl int32) (int32, error)
- func (sh *SchemaHandler) GetRepetitionType(path []string) (parquet.FieldRepetitionType, error)
- func (sh *SchemaHandler) GetRootExName() string
- func (sh *SchemaHandler) GetRootInName() string
- func (sh *SchemaHandler) GetType(prefixPath string) (reflect.Type, error)
- func (sh *SchemaHandler) GetTypes() []reflect.Type
- func (sh *SchemaHandler) MaxDefinitionLevel(path []string) (int32, error)
- func (sh *SchemaHandler) MaxRepetitionLevel(path []string) (int32, error)
Constants ¶
const ( ParquetFieldName = "name" ParquetType = "type" ParquetConvertedType = "convertedtype" ParquetValueType = "valuetype" ParquetRepetitionType = "repetitiontype" ParquetKeyType = "keytype" ParquetKeyConvertedType = "keyconvertedtype" ParquetValueConvertedType = "valueconvertedtype" ParquetLogicalType = "logicaltype" RepetitionTypeOptional = "OPTIONAL" // Parquet type https://github.com/xitongsys/parquet-go?tab=readme-ov-file#type ParquetTypeBoolean = "BOOLEAN" ParquetTypeInt32 = "INT32" ParquetTypeInt64 = "INT64" ParquetTypeInt96 = "INT96" ParquetTypeFloat = "FLOAT" ParquetTypeDouble = "DOUBLE" ParquetTypeByteArray = "BYTE_ARRAY" ParquetTypeFixedLenByteArray = "FIXED_LEN_BYTE_ARRAY" ParquetTypeMap = "MAP" ParquetTypeList = "LIST" ParquetTypeStruct = "STRUCT" // Parquet converted type https://github.com/apache/parquet-format/blob/97ed3ba484d3b5a7b58678457eceb518b037ee04/LogicalTypes.md#L136 ConvertedTypeUtf8 = "UTF8" ConvertedTypeMap = "MAP" ConvertedTypeList = "LIST" ConvertedTypeEnum = "ENUM" ConvertedTypeDecimal = "DECIMAL" ConvertedTypeDate = "DATE" ConvertedTypeTimeMills = "TIME_MILLIS" ConvertedTypeTimeMicros = "TIME_MICROS" ConvertedTypeTimestampMills = "TIMESTAMP_MILLIS" ConvertedTypeTimestampMicros = "TIMESTAMP_MICROS" ConvertedTypeUint8 = "UINT_8" ConvertedTypeUnint16 = "UINT_16" ConvertedTypeUnint32 = "UINT_32" ConvertedTypeUnint64 = "UINT_64" ConvertedTypeInt8 = "INT_8" ConvertedTypeInt16 = "INT_16" ConvertedTypeInt32 = "INT_32" ConvertedTypeInt64 = "INT_64" ConvertedTypeJson = "JSON" ConvertedTypeBson = "BSON" ConvertedTypeInterval = "INTERVAL" // Parquet logical type LogicalTypeString = "STRING" ProtoEnumMethodName = "Enum" ProtoStringMethodName = "String" )
Variables ¶
var ProtoTimestampType = reflect.TypeOf(timestamppb.Timestamp{})
Functions ¶
func ConvertArrowToParquetSchema ¶
ConvertArrowToParquetSchema converts arrow schema to representation understandable by parquet-go library. We need this coversion and can't directly use arrow format because the go parquet type contains metadata which the base writer is using to determine the size of the objects.
func GenerateFieldTag ¶
Generate the tag for the struct field when there is no predefined tag
func GetDefinedTypeTag ¶
Get the tags from given reflect type through switch case, which has better performance than map. Given this function is per event call, the performance matters when there are many events to process.
func IsInternalField ¶ added in v1.0.3
func IsPointerGoTypeKind ¶
func IsPointerOrInterface ¶ added in v1.0.3
func IsPrimitiveGoTypeKind ¶
func IsPrimitiveParquetType ¶
Types ¶
type JSONSchemaItemType ¶
type JSONSchemaItemType struct { Tag string `json:"Tag"` Fields []*JSONSchemaItemType `json:"Fields,omitempty"` }
func NewJSONSchemaItem ¶
func NewJSONSchemaItem() *JSONSchemaItemType
type PathMapType ¶
type PathMapType struct { Path string Children map[string]*PathMapType }
PathMapType records the path and its children; This is used in Marshal for improve performance.
func NewPathMap ¶
func NewPathMap(path string) *PathMapType
func (*PathMapType) Add ¶
func (pmt *PathMapType) Add(path []string)
type SchemaHandler ¶
type SchemaHandler struct { SchemaElements []*parquet.SchemaElement MapIndex map[string]int32 IndexMap map[int32]string PathMap *PathMapType Infos []*common.Tag InPathToExPath map[string]string ExPathToInPath map[string]string ValueColumns []string }
SchemaHandler stores the schema data
func NewSchemaHandlerFromArrow ¶
func NewSchemaHandlerFromArrow(arrowSchema *arrow.Schema) ( *SchemaHandler, error)
NewSchemaHandlerFromArrow creates a schema handler from arrow format. This handler is needed since the base ParquetWriter does not understand arrow schema and we need to translate it to the native format which the parquet-go library understands.
func NewSchemaHandlerFromJSON ¶
func NewSchemaHandlerFromJSON(str string) (sh *SchemaHandler, err error)
func NewSchemaHandlerFromMetadata ¶
func NewSchemaHandlerFromMetadata(mds []string) (*SchemaHandler, error)
Create a schema handler from CSV metadata
func NewSchemaHandlerFromSchemaHandler ¶
func NewSchemaHandlerFromSchemaHandler(sh *SchemaHandler) *SchemaHandler
func NewSchemaHandlerFromSchemaList ¶
func NewSchemaHandlerFromSchemaList(schemas []*parquet.SchemaElement) *SchemaHandler
NewSchemaHandlerFromSchemaList creates schema handler from schema list
func NewSchemaHandlerFromStruct ¶
func NewSchemaHandlerFromStruct(obj interface{}, skipNoTagField bool) (sh *SchemaHandler, err error)
DFS traverse the obj underlying struct type. If field is strucd recursive visit its sub-fields. Generate the schemas for each non-struct field and create SchemaHandler from schema list. Primitive type is just generating one schema for it. Slice has its own special handling: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists Map also requires special handling: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
func (*SchemaHandler) ConvertToInPathStr ¶
func (sh *SchemaHandler) ConvertToInPathStr(pathStr string) (string, error)
Convert a path to internal path
func (*SchemaHandler) CreateInExMap ¶
func (sh *SchemaHandler) CreateInExMap()
func (*SchemaHandler) GetColumnNum ¶
func (sh *SchemaHandler) GetColumnNum() int64
func (*SchemaHandler) GetExName ¶
func (sh *SchemaHandler) GetExName(index int) string
func (*SchemaHandler) GetInName ¶
func (sh *SchemaHandler) GetInName(index int) string
func (*SchemaHandler) GetRepetitionLevelIndex ¶
func (sh *SchemaHandler) GetRepetitionLevelIndex(path []string, rl int32) (int32, error)
MaxRepetitionLevel returns the max repetition level type of a column by it's schema path
func (*SchemaHandler) GetRepetitionType ¶
func (sh *SchemaHandler) GetRepetitionType(path []string) (parquet.FieldRepetitionType, error)
GetRepetitionType returns the repetition type of a column by it's schema path
func (*SchemaHandler) GetRootExName ¶
func (sh *SchemaHandler) GetRootExName() string
func (*SchemaHandler) GetRootInName ¶
func (sh *SchemaHandler) GetRootInName() string
Get root name from the schema handler
func (*SchemaHandler) GetType ¶
func (sh *SchemaHandler) GetType(prefixPath string) (reflect.Type, error)
func (*SchemaHandler) GetTypes ¶
func (sh *SchemaHandler) GetTypes() []reflect.Type
Get object type from schema by reflect
func (*SchemaHandler) MaxDefinitionLevel ¶
func (sh *SchemaHandler) MaxDefinitionLevel(path []string) (int32, error)
MaxDefinitionLevel returns the max definition level type of a column by it's schema path
func (*SchemaHandler) MaxRepetitionLevel ¶
func (sh *SchemaHandler) MaxRepetitionLevel(path []string) (int32, error)
MaxRepetitionLevel returns the max repetition level type of a column by it's schema path