parquet

package
v1.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 24, 2019 License: Apache-2.0 Imports: 5 Imported by: 352

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ColumnChunk_FilePath_DEFAULT string
View Source
var ColumnMetaData_DictionaryPageOffset_DEFAULT int64
View Source
var ColumnMetaData_EncodingStats_DEFAULT []*PageEncodingStats
View Source
var ColumnMetaData_IndexPageOffset_DEFAULT int64
View Source
var ColumnMetaData_KeyValueMetadata_DEFAULT []*KeyValue
View Source
var DataPageHeaderV2_IsCompressed_DEFAULT bool = true
View Source
var DictionaryPageHeader_IsSorted_DEFAULT bool
View Source
var FileMetaData_CreatedBy_DEFAULT string
View Source
var FileMetaData_KeyValueMetadata_DEFAULT []*KeyValue
View Source
var GoUnusedProtection__ int
View Source
var KeyValue_Value_DEFAULT string
View Source
var PageHeader_Crc_DEFAULT int32
View Source
var RowGroup_SortingColumns_DEFAULT []*SortingColumn
View Source
var SchemaElement_FieldID_DEFAULT int32
View Source
var SchemaElement_NumChildren_DEFAULT int32
View Source
var SchemaElement_Precision_DEFAULT int32
View Source
var SchemaElement_Scale_DEFAULT int32
View Source
var SchemaElement_TypeLength_DEFAULT int32
View Source
var Statistics_DistinctCount_DEFAULT int64
View Source
var Statistics_Max_DEFAULT []byte
View Source
var Statistics_Min_DEFAULT []byte
View Source
var Statistics_NullCount_DEFAULT int64

Functions

This section is empty.

Types

type ColumnChunk

type ColumnChunk struct {
	FilePath   *string         `thrift:"file_path,1" db:"file_path" json:"file_path,omitempty"`
	FileOffset int64           `thrift:"file_offset,2,required" db:"file_offset" json:"file_offset"`
	MetaData   *ColumnMetaData `thrift:"meta_data,3" db:"meta_data" json:"meta_data,omitempty"`
}

Attributes:

  • FilePath: File where column data is stored. If not set, assumed to be same file as

metadata. This path is relative to the current file.

  • FileOffset: Byte offset in file_path to the ColumnMetaData *
  • MetaData: Column metadata for this chunk. This is the same content as what is at

file_path/file_offset. Having it here has it replicated in the file metadata.

func NewColumnChunk

func NewColumnChunk() *ColumnChunk

func (*ColumnChunk) GetFileOffset

func (p *ColumnChunk) GetFileOffset() int64

func (*ColumnChunk) GetFilePath

func (p *ColumnChunk) GetFilePath() string

func (*ColumnChunk) GetMetaData

func (p *ColumnChunk) GetMetaData() *ColumnMetaData

func (*ColumnChunk) IsSetFilePath

func (p *ColumnChunk) IsSetFilePath() bool

func (*ColumnChunk) IsSetMetaData

func (p *ColumnChunk) IsSetMetaData() bool

func (*ColumnChunk) Read

func (p *ColumnChunk) Read(iprot thrift.TProtocol) error

func (*ColumnChunk) ReadField1

func (p *ColumnChunk) ReadField1(iprot thrift.TProtocol) error

func (*ColumnChunk) ReadField2

func (p *ColumnChunk) ReadField2(iprot thrift.TProtocol) error

func (*ColumnChunk) ReadField3

func (p *ColumnChunk) ReadField3(iprot thrift.TProtocol) error

func (*ColumnChunk) String

func (p *ColumnChunk) String() string

func (*ColumnChunk) Write

func (p *ColumnChunk) Write(oprot thrift.TProtocol) error

type ColumnMetaData

type ColumnMetaData struct {
	Type                  Type                 `thrift:"type,1,required" db:"type" json:"type"`
	Encodings             []Encoding           `thrift:"encodings,2,required" db:"encodings" json:"encodings"`
	PathInSchema          []string             `thrift:"path_in_schema,3,required" db:"path_in_schema" json:"path_in_schema"`
	Codec                 CompressionCodec     `thrift:"codec,4,required" db:"codec" json:"codec"`
	NumValues             int64                `thrift:"num_values,5,required" db:"num_values" json:"num_values"`
	TotalUncompressedSize int64                `thrift:"total_uncompressed_size,6,required" db:"total_uncompressed_size" json:"total_uncompressed_size"`
	TotalCompressedSize   int64                `thrift:"total_compressed_size,7,required" db:"total_compressed_size" json:"total_compressed_size"`
	KeyValueMetadata      []*KeyValue          `thrift:"key_value_metadata,8" db:"key_value_metadata" json:"key_value_metadata,omitempty"`
	DataPageOffset        int64                `thrift:"data_page_offset,9,required" db:"data_page_offset" json:"data_page_offset"`
	IndexPageOffset       *int64               `thrift:"index_page_offset,10" db:"index_page_offset" json:"index_page_offset,omitempty"`
	DictionaryPageOffset  *int64               `thrift:"dictionary_page_offset,11" db:"dictionary_page_offset" json:"dictionary_page_offset,omitempty"`
	Statistics            *Statistics          `thrift:"statistics,12" db:"statistics" json:"statistics,omitempty"`
	EncodingStats         []*PageEncodingStats `thrift:"encoding_stats,13" db:"encoding_stats" json:"encoding_stats,omitempty"`
}

Description for column metadata

Attributes:

  • Type: Type of this column *
  • Encodings: Set of all encodings used for this column. The purpose is to validate

whether we can decode those pages. *

  • PathInSchema: Path in schema *
  • Codec: Compression codec *
  • NumValues: Number of values in this column *
  • TotalUncompressedSize: total byte size of all uncompressed pages in this column chunk (including the headers) *
  • TotalCompressedSize: total byte size of all compressed pages in this column chunk (including the headers) *
  • KeyValueMetadata: Optional key/value metadata *
  • DataPageOffset: Byte offset from beginning of file to first data page *
  • IndexPageOffset: Byte offset from beginning of file to root index page *
  • DictionaryPageOffset: Byte offset from the beginning of file to first (only) dictionary page *
  • Statistics: optional statistics for this column chunk
  • EncodingStats: Set of all encodings used for pages in this column chunk.

This information can be used to determine if all data pages are dictionary encoded for example *

var ColumnChunk_MetaData_DEFAULT *ColumnMetaData

func NewColumnMetaData

func NewColumnMetaData() *ColumnMetaData

func (*ColumnMetaData) GetCodec

func (p *ColumnMetaData) GetCodec() CompressionCodec

func (*ColumnMetaData) GetDataPageOffset

func (p *ColumnMetaData) GetDataPageOffset() int64

func (*ColumnMetaData) GetDictionaryPageOffset

func (p *ColumnMetaData) GetDictionaryPageOffset() int64

func (*ColumnMetaData) GetEncodingStats

func (p *ColumnMetaData) GetEncodingStats() []*PageEncodingStats

func (*ColumnMetaData) GetEncodings

func (p *ColumnMetaData) GetEncodings() []Encoding

func (*ColumnMetaData) GetIndexPageOffset

func (p *ColumnMetaData) GetIndexPageOffset() int64

func (*ColumnMetaData) GetKeyValueMetadata

func (p *ColumnMetaData) GetKeyValueMetadata() []*KeyValue

func (*ColumnMetaData) GetNumValues

func (p *ColumnMetaData) GetNumValues() int64

func (*ColumnMetaData) GetPathInSchema

func (p *ColumnMetaData) GetPathInSchema() []string

func (*ColumnMetaData) GetStatistics

func (p *ColumnMetaData) GetStatistics() *Statistics

func (*ColumnMetaData) GetTotalCompressedSize

func (p *ColumnMetaData) GetTotalCompressedSize() int64

func (*ColumnMetaData) GetTotalUncompressedSize

func (p *ColumnMetaData) GetTotalUncompressedSize() int64

func (*ColumnMetaData) GetType

func (p *ColumnMetaData) GetType() Type

func (*ColumnMetaData) IsSetDictionaryPageOffset

func (p *ColumnMetaData) IsSetDictionaryPageOffset() bool

func (*ColumnMetaData) IsSetEncodingStats

func (p *ColumnMetaData) IsSetEncodingStats() bool

func (*ColumnMetaData) IsSetIndexPageOffset

func (p *ColumnMetaData) IsSetIndexPageOffset() bool

func (*ColumnMetaData) IsSetKeyValueMetadata

func (p *ColumnMetaData) IsSetKeyValueMetadata() bool

func (*ColumnMetaData) IsSetStatistics

func (p *ColumnMetaData) IsSetStatistics() bool

func (*ColumnMetaData) Read

func (p *ColumnMetaData) Read(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField1

func (p *ColumnMetaData) ReadField1(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField10

func (p *ColumnMetaData) ReadField10(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField11

func (p *ColumnMetaData) ReadField11(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField12

func (p *ColumnMetaData) ReadField12(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField13

func (p *ColumnMetaData) ReadField13(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField2

func (p *ColumnMetaData) ReadField2(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField3

func (p *ColumnMetaData) ReadField3(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField4

func (p *ColumnMetaData) ReadField4(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField5

func (p *ColumnMetaData) ReadField5(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField6

func (p *ColumnMetaData) ReadField6(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField7

func (p *ColumnMetaData) ReadField7(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField8

func (p *ColumnMetaData) ReadField8(iprot thrift.TProtocol) error

func (*ColumnMetaData) ReadField9

func (p *ColumnMetaData) ReadField9(iprot thrift.TProtocol) error

func (*ColumnMetaData) String

func (p *ColumnMetaData) String() string

func (*ColumnMetaData) Write

func (p *ColumnMetaData) Write(oprot thrift.TProtocol) error

type CompressionCodec

type CompressionCodec int64

Supported compression algorithms.

const (
	CompressionCodec_UNCOMPRESSED CompressionCodec = 0
	CompressionCodec_SNAPPY       CompressionCodec = 1
	CompressionCodec_GZIP         CompressionCodec = 2
	CompressionCodec_LZO          CompressionCodec = 3
)

func CompressionCodecFromString

func CompressionCodecFromString(s string) (CompressionCodec, error)

func CompressionCodecPtr

func CompressionCodecPtr(v CompressionCodec) *CompressionCodec

func (CompressionCodec) MarshalText

func (p CompressionCodec) MarshalText() ([]byte, error)

func (*CompressionCodec) Scan

func (p *CompressionCodec) Scan(value interface{}) error

func (CompressionCodec) String

func (p CompressionCodec) String() string

func (*CompressionCodec) UnmarshalText

func (p *CompressionCodec) UnmarshalText(text []byte) error

func (*CompressionCodec) Value

func (p *CompressionCodec) Value() (driver.Value, error)

type ConvertedType

type ConvertedType int64

Common types used by frameworks(e.g. hive, pig) using parquet. This helps map between types in those frameworks to the base types in parquet. This is only metadata and not needed to read or write the data.

const (
	ConvertedType_UTF8             ConvertedType = 0
	ConvertedType_MAP              ConvertedType = 1
	ConvertedType_MAP_KEY_VALUE    ConvertedType = 2
	ConvertedType_LIST             ConvertedType = 3
	ConvertedType_ENUM             ConvertedType = 4
	ConvertedType_DECIMAL          ConvertedType = 5
	ConvertedType_DATE             ConvertedType = 6
	ConvertedType_TIME_MILLIS      ConvertedType = 7
	ConvertedType_TIME_MICROS      ConvertedType = 8
	ConvertedType_TIMESTAMP_MILLIS ConvertedType = 9
	ConvertedType_TIMESTAMP_MICROS ConvertedType = 10
	ConvertedType_UINT_8           ConvertedType = 11
	ConvertedType_UINT_16          ConvertedType = 12
	ConvertedType_UINT_32          ConvertedType = 13
	ConvertedType_UINT_64          ConvertedType = 14
	ConvertedType_INT_8            ConvertedType = 15
	ConvertedType_INT_16           ConvertedType = 16
	ConvertedType_INT_32           ConvertedType = 17
	ConvertedType_INT_64           ConvertedType = 18
	ConvertedType_JSON             ConvertedType = 19
	ConvertedType_BSON             ConvertedType = 20
	ConvertedType_INTERVAL         ConvertedType = 21
)
var SchemaElement_ConvertedType_DEFAULT ConvertedType

func ConvertedTypeFromString

func ConvertedTypeFromString(s string) (ConvertedType, error)

func ConvertedTypePtr

func ConvertedTypePtr(v ConvertedType) *ConvertedType

func (ConvertedType) MarshalText

func (p ConvertedType) MarshalText() ([]byte, error)

func (*ConvertedType) Scan

func (p *ConvertedType) Scan(value interface{}) error

func (ConvertedType) String

func (p ConvertedType) String() string

func (*ConvertedType) UnmarshalText

func (p *ConvertedType) UnmarshalText(text []byte) error

func (*ConvertedType) Value

func (p *ConvertedType) Value() (driver.Value, error)

type DataPageHeader

type DataPageHeader struct {
	NumValues               int32       `thrift:"num_values,1,required" db:"num_values" json:"num_values"`
	Encoding                Encoding    `thrift:"encoding,2,required" db:"encoding" json:"encoding"`
	DefinitionLevelEncoding Encoding    `thrift:"definition_level_encoding,3,required" db:"definition_level_encoding" json:"definition_level_encoding"`
	RepetitionLevelEncoding Encoding    `thrift:"repetition_level_encoding,4,required" db:"repetition_level_encoding" json:"repetition_level_encoding"`
	Statistics              *Statistics `thrift:"statistics,5" db:"statistics" json:"statistics,omitempty"`
}

Data page header

Attributes:

  • NumValues: Number of values, including NULLs, in this data page. *
  • Encoding: Encoding used for this data page *
  • DefinitionLevelEncoding: Encoding used for definition levels *
  • RepetitionLevelEncoding: Encoding used for repetition levels *
  • Statistics: Optional statistics for the data in this page*
var PageHeader_DataPageHeader_DEFAULT *DataPageHeader

func NewDataPageHeader

func NewDataPageHeader() *DataPageHeader

func (*DataPageHeader) GetDefinitionLevelEncoding

func (p *DataPageHeader) GetDefinitionLevelEncoding() Encoding

func (*DataPageHeader) GetEncoding

func (p *DataPageHeader) GetEncoding() Encoding

func (*DataPageHeader) GetNumValues

func (p *DataPageHeader) GetNumValues() int32

func (*DataPageHeader) GetRepetitionLevelEncoding

func (p *DataPageHeader) GetRepetitionLevelEncoding() Encoding

func (*DataPageHeader) GetStatistics

func (p *DataPageHeader) GetStatistics() *Statistics

func (*DataPageHeader) IsSetStatistics

func (p *DataPageHeader) IsSetStatistics() bool

func (*DataPageHeader) Read

func (p *DataPageHeader) Read(iprot thrift.TProtocol) error

func (*DataPageHeader) ReadField1

func (p *DataPageHeader) ReadField1(iprot thrift.TProtocol) error

func (*DataPageHeader) ReadField2

func (p *DataPageHeader) ReadField2(iprot thrift.TProtocol) error

func (*DataPageHeader) ReadField3

func (p *DataPageHeader) ReadField3(iprot thrift.TProtocol) error

func (*DataPageHeader) ReadField4

func (p *DataPageHeader) ReadField4(iprot thrift.TProtocol) error

func (*DataPageHeader) ReadField5

func (p *DataPageHeader) ReadField5(iprot thrift.TProtocol) error

func (*DataPageHeader) String

func (p *DataPageHeader) String() string

func (*DataPageHeader) Write

func (p *DataPageHeader) Write(oprot thrift.TProtocol) error

type DataPageHeaderV2

type DataPageHeaderV2 struct {
	NumValues                  int32       `thrift:"num_values,1,required" db:"num_values" json:"num_values"`
	NumNulls                   int32       `thrift:"num_nulls,2,required" db:"num_nulls" json:"num_nulls"`
	NumRows                    int32       `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"`
	Encoding                   Encoding    `thrift:"encoding,4,required" db:"encoding" json:"encoding"`
	DefinitionLevelsByteLength int32       `thrift:"definition_levels_byte_length,5,required" db:"definition_levels_byte_length" json:"definition_levels_byte_length"`
	RepetitionLevelsByteLength int32       `thrift:"repetition_levels_byte_length,6,required" db:"repetition_levels_byte_length" json:"repetition_levels_byte_length"`
	IsCompressed               bool        `thrift:"is_compressed,7" db:"is_compressed" json:"is_compressed,omitempty"`
	Statistics                 *Statistics `thrift:"statistics,8" db:"statistics" json:"statistics,omitempty"`
}

New page format alowing reading levels without decompressing the data Repetition and definition levels are uncompressed The remaining section containing the data is compressed if is_compressed is true

Attributes:

  • NumValues: Number of values, including NULLs, in this data page. *
  • NumNulls: Number of NULL values, in this data page.

Number of non-null = num_values - num_nulls which is also the number of values in the data section *

  • NumRows: Number of rows in this data page. which means pages change on record boundaries (r = 0) *
  • Encoding: Encoding used for data in this page *
  • DefinitionLevelsByteLength: length of the repetition levels
  • RepetitionLevelsByteLength: length of the definition levels
  • IsCompressed: whether the values are compressed.

Which means the section of the page between definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) is compressed with the compression_codec. If missing it is considered compressed

  • Statistics: optional statistics for this column chunk
var PageHeader_DataPageHeaderV2_DEFAULT *DataPageHeaderV2

func NewDataPageHeaderV2

func NewDataPageHeaderV2() *DataPageHeaderV2

func (*DataPageHeaderV2) GetDefinitionLevelsByteLength

func (p *DataPageHeaderV2) GetDefinitionLevelsByteLength() int32

func (*DataPageHeaderV2) GetEncoding

func (p *DataPageHeaderV2) GetEncoding() Encoding

func (*DataPageHeaderV2) GetIsCompressed

func (p *DataPageHeaderV2) GetIsCompressed() bool

func (*DataPageHeaderV2) GetNumNulls

func (p *DataPageHeaderV2) GetNumNulls() int32

func (*DataPageHeaderV2) GetNumRows

func (p *DataPageHeaderV2) GetNumRows() int32

func (*DataPageHeaderV2) GetNumValues

func (p *DataPageHeaderV2) GetNumValues() int32

func (*DataPageHeaderV2) GetRepetitionLevelsByteLength

func (p *DataPageHeaderV2) GetRepetitionLevelsByteLength() int32

func (*DataPageHeaderV2) GetStatistics

func (p *DataPageHeaderV2) GetStatistics() *Statistics

func (*DataPageHeaderV2) IsSetIsCompressed

func (p *DataPageHeaderV2) IsSetIsCompressed() bool

func (*DataPageHeaderV2) IsSetStatistics

func (p *DataPageHeaderV2) IsSetStatistics() bool

func (*DataPageHeaderV2) Read

func (p *DataPageHeaderV2) Read(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField1

func (p *DataPageHeaderV2) ReadField1(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField2

func (p *DataPageHeaderV2) ReadField2(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField3

func (p *DataPageHeaderV2) ReadField3(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField4

func (p *DataPageHeaderV2) ReadField4(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField5

func (p *DataPageHeaderV2) ReadField5(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField6

func (p *DataPageHeaderV2) ReadField6(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField7

func (p *DataPageHeaderV2) ReadField7(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) ReadField8

func (p *DataPageHeaderV2) ReadField8(iprot thrift.TProtocol) error

func (*DataPageHeaderV2) String

func (p *DataPageHeaderV2) String() string

func (*DataPageHeaderV2) Write

func (p *DataPageHeaderV2) Write(oprot thrift.TProtocol) error

type DictionaryPageHeader

type DictionaryPageHeader struct {
	NumValues int32    `thrift:"num_values,1,required" db:"num_values" json:"num_values"`
	Encoding  Encoding `thrift:"encoding,2,required" db:"encoding" json:"encoding"`
	IsSorted  *bool    `thrift:"is_sorted,3" db:"is_sorted" json:"is_sorted,omitempty"`
}

TODO: *

Attributes:

  • NumValues: Number of values in the dictionary *
  • Encoding: Encoding using this dictionary page *
  • IsSorted: If true, the entries in the dictionary are sorted in ascending order *
var PageHeader_DictionaryPageHeader_DEFAULT *DictionaryPageHeader

func NewDictionaryPageHeader

func NewDictionaryPageHeader() *DictionaryPageHeader

func (*DictionaryPageHeader) GetEncoding

func (p *DictionaryPageHeader) GetEncoding() Encoding

func (*DictionaryPageHeader) GetIsSorted

func (p *DictionaryPageHeader) GetIsSorted() bool

func (*DictionaryPageHeader) GetNumValues

func (p *DictionaryPageHeader) GetNumValues() int32

func (*DictionaryPageHeader) IsSetIsSorted

func (p *DictionaryPageHeader) IsSetIsSorted() bool

func (*DictionaryPageHeader) Read

func (p *DictionaryPageHeader) Read(iprot thrift.TProtocol) error

func (*DictionaryPageHeader) ReadField1

func (p *DictionaryPageHeader) ReadField1(iprot thrift.TProtocol) error

func (*DictionaryPageHeader) ReadField2

func (p *DictionaryPageHeader) ReadField2(iprot thrift.TProtocol) error

func (*DictionaryPageHeader) ReadField3

func (p *DictionaryPageHeader) ReadField3(iprot thrift.TProtocol) error

func (*DictionaryPageHeader) String

func (p *DictionaryPageHeader) String() string

func (*DictionaryPageHeader) Write

func (p *DictionaryPageHeader) Write(oprot thrift.TProtocol) error

type Encoding

type Encoding int64

Encodings supported by Parquet. Not all encodings are valid for all types. These enums are also used to specify the encoding of definition and repetition levels. See the accompanying doc for the details of the more complicated encodings.

const (
	Encoding_PLAIN                   Encoding = 0
	Encoding_PLAIN_DICTIONARY        Encoding = 2
	Encoding_RLE                     Encoding = 3
	Encoding_BIT_PACKED              Encoding = 4
	Encoding_DELTA_BINARY_PACKED     Encoding = 5
	Encoding_DELTA_LENGTH_BYTE_ARRAY Encoding = 6
	Encoding_DELTA_BYTE_ARRAY        Encoding = 7
	Encoding_RLE_DICTIONARY          Encoding = 8
)

func EncodingFromString

func EncodingFromString(s string) (Encoding, error)

func EncodingPtr

func EncodingPtr(v Encoding) *Encoding

func (Encoding) MarshalText

func (p Encoding) MarshalText() ([]byte, error)

func (*Encoding) Scan

func (p *Encoding) Scan(value interface{}) error

func (Encoding) String

func (p Encoding) String() string

func (*Encoding) UnmarshalText

func (p *Encoding) UnmarshalText(text []byte) error

func (*Encoding) Value

func (p *Encoding) Value() (driver.Value, error)

type FieldRepetitionType

type FieldRepetitionType int64

Representation of Schemas

const (
	FieldRepetitionType_REQUIRED FieldRepetitionType = 0
	FieldRepetitionType_OPTIONAL FieldRepetitionType = 1
	FieldRepetitionType_REPEATED FieldRepetitionType = 2
)
var SchemaElement_RepetitionType_DEFAULT FieldRepetitionType

func FieldRepetitionTypeFromString

func FieldRepetitionTypeFromString(s string) (FieldRepetitionType, error)

func FieldRepetitionTypePtr

func FieldRepetitionTypePtr(v FieldRepetitionType) *FieldRepetitionType

func (FieldRepetitionType) MarshalText

func (p FieldRepetitionType) MarshalText() ([]byte, error)

func (*FieldRepetitionType) Scan

func (p *FieldRepetitionType) Scan(value interface{}) error

func (FieldRepetitionType) String

func (p FieldRepetitionType) String() string

func (*FieldRepetitionType) UnmarshalText

func (p *FieldRepetitionType) UnmarshalText(text []byte) error

func (*FieldRepetitionType) Value

func (p *FieldRepetitionType) Value() (driver.Value, error)

type FileMetaData

type FileMetaData struct {
	Version          int32            `thrift:"version,1,required" db:"version" json:"version"`
	Schema           []*SchemaElement `thrift:"schema,2,required" db:"schema" json:"schema"`
	NumRows          int64            `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"`
	RowGroups        []*RowGroup      `thrift:"row_groups,4,required" db:"row_groups" json:"row_groups"`
	KeyValueMetadata []*KeyValue      `thrift:"key_value_metadata,5" db:"key_value_metadata" json:"key_value_metadata,omitempty"`
	CreatedBy        *string          `thrift:"created_by,6" db:"created_by" json:"created_by,omitempty"`
}

Description for file metadata

Attributes:

  • Version: Version of this file *
  • Schema: Parquet schema for this file. This schema contains metadata for all the columns.

The schema is represented as a tree with a single root. The nodes of the tree are flattened to a list by doing a depth-first traversal. The column metadata contains the path in the schema for that column which can be used to map columns to nodes in the schema. The first element is the root *

  • NumRows: Number of rows in this file *
  • RowGroups: Row groups in this file *
  • KeyValueMetadata: Optional key/value metadata *
  • CreatedBy: String for application that wrote this file. This should be in the format

<Application> version <App Version> (build <App Build Hash>). e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)

func NewFileMetaData

func NewFileMetaData() *FileMetaData

func (*FileMetaData) GetCreatedBy

func (p *FileMetaData) GetCreatedBy() string

func (*FileMetaData) GetKeyValueMetadata

func (p *FileMetaData) GetKeyValueMetadata() []*KeyValue

func (*FileMetaData) GetNumRows

func (p *FileMetaData) GetNumRows() int64

func (*FileMetaData) GetRowGroups

func (p *FileMetaData) GetRowGroups() []*RowGroup

func (*FileMetaData) GetSchema

func (p *FileMetaData) GetSchema() []*SchemaElement

func (*FileMetaData) GetVersion

func (p *FileMetaData) GetVersion() int32

func (*FileMetaData) IsSetCreatedBy

func (p *FileMetaData) IsSetCreatedBy() bool

func (*FileMetaData) IsSetKeyValueMetadata

func (p *FileMetaData) IsSetKeyValueMetadata() bool

func (*FileMetaData) Read

func (p *FileMetaData) Read(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField1

func (p *FileMetaData) ReadField1(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField2

func (p *FileMetaData) ReadField2(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField3

func (p *FileMetaData) ReadField3(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField4

func (p *FileMetaData) ReadField4(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField5

func (p *FileMetaData) ReadField5(iprot thrift.TProtocol) error

func (*FileMetaData) ReadField6

func (p *FileMetaData) ReadField6(iprot thrift.TProtocol) error

func (*FileMetaData) String

func (p *FileMetaData) String() string

func (*FileMetaData) Write

func (p *FileMetaData) Write(oprot thrift.TProtocol) error

type IndexPageHeader

type IndexPageHeader struct {
}
var PageHeader_IndexPageHeader_DEFAULT *IndexPageHeader

func NewIndexPageHeader

func NewIndexPageHeader() *IndexPageHeader

func (*IndexPageHeader) Read

func (p *IndexPageHeader) Read(iprot thrift.TProtocol) error

func (*IndexPageHeader) String

func (p *IndexPageHeader) String() string

func (*IndexPageHeader) Write

func (p *IndexPageHeader) Write(oprot thrift.TProtocol) error

type KeyValue

type KeyValue struct {
	Key   string  `thrift:"key,1,required" db:"key" json:"key"`
	Value *string `thrift:"value,2" db:"value" json:"value,omitempty"`
}

Wrapper struct to store key values

Attributes:

  • Key
  • Value

func NewKeyValue

func NewKeyValue() *KeyValue

func (*KeyValue) GetKey

func (p *KeyValue) GetKey() string

func (*KeyValue) GetValue

func (p *KeyValue) GetValue() string

func (*KeyValue) IsSetValue

func (p *KeyValue) IsSetValue() bool

func (*KeyValue) Read

func (p *KeyValue) Read(iprot thrift.TProtocol) error

func (*KeyValue) ReadField1

func (p *KeyValue) ReadField1(iprot thrift.TProtocol) error

func (*KeyValue) ReadField2

func (p *KeyValue) ReadField2(iprot thrift.TProtocol) error

func (*KeyValue) String

func (p *KeyValue) String() string

func (*KeyValue) Write

func (p *KeyValue) Write(oprot thrift.TProtocol) error

type PageEncodingStats

type PageEncodingStats struct {
	PageType PageType `thrift:"page_type,1,required" db:"page_type" json:"page_type"`
	Encoding Encoding `thrift:"encoding,2,required" db:"encoding" json:"encoding"`
	Count    int32    `thrift:"count,3,required" db:"count" json:"count"`
}

statistics of a given page type and encoding

Attributes:

  • PageType: the page type (data/dic/...) *
  • Encoding: encoding of the page *
  • Count: number of pages of this type with this encoding *

func NewPageEncodingStats

func NewPageEncodingStats() *PageEncodingStats

func (*PageEncodingStats) GetCount

func (p *PageEncodingStats) GetCount() int32

func (*PageEncodingStats) GetEncoding

func (p *PageEncodingStats) GetEncoding() Encoding

func (*PageEncodingStats) GetPageType

func (p *PageEncodingStats) GetPageType() PageType

func (*PageEncodingStats) Read

func (p *PageEncodingStats) Read(iprot thrift.TProtocol) error

func (*PageEncodingStats) ReadField1

func (p *PageEncodingStats) ReadField1(iprot thrift.TProtocol) error

func (*PageEncodingStats) ReadField2

func (p *PageEncodingStats) ReadField2(iprot thrift.TProtocol) error

func (*PageEncodingStats) ReadField3

func (p *PageEncodingStats) ReadField3(iprot thrift.TProtocol) error

func (*PageEncodingStats) String

func (p *PageEncodingStats) String() string

func (*PageEncodingStats) Write

func (p *PageEncodingStats) Write(oprot thrift.TProtocol) error
type PageHeader struct {
	Type                 PageType              `thrift:"type,1,required" db:"type" json:"type"`
	UncompressedPageSize int32                 `thrift:"uncompressed_page_size,2,required" db:"uncompressed_page_size" json:"uncompressed_page_size"`
	CompressedPageSize   int32                 `thrift:"compressed_page_size,3,required" db:"compressed_page_size" json:"compressed_page_size"`
	Crc                  *int32                `thrift:"crc,4" db:"crc" json:"crc,omitempty"`
	DataPageHeader       *DataPageHeader       `thrift:"data_page_header,5" db:"data_page_header" json:"data_page_header,omitempty"`
	IndexPageHeader      *IndexPageHeader      `thrift:"index_page_header,6" db:"index_page_header" json:"index_page_header,omitempty"`
	DictionaryPageHeader *DictionaryPageHeader `thrift:"dictionary_page_header,7" db:"dictionary_page_header" json:"dictionary_page_header,omitempty"`
	DataPageHeaderV2     *DataPageHeaderV2     `thrift:"data_page_header_v2,8" db:"data_page_header_v2" json:"data_page_header_v2,omitempty"`
}

Attributes:

  • Type: the type of the page: indicates which of the *_header fields is set *
  • UncompressedPageSize: Uncompressed page size in bytes (not including this header) *
  • CompressedPageSize: Compressed page size in bytes (not including this header) *
  • Crc: 32bit crc for the data below. This allows for disabling checksumming in HDFS

if only a few pages needs to be read

  • DataPageHeader
  • IndexPageHeader
  • DictionaryPageHeader
  • DataPageHeaderV2

func NewPageHeader

func NewPageHeader() *PageHeader

func (*PageHeader) GetCompressedPageSize

func (p *PageHeader) GetCompressedPageSize() int32

func (*PageHeader) GetCrc

func (p *PageHeader) GetCrc() int32

func (*PageHeader) GetDataPageHeader

func (p *PageHeader) GetDataPageHeader() *DataPageHeader

func (*PageHeader) GetDataPageHeaderV2

func (p *PageHeader) GetDataPageHeaderV2() *DataPageHeaderV2

func (*PageHeader) GetDictionaryPageHeader

func (p *PageHeader) GetDictionaryPageHeader() *DictionaryPageHeader

func (*PageHeader) GetIndexPageHeader

func (p *PageHeader) GetIndexPageHeader() *IndexPageHeader

func (*PageHeader) GetType

func (p *PageHeader) GetType() PageType

func (*PageHeader) GetUncompressedPageSize

func (p *PageHeader) GetUncompressedPageSize() int32

func (*PageHeader) IsSetCrc

func (p *PageHeader) IsSetCrc() bool

func (*PageHeader) IsSetDataPageHeader

func (p *PageHeader) IsSetDataPageHeader() bool

func (*PageHeader) IsSetDataPageHeaderV2

func (p *PageHeader) IsSetDataPageHeaderV2() bool

func (*PageHeader) IsSetDictionaryPageHeader

func (p *PageHeader) IsSetDictionaryPageHeader() bool

func (*PageHeader) IsSetIndexPageHeader

func (p *PageHeader) IsSetIndexPageHeader() bool

func (*PageHeader) Read

func (p *PageHeader) Read(iprot thrift.TProtocol) error

func (*PageHeader) ReadField1

func (p *PageHeader) ReadField1(iprot thrift.TProtocol) error

func (*PageHeader) ReadField2

func (p *PageHeader) ReadField2(iprot thrift.TProtocol) error

func (*PageHeader) ReadField3

func (p *PageHeader) ReadField3(iprot thrift.TProtocol) error

func (*PageHeader) ReadField4

func (p *PageHeader) ReadField4(iprot thrift.TProtocol) error

func (*PageHeader) ReadField5

func (p *PageHeader) ReadField5(iprot thrift.TProtocol) error

func (*PageHeader) ReadField6

func (p *PageHeader) ReadField6(iprot thrift.TProtocol) error

func (*PageHeader) ReadField7

func (p *PageHeader) ReadField7(iprot thrift.TProtocol) error

func (*PageHeader) ReadField8

func (p *PageHeader) ReadField8(iprot thrift.TProtocol) error

func (*PageHeader) String

func (p *PageHeader) String() string

func (*PageHeader) Write

func (p *PageHeader) Write(oprot thrift.TProtocol) error

type PageType

type PageType int64
const (
	PageType_DATA_PAGE       PageType = 0
	PageType_INDEX_PAGE      PageType = 1
	PageType_DICTIONARY_PAGE PageType = 2
	PageType_DATA_PAGE_V2    PageType = 3
)

func PageTypeFromString

func PageTypeFromString(s string) (PageType, error)

func PageTypePtr

func PageTypePtr(v PageType) *PageType

func (PageType) MarshalText

func (p PageType) MarshalText() ([]byte, error)

func (*PageType) Scan

func (p *PageType) Scan(value interface{}) error

func (PageType) String

func (p PageType) String() string

func (*PageType) UnmarshalText

func (p *PageType) UnmarshalText(text []byte) error

func (*PageType) Value

func (p *PageType) Value() (driver.Value, error)

type RowGroup

type RowGroup struct {
	Columns        []*ColumnChunk   `thrift:"columns,1,required" db:"columns" json:"columns"`
	TotalByteSize  int64            `thrift:"total_byte_size,2,required" db:"total_byte_size" json:"total_byte_size"`
	NumRows        int64            `thrift:"num_rows,3,required" db:"num_rows" json:"num_rows"`
	SortingColumns []*SortingColumn `thrift:"sorting_columns,4" db:"sorting_columns" json:"sorting_columns,omitempty"`
}

Attributes:

  • Columns: Metadata for each column chunk in this row group.

This list must have the same order as the SchemaElement list in FileMetaData.

  • TotalByteSize: Total byte size of all the uncompressed column data in this row group *
  • NumRows: Number of rows in this row group *
  • SortingColumns: If set, specifies a sort ordering of the rows in this RowGroup.

The sorting columns can be a subset of all the columns.

func NewRowGroup

func NewRowGroup() *RowGroup

func (*RowGroup) GetColumns

func (p *RowGroup) GetColumns() []*ColumnChunk

func (*RowGroup) GetNumRows

func (p *RowGroup) GetNumRows() int64

func (*RowGroup) GetSortingColumns

func (p *RowGroup) GetSortingColumns() []*SortingColumn

func (*RowGroup) GetTotalByteSize

func (p *RowGroup) GetTotalByteSize() int64

func (*RowGroup) IsSetSortingColumns

func (p *RowGroup) IsSetSortingColumns() bool

func (*RowGroup) Read

func (p *RowGroup) Read(iprot thrift.TProtocol) error

func (*RowGroup) ReadField1

func (p *RowGroup) ReadField1(iprot thrift.TProtocol) error

func (*RowGroup) ReadField2

func (p *RowGroup) ReadField2(iprot thrift.TProtocol) error

func (*RowGroup) ReadField3

func (p *RowGroup) ReadField3(iprot thrift.TProtocol) error

func (*RowGroup) ReadField4

func (p *RowGroup) ReadField4(iprot thrift.TProtocol) error

func (*RowGroup) String

func (p *RowGroup) String() string

func (*RowGroup) Write

func (p *RowGroup) Write(oprot thrift.TProtocol) error

type SchemaElement

type SchemaElement struct {
	Type           *Type                `thrift:"type,1" db:"type" json:"type,omitempty"`
	TypeLength     *int32               `thrift:"type_length,2" db:"type_length" json:"type_length,omitempty"`
	RepetitionType *FieldRepetitionType `thrift:"repetition_type,3" db:"repetition_type" json:"repetition_type,omitempty"`
	Name           string               `thrift:"name,4,required" db:"name" json:"name"`
	NumChildren    *int32               `thrift:"num_children,5" db:"num_children" json:"num_children,omitempty"`
	ConvertedType  *ConvertedType       `thrift:"converted_type,6" db:"converted_type" json:"converted_type,omitempty"`
	Scale          *int32               `thrift:"scale,7" db:"scale" json:"scale,omitempty"`
	Precision      *int32               `thrift:"precision,8" db:"precision" json:"precision,omitempty"`
	FieldID        *int32               `thrift:"field_id,9" db:"field_id" json:"field_id,omitempty"`
}

Represents a element inside a schema definition.

  • if it is a group (inner node) then type is undefined and num_children is defined
  • if it is a primitive type (leaf) then type is defined and num_children is undefined

the nodes are listed in depth first traversal order.

Attributes:

  • Type: Data type for this field. Not set if the current element is a non-leaf node
  • TypeLength: If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales.

Otherwise, if specified, this is the maximum bit length to store any of the values. (e.g. a low cardinality INT col could have this set to 3). Note that this is in the schema, and therefore fixed for the entire file.

  • RepetitionType: repetition of the field. The root of the schema does not have a repetition_type.

All other nodes must have one

  • Name: Name of the field in the schema
  • NumChildren: Nested fields. Since thrift does not support nested fields,

the nesting is flattened to a single list by a depth-first traversal. The children count is used to construct the nested relationship. This field is not set when the element is a primitive type

  • ConvertedType: When the schema is the result of a conversion from another model

Used to record the original type to help with cross conversion.

  • Scale: Used when this column contains decimal data.

See the DECIMAL converted type for more details.

  • Precision
  • FieldID: When the original schema supports field ids, this will save the

original field id in the parquet schema

func NewSchemaElement

func NewSchemaElement() *SchemaElement

func (*SchemaElement) GetConvertedType

func (p *SchemaElement) GetConvertedType() ConvertedType

func (*SchemaElement) GetFieldID

func (p *SchemaElement) GetFieldID() int32

func (*SchemaElement) GetName

func (p *SchemaElement) GetName() string

func (*SchemaElement) GetNumChildren

func (p *SchemaElement) GetNumChildren() int32

func (*SchemaElement) GetPrecision

func (p *SchemaElement) GetPrecision() int32

func (*SchemaElement) GetRepetitionType

func (p *SchemaElement) GetRepetitionType() FieldRepetitionType

func (*SchemaElement) GetScale

func (p *SchemaElement) GetScale() int32

func (*SchemaElement) GetType

func (p *SchemaElement) GetType() Type

func (*SchemaElement) GetTypeLength

func (p *SchemaElement) GetTypeLength() int32

func (*SchemaElement) IsSetConvertedType

func (p *SchemaElement) IsSetConvertedType() bool

func (*SchemaElement) IsSetFieldID

func (p *SchemaElement) IsSetFieldID() bool

func (*SchemaElement) IsSetNumChildren

func (p *SchemaElement) IsSetNumChildren() bool

func (*SchemaElement) IsSetPrecision

func (p *SchemaElement) IsSetPrecision() bool

func (*SchemaElement) IsSetRepetitionType

func (p *SchemaElement) IsSetRepetitionType() bool

func (*SchemaElement) IsSetScale

func (p *SchemaElement) IsSetScale() bool

func (*SchemaElement) IsSetType

func (p *SchemaElement) IsSetType() bool

func (*SchemaElement) IsSetTypeLength

func (p *SchemaElement) IsSetTypeLength() bool

func (*SchemaElement) Read

func (p *SchemaElement) Read(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField1

func (p *SchemaElement) ReadField1(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField2

func (p *SchemaElement) ReadField2(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField3

func (p *SchemaElement) ReadField3(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField4

func (p *SchemaElement) ReadField4(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField5

func (p *SchemaElement) ReadField5(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField6

func (p *SchemaElement) ReadField6(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField7

func (p *SchemaElement) ReadField7(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField8

func (p *SchemaElement) ReadField8(iprot thrift.TProtocol) error

func (*SchemaElement) ReadField9

func (p *SchemaElement) ReadField9(iprot thrift.TProtocol) error

func (*SchemaElement) String

func (p *SchemaElement) String() string

func (*SchemaElement) Write

func (p *SchemaElement) Write(oprot thrift.TProtocol) error

type SortingColumn

type SortingColumn struct {
	ColumnIdx  int32 `thrift:"column_idx,1,required" db:"column_idx" json:"column_idx"`
	Descending bool  `thrift:"descending,2,required" db:"descending" json:"descending"`
	NullsFirst bool  `thrift:"nulls_first,3,required" db:"nulls_first" json:"nulls_first"`
}

Wrapper struct to specify sort order

Attributes:

  • ColumnIdx: The column index (in this row group) *
  • Descending: If true, indicates this column is sorted in descending order. *
  • NullsFirst: If true, nulls will come before non-null values, otherwise,

nulls go at the end.

func NewSortingColumn

func NewSortingColumn() *SortingColumn

func (*SortingColumn) GetColumnIdx

func (p *SortingColumn) GetColumnIdx() int32

func (*SortingColumn) GetDescending

func (p *SortingColumn) GetDescending() bool

func (*SortingColumn) GetNullsFirst

func (p *SortingColumn) GetNullsFirst() bool

func (*SortingColumn) Read

func (p *SortingColumn) Read(iprot thrift.TProtocol) error

func (*SortingColumn) ReadField1

func (p *SortingColumn) ReadField1(iprot thrift.TProtocol) error

func (*SortingColumn) ReadField2

func (p *SortingColumn) ReadField2(iprot thrift.TProtocol) error

func (*SortingColumn) ReadField3

func (p *SortingColumn) ReadField3(iprot thrift.TProtocol) error

func (*SortingColumn) String

func (p *SortingColumn) String() string

func (*SortingColumn) Write

func (p *SortingColumn) Write(oprot thrift.TProtocol) error

type Statistics

type Statistics struct {
	Max           []byte `thrift:"max,1" db:"max" json:"max,omitempty"`
	Min           []byte `thrift:"min,2" db:"min" json:"min,omitempty"`
	NullCount     *int64 `thrift:"null_count,3" db:"null_count" json:"null_count,omitempty"`
	DistinctCount *int64 `thrift:"distinct_count,4" db:"distinct_count" json:"distinct_count,omitempty"`
}

Statistics per row group and per page All fields are optional.

Attributes:

  • Max: min and max value of the column, encoded in PLAIN encoding
  • Min
  • NullCount: count of null value in the column
  • DistinctCount: count of distinct values occurring
var ColumnMetaData_Statistics_DEFAULT *Statistics
var DataPageHeaderV2_Statistics_DEFAULT *Statistics
var DataPageHeader_Statistics_DEFAULT *Statistics

func NewStatistics

func NewStatistics() *Statistics

func (*Statistics) GetDistinctCount

func (p *Statistics) GetDistinctCount() int64

func (*Statistics) GetMax

func (p *Statistics) GetMax() []byte

func (*Statistics) GetMin

func (p *Statistics) GetMin() []byte

func (*Statistics) GetNullCount

func (p *Statistics) GetNullCount() int64

func (*Statistics) IsSetDistinctCount

func (p *Statistics) IsSetDistinctCount() bool

func (*Statistics) IsSetMax

func (p *Statistics) IsSetMax() bool

func (*Statistics) IsSetMin

func (p *Statistics) IsSetMin() bool

func (*Statistics) IsSetNullCount

func (p *Statistics) IsSetNullCount() bool

func (*Statistics) Read

func (p *Statistics) Read(iprot thrift.TProtocol) error

func (*Statistics) ReadField1

func (p *Statistics) ReadField1(iprot thrift.TProtocol) error

func (*Statistics) ReadField2

func (p *Statistics) ReadField2(iprot thrift.TProtocol) error

func (*Statistics) ReadField3

func (p *Statistics) ReadField3(iprot thrift.TProtocol) error

func (*Statistics) ReadField4

func (p *Statistics) ReadField4(iprot thrift.TProtocol) error

func (*Statistics) String

func (p *Statistics) String() string

func (*Statistics) Write

func (p *Statistics) Write(oprot thrift.TProtocol) error

type Type

type Type int64

Types supported by Parquet. These types are intended to be used in combination with the encodings to control the on disk storage format. For example INT16 is not included as a type since a good encoding of INT32 would handle this.

const (
	Type_BOOLEAN              Type = 0
	Type_INT32                Type = 1
	Type_INT64                Type = 2
	Type_INT96                Type = 3
	Type_FLOAT                Type = 4
	Type_DOUBLE               Type = 5
	Type_BYTE_ARRAY           Type = 6
	Type_FIXED_LEN_BYTE_ARRAY Type = 7
)
var SchemaElement_Type_DEFAULT Type

func TypeFromString

func TypeFromString(s string) (Type, error)

func TypePtr

func TypePtr(v Type) *Type

func (Type) MarshalText

func (p Type) MarshalText() ([]byte, error)

func (*Type) Scan

func (p *Type) Scan(value interface{}) error

func (Type) String

func (p Type) String() string

func (*Type) UnmarshalText

func (p *Type) UnmarshalText(text []byte) error

func (*Type) Value

func (p *Type) Value() (driver.Value, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL