Documentation ¶
Index ¶
- Constants
- Variables
- func ApplyNameMapping(schemaWithoutIDs *iceberg.Schema, nameMapping NameMapping) (*iceberg.Schema, error)
- func ArrowSchemaToIceberg(sc *arrow.Schema, downcastNsTimestamp bool, nameMapping NameMapping) (*iceberg.Schema, error)
- func ArrowTypeToIceberg(dt arrow.DataType, downcastNsTimestamp bool) (iceberg.Type, error)
- func SchemaToArrowSchema(sc *iceberg.Schema, metadata map[string]string, ...) (*arrow.Schema, error)
- func ToRequestedSchema(requested, fileSchema *iceberg.Schema, batch arrow.Record, ...) (arrow.Record, error)
- func TypeToArrowType(t iceberg.Type, includeFieldIDs bool) (arrow.DataType, error)
- func VisitArrowSchema[T any](sc *arrow.Schema, visitor ArrowSchemaVisitor[T]) (res T, err error)
- func VisitMappedFields[S, T any](fields []MappedField, visitor NameMappingVisitor[S, T]) (res S, err error)
- func VisitNameMapping[S, T any](obj NameMapping, visitor NameMappingVisitor[S, T]) (res S, err error)
- type ArrowSchemaVisitor
- type FileScanTask
- type Identifier
- type MappedField
- type Metadata
- type MetadataLogEntry
- type MetadataV1
- func (c *MetadataV1) CurrentSchema() *iceberg.Schema
- func (c *MetadataV1) CurrentSnapshot() *Snapshot
- func (c *MetadataV1) DefaultPartitionSpec() int
- func (m *MetadataV1) Equals(other Metadata) bool
- func (c *MetadataV1) LastColumnID() int
- func (c *MetadataV1) LastPartitionSpecID() *int
- func (c *MetadataV1) LastUpdatedMillis() int64
- func (c *MetadataV1) Location() string
- func (c *MetadataV1) PartitionSpec() iceberg.PartitionSpec
- func (c *MetadataV1) PartitionSpecs() []iceberg.PartitionSpec
- func (c *MetadataV1) Properties() iceberg.Properties
- func (c *MetadataV1) Schemas() []*iceberg.Schema
- func (c *MetadataV1) SnapshotByID(id int64) *Snapshot
- func (c *MetadataV1) SnapshotByName(name string) *Snapshot
- func (c *MetadataV1) Snapshots() []Snapshot
- func (c *MetadataV1) SortOrder() SortOrder
- func (c *MetadataV1) SortOrders() []SortOrder
- func (c *MetadataV1) TableUUID() uuid.UUID
- func (m *MetadataV1) ToV2() MetadataV2
- func (m *MetadataV1) UnmarshalJSON(b []byte) error
- func (c *MetadataV1) Version() int
- type MetadataV2
- func (c *MetadataV2) CurrentSchema() *iceberg.Schema
- func (c *MetadataV2) CurrentSnapshot() *Snapshot
- func (c *MetadataV2) DefaultPartitionSpec() int
- func (m *MetadataV2) Equals(other Metadata) bool
- func (c *MetadataV2) LastColumnID() int
- func (c *MetadataV2) LastPartitionSpecID() *int
- func (c *MetadataV2) LastUpdatedMillis() int64
- func (c *MetadataV2) Location() string
- func (c *MetadataV2) PartitionSpec() iceberg.PartitionSpec
- func (c *MetadataV2) PartitionSpecs() []iceberg.PartitionSpec
- func (c *MetadataV2) Properties() iceberg.Properties
- func (c *MetadataV2) Schemas() []*iceberg.Schema
- func (c *MetadataV2) SnapshotByID(id int64) *Snapshot
- func (c *MetadataV2) SnapshotByName(name string) *Snapshot
- func (c *MetadataV2) Snapshots() []Snapshot
- func (c *MetadataV2) SortOrder() SortOrder
- func (c *MetadataV2) SortOrders() []SortOrder
- func (c *MetadataV2) TableUUID() uuid.UUID
- func (m *MetadataV2) UnmarshalJSON(b []byte) error
- func (c *MetadataV2) Version() int
- type NameMapping
- type NameMappingAccessor
- func (n NameMappingAccessor) FieldPartner(partnerStruct *MappedField, _ int, fieldName string) *MappedField
- func (n NameMappingAccessor) ListElementPartner(partnerList *MappedField) *MappedField
- func (n NameMappingAccessor) MapKeyPartner(partnerMap *MappedField) *MappedField
- func (n NameMappingAccessor) MapValuePartner(partnerMap *MappedField) *MappedField
- func (NameMappingAccessor) SchemaPartner(partner *MappedField) *MappedField
- type NameMappingVisitor
- type NullOrder
- type Operation
- type RefType
- type Scan
- func (scan *Scan) PlanFiles(ctx context.Context) ([]FileScanTask, error)
- func (scan *Scan) Projection() (*iceberg.Schema, error)
- func (scan *Scan) Snapshot() *Snapshot
- func (scan *Scan) ToArrowRecords(ctx context.Context) (*arrow.Schema, iter.Seq2[arrow.Record, error], error)
- func (scan *Scan) ToArrowTable(ctx context.Context) (arrow.Table, error)
- func (scan *Scan) UseRef(name string) (*Scan, error)
- func (scan *Scan) UseRowLimit(n int64) *Scan
- type ScanOption
- func WitMaxConcurrency(n int) ScanOption
- func WithCaseSensitive(b bool) ScanOption
- func WithLimit(n int64) ScanOption
- func WithOptions(opts iceberg.Properties) ScanOption
- func WithRowFilter(e iceberg.BooleanExpression) ScanOption
- func WithSelectedFields(fields ...string) ScanOption
- func WithSnapshotID(n int64) ScanOption
- type Snapshot
- type SnapshotLogEntry
- type SnapshotRef
- type SortDirection
- type SortField
- type SortOrder
- type Summary
- type Table
- func (t Table) CurrentSnapshot() *Snapshot
- func (t Table) Equals(other Table) bool
- func (t Table) FS() io.IO
- func (t Table) Identifier() Identifier
- func (t Table) Location() string
- func (t Table) Metadata() Metadata
- func (t Table) MetadataLocation() string
- func (t Table) Properties() iceberg.Properties
- func (t Table) Scan(opts ...ScanOption) *Scan
- func (t Table) Schema() *iceberg.Schema
- func (t Table) Schemas() map[int]*iceberg.Schema
- func (t Table) SnapshotByID(id int64) *Snapshot
- func (t Table) SnapshotByName(name string) *Snapshot
- func (t Table) SortOrder() SortOrder
- func (t Table) Spec() iceberg.PartitionSpec
Constants ¶
const ( ArrowFieldDocKey = "doc" // Arrow schemas that are generated from the Parquet library will utilize // this key to identify the field id of the source Parquet field. // We use this when converting to Iceberg to provide field IDs ArrowParquetFieldIDKey = "PARQUET:field_id" )
constants to look for as Keys in Arrow field metadata
const ( InitialSortOrderID = 1 UnsortedSortOrderID = 0 )
const MainBranch = "main"
const ScanNoLimit = -1
const (
ScanOptionArrowUseLargeTypes = "arrow.use_large_types"
)
Variables ¶
var ( ErrInvalidMetadataFormatVersion = errors.New("invalid or missing format-version in table metadata") ErrInvalidMetadata = errors.New("invalid metadata") )
var ( ErrInvalidOperation = errors.New("invalid operation value") ErrMissingOperation = errors.New("missing operation key") )
var ( ErrInvalidSortDirection = errors.New("invalid sort direction, must be 'asc' or 'desc'") ErrInvalidNullOrder = errors.New("invalid null order, must be 'nulls-first' or 'nulls-last'") )
var (
ErrInvalidRefType = errors.New("invalid snapshot ref type, should be 'branch' or 'tag'")
)
var UnsortedSortOrder = SortOrder{OrderID: UnsortedSortOrderID, Fields: []SortField{}}
A default Sort Order indicating no sort order at all
Functions ¶
func ApplyNameMapping ¶
func ApplyNameMapping(schemaWithoutIDs *iceberg.Schema, nameMapping NameMapping) (*iceberg.Schema, error)
func ArrowSchemaToIceberg ¶
func ArrowSchemaToIceberg(sc *arrow.Schema, downcastNsTimestamp bool, nameMapping NameMapping) (*iceberg.Schema, error)
func ArrowTypeToIceberg ¶
func SchemaToArrowSchema ¶
func SchemaToArrowSchema(sc *iceberg.Schema, metadata map[string]string, includeFieldIDs, useLargeTypes bool) (*arrow.Schema, error)
SchemaToArrowSchema converts an Iceberg schema to an Arrow schema. If the metadata parameter is non-nil, it will be included as the top-level metadata in the schema. If includeFieldIDs is true, then each field of the schema will contain a metadata key PARQUET:field_id set to the field id from the iceberg schema.
func ToRequestedSchema ¶
func ToRequestedSchema(requested, fileSchema *iceberg.Schema, batch arrow.Record, downcastTimestamp, includeFieldIDs, useLargeTypes bool) (arrow.Record, error)
ToRequestedSchema will construct a new record batch matching the requested iceberg schema casting columns if necessary as appropriate.
func TypeToArrowType ¶
TypeToArrowType converts a given iceberg type, into the equivalent Arrow data type. For dealing with nested fields (List, Struct, Map) if includeFieldIDs is true, then the child fields will contain a metadata key PARQUET:field_id set to the field id.
func VisitArrowSchema ¶
func VisitArrowSchema[T any](sc *arrow.Schema, visitor ArrowSchemaVisitor[T]) (res T, err error)
func VisitMappedFields ¶
func VisitMappedFields[S, T any](fields []MappedField, visitor NameMappingVisitor[S, T]) (res S, err error)
func VisitNameMapping ¶
func VisitNameMapping[S, T any](obj NameMapping, visitor NameMappingVisitor[S, T]) (res S, err error)
Types ¶
type ArrowSchemaVisitor ¶
type ArrowSchemaVisitor[T any] interface { Schema(*arrow.Schema, T) T Struct(*arrow.StructType, []T) T Field(arrow.Field, T) T List(arrow.ListLikeType, T) T Map(mt *arrow.MapType, keyResult T, valueResult T) T Primitive(arrow.DataType) T }
ArrowSchemaVisitor is an interface that can be implemented and used to call VisitArrowSchema for iterating
type FileScanTask ¶
type FileScanTask struct { File iceberg.DataFile DeleteFiles []iceberg.DataFile Start, Length int64 }
type Identifier ¶
type Identifier = []string
type MappedField ¶
type MappedField struct { Names []string `json:"names"` // iceberg spec says this is optional, but I don't see any examples // of this being left empty. Does pyiceberg need to be updated or should // the spec not say field-id is optional? FieldID *int `json:"field-id,omitempty"` Fields []MappedField `json:"fields,omitempty"` }
func (*MappedField) Len ¶
func (m *MappedField) Len() int
func (*MappedField) String ¶
func (m *MappedField) String() string
type Metadata ¶
type Metadata interface { // Version indicates the version of this metadata, 1 for V1, 2 for V2, etc. Version() int // TableUUID returns a UUID that identifies the table, generated when the // table is created. Implementations must throw an exception if a table's // UUID does not match the expected UUID after refreshing metadata. TableUUID() uuid.UUID // Location is the table's base location. This is used by writers to determine // where to store data files, manifest files, and table metadata files. Location() string // LastUpdatedMillis is the timestamp in milliseconds from the unix epoch when // the table was last updated. Each table metadata file should update this // field just before writing. LastUpdatedMillis() int64 // LastColumnID returns the highest assigned column ID for the table. // This is used to ensure fields are always assigned an unused ID when // evolving schemas. LastColumnID() int // Schemas returns the list of schemas, stored as objects with their // schema-id. Schemas() []*iceberg.Schema // CurrentSchema returns the table's current schema. CurrentSchema() *iceberg.Schema // PartitionSpecs returns the list of all partition specs in the table. PartitionSpecs() []iceberg.PartitionSpec // PartitionSpec returns the current partition spec that the table is using. PartitionSpec() iceberg.PartitionSpec // DefaultPartitionSpec is the ID of the current spec that writers should // use by default. DefaultPartitionSpec() int // LastPartitionSpecID is the highest assigned partition field ID across // all partition specs for the table. This is used to ensure partition // fields are always assigned an unused ID when evolving specs. LastPartitionSpecID() *int // Snapshots returns the list of valid snapshots. Valid snapshots are // snapshots for which all data files exist in the file system. A data // file must not be deleted from the file system until the last snapshot // in which it was listed is garbage collected. Snapshots() []Snapshot // SnapshotByID find and return a specific snapshot by its ID. Returns // nil if the ID is not found in the list of snapshots. SnapshotByID(int64) *Snapshot // SnapshotByName searches the list of snapshots for a snapshot with a given // ref name. Returns nil if there's no ref with this name for a snapshot. SnapshotByName(name string) *Snapshot // CurrentSnapshot returns the table's current snapshot. CurrentSnapshot() *Snapshot // SortOrder returns the table's current sort order, ie: the one with the // ID that matches the default-sort-order-id. SortOrder() SortOrder // SortOrders returns the list of sort orders in the table. SortOrders() []SortOrder // Properties is a string to string map of table properties. This is used // to control settings that affect reading and writing and is not intended // to be used for arbitrary metadata. For example, commit.retry.num-retries // is used to control the number of commit retries. Properties() iceberg.Properties Equals(Metadata) bool }
Metadata for an iceberg table as specified in the Iceberg spec
https://iceberg.apache.org/spec/#iceberg-table-spec
func ParseMetadata ¶
ParseMetadata parses json metadata provided by the passed in reader, returning an error if one is encountered.
func ParseMetadataBytes ¶
ParseMetadataBytes is like ParseMetadataString but for a byte slice.
func ParseMetadataString ¶
ParseMetadataString is like ParseMetadata, but for a string rather than an io.Reader.
type MetadataLogEntry ¶
type MetadataV1 ¶
type MetadataV1 struct { Schema iceberg.Schema `json:"schema"` Partition []iceberg.PartitionField `json:"partition-spec"` // contains filtered or unexported fields }
func (*MetadataV1) CurrentSchema ¶
func (c *MetadataV1) CurrentSchema() *iceberg.Schema
func (*MetadataV1) CurrentSnapshot ¶
func (c *MetadataV1) CurrentSnapshot() *Snapshot
func (*MetadataV1) DefaultPartitionSpec ¶
func (c *MetadataV1) DefaultPartitionSpec() int
func (*MetadataV1) Equals ¶
func (m *MetadataV1) Equals(other Metadata) bool
func (*MetadataV1) LastColumnID ¶
func (c *MetadataV1) LastColumnID() int
func (*MetadataV1) LastPartitionSpecID ¶
func (c *MetadataV1) LastPartitionSpecID() *int
func (*MetadataV1) LastUpdatedMillis ¶
func (c *MetadataV1) LastUpdatedMillis() int64
func (*MetadataV1) PartitionSpec ¶
func (c *MetadataV1) PartitionSpec() iceberg.PartitionSpec
func (*MetadataV1) PartitionSpecs ¶
func (c *MetadataV1) PartitionSpecs() []iceberg.PartitionSpec
func (*MetadataV1) Properties ¶
func (c *MetadataV1) Properties() iceberg.Properties
func (*MetadataV1) SnapshotByID ¶
func (*MetadataV1) SnapshotByName ¶
func (*MetadataV1) SortOrders ¶
func (c *MetadataV1) SortOrders() []SortOrder
func (*MetadataV1) ToV2 ¶
func (m *MetadataV1) ToV2() MetadataV2
func (*MetadataV1) UnmarshalJSON ¶
func (m *MetadataV1) UnmarshalJSON(b []byte) error
type MetadataV2 ¶
type MetadataV2 struct { LastSequenceNumber int `json:"last-sequence-number"` // contains filtered or unexported fields }
func (*MetadataV2) CurrentSchema ¶
func (c *MetadataV2) CurrentSchema() *iceberg.Schema
func (*MetadataV2) CurrentSnapshot ¶
func (c *MetadataV2) CurrentSnapshot() *Snapshot
func (*MetadataV2) DefaultPartitionSpec ¶
func (c *MetadataV2) DefaultPartitionSpec() int
func (*MetadataV2) Equals ¶
func (m *MetadataV2) Equals(other Metadata) bool
func (*MetadataV2) LastColumnID ¶
func (c *MetadataV2) LastColumnID() int
func (*MetadataV2) LastPartitionSpecID ¶
func (c *MetadataV2) LastPartitionSpecID() *int
func (*MetadataV2) LastUpdatedMillis ¶
func (c *MetadataV2) LastUpdatedMillis() int64
func (*MetadataV2) PartitionSpec ¶
func (c *MetadataV2) PartitionSpec() iceberg.PartitionSpec
func (*MetadataV2) PartitionSpecs ¶
func (c *MetadataV2) PartitionSpecs() []iceberg.PartitionSpec
func (*MetadataV2) Properties ¶
func (c *MetadataV2) Properties() iceberg.Properties
func (*MetadataV2) SnapshotByID ¶
func (*MetadataV2) SnapshotByName ¶
func (*MetadataV2) SortOrders ¶
func (c *MetadataV2) SortOrders() []SortOrder
func (*MetadataV2) UnmarshalJSON ¶
func (m *MetadataV2) UnmarshalJSON(b []byte) error
type NameMapping ¶
type NameMapping []MappedField
func (NameMapping) String ¶
func (nm NameMapping) String() string
type NameMappingAccessor ¶
type NameMappingAccessor struct{}
func (NameMappingAccessor) FieldPartner ¶
func (n NameMappingAccessor) FieldPartner(partnerStruct *MappedField, _ int, fieldName string) *MappedField
func (NameMappingAccessor) ListElementPartner ¶
func (n NameMappingAccessor) ListElementPartner(partnerList *MappedField) *MappedField
func (NameMappingAccessor) MapKeyPartner ¶
func (n NameMappingAccessor) MapKeyPartner(partnerMap *MappedField) *MappedField
func (NameMappingAccessor) MapValuePartner ¶
func (n NameMappingAccessor) MapValuePartner(partnerMap *MappedField) *MappedField
func (NameMappingAccessor) SchemaPartner ¶
func (NameMappingAccessor) SchemaPartner(partner *MappedField) *MappedField
type NameMappingVisitor ¶
type NameMappingVisitor[S, T any] interface { Mapping(nm NameMapping, fieldResults S) S Fields(st []MappedField, fieldResults []T) S Field(field MappedField, fieldResult S) T }
type Operation ¶
type Operation string
func ValidOperation ¶
ValidOperation ensures that a given string is one of the valid operation types: append,replace,overwrite,delete
type Scan ¶
type Scan struct {
// contains filtered or unexported fields
}
func (*Scan) Projection ¶
func (*Scan) ToArrowRecords ¶
func (scan *Scan) ToArrowRecords(ctx context.Context) (*arrow.Schema, iter.Seq2[arrow.Record, error], error)
ToArrowRecords returns the arrow schema of the expected records and an interator that can be used with a range expression to read the records as they are available. If an error is encountered, during the planning and setup then this will return the error directly. If the error occurs while iterating the records, it will be returned by the iterator.
The purpose for returning the schema up front is to handle the case where there are no rows returned. The resulting Arrow Schema of the projection will still be known.
func (*Scan) ToArrowTable ¶
ToArrowTable calls ToArrowRecords and then gathers all of the records together and returns an arrow.Table make from those records.
func (*Scan) UseRowLimit ¶
type ScanOption ¶
type ScanOption func(*Scan)
func WitMaxConcurrency ¶
func WitMaxConcurrency(n int) ScanOption
WitMaxConcurrency sets the maximum concurrency for table scan and plan operations. When unset it defaults to runtime.GOMAXPROCS.
func WithCaseSensitive ¶
func WithCaseSensitive(b bool) ScanOption
func WithLimit ¶
func WithLimit(n int64) ScanOption
func WithOptions ¶
func WithOptions(opts iceberg.Properties) ScanOption
func WithRowFilter ¶
func WithRowFilter(e iceberg.BooleanExpression) ScanOption
func WithSelectedFields ¶
func WithSelectedFields(fields ...string) ScanOption
func WithSnapshotID ¶
func WithSnapshotID(n int64) ScanOption
type Snapshot ¶
type Snapshot struct { SnapshotID int64 `json:"snapshot-id"` ParentSnapshotID *int64 `json:"parent-snapshot-id,omitempty"` SequenceNumber int64 `json:"sequence-number"` TimestampMs int64 `json:"timestamp-ms"` ManifestList string `json:"manifest-list,omitempty"` Summary *Summary `json:"summary,omitempty"` SchemaID *int `json:"schema-id,omitempty"` }
type SnapshotLogEntry ¶
type SnapshotRef ¶
type SnapshotRef struct { SnapshotID int64 `json:"snapshot-id"` SnapshotRefType RefType `json:"type"` MinSnapshotsToKeep *int `json:"min-snapshots-to-keep,omitempty"` MaxSnapshotAgeMs *int64 `json:"max-snapshot-age-ms,omitempty"` MaxRefAgeMs *int64 `json:"max-ref-age-ms,omitempty"` }
SnapshotRef represents the reference information for a specific snapshot
func (*SnapshotRef) Equals ¶
func (s *SnapshotRef) Equals(rhs SnapshotRef) bool
func (*SnapshotRef) UnmarshalJSON ¶
func (s *SnapshotRef) UnmarshalJSON(b []byte) error
type SortDirection ¶
type SortDirection string
const ( SortASC SortDirection = "asc" SortDESC SortDirection = "desc" )
type SortField ¶
type SortField struct { // SourceID is the source column id from the table's schema SourceID int `json:"source-id"` // Transform is the tranformation used to produce values to be // sorted on from the source column. Transform iceberg.Transform `json:"transform"` // Direction is an enum indicating ascending or descending direction. Direction SortDirection `json:"direction"` // NullOrder describes the order of null values when sorting // should be only either nulls-first or nulls-last enum values. NullOrder NullOrder `json:"null-order"` }
SortField describes a field used in a sort order definition.
func (*SortField) MarshalJSON ¶
func (*SortField) UnmarshalJSON ¶
type SortOrder ¶
SortOrder describes how the data is sorted within the table.
Data can be sorted within partitions by columns to gain performance. The order of the sort fields within the list defines the order in which the sort is applied to the data.
func (*SortOrder) UnmarshalJSON ¶
type Summary ¶
Summary stores the summary information for a snapshot indicating the operation that created the snapshot, and various properties which might exist in the summary.
func (*Summary) MarshalJSON ¶
func (*Summary) UnmarshalJSON ¶
type Table ¶
type Table struct {
// contains filtered or unexported fields
}
func NewFromLocation ¶
func (Table) CurrentSnapshot ¶
func (Table) Identifier ¶
func (t Table) Identifier() Identifier
func (Table) MetadataLocation ¶
func (Table) Properties ¶
func (t Table) Properties() iceberg.Properties
func (Table) Scan ¶
func (t Table) Scan(opts ...ScanOption) *Scan