datastore

package
v0.4.14-rc14 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 30, 2024 License: Apache-2.0 Imports: 31 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrDBDatasetExists = errors.New("dataset already exists in database")
View Source
var ErrDBDocumentNotFound = errors.New("document not found in database")

ErrDBDocumentNotFound is returned when a document is not found in the database.

View Source
var ErrDBFileNotFound = errors.New("file not found in database")

ErrDBFileNotFound is returned when a file is not found.

View Source
var IsDuplicateFuncs = map[string]IsDuplicateFunc{
	"file_metadata": DedupeByFileMetadata,
	"dummy":         DummyDedupe,
	"none":          DummyDedupe,
	"ignore":        DummyDedupe,
	"upsert":        DedupeUpsert,
}

IsDuplicateFuncs is a map of deduplication functions by name.

Functions

func DedupeByFileMetadata

func DedupeByFileMetadata(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DedupeByFileMetadata is a deduplication function that checks if the document is a duplicate based on the file metadata.

func DedupeUpsert added in v0.4.3

func DedupeUpsert(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

func DummyDedupe

func DummyDedupe(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DummyDedupe is a dummy deduplication function that always returns false (i.e. "No Duplicate").

func GetDatastorePaths

func GetDatastorePaths(dsn, vectordbPath string) (string, string, bool, error)

GetDatastorePaths returns the paths for the datastore and vectorstore databases. In addition, it returns a boolean indicating whether the datastore is an archive.

func LogEmbeddingFunc added in v0.4.14

func LogEmbeddingFunc(embeddingFunc cg.EmbeddingFunc) cg.EmbeddingFunc

Types

type Datastore

type Datastore struct {
	LLM                    llm.LLM
	Index                  *index.DB
	Vectorstore            vectorstore.VectorStore
	EmbeddingConfig        config.EmbeddingsConfig
	EmbeddingModelProvider etypes.EmbeddingModelProvider
}

func NewDatastore

func NewDatastore(dsn string, automigrate bool, vectorDBPath string, embeddingProvider etypes.EmbeddingModelProvider) (*Datastore, error)

func (*Datastore) DeleteDataset

func (s *Datastore) DeleteDataset(ctx context.Context, datasetID string) error

func (*Datastore) DeleteDocument

func (s *Datastore) DeleteDocument(ctx context.Context, documentID, datasetID string) error

func (*Datastore) DeleteFile

func (s *Datastore) DeleteFile(ctx context.Context, datasetID, fileID string) error

func (*Datastore) ExportDatasetsToFile added in v0.1.7

func (s *Datastore) ExportDatasetsToFile(ctx context.Context, path string, datasets ...string) error

func (*Datastore) GetDataset

func (s *Datastore) GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error)

func (*Datastore) GetDocuments added in v0.4.11

func (s *Datastore) GetDocuments(ctx context.Context, datasetID string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vectorstore.Document, error)

func (*Datastore) ImportDatasetsFromFile added in v0.1.7

func (s *Datastore) ImportDatasetsFromFile(ctx context.Context, path string, datasets ...string) error

func (*Datastore) Ingest

func (s *Datastore) Ingest(ctx context.Context, datasetID string, name string, content []byte, opts IngestOpts) ([]string, error)

Ingest loads a document from a reader and adds it to the dataset.

func (*Datastore) ListDatasets

func (s *Datastore) ListDatasets(ctx context.Context) ([]index.Dataset, error)

func (*Datastore) NewDataset

func (s *Datastore) NewDataset(ctx context.Context, dataset index.Dataset) error

func (*Datastore) PruneFiles added in v0.4.3

func (s *Datastore) PruneFiles(ctx context.Context, datasetID string, pathPrefix string, keep []string) ([]index.File, error)

func (*Datastore) Retrieve

func (s *Datastore) Retrieve(ctx context.Context, datasetIDs []string, query string, opts RetrieveOpts) (*types.RetrievalResponse, error)

func (*Datastore) SimilaritySearch added in v0.1.8

func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocuments int, datasetID string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vectorstore.Document, error)

func (*Datastore) UpdateDataset added in v0.1.8

func (s *Datastore) UpdateDataset(ctx context.Context, updatedDataset index.Dataset, opts *UpdateDatasetOpts) (*index.Dataset, error)

type IngestOpts

type IngestOpts struct {
	FileMetadata        *index.FileMetadata
	IsDuplicateFuncName string
	IsDuplicateFunc     IsDuplicateFunc
	TextSplitterOpts    *textsplitter.TextSplitterOpts
	IngestionFlows      []flows.IngestionFlow
	ExtraMetadata       map[string]any
}

type IsDuplicateFunc

type IsDuplicateFunc func(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

IsDuplicateFunc is a function that determines whether a document is a duplicate or if it should be ingested. The function should return true if the document is a duplicate (and thus should not be ingested) and false otherwise.

type RetrieveOpts added in v0.1.6

type RetrieveOpts struct {
	TopK          int
	Keywords      []string
	RetrievalFlow *flows.RetrievalFlow
}

type UpdateDatasetOpts added in v0.1.8

type UpdateDatasetOpts struct {
	ReplaceMedata bool
}

Directories

Path Synopsis
lib
Package postprocessors is basically the same as package transformers, but used at a different stage of the RAG pipeline
Package postprocessors is basically the same as package transformers, but used at a different stage of the RAG pipeline

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL