datastore

package
v0.1.5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 13, 2024 License: Apache-2.0 Imports: 27 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrDBDocumentNotFound = errors.New("document not found in database")

ErrDBDocumentNotFound is returned when a document is not found in the database.

View Source
var ErrDBFileNotFound = errors.New("file not found in database")

ErrDBFileNotFound is returned when a file is not found.

View Source
var IsDuplicateFuncs = map[string]IsDuplicateFunc{
	"file_metadata": DedupeByFileMetadata,
	"dummy":         DummyDedupe,
	"none":          DummyDedupe,
	"ignore":        DummyDedupe,
}

IsDuplicateFuncs is a map of deduplication functions by name.

Functions

func DedupeByFileMetadata

func DedupeByFileMetadata(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DedupeByFileMetadata is a deduplication function that checks if the document is a duplicate based on the file metadata.

func DummyDedupe

func DummyDedupe(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DummyDedupe is a dummy deduplication function that always returns false (i.e. "No Duplicate").

func GetDatastorePaths

func GetDatastorePaths(dsn, vectordbPath string) (string, string, error)

func GetDocuments

func GetDocuments(ctx context.Context, filename, filetype string, reader io.Reader, textSplitterOpts *TextSplitterOpts) ([]vs.Document, error)

func NewLcgoMarkdownSplitter added in v0.1.5

func NewLcgoMarkdownSplitter(opts TextSplitterOpts) *lcgosplitter.MarkdownTextSplitter

func NewLcgoTextSplitter added in v0.1.5

func NewLcgoTextSplitter(opts TextSplitterOpts) lcgosplitter.TokenSplitter

NewLcgoTextSplitter returns a new langchain-go text splitter.

Types

type Datastore

type Datastore struct {
	Index       *index.DB
	Vectorstore vectorstore.VectorStore
}

func NewDatastore

func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig config.OpenAIConfig) (*Datastore, error)

func (*Datastore) DeleteDataset

func (s *Datastore) DeleteDataset(ctx context.Context, datasetID string) error

func (*Datastore) DeleteDocument

func (s *Datastore) DeleteDocument(ctx context.Context, documentID, datasetID string) error

func (*Datastore) DeleteFile

func (s *Datastore) DeleteFile(ctx context.Context, datasetID, fileID string) error

func (*Datastore) GetDataset

func (s *Datastore) GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error)

func (*Datastore) Ingest

func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte, opts IngestOpts) ([]string, error)

Ingest loads a document from a reader and adds it to the dataset.

func (*Datastore) ListDatasets

func (s *Datastore) ListDatasets(ctx context.Context) ([]index.Dataset, error)

func (*Datastore) NewDataset

func (s *Datastore) NewDataset(ctx context.Context, dataset index.Dataset) error

func (*Datastore) Retrieve

func (s *Datastore) Retrieve(ctx context.Context, datasetID string, query string, topk int) ([]vectorstore.Document, error)

type IngestOpts

type IngestOpts struct {
	Filename            *string
	FileMetadata        *index.FileMetadata
	IsDuplicateFuncName string
	IsDuplicateFunc     IsDuplicateFunc
	TextSplitterOpts    *TextSplitterOpts
}

type IsDuplicateFunc

type IsDuplicateFunc func(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

IsDuplicateFunc is a function that determines whether a document is a duplicate or if it should be ingested. The function should return true if the document is a duplicate (and thus should not be ingested) and false otherwise.

type TextSplitterOpts added in v0.1.5

type TextSplitterOpts struct {
	ChunkSize    int    `usage:"Textsplitter Chunk Size" default:"1024" env:"KNOW_TEXTSPLITTER_CHUNK_SIZE" name:"textsplitter-chunk-size"`
	ChunkOverlap int    `usage:"Textsplitter Chunk Overlap" default:"256" env:"KNOW_TEXTSPLITTER_CHUNK_OVERLAP" name:"textsplitter-chunk-overlap"`
	ModelName    string `usage:"Textsplitter Model Name" default:"gpt-4" env:"KNOW_TEXTSPLITTER_MODEL_NAME" name:"textsplitter-model-name"`
	EncodingName string `` /* 128-byte string literal not displayed */
}

func NewTextSplitterOpts added in v0.1.5

func NewTextSplitterOpts() TextSplitterOpts

NewTextSplitterOpts returns the default options for a text splitter.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL