Documentation ¶
Index ¶
- func NewDirectoryIndex() *pb.DirectoryIndex
- type DiskCachedEmbeddingIndex
- func (this *DiskCachedEmbeddingIndex) ClearPath(ctx context.Context, path string) error
- func (this *DiskCachedEmbeddingIndex) ClearPaths(ctx context.Context, paths []string) error
- func (this *DiskCachedEmbeddingIndex) EmbedFile(ctx context.Context, path string, chunkSize, maxChunks int) (*pb.FileEmbeddings, error)
- func (this *DiskCachedEmbeddingIndex) FilterUnindexablefiles(path string, files []os.FileInfo, forceUpdate bool, ...) []os.FileInfo
- func (this *DiskCachedEmbeddingIndex) IndexPath(ctx context.Context, path string, forceUpdate bool, chunkSize, maxChunks int) error
- func (this *DiskCachedEmbeddingIndex) IndexPaths(ctx context.Context, paths []string, forceUpdate bool, ...) error
- func (this *DiskCachedEmbeddingIndex) IndexableDirectory(path string) bool
- func (this *DiskCachedEmbeddingIndex) IndexableFile(path string, file os.FileInfo, forceUpdate bool, ...) bool
- func (this *DiskCachedEmbeddingIndex) IndexedFiles() []string
- func (this *DiskCachedEmbeddingIndex) LoadDotfile(dotfile string) error
- func (this *DiskCachedEmbeddingIndex) LoadPath(ctx context.Context, path string) error
- func (this *DiskCachedEmbeddingIndex) LoadPaths(ctx context.Context, paths []string) error
- func (this *DiskCachedEmbeddingIndex) PopulateSearchResults(ctx context.Context, results []*VectorSearchResult) error
- func (this *DiskCachedEmbeddingIndex) SavePath(path string) error
- func (this *DiskCachedEmbeddingIndex) SavePaths(paths []string) error
- func (this *DiskCachedEmbeddingIndex) Search(ctx context.Context, query string, numResults int) ([]*VectorSearchResult, error)
- func (this *DiskCachedEmbeddingIndex) SearchWithVector(ctx context.Context, queryVector []float32, numResults int) ([]*VectorSearchResult, error)
- func (this *DiskCachedEmbeddingIndex) SetDefaultConfig()
- func (this *DiskCachedEmbeddingIndex) SetEmbedder(embedder Embedder)
- func (this *DiskCachedEmbeddingIndex) SetOutput(out io.Writer)
- func (this *DiskCachedEmbeddingIndex) SetVerbosity(verbosity int)
- func (this *DiskCachedEmbeddingIndex) Vectorize(ctx context.Context, content string) ([]float32, error)
- type Embedder
- type FileEmbeddingIndex
- type VectorSearchResult
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func NewDirectoryIndex ¶
func NewDirectoryIndex() *pb.DirectoryIndex
Types ¶
type DiskCachedEmbeddingIndex ¶
type DiskCachedEmbeddingIndex struct { // maps absolute path of directory to a directory Index Index map[string]*pb.DirectoryIndex // Interface to an Embedder used to embed chunks of documents Embedder Embedder // A filesystem interface, used when reading and writing files. // We use an interface here so that we can mock the filesystem during testing. Fs afero.Fs // The output stream to use for logging Out io.Writer // The Verbosity level of the output stream // 0 - no output // 1 - most important calls // 2 - more detail about embeddings Verbosity int // The name of the file to cache the index on disk DotfileName string // When we call the embedder we batch chunks together into a single call, // this is the number of chunks to batch together ChunksPerCall int // When we embed a path we skip these directories IgnoreDirs []string // When we embed a path we skip these files IgnoreFiles []string }
func NewDiskCachedEmbeddingIndex ¶
func NewDiskCachedEmbeddingIndex(embedder Embedder, writer io.Writer) *DiskCachedEmbeddingIndex
func (*DiskCachedEmbeddingIndex) ClearPath ¶
func (this *DiskCachedEmbeddingIndex) ClearPath(ctx context.Context, path string) error
Clear out embeddings at a given path, both in memory and on disk We do this by first locating all dotfiles in the path, then deleting the in-memory copy, and finally deleting the dotfiles
func (*DiskCachedEmbeddingIndex) ClearPaths ¶
func (this *DiskCachedEmbeddingIndex) ClearPaths(ctx context.Context, paths []string) error
func (*DiskCachedEmbeddingIndex) EmbedFile ¶
func (this *DiskCachedEmbeddingIndex) EmbedFile(ctx context.Context, path string, chunkSize, maxChunks int) (*pb.FileEmbeddings, error)
EmbedFile takes a path to a file, splits the file into chunks, and calls the embedding API for each chunk
func (*DiskCachedEmbeddingIndex) FilterUnindexablefiles ¶
func (this *DiskCachedEmbeddingIndex) FilterUnindexablefiles(path string, files []os.FileInfo, forceUpdate bool, dirIndex *pb.DirectoryIndex) []os.FileInfo
func (*DiskCachedEmbeddingIndex) IndexPath ¶
func (this *DiskCachedEmbeddingIndex) IndexPath(ctx context.Context, path string, forceUpdate bool, chunkSize, maxChunks int) error
Force means that we will re-index the file even if the target file hasn't changed since the last index
func (*DiskCachedEmbeddingIndex) IndexPaths ¶
func (*DiskCachedEmbeddingIndex) IndexableDirectory ¶
func (this *DiskCachedEmbeddingIndex) IndexableDirectory(path string) bool
func (*DiskCachedEmbeddingIndex) IndexableFile ¶
func (this *DiskCachedEmbeddingIndex) IndexableFile(path string, file os.FileInfo, forceUpdate bool, previousEmbeddings *pb.FileEmbeddings) bool
Return true if this is a file we want to index/embed. We use several predicates to determine this.
- The file must be a non-hidden file (i.e. not starting with a dot)
- The file must not be a directory (handled separately)
- The file must be text, not binary, checked by extension/mime-type and by checking the first few bytes of the file if the extension check passes
- The file must have been updated since the last indexing, unless forceUpdate is true
func (*DiskCachedEmbeddingIndex) IndexedFiles ¶
func (this *DiskCachedEmbeddingIndex) IndexedFiles() []string
func (*DiskCachedEmbeddingIndex) LoadDotfile ¶
func (this *DiskCachedEmbeddingIndex) LoadDotfile(dotfile string) error
Assumes the path is a valid butterfish index file
func (*DiskCachedEmbeddingIndex) LoadPath ¶
func (this *DiskCachedEmbeddingIndex) LoadPath(ctx context.Context, path string) error
func (*DiskCachedEmbeddingIndex) LoadPaths ¶
func (this *DiskCachedEmbeddingIndex) LoadPaths(ctx context.Context, paths []string) error
func (*DiskCachedEmbeddingIndex) PopulateSearchResults ¶
func (this *DiskCachedEmbeddingIndex) PopulateSearchResults(ctx context.Context, results []*VectorSearchResult) error
Given an array of VectorSearchResults, fetch the file contents for each result and store it in the result's Content field.
func (*DiskCachedEmbeddingIndex) SavePath ¶
func (this *DiskCachedEmbeddingIndex) SavePath(path string) error
func (*DiskCachedEmbeddingIndex) SavePaths ¶
func (this *DiskCachedEmbeddingIndex) SavePaths(paths []string) error
func (*DiskCachedEmbeddingIndex) Search ¶
func (this *DiskCachedEmbeddingIndex) Search(ctx context.Context, query string, numResults int) ([]*VectorSearchResult, error)
Search the vectors that have been loaded into memory by embedding the query string and then searching for the closest vectors based on a cosine distance. This method calls the following methods in succession. 1. Vectorize() 2. SearchWithVector() 3. PopulateSearchResults()
func (*DiskCachedEmbeddingIndex) SearchWithVector ¶
func (this *DiskCachedEmbeddingIndex) SearchWithVector(ctx context.Context, queryVector []float32, numResults int) ([]*VectorSearchResult, error)
Super naive vector search operation.
- First we brute force search by iterating over all stored vectors and calculating cosine distance
- Next we sort based on score
func (*DiskCachedEmbeddingIndex) SetDefaultConfig ¶
func (this *DiskCachedEmbeddingIndex) SetDefaultConfig()
func (*DiskCachedEmbeddingIndex) SetEmbedder ¶
func (this *DiskCachedEmbeddingIndex) SetEmbedder(embedder Embedder)
func (*DiskCachedEmbeddingIndex) SetOutput ¶
func (this *DiskCachedEmbeddingIndex) SetOutput(out io.Writer)
func (*DiskCachedEmbeddingIndex) SetVerbosity ¶
func (this *DiskCachedEmbeddingIndex) SetVerbosity(verbosity int)
type FileEmbeddingIndex ¶
type FileEmbeddingIndex interface { SetEmbedder(embedder Embedder) Search(ctx context.Context, query string, numResults int) ([]*VectorSearchResult, error) Vectorize(ctx context.Context, content string) ([]float32, error) SearchWithVector(ctx context.Context, queryVector []float32, k int) ([]*VectorSearchResult, error) PopulateSearchResults(ctx context.Context, embeddings []*VectorSearchResult) error ClearPaths(ctx context.Context, paths []string) error ClearPath(ctx context.Context, path string) error LoadPaths(ctx context.Context, paths []string) error LoadPath(ctx context.Context, path string) error IndexPaths(ctx context.Context, paths []string, forceUpdate bool, chunkSize, maxChunks int) error IndexPath(ctx context.Context, path string, forceUpdate bool, chunkSize, maxChunks int) error IndexedFiles() []string }