Documentation ¶
Index ¶
- Constants
- func FindMatchingLines(r io.Reader, query string, limit int) []string
- func GetFill(doc []bool) float64
- func HashBloom(word []byte) []uint64
- func Itemise(tokens []string) []bool
- func Ngrams(text string, size int) []string
- func RemoveUInt64Duplicates(s []uint64) []uint64
- func Trigrams(text string) []string
- func TrigramsDancantos(text string) []string
- func TrigramsFfmiruz(text string) []string
- func TrigramsMerovius(text string) []string
- type Archive
- type File
- type Index
- func (idx *Index) Add(item []bool) error
- func (idx *Index) Archive() *Archive
- func (idx *Index) Close() error
- func (idx *Index) ExportArchive(ctx context.Context, wr io.Writer) error
- func (idx *Index) ExportArchiveWithURI(ctx context.Context, archive_uri string) error
- func (idx *Index) IdToFile(id uint32) *File
- func (idx *Index) ImportArchive(ctx context.Context, r io.Reader) error
- func (idx *Index) ImportArchiveWithURI(ctx context.Context, archive_uri string) error
- func (idx *Index) IndexBuckets(ctx context.Context, bucket_uris ...string) error
- func (idx *Index) IndexObject(ctx context.Context, b *blob.Bucket, bucket_id uint32, obj *blob.ListObject) error
- func (idx *Index) OpenFile(ctx context.Context, id uint32) (io.ReadCloser, error)
- func (idx *Index) PrintIndex()
- func (idx *Index) Queryise(query string) []uint64
- func (idx *Index) Search(queryBits []uint64) []uint32
- func (idx *Index) Tokenize(text string) []string
- type IndexOptions
- type Trigram
Constants ¶
const ( BloomSize = 4096 DocumentsPerBlock = 64 )
Variables ¶
This section is empty.
Functions ¶
func FindMatchingLines ¶
Given a file and a query try to open the file, then look through its lines and see if any of them match something from the query up to a limit Note this will return partial matches as if any term matches its considered a match and there is no accounting for better matches... In other words it's a very dumb way of doing this and probably has horrible runtime performance to match
func GetFill ¶
GetFill returns the % value of how much this doc was filled, allowing for determining if the index will be overfilled for this document
func HashBloom ¶
HashBloom hashes a single token/word 3 times to give us the entry locations we need for our bloomFilter filter
func Itemise ¶
Itemise given some content will turn it into tokens and then use those to create the bit positions we need to set for our bloomFilter filter index
func Ngrams ¶
Ngrams given input splits it according the requested size such that you can get trigrams or whatever else is required
func RemoveUInt64Duplicates ¶
RemoveUInt64Duplicates removes duplicate values from uint64 slice
func Trigrams ¶
Trigrams takes in text and returns its trigrams Attempts to be as efficient as possible
func TrigramsDancantos ¶
Trigrams takes in text and returns its trigrams
func TrigramsFfmiruz ¶
func TrigramsMerovius ¶
Types ¶
type Archive ¶
type Archive struct { BloomFilter []uint64 `json:"bloom_filter"` IdToFile []*File `json:"id_to_file"` BucketURIs map[string]uint32 `json:"bucket_uris"` }
Archive implements a struct containing data for serializing and deserializing `Index` instances
type Index ¶
type Index struct {
// contains filtered or unexported fields
}
Index implements a bloom filter based search index
func NewIndexWithOptions ¶
func NewIndexWithOptions(opts *IndexOptions) *Index
func (*Index) Add ¶
Add adds items into the internal bloomFilter used later for pre-screening documents note that it fills the filter from right to left, which might not be what you expect
func (*Index) ExportArchive ¶
func (*Index) ExportArchiveWithURI ¶
func (*Index) ImportArchive ¶
func (*Index) ImportArchiveWithURI ¶
func (*Index) IndexBuckets ¶
func (*Index) IndexObject ¶
func (*Index) PrintIndex ¶
func (idx *Index) PrintIndex()
PrintIndex prints out the index which can be useful from time to time to ensure that bits are being set correctly.
func (*Index) Queryise ¶
Queryise given some content will turn it into tokens and then hash them and store the resulting values into a slice which we can use to query the bloom filter
type IndexOptions ¶
func DefaultIndexOptions ¶
func DefaultIndexOptions() *IndexOptions
type Trigram ¶
type Trigram [3]rune
func TrigramsJamesrom ¶
Trigrams takes in text and returns its trigrams.
func (Trigram) Bytes ¶
Bytes is the simplest way to turn an array of runes into a slice of bytes. There is a faster way to do this, but not needed for this demo. See: https://stackoverflow.com/questions/29255746/how-encode-rune-into-byte-using-utf8