lucene41

package

v0.0.0-...-d0be9ee Latest Latest Go to latest Published: Dec 10, 2015 License: Apache-2.0 Imports: 13 Imported by: 12

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/balzaczyy/golucene

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func NewLucene41PostingsReader(dir store.Directory, fis FieldInfos, si *SegmentInfo, ctx store.IOContext, ...) (r PostingsReaderBase, err error)
type DataInput
type ForUtil
- func NewForUtilFrom(in DataInput) (fu *ForUtil, err error)
- func NewForUtilInto(accetableOverheadRatio float32, out util.DataOutput) (*ForUtil, error)
type IndexOutput
type Lucene41PostingsFormat
- func NewLucene41PostingsFormat() *Lucene41PostingsFormat
- func NewLucene41PostingsFormatWith(minTermBlockSize, maxTermBlockSize int) *Lucene41PostingsFormat
type Lucene41PostingsReader
type Lucene41PostingsWriter
type Lucene41StoredFieldsFormat
- func NewLucene41StoredFieldsFormat() *Lucene41StoredFieldsFormat
type SkipWriter
- func NewSkipWriter(maxSkipLevels, blockSize, docCount int, ...) *SkipWriter

Constants ¶

View Source

const (
	/**
	 * Special number of bits per value used whenever all values to encode are equal.
	 */
	ALL_VALUES_EQUAL = 0
	/**
	 * Upper limit of the number of bytes that might be required to stored
	 * <code>BLOCK_SIZE</code> encoded values.
	 */
	MAX_ENCODED_SIZE = LUCENE41_BLOCK_SIZE * 4
)

View Source

const (
	LUCENE41_DOC_EXTENSION = "doc"
	LUCENE41_POS_EXTENSION = "pos"
	LUCENE41_PAY_EXTENSION = "pay"

	LUCENE41_BLOCK_SIZE = 128
)

View Source

const (
	LUCENE41_TERMS_CODEC = "Lucene41PostingsWriterTerms"
	LUCENE41_DOC_CODEC   = "Lucene41PostingsWriterDoc"
	LUCENE41_POS_CODEC   = "Lucene41PostingsWriterPos"
	LUCENE41_PAY_CODEC   = "Lucene41PostingsWriterPay"

	LUCENE41_VERSION_START      = 0
	LUCENE41_VERSION_META_ARRAY = 1
	LUCENE41_VERSION_CHECKSUM   = 2
	LUCENE41_VERSION_CURRENT    = LUCENE41_VERSION_CHECKSUM
)

Variables ¶

View Source

var MAX_DATA_SIZE int = computeMaxDataSize()

*

Upper limit of the number of values that might be decoded in a single call to
{@link #readBlock(IndexInput, byte[], int[])}. Although values after
<code>BLOCK_SIZE</code> are garbage, it is necessary to allocate value buffers
whose size is >= MAX_DATA_SIZE to avoid {@link ArrayIndexOutOfBoundsException}s.

Functions ¶

func NewLucene41PostingsReader ¶

func NewLucene41PostingsReader(dir store.Directory,
	fis FieldInfos, si *SegmentInfo,
	ctx store.IOContext, segmentSuffix string) (r PostingsReaderBase, err error)

Types ¶

type DataInput ¶

type DataInput interface {
	ReadVInt() (n int32, err error)
}

type ForUtil ¶

type ForUtil struct {
	// contains filtered or unexported fields
}

*

Encode all values in normal area with fixed bit width,
which is determined by the max value in this block.

func NewForUtilInto ¶

func NewForUtilInto(accetableOverheadRatio float32, out util.DataOutput) (*ForUtil, error)

Create a new ForUtil instance and save state into out.

type IndexOutput ¶

type IndexOutput interface {
	WriteByte(byte) error
	WriteBytes([]byte) error
	WriteVInt(int32) error
}

type Lucene41PostingsFormat ¶

type Lucene41PostingsFormat struct {
	// contains filtered or unexported fields
}

func NewLucene41PostingsFormat ¶

func NewLucene41PostingsFormat() *Lucene41PostingsFormat

Creates Lucene41PostingsFormat wit hdefault settings.

func NewLucene41PostingsFormatWith ¶

func NewLucene41PostingsFormatWith(minTermBlockSize, maxTermBlockSize int) *Lucene41PostingsFormat

Creates Lucene41PostingsFormat with custom values for minBlockSize and maxBlockSize passed to block terms directory.

func (*Lucene41PostingsFormat) FieldsConsumer ¶

func (f *Lucene41PostingsFormat) FieldsConsumer(state *SegmentWriteState) (FieldsConsumer, error)

func (*Lucene41PostingsFormat) FieldsProducer ¶

func (f *Lucene41PostingsFormat) FieldsProducer(state SegmentReadState) (FieldsProducer, error)

func (*Lucene41PostingsFormat) Name ¶

func (f *Lucene41PostingsFormat) Name() string

func (*Lucene41PostingsFormat) String ¶

func (f *Lucene41PostingsFormat) String()

type Lucene41PostingsReader ¶

type Lucene41PostingsReader struct {
	// contains filtered or unexported fields
}

Concrete class that reads docId (maybe frq,pos,offset,payload) list with postings format.

func (*Lucene41PostingsReader) Close ¶

func (r *Lucene41PostingsReader) Close() error

func (*Lucene41PostingsReader) DecodeTerm ¶

func (r *Lucene41PostingsReader) DecodeTerm(longs []int64,
	in util.DataInput, fieldInfo *FieldInfo,
	_termState *BlockTermState, absolute bool) (err error)

func (*Lucene41PostingsReader) Docs ¶

func (r *Lucene41PostingsReader) Docs(fieldInfo *FieldInfo,
	termState *BlockTermState, liveDocs util.Bits,
	reuse DocsEnum, flags int) (de DocsEnum, err error)

func (*Lucene41PostingsReader) Init ¶

func (r *Lucene41PostingsReader) Init(termsIn store.IndexInput) error

func (*Lucene41PostingsReader) NewTermState ¶

func (r *Lucene41PostingsReader) NewTermState() *BlockTermState

type Lucene41PostingsWriter ¶

type Lucene41PostingsWriter struct {
	// contains filtered or unexported fields
}

Concrete class that writes docId (maybe frq,pos,offset,payloads) list with postings format.

Postings list for each term will be stored separately.

func (*Lucene41PostingsWriter) AddPosition ¶

func (w *Lucene41PostingsWriter) AddPosition(position int,
	payload []byte, startOffset, endOffset int) error

Add a new opsition & payload

func (*Lucene41PostingsWriter) Close ¶

func (w *Lucene41PostingsWriter) Close() (err error)

func (*Lucene41PostingsWriter) EncodeTerm ¶

func (w *Lucene41PostingsWriter) EncodeTerm(longs []int64,
	out util.DataOutput, fieldInfo *FieldInfo, _state *BlockTermState,
	absolute bool) (err error)

func (*Lucene41PostingsWriter) FinishDoc ¶

func (w *Lucene41PostingsWriter) FinishDoc() error

func (*Lucene41PostingsWriter) FinishTerm ¶

func (w *Lucene41PostingsWriter) FinishTerm(_state *BlockTermState) error

Called when we are done adding docs to this term

func (*Lucene41PostingsWriter) Init ¶

func (w *Lucene41PostingsWriter) Init(termsOut store.IndexOutput) error

func (*Lucene41PostingsWriter) NewTermState ¶

func (w *Lucene41PostingsWriter) NewTermState() *BlockTermState

func (*Lucene41PostingsWriter) SetField ¶

func (w *Lucene41PostingsWriter) SetField(fieldInfo *FieldInfo) int

func (*Lucene41PostingsWriter) StartDoc ¶

func (w *Lucene41PostingsWriter) StartDoc(docId, termDocFreq int) error

func (*Lucene41PostingsWriter) StartTerm ¶

func (w *Lucene41PostingsWriter) StartTerm() error

type Lucene41StoredFieldsFormat ¶

type Lucene41StoredFieldsFormat struct {
	*compressing.CompressingStoredFieldsFormat
}

Lucene 4.1 stored fields format.

Principle ¶

This StoredFieldsFormat compresses blocks of 16KB of documents in order to improve the compression ratio compared to document-level compression. It uses the LZ4 compression algorithm, which is fast to compress and very fast to decompress dta. Although the compression method that is used focuses more on speed than on compression ratio, it should provide interesting compression ratios for redundant inputs (such as log files, HTML or plain text).

File formats ¶

Stored fields are represented by two files:

1. field_data

A fields data file (extension .fdt). This file stores a compact representation of documents in compressed blocks of 16KB or more. When writing a segment, documents are appended to an in-memory []byte buffer. When its size reaches 16KB or more, some metadata about the documents is flushed to disk, immediately followed by a compressed representation of the buffer using the [LZ4](http://codec.google.com/p/lz4/) [compression format](http://fastcompression.blogspot.ru/2011/05/lz4-explained.html)

Here is a more detailed description of the field data fiel format:

- FieldData (.dft) --> <Header>, packedIntsVersion, <Chunk>^ChunkCount - Header --> CodecHeader - PackedIntsVersion --> PackedInts.VERSION_CURRENT as a VInt - ChunkCount is not known in advance and is the number of chunks nucessary to store all document of the segment - Chunk --> DocBase, ChunkDocs, DocFieldCounts, DocLengths, <CompressedDoc> - DocBase --> the ID of the first document of the chunk as a VInt - ChunkDocs --> the number of the documents in the chunk as a VInt - DocFieldCounts --> the number of stored fields or every document in the chunk, encoded as followed:

if hunkDocs=1, the unique value is encoded as a VInt
else read VInt (let's call it bitsRequired)
if bitsRequired is 0 then all values are equal, and the common value is the following VInt
else bitsRequired is the number of bits required to store any value, and values are stored in a packed array where every value is stored on exactly bitsRequired bits

- DocLenghts --> the lengths of all documents in the chunk, encodedwith the same method as DocFieldCounts - CompressedDocs --> a compressed representation of <Docs> using the LZ4 compression format - Docs --> <Doc>^ChunkDocs - Doc --> <FieldNumAndType, Value>^DocFieldCount - FieldNumAndType --> a VLong, whose 3 last bits are Type and other bits are FieldNum - Type -->

0: Value is string
1: Value is BinaryValue
2: Value is int
3: Value is float32
4: Value is int64
5: Value is float64
6, 7: unused

Notes ¶

- If documents are larger than 16KB then chunks will likely contain only one document. However, documents can never spread across several chunks (all fields of a single document are in the same chunk). - When at least one document in a chunk is large enough so that the chunk is larger than 32KB, then chunk will actually be compressed in several LZ4 blocks of 16KB. This allows StoredFieldsVisitors which are only interested in the first fields of a document to not have to decompress 10MB of data if the document is 10MB, but only 16KB. - Given that the original lengths are written in the metadata of the chunk, the decompressorcan leverage this information to stop decoding as soon as enough data has been decompressed. - In case documents are incompressible, CompressedDocs will be less than 0.5% larger than Docs.

2. field_index

A fields index file (extension .fdx).

- FieldsIndex (.fdx) --> <Header>, <ChunkINdex> - Header --> CodecHeader - ChunkIndex: See CompressingStoredFieldsInexWriter

Known limitations ¶

This StoredFieldsFormat does not support individual documents larger than (2^32 - 2^14) bytes. In case this is a problem, you should use another format, such as Lucene40StoredFieldsFormat.

func NewLucene41StoredFieldsFormat ¶

func NewLucene41StoredFieldsFormat() *Lucene41StoredFieldsFormat

type SkipWriter ¶

type SkipWriter struct {
	*store.MultiLevelSkipListWriter
	// contains filtered or unexported fields
}

func NewSkipWriter ¶

func NewSkipWriter(maxSkipLevels, blockSize, docCount int,
	docOut, posOut, payOut store.IndexOutput) *SkipWriter

func (*SkipWriter) BufferSkip ¶

func (w *SkipWriter) BufferSkip(doc, numDocs int, posFP, payFP int64, posBufferUpto, payloadByteUpto int) error

Sets the values for the current skip data.

func (*SkipWriter) ResetSkip ¶

func (w *SkipWriter) ResetSkip()

func (*SkipWriter) SetField ¶

func (w *SkipWriter) SetField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads bool)

func (*SkipWriter) WriteSkipData ¶

func (w *SkipWriter) WriteSkipData(level int, skipBuffer store.IndexOutput) error

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func NewLucene41PostingsReader ¶

Types ¶

type DataInput ¶

type ForUtil ¶

func NewForUtilFrom ¶

func NewForUtilInto ¶

type IndexOutput ¶

type Lucene41PostingsFormat ¶

func NewLucene41PostingsFormat ¶

func NewLucene41PostingsFormatWith ¶

func (*Lucene41PostingsFormat) FieldsConsumer ¶

func (*Lucene41PostingsFormat) FieldsProducer ¶

func (*Lucene41PostingsFormat) Name ¶

func (*Lucene41PostingsFormat) String ¶

type Lucene41PostingsReader ¶

func (*Lucene41PostingsReader) Close ¶

func (*Lucene41PostingsReader) DecodeTerm ¶

func (*Lucene41PostingsReader) Docs ¶

func (*Lucene41PostingsReader) Init ¶

func (*Lucene41PostingsReader) NewTermState ¶

type Lucene41PostingsWriter ¶

func (*Lucene41PostingsWriter) AddPosition ¶

func (*Lucene41PostingsWriter) Close ¶

func (*Lucene41PostingsWriter) EncodeTerm ¶

func (*Lucene41PostingsWriter) FinishDoc ¶

func (*Lucene41PostingsWriter) FinishTerm ¶

func (*Lucene41PostingsWriter) Init ¶

func (*Lucene41PostingsWriter) NewTermState ¶

func (*Lucene41PostingsWriter) SetField ¶

func (*Lucene41PostingsWriter) StartDoc ¶

func (*Lucene41PostingsWriter) StartTerm ¶

type Lucene41StoredFieldsFormat ¶

Principle ¶

File formats ¶

Notes ¶

Known limitations ¶

func NewLucene41StoredFieldsFormat ¶

type SkipWriter ¶

func NewSkipWriter ¶

func (*SkipWriter) BufferSkip ¶

func (*SkipWriter) ResetSkip ¶

func (*SkipWriter) SetField ¶

func (*SkipWriter) WriteSkipData ¶

Source Files ¶