doclib

package

v0.0.0 Latest Latest Go to latest Published: Oct 28, 2019 License: MIT Imports: 31 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/papercutsoftware/pdfsearch

Links

Open Source Insights

README ¶

doclib

doclib implements the bleve + unidoc interfaces

Documentation ¶

Overview ¶

* This source implements the main function IndexPdfReaders(). * IndexPdfFiles() is a convenience function that opens files and calls IndexPdfReaders().

* This source file implements the main doclib function IndexPdfReaders(). * IndexPdfFiles() is a convenience function that opens files and calls IndexPdfReaders().

Index ¶

Constants
Variables
func ExportBleveMem(index bleve.Index) ([]byte, error)
func ExtractPageTextMarks(page *model.PdfPage) (string, *extractor.TextMarkArray, error)
func ImportBleveMem(data []byte) (bleve.Index, error)
func PageSizePt(page *model.PdfPage) (width, height float64, err error)
func PdfOpenFile(inPath string) (*model.PdfReader, error)
func PdfOpenFileLazy(inPath string) (*os.File, *model.PdfReader, error)
func PdfOpenReader(rs io.ReadSeeker, lazy bool) (*model.PdfReader, error)
func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *model.PdfPage) error) error
func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker, ...) error
type BlevePdf
- func BlevePdfFromHIPDs(hipds []serial.HashIndexPathDoc) (BlevePdf, error)
- func IndexPdfFilesOrReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, forceCreate bool, ...) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)
- func IndexPdfFilesUsingReaders(pathList []string, persistDir string, forceCreate bool, report func(string)) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)
- func (blevePdf *BlevePdf) Equals(other *BlevePdf) bool
- func (blevePdf BlevePdf) Len() int
- func (blevePdf *BlevePdf) SearchBleveIndex(index bleve.Index, term0 string, maxResults int) (PdfMatchSet, error)
- func (blevePdf BlevePdf) String() string
- func (blevePdf BlevePdf) ToHIPDs() ([]serial.HashIndexPathDoc, error)
type DocPageText
type DocPositions
- func (docPos *DocPositions) AddDocPage(pageNum uint32, ppos PagePositions, text string) (uint32, error)
- func (docPos *DocPositions) Close() error
- func (docPos *DocPositions) Equals(e *DocPositions) bool
- func (docPos DocPositions) Len() int
- func (docPos *DocPositions) Save() error
- func (docPos DocPositions) String() string
type ExtractList
- func CreateExtractList(maxPages, maxPerPage int) *ExtractList
- func (l *ExtractList) AddRect(inPath string, pageNum uint32, r model.PdfRectangle)
- func (l *ExtractList) SaveOutputPdf(outPath string) error
- func (l ExtractList) String() string
type IDText
type PDFPageProcessor
- func CreatePDFPageProcessorFile(inPath string) (*PDFPageProcessor, error)
- func CreatePDFPageProcessorReader(inPath string, rs io.ReadSeeker) (*PDFPageProcessor, error)
- func (p *PDFPageProcessor) Close() error
- func (p PDFPageProcessor) NumPages() (uint32, error)
- func (p *PDFPageProcessor) Process(processPage func(pageNum uint32, page *model.PdfPage) error) (err error)
type PagePositions
- func PagePositionsFromTextMarks(textMarks *extractor.TextMarkArray) PagePositions
- func (ppos PagePositions) BBox(start, end uint32) (model.PdfRectangle, bool)
- func (ppos PagePositions) Empty() bool
- func (ppos PagePositions) Equals(epl PagePositions) bool
- func (ppos PagePositions) String() string
type PdfMatchSet
- func SearchPersistentPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)
- func (p PdfMatchSet) Best() PdfMatchSet
- func (p PdfMatchSet) Equals(q PdfMatchSet) bool
- func (s PdfMatchSet) Files() []string
- func (s PdfMatchSet) String() string
type PdfPageMatch
- func (p PdfPageMatch) String() string
type Phrase
type Span

Constants ¶

View Source

const (
	// BorderWidth is the width of rectangle sides in points
	BorderWidth = 1.0
	// ShadowWidth is the with of the shadow on the inside and outside of the rectangles
	ShadowWidth = 0.2
)

Variables ¶

View Source

var (
	// Debug can be set true to enable debug level logging.
	Debug bool
	// Trace can be set true to enable debug level logging.
	Trace bool
	// ExposeErrors can be set to true to not recover from errors in library functions.
	ExposeErrors bool
)

View Source

var CheckConsistency = false

CheckConsistency should be set true to regularly check the BlevePdf consistency.

View Source

var ErrNoMatch = errors.New("no match for hit")

ErrNoMatch indicates there was no match for a bleve hit. It is not a real error.

View Source

var ErrNoPositions = errors.New("no match for hit")

ErrNoMatch indicates there was no match for a bleve hit. It is not a real error.

Functions ¶

func ExportBleveMem ¶

func ExportBleveMem(index bleve.Index) ([]byte, error)

ExportBleveMem serializes bleve index `index` to a byte slice.

func ExtractPageTextMarks ¶

func ExtractPageTextMarks(page *model.PdfPage) (string, *extractor.TextMarkArray, error)

ExtractPageTextMarks returns the extracted text and corresponding TextMarks on page `page`.

func ImportBleveMem ¶

func ImportBleveMem(data []byte) (bleve.Index, error)

ImportBleveMem deserializes `data` to a bleve.Index.

func PageSizePt ¶

func PageSizePt(page *model.PdfPage) (width, height float64, err error)

PageSizePt returns the width and height of `page` in points.

func PdfOpenFile ¶

func PdfOpenFile(inPath string) (*model.PdfReader, error)

PdfOpenFile opens PDF file `inPath` and attempts to handle null encryption schemes.

func PdfOpenFileLazy ¶

func PdfOpenFileLazy(inPath string) (*os.File, *model.PdfReader, error)

PdfOpenFile opens PDF file `inPath` lazily and attempts to handle null encryption schemes. Caller must close the returned file handle if there are no errors.

func PdfOpenReader ¶

func PdfOpenReader(rs io.ReadSeeker, lazy bool) (*model.PdfReader, error)

PdfOpenReader opens the PDF file accessed by `rs` and attempts to handle null encryption schemes. If `lazy` is true, a lazy PDF reader is opened.

func ProcessPDFPagesFile ¶

func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *model.PdfPage) error) error

ProcessPDFPagesFile runs `processPage` on every page in PDF file `inPath`. It is a convenience function.

func ProcessPDFPagesReader ¶

func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker,
	processPage func(pageNum uint32, page *model.PdfPage) error) error

ProcessPDFPagesReader runs `processPage` on every page in PDF file opened in `rs`. It is a convenience function.

Types ¶

type BlevePdf ¶

type BlevePdf struct {
	// contains filtered or unexported fields
}

BlevePdf links a bleve index over texts to the PDF files that the texts were extracted from, using the hashDoc {file hash: DocPositions} map. For each PDF file, the DocPositions maps extracted text to the location on of text on the PDF page it was extracted from. A BlevePdf can be optionally saved to and retreived from disk, in which case isMem() returns false. BlevePdf is intentionally opaque.

func BlevePdfFromHIPDs ¶

func BlevePdfFromHIPDs(hipds []serial.HashIndexPathDoc) (BlevePdf, error)

BlevePdfFromHIPDs creates a BlevePdf from its seralized form `hipds`. It is used to deserialize a BlevePdf. !@#$ Round trip test BlevePdfFromHIPDs + ToHIPDs

func IndexPdfFilesOrReaders ¶

func IndexPdfFilesOrReaders(pathList []string, rsList []io.ReadSeeker, persistDir string,
	forceCreate bool, report func(string)) (*BlevePdf, bleve.Index,
	int, int, time.Duration, time.Duration, error)

IndexPdfFilesOrReaders returns a BlevePdf and a bleve.Index over

the PDF contents referenced by the io.ReaderSeeker's in `rsList` if `rsList` is not empty, or
the PDF filenames in `pathList` if `rsList` is not empty.

If `persist` is false, the index is stored in memory. If `persist` is true, the index is stored on disk in `persistDir`. `report` is a supplied function that is called to report progress. Returns: (blevePdf, index, numFiles, totalPages, dtPdf, dtBleve, err) where

blevePdf: mapping of a bleve index to PDF pages and text coordinates
index: a bleve index
numFiles: number of PDF files succesfully indexed
totalPages: number of PDF pages succesfully indexed
dtPdf: number of seconds spent building blevePdf
dtBleve: number of seconds spent building index
err: error, if one occurred

NOTE: If you have access to your PDF files then use `pathList` and set `rsList` to nil as a long

list of file handles may exhaust system resources.

func IndexPdfFilesUsingReaders ¶

func IndexPdfFilesUsingReaders(pathList []string, persistDir string, forceCreate bool,
	report func(string)) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)

IndexPdfFilesUsingReaders creates a bleve+BlevePdf index for `pathList`. If `persistDir` is not empty, the index is written to this directory. If `forceCreate` is true and `persistDir` is not empty, a new directory is always created. then the bleve index will be appended to. `report` is a supplied function that is called to report progress. NOTE: This is for testing only. It doesn't make sense to access IndexPdfFilesOrReaders() with a

list of opened files as this can exhaust available file handles.

func (*BlevePdf) Equals ¶

func (blevePdf *BlevePdf) Equals(other *BlevePdf) bool

Equals returns true if `blevePdf` contains the same information as `other`.

func (BlevePdf) Len ¶

func (blevePdf BlevePdf) Len() int

Len returns the number of documents in `blevePdf`.

func (*BlevePdf) SearchBleveIndex ¶

func (blevePdf *BlevePdf) SearchBleveIndex(index bleve.Index, term0 string, maxResults int) (
	PdfMatchSet, error)

SearchBleveIndex performs a bleve search on `index `for `term` and returns up to `maxResults` matches. It maps the results to PDF page names, page numbers, line numbers and page locations using `blevePdf`.

func (BlevePdf) String ¶

func (blevePdf BlevePdf) String() string

String returns a string describing `blevePdf`.

func (BlevePdf) ToHIPDs ¶

func (blevePdf BlevePdf) ToHIPDs() ([]serial.HashIndexPathDoc, error)

ToHIPDs converts `blevePdf` to a serial.HashIndexPathDoc. blevePdf.Check() is run before saving to avoid empty serializations.

type DocPageText ¶

type DocPageText struct {
	DocIdx  uint64 // Doc index (0-offset) into BlevePdf.fileList .
	PageIdx uint32 // Page index (0-offset) into DocPositions.index .
	PageNum uint32 // Page number in PDF file (1-offset)
	Text    string // Extracted page text.
}

DocPageText contains doc:page indexes, the PDF page number and the text extracted from a PDF page.

type DocPositions ¶

type DocPositions struct {
	// contains filtered or unexported fields
}

DocPositions is used to the link per-document data in a bleve index to the PDF file that the data was extracted from. There is one DocPositions per PDF file.

func (*DocPositions) AddDocPage ¶

func (docPos *DocPositions) AddDocPage(pageNum uint32, ppos PagePositions, text string) (
	uint32, error)

AddDocPage adds a page with (1-offset) page number `pageNum` and contents `ppos` to `docPos`. It returns the page index, that can be used to access this page from ReadPagePositions() !@#$ Remove `text` param. ^^^ !@#$ ^^^

func (*DocPositions) Close ¶

func (docPos *DocPositions) Close() error

Close closes `docPos`'s open files if it peristent.

func (*DocPositions) Equals ¶

func (docPos *DocPositions) Equals(e *DocPositions) bool

Equals returns true if `d` contains the same information as `e`.

func (DocPositions) Len ¶

func (docPos DocPositions) Len() int

Len returns the number of pages in `d`.

func (*DocPositions) Save ¶

func (docPos *DocPositions) Save() error

Save saves `docPos` to disk if it peristent.

func (DocPositions) String ¶

func (docPos DocPositions) String() string

String returns a human readable string describing `d`.

type ExtractList ¶

type ExtractList struct {
	// contains filtered or unexported fields
}

ExtractList is a list of PDF file:page inputs that are to be marked up then combined in a specificed order. If i is the (0-offset) ith page, then content is the contents to be added to this page. src := sources[i] content := contents[src.inPath][src.pageNum]

func CreateExtractList ¶

func CreateExtractList(maxPages, maxPerPage int) *ExtractList

CreateExtractList returns an empty *ExtractList with `maxPages` maximum number of pages and `maxPerPage` maximum rectangles per page.

func (*ExtractList) AddRect ¶

func (l *ExtractList) AddRect(inPath string, pageNum uint32, r model.PdfRectangle)

AddRect adds to `l`, instructions to draw rectangle `r` on (1-offset) page number `pageNum` of PDF file `inPath`

func (*ExtractList) SaveOutputPdf ¶

func (l *ExtractList) SaveOutputPdf(outPath string) error

SaveOutputPdf is called to markup a PDF file with the locations of text. `l` contains the input PDF names and the pages and coordinates to mark. The resulting PDF is written to `outPath`.

func (ExtractList) String ¶

func (l ExtractList) String() string

String returns a string describing `l`.

type IDText ¶

type IDText struct {
	// ID identifies the document + page index.
	ID string
	// Text is the text that bleve indexes.
	Text string
}

IDText is what bleve sees for each page of a PDF file.

type PDFPageProcessor ¶

type PDFPageProcessor struct {
	// contains filtered or unexported fields
}

PDFPageProcessor is used for processing a PDF file one page at a time. It is an opaque struct.

func CreatePDFPageProcessorFile ¶

func CreatePDFPageProcessorFile(inPath string) (*PDFPageProcessor, error)

CreatePDFPageProcessorFile creates a PDFPageProcessor for reading the PDF file `inPath`.

func CreatePDFPageProcessorReader ¶

func CreatePDFPageProcessorReader(inPath string, rs io.ReadSeeker) (*PDFPageProcessor, error)

CreatePDFPageProcessorReader creates a PDFPageProcessor for reading the PDF file referenced by `rs`. `inPath` is provided for logging only. It is expected to be the path referenced by `rs`.

func (*PDFPageProcessor) Close ¶

func (p *PDFPageProcessor) Close() error

Close closes file handles opened by CreatePDFPageProcessorFile.

func (PDFPageProcessor) NumPages ¶

func (p PDFPageProcessor) NumPages() (uint32, error)

NumPages return the number of pages in the PDF file referenced by `p`.

func (*PDFPageProcessor) Process ¶

func (p *PDFPageProcessor) Process(processPage func(pageNum uint32, page *model.PdfPage) error) (
	err error)

Process runs `processPage` on every page in PDF file `p.inPath`. It can recover from errors in the libraries it calls if `ExposeErrors` is false.

type PagePositions ¶

type PagePositions struct {
	// contains filtered or unexported fields
}

PagePositions is used to link per-document data in a bleve index to the PDF file the data was extracted from. There is one PagePositions per PDF page. PagePositions stores the locations of text fragments on a page. The search index includes a binary copy of PagePositions, so our goal is to make PagePositions compact. !@#$ Which search index?

func PagePositionsFromTextMarks ¶

func PagePositionsFromTextMarks(textMarks *extractor.TextMarkArray) PagePositions

PagePositionsFromTextMarks converts extractor.TextMarkArray `textMarks` to a more compact PagePositions. We do this because PagePositions is stored in our index which we want to be small.

func (PagePositions) BBox ¶

func (ppos PagePositions) BBox(start, end uint32) (model.PdfRectangle, bool)

BBox returns a rectangle that bounds the text with offsets `start` and `end`. ofs: `start` <= ofs < `end` on the PDF page indexed by `ppos`. Caller must check that ppos.offsetBBoxes is not empty.

func (PagePositions) Empty ¶

func (ppos PagePositions) Empty() bool

Empty return true if `ppos` has no entries.

func (PagePositions) Equals ¶

func (ppos PagePositions) Equals(epl PagePositions) bool

Equals returns true if `ppos` contains the same information as `epl`.

func (PagePositions) String ¶

func (ppos PagePositions) String() string

String returns a string describing PagePositions `ppos`.

type PdfMatchSet ¶

type PdfMatchSet struct {
	TotalMatches   int            // Total number of matches.
	SearchDuration time.Duration  // The time it took to perform the search.
	Matches        []PdfPageMatch // The per-page matches which may come from different PDFs.
}

PdfMatchSet is the result of a search over a PdfIndex.

func SearchPersistentPdfIndex ¶

func SearchPersistentPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)

SearchPersistentPdfIndex performs a bleve search on the persistent index in `persistDir`/bleve for `term` and returns up to `maxResults` matches. It maps the results to PDF page names, page numbers, line numbers and page locations using the BlevePdf that was saved in directory `persistDir` by IndexPdfReaders().

func (PdfMatchSet) Best ¶

func (p PdfMatchSet) Best() PdfMatchSet

Best return a copy of `p` trimmed to the results with the highest score.

func (PdfMatchSet) Equals ¶

func (p PdfMatchSet) Equals(q PdfMatchSet) bool

Equals returns true if `p` contains the same results as `q`.

func (PdfMatchSet) Files ¶

func (s PdfMatchSet) Files() []string

Files returns the PDF file names names in PdfMatchSet `s`. These are all the PDF that contained at least one match of the search term.

func (PdfMatchSet) String ¶

func (s PdfMatchSet) String() string

String returns a human readable description of `s`.

type PdfPageMatch ¶

type PdfPageMatch struct {
	InPath        string   // Path of the PDF file that was matched. (A name stored in the index.)
	PageNum       uint32   // 1-offset page number of the PDF page containing the matched text.
	LineNums      []int    // 1-offset line number of the matched text within the extracted page text.
	Lines         []string // The contents of the line containing the matched text.
	PagePositions          // This is used to find the bounding box of the match text on the PDF page.
	// contains filtered or unexported fields
}

PdfPageMatch describes the search results for a PDF page returned from a search over a PDF index. It is the analog of a bleve search.DocumentMatch.

func (PdfPageMatch) String ¶

func (p PdfPageMatch) String() string

type Phrase ¶

type Phrase struct {
	// contains filtered or unexported fields
}

type Span ¶

type Span struct {
	Start uint32  // Offset of the start of the bleve match in the page.
	End   uint32  // Offset of the end of the bleve match in the page.
	Score float64 // Score for this match
}

Span gives the offsets in extracted text that span a phrase.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL