textractor

package module
v0.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 1, 2024 License: MIT Imports: 10 Imported by: 1

README

📄 go-textractor

Build Status Go Reference goreportcard codecov

Amazon textract response parser written in go.

Installation

Use Go modules to include go-textractor in your project:

go get github.com/hupe1980/go-textractor

Usage

package main

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/textract"
	"github.com/aws/aws-sdk-go-v2/service/textract/types"
	"github.com/hupe1980/go-textractor"
)

func main() {
	file, err := os.Open("examples/analyze_document/testfile.pdf")
	if err != nil {
		log.Fatal(err)
	}

	defer file.Close()

	b, err := io.ReadAll(file)
	if err != nil {
		log.Fatal(err)
	}

	cfg, _ := config.LoadDefaultConfig(context.Background())
	client := textract.NewFromConfig(cfg)

	output, err := client.AnalyzeDocument(context.Background(), &textract.AnalyzeDocumentInput{
		Document: &types.Document{
			Bytes: b,
		},
		FeatureTypes: []types.FeatureType{
			types.FeatureTypeTables, types.FeatureTypeForms,
		},
	})
	if err != nil {
		log.Fatal(err)
	}

	doc, err := textractor.ParseDocumentAPIOutput(&textractor.DocumentAPIOutput{
		DocumentMetadata: output.DocumentMetadata,
		Blocks:           output.Blocks,
	})
	if err != nil {
		log.Fatal(err)
	}

	// Iterate over elements in the document
	for _, p := range doc.Pages() {
		// Print lines and words
		for _, l := range p.Lines() {
			fmt.Printf("Line: %s (%f)\n", l.Text(), l.Confidence())
			for _, w := range l.Words() {
				fmt.Printf("Word: %s (%f)\n", w.Text(), w.Confidence())
			}
		}

		// Print tables
		for _, t := range p.Tables() {
			for r, row := range t.Rows() {
				for c, cell := range row.Cells() {
					fmt.Printf("Table[%d][%d] = %s (%f)\n", r, c, cell.Text(), cell.Confidence())
				}
			}
		}

		// Print key values
		for _, kv := range p.KeyValues() {
			fmt.Printf("Key: %s, Value: %s\n", kv.Key(), kv.Value())
		}
	}
}

For more example usage, see examples.

Contributing

Contributions are welcome! Feel free to open an issue or submit a pull request for any improvements or new features you would like to see.

License

This project is licensed under the MIT License. See the LICENSE file for details.

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultLinerizationOptions = TextLinearizationOptions{
	LinearizeTables:                true,
	LinearizeKeyValues:             true,
	RemoveNewLinesInLeafElements:   true,
	MaxNumberOfConsecutiveNewLines: 2,
	HideHeaderLayout:               false,
	HideFooterLayout:               false,
	HideFigureLayout:               false,
	HidePageNumberLayout:           false,
	PageNumberPrefix:               "",
	PageNumberSuffix:               "",
	SameParagraphSeparator:         " ",
	LayoutElementSeparator:         "\n\n",
	ListElementSeparator:           "\n",
	ListLayoutPrefix:               "",
	ListLayoutSuffix:               "",
	ListElementPrefix:              "",
	ListElementSuffix:              "",
	TitlePrefix:                    "",
	TitleSuffix:                    "",
	TableLayoutPrefix:              "\n\n",
	TableLayoutSuffix:              "\n",
	TableRemoveColumnHeaders:       false,
	TableLinearizationFormat:       "plaintext",
	TableTabulateFormat:            "github",
	TableMinTableWords:             0,
	TableColumnSeparator:           "\t",
	TablePrefix:                    "",
	TableSuffix:                    "",
	TableRowSeparator:              "\n",
	TableRowPrefix:                 "",
	TableRowSuffix:                 "",
	TableCellPrefix:                "",
	TableCellSuffix:                "",
	SectionHeaderPrefix:            "",
	SectionHeaderSuffix:            "",
	TextPrefix:                     "",
	TextSuffix:                     "",
	KeyValueLayoutPrefix:           "\n\n",
	KeyValueLayoutSuffix:           "",
	KeyValuePrefix:                 "",
	KeyValueSuffix:                 "",
	ValuePrefix:                    "",
	ValueSuffix:                    "",
	SelectionElementSelected:       "[X]",
	SelectionElementNotSelected:    "[ ]",
	HeuristicHTolerance:            0.3,
	HeuristicLineBreakThreshold:    0.9,
	HeuristicOverlapRatio:          0.5,
	SignatureToken:                 "[SIGNATURE]",
	AddPrefixesAndSuffixesAsWords:  false,
	AddPrefixesAndSuffixesInText:   true,
}

Functions

This section is empty.

Types

type AnalyzeIDOutput added in v0.0.4

type AnalyzeIDOutput struct {
	DocumentMetadata  *types.DocumentMetadata  `json:"DocumentMetadata"`
	IdentityDocuments []types.IdentityDocument `json:"IdentityDocuments"`
}

type BoundingBox

type BoundingBox struct {
	// contains filtered or unexported fields
}

func NewEnclosingBoundingBox added in v0.0.4

func NewEnclosingBoundingBox[T BoundingBoxAccessor](accessors ...T) *BoundingBox

NewEnclosingBoundingBox returns a new bounding box that represents the union of multiple bounding boxes.

func (*BoundingBox) Area added in v0.0.4

func (bb *BoundingBox) Area() float32

Area calculates and returns the area of the bounding box. If either the width or height of the bounding box is less than zero, the area is considered zero to prevent negative area values.

func (*BoundingBox) Bottom

func (bb *BoundingBox) Bottom() float32

Bottom returns the bottom coordinate of the bounding box.

func (*BoundingBox) Height

func (bb *BoundingBox) Height() float32

func (*BoundingBox) HorizontalCenter

func (bb *BoundingBox) HorizontalCenter() float32

HorizontalCenter returns the horizontal center coordinate of the bounding box.

func (*BoundingBox) Intersection

func (bb *BoundingBox) Intersection(other *BoundingBox) *BoundingBox

Intersection returns a new bounding box that represents the intersection of two bounding boxes.

func (*BoundingBox) Left

func (bb *BoundingBox) Left() float32

func (*BoundingBox) Right

func (bb *BoundingBox) Right() float32

Right returns the right coordinate of the bounding box.

func (*BoundingBox) String added in v0.0.2

func (bb *BoundingBox) String() string

String returns a string representation of the bounding box.

func (*BoundingBox) Top

func (bb *BoundingBox) Top() float32

func (*BoundingBox) VerticalCenter

func (bb *BoundingBox) VerticalCenter() float32

VerticalCenter returns the vertical center coordinate of the bounding box.

func (*BoundingBox) Width

func (bb *BoundingBox) Width() float32

type BoundingBoxAccessor added in v0.0.4

type BoundingBoxAccessor interface {
	BoundingBox() *BoundingBox
}

type Document

type Document struct {
	// contains filtered or unexported fields
}

func ParseDocumentAPIOutput added in v0.0.4

func ParseDocumentAPIOutput(output *DocumentAPIOutput) (*Document, error)

func (*Document) KeyValues added in v0.0.4

func (d *Document) KeyValues() []*KeyValue

func (*Document) Lines added in v0.0.4

func (d *Document) Lines() []*Line

func (*Document) Pages

func (d *Document) Pages() []*Page

func (*Document) Signatures added in v0.0.4

func (d *Document) Signatures() []*Signature

func (*Document) Tables added in v0.0.4

func (d *Document) Tables() []*Table

func (*Document) Text added in v0.0.4

func (d *Document) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Document) Words added in v0.0.4

func (d *Document) Words() []*Word

type DocumentAPIOutput added in v0.0.4

type DocumentAPIOutput struct {
	DocumentMetadata *types.DocumentMetadata `json:"DocumentMetadata"`
	Blocks           []types.Block           `json:"Blocks"`
}

type IdentityDocument added in v0.0.2

type IdentityDocument struct {
	// contains filtered or unexported fields
}

func ParseAnalyzeIDOutput added in v0.0.4

func ParseAnalyzeIDOutput(output *AnalyzeIDOutput) ([]*IdentityDocument, error)

func (*IdentityDocument) Document added in v0.0.4

func (id *IdentityDocument) Document() *Document

func (*IdentityDocument) FieldByType added in v0.0.3

func (*IdentityDocument) Fields added in v0.0.3

func (id *IdentityDocument) Fields() []*IdentityDocumentField

func (*IdentityDocument) IdentityDocumentType added in v0.0.4

func (id *IdentityDocument) IdentityDocumentType() IdentityDocumentType

type IdentityDocumentField added in v0.0.3

type IdentityDocumentField struct {
	// contains filtered or unexported fields
}

IdentityDocumentField represents a field extracted from an identity document by Textract.

func (*IdentityDocumentField) Confidence added in v0.0.3

func (idf *IdentityDocumentField) Confidence() float32

Confidence returns the confidence score associated with the field extraction.

func (*IdentityDocumentField) FieldType added in v0.0.4

FieldType returns the type of the identity document field.

func (*IdentityDocumentField) IsNormalized added in v0.0.3

func (idf *IdentityDocumentField) IsNormalized() bool

IsNormalized checks if the field value is normalized.

func (*IdentityDocumentField) NormalizedValue added in v0.0.3

NormalizedValue returns the normalized value of the identity document field.

func (*IdentityDocumentField) Value added in v0.0.3

func (idf *IdentityDocumentField) Value() string

Value returns the value of the identity document field.

type IdentityDocumentFieldType added in v0.0.3

type IdentityDocumentFieldType string

IdentityDocumentFieldType represents the type of fields in an identity document.

const (
	IdentityDocumentFieldTypeFirstName        IdentityDocumentFieldType = "FIRST_NAME"
	IdentityDocumentFieldTypeLastName         IdentityDocumentFieldType = "LAST_NAME"
	IdentityDocumentFieldTypeMiddleName       IdentityDocumentFieldType = "MIDDLE_NAME"
	IdentityDocumentFieldTypeSuffix           IdentityDocumentFieldType = "Suffix"
	IdentityDocumentFieldTypeCityInAddress    IdentityDocumentFieldType = "CITY_IN_ADDRESS"
	IdentityDocumentFieldTypeZipCodeInAddress IdentityDocumentFieldType = "ZIP_CODE_IN_ADDRESS"
	IdentityDocumentFieldTypeStateInAddress   IdentityDocumentFieldType = "STATE_IN_ADDRESS"
	IdentityDocumentFieldTypeStateName        IdentityDocumentFieldType = "STATE_NAME"
	IdentityDocumentFieldTypeDocumentNumber   IdentityDocumentFieldType = "DOCUMENT_NUMBER"
	IdentityDocumentFieldTypeExpirationDate   IdentityDocumentFieldType = "EXPIRATION_DATE"
	IdentityDocumentFieldTypeDateOfBirth      IdentityDocumentFieldType = "DATE_OF_BIRTH"
	IdentityDocumentFieldTypeDateOfIssue      IdentityDocumentFieldType = "DATE_OF_ISSUE"
	IdentityDocumentFieldTypeIDType           IdentityDocumentFieldType = "ID_TYPE"
	IdentityDocumentFieldTypeEndorsements     IdentityDocumentFieldType = "ENDORSEMENTS"
	IdentityDocumentFieldTypeVeteran          IdentityDocumentFieldType = "VETERAN"
	IdentityDocumentFieldTypeRestrictions     IdentityDocumentFieldType = "RESTRICTIONS"
	IdentityDocumentFieldTypeClass            IdentityDocumentFieldType = "CLASS"
	IdentityDocumentFieldTypeAddress          IdentityDocumentFieldType = "ADDRESS"
	IdentityDocumentFieldTypeCounty           IdentityDocumentFieldType = "COUNTY"
	IdentityDocumentFieldTypePlaceOfBirth     IdentityDocumentFieldType = "PLACE_OF_BIRTH"
	IdentityDocumentFieldTypeOther            IdentityDocumentFieldType = "Other"
)

type IdentityDocumentType added in v0.0.3

type IdentityDocumentType string

IdentityDocumentType represents the type of an identity document.

const (
	IdentityDocumentTypeDrivingLicense IdentityDocumentType = "DRIVER LICENSE FRONT"
	IdentityDocumentTypePassport       IdentityDocumentType = "PASSPORT"
	IdentityDocumentTypeOther          IdentityDocumentType = "OTHER"
)

type Key added in v0.0.4

type Key struct {
	// contains filtered or unexported fields
}

func (*Key) BlockType added in v0.0.4

func (b *Key) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Key) BoundingBox added in v0.0.4

func (b *Key) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Key) Confidence added in v0.0.4

func (b *Key) Confidence() float32

Confidence returns the confidence of the block.

func (*Key) ID added in v0.0.4

func (b *Key) ID() string

ID returns the identifier of the block.

func (*Key) PageNumber added in v0.0.4

func (b *Key) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Key) Polygon added in v0.0.4

func (b *Key) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Key) Raw added in v0.0.4

func (b *Key) Raw() types.Block

Raw returns the raw block data.

func (*Key) String added in v0.0.4

func (k *Key) String() string

String returns the string representation of the key.

func (*Key) Text added in v0.0.4

func (k *Key) Text() string

func (*Key) Words added in v0.0.4

func (k *Key) Words() []*Word

type KeyValue added in v0.0.4

type KeyValue struct {
	// contains filtered or unexported fields
}

func (*KeyValue) BlockType added in v0.0.4

func (b *KeyValue) BlockType() types.BlockType

BlockType returns the type of the block.

func (*KeyValue) BoundingBox added in v0.0.4

func (kv *KeyValue) BoundingBox() *BoundingBox

func (*KeyValue) Confidence added in v0.0.4

func (kv *KeyValue) Confidence() float32

Confidence calculates the confidence score for a key value.

func (*KeyValue) ID added in v0.0.4

func (b *KeyValue) ID() string

ID returns the identifier of the block.

func (*KeyValue) Key added in v0.0.4

func (kv *KeyValue) Key() *Key

func (*KeyValue) PageNumber added in v0.0.4

func (b *KeyValue) PageNumber() int

PageNumber returns the page number associated with the block.

func (*KeyValue) Polygon added in v0.0.4

func (kv *KeyValue) Polygon() []*Point

func (*KeyValue) Raw added in v0.0.4

func (b *KeyValue) Raw() types.Block

Raw returns the raw block data.

func (*KeyValue) TextAndWords added in v0.0.4

func (kv *KeyValue) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

func (*KeyValue) Value added in v0.0.4

func (kv *KeyValue) Value() *Value

func (*KeyValue) Words added in v0.0.4

func (kv *KeyValue) Words() []*Word

type Layout added in v0.0.4

type Layout struct {
	// contains filtered or unexported fields
}

func (*Layout) AddChildren added in v0.0.4

func (l *Layout) AddChildren(children ...LayoutChild)

func (*Layout) BlockType added in v0.0.4

func (b *Layout) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Layout) BoundingBox added in v0.0.4

func (b *Layout) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Layout) Confidence added in v0.0.4

func (b *Layout) Confidence() float32

Confidence returns the confidence of the block.

func (*Layout) ID added in v0.0.4

func (b *Layout) ID() string

ID returns the identifier of the block.

func (*Layout) PageNumber added in v0.0.4

func (b *Layout) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Layout) Polygon added in v0.0.4

func (b *Layout) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Layout) Raw added in v0.0.4

func (b *Layout) Raw() types.Block

Raw returns the raw block data.

func (*Layout) Text added in v0.0.4

func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Layout) TextAndWords added in v0.0.4

func (l *Layout) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

type LayoutChild added in v0.0.4

type LayoutChild interface {
	ID() string
	TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)
	BoundingBox() *BoundingBox
}

type Line

type Line struct {
	// contains filtered or unexported fields
}

func (*Line) BlockType added in v0.0.4

func (b *Line) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Line) BoundingBox added in v0.0.4

func (b *Line) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Line) Confidence

func (b *Line) Confidence() float32

Confidence returns the confidence of the block.

func (*Line) ID

func (b *Line) ID() string

ID returns the identifier of the block.

func (*Line) PageNumber added in v0.0.4

func (b *Line) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Line) Polygon added in v0.0.4

func (b *Line) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Line) Raw added in v0.0.4

func (b *Line) Raw() types.Block

Raw returns the raw block data.

func (*Line) Text

func (l *Line) Text(_ ...func(*TextLinearizationOptions)) string

func (*Line) TextAndWords added in v0.0.4

func (l *Line) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

func (*Line) Words

func (l *Line) Words() []*Word

type NormalizedIdentityDocumentFieldValue added in v0.0.4

type NormalizedIdentityDocumentFieldValue struct {
	// contains filtered or unexported fields
}

NormalizedIdentityDocumentFieldValue represents a normalized value of an identity document field.

func (NormalizedIdentityDocumentFieldValue) DateValue added in v0.0.4

func (nidfv NormalizedIdentityDocumentFieldValue) DateValue() (time.Time, error)

DateValue returns the time representation of the normalized date value.

func (NormalizedIdentityDocumentFieldValue) Value added in v0.0.4

Value returns the string representation of the normalized value.

func (NormalizedIdentityDocumentFieldValue) ValueType added in v0.0.4

ValueType returns the type of the normalized value.

type Orientation

type Orientation struct {
	// contains filtered or unexported fields
}

Orientation represents the orientation of a geometric element.

func (*Orientation) Degrees

func (o *Orientation) Degrees() float32

Degrees returns the orientation in degrees.

func (*Orientation) Radians

func (o *Orientation) Radians() float32

Radians returns the orientation in radians.

type Page

type Page struct {
	// contains filtered or unexported fields
}

func (*Page) AddLayouts added in v0.0.4

func (p *Page) AddLayouts(layouts ...*Layout)

func (*Page) Height added in v0.0.4

func (p *Page) Height() float32

func (*Page) ID

func (p *Page) ID() string

func (*Page) KeyValues added in v0.0.4

func (p *Page) KeyValues() []*KeyValue

func (*Page) Layouts added in v0.0.4

func (p *Page) Layouts() []*Layout

func (*Page) Lines

func (p *Page) Lines() []*Line

func (*Page) Number added in v0.0.4

func (p *Page) Number() int

func (*Page) Queries added in v0.0.2

func (p *Page) Queries() []*Query

func (*Page) SearchValueByKey added in v0.0.4

func (p *Page) SearchValueByKey(key string) []*KeyValue

func (*Page) Signatures added in v0.0.2

func (p *Page) Signatures() []*Signature

func (*Page) Tables

func (p *Page) Tables() []*Table

func (*Page) Text

func (p *Page) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Page) Width added in v0.0.4

func (p *Page) Width() float32

func (*Page) Words added in v0.0.4

func (p *Page) Words() []*Word

type Point

type Point struct {
	// contains filtered or unexported fields
}

Point represents a 2D point.

func (*Point) String added in v0.0.2

func (p *Point) String() string

String returns a string representation of the Point, including its X and Y coordinates.

func (*Point) X

func (p *Point) X() float32

X returns the X coordinate of the point.

func (*Point) Y

func (p *Point) Y() float32

Y returns the Y coordinate of the point.

type Query added in v0.0.2

type Query struct {
	// contains filtered or unexported fields
}

Query represents a query with associated information, including an identifier, text, alias, query pages, results, a page, and raw block data.

func (*Query) Alias added in v0.0.2

func (q *Query) Alias() string

Alias returns the alias for the query.

func (*Query) HasResult added in v0.0.4

func (q *Query) HasResult() bool

func (*Query) ResultsByConfidence added in v0.0.2

func (q *Query) ResultsByConfidence() []*QueryResult

ResultsByConfidence lists this query instance's results, sorted from most to least confident.

func (*Query) Text added in v0.0.2

func (q *Query) Text() string

Text returns the text associated with the query.

func (*Query) TopResult added in v0.0.2

func (q *Query) TopResult() *QueryResult

TopResult retrieves the top result by confidence score, if any are available.

type QueryResult added in v0.0.2

type QueryResult struct {
	// contains filtered or unexported fields
}

QueryResult represents the result of a parsed query.

func (*QueryResult) BlockType added in v0.0.4

func (b *QueryResult) BlockType() types.BlockType

BlockType returns the type of the block.

func (*QueryResult) BoundingBox added in v0.0.4

func (b *QueryResult) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*QueryResult) Confidence added in v0.0.2

func (qr *QueryResult) Confidence() float32

Confidence returns the confidence level of the query result.

func (*QueryResult) ID added in v0.0.2

func (b *QueryResult) ID() string

ID returns the identifier of the block.

func (*QueryResult) PageNumber added in v0.0.4

func (b *QueryResult) PageNumber() int

PageNumber returns the page number associated with the block.

func (*QueryResult) Polygon added in v0.0.4

func (b *QueryResult) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*QueryResult) Raw added in v0.0.4

func (b *QueryResult) Raw() types.Block

Raw returns the raw block data.

func (*QueryResult) Text added in v0.0.2

func (qr *QueryResult) Text() string

Text returns the extracted text from the query result.

type SelectionElement

type SelectionElement struct {
	// contains filtered or unexported fields
}

func (*SelectionElement) BlockType added in v0.0.4

func (b *SelectionElement) BlockType() types.BlockType

BlockType returns the type of the block.

func (*SelectionElement) BoundingBox added in v0.0.4

func (b *SelectionElement) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*SelectionElement) Confidence

func (b *SelectionElement) Confidence() float32

Confidence returns the confidence of the block.

func (*SelectionElement) ID

func (b *SelectionElement) ID() string

ID returns the identifier of the block.

func (*SelectionElement) IsSelected

func (se *SelectionElement) IsSelected() bool

IsSelected checks if the element is selected.

func (*SelectionElement) PageNumber added in v0.0.4

func (b *SelectionElement) PageNumber() int

PageNumber returns the page number associated with the block.

func (*SelectionElement) Polygon added in v0.0.4

func (b *SelectionElement) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*SelectionElement) Raw added in v0.0.4

func (b *SelectionElement) Raw() types.Block

Raw returns the raw block data.

func (*SelectionElement) Status

func (se *SelectionElement) Status() types.SelectionStatus

Status returns the selection status of the element.

func (*SelectionElement) TextAndWords added in v0.0.4

func (se *SelectionElement) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

func (*SelectionElement) Words added in v0.0.4

func (se *SelectionElement) Words() []*Word

type Signature added in v0.0.2

type Signature struct {
	// contains filtered or unexported fields
}

func (*Signature) BlockType added in v0.0.4

func (b *Signature) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Signature) BoundingBox added in v0.0.4

func (b *Signature) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Signature) Confidence added in v0.0.2

func (b *Signature) Confidence() float32

Confidence returns the confidence of the block.

func (*Signature) ID added in v0.0.2

func (b *Signature) ID() string

ID returns the identifier of the block.

func (*Signature) PageNumber added in v0.0.4

func (b *Signature) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Signature) Polygon added in v0.0.4

func (b *Signature) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Signature) Raw added in v0.0.4

func (b *Signature) Raw() types.Block

Raw returns the raw block data.

func (*Signature) Text added in v0.0.4

func (s *Signature) Text(optFns ...func(*TextLinearizationOptions)) string

func (*Signature) TextAndWords added in v0.0.4

func (s *Signature) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

func (*Signature) Words added in v0.0.4

func (s *Signature) Words() []*Word

type Table

type Table struct {
	// contains filtered or unexported fields
}

func (*Table) BlockType added in v0.0.4

func (b *Table) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Table) BoundingBox added in v0.0.4

func (b *Table) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Table) Confidence

func (b *Table) Confidence() float32

Confidence returns the confidence of the block.

func (*Table) ID

func (b *Table) ID() string

ID returns the identifier of the block.

func (*Table) PageNumber added in v0.0.4

func (b *Table) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Table) Polygon added in v0.0.4

func (b *Table) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Table) Raw added in v0.0.4

func (b *Table) Raw() types.Block

Raw returns the raw block data.

func (*Table) Rows

func (t *Table) Rows() []*TableRow

type TableCell added in v0.0.4

type TableCell struct {
	// contains filtered or unexported fields
}

func (*TableCell) BlockType added in v0.0.4

func (b *TableCell) BlockType() types.BlockType

BlockType returns the type of the block.

func (*TableCell) BoundingBox added in v0.0.4

func (b *TableCell) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*TableCell) Confidence added in v0.0.4

func (tc *TableCell) Confidence() float32

func (*TableCell) ID added in v0.0.4

func (b *TableCell) ID() string

ID returns the identifier of the block.

func (*TableCell) IsColumnHeader added in v0.0.4

func (tc *TableCell) IsColumnHeader() bool

func (*TableCell) IsTableFooter added in v0.0.4

func (tc *TableCell) IsTableFooter() bool

func (*TableCell) IsTableSectionTitle added in v0.0.4

func (tc *TableCell) IsTableSectionTitle() bool

func (*TableCell) IsTableSummary added in v0.0.4

func (tc *TableCell) IsTableSummary() bool

func (*TableCell) IsTableTitle added in v0.0.4

func (tc *TableCell) IsTableTitle() bool

func (*TableCell) PageNumber added in v0.0.4

func (b *TableCell) PageNumber() int

PageNumber returns the page number associated with the block.

func (*TableCell) Polygon added in v0.0.4

func (b *TableCell) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*TableCell) Raw added in v0.0.4

func (b *TableCell) Raw() types.Block

Raw returns the raw block data.

func (*TableCell) Text added in v0.0.4

func (tc *TableCell) Text() string

type TableFooter added in v0.0.4

type TableFooter struct {
	// contains filtered or unexported fields
}

func (*TableFooter) BlockType added in v0.0.4

func (b *TableFooter) BlockType() types.BlockType

BlockType returns the type of the block.

func (*TableFooter) BoundingBox added in v0.0.4

func (b *TableFooter) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*TableFooter) Confidence added in v0.0.4

func (b *TableFooter) Confidence() float32

Confidence returns the confidence of the block.

func (*TableFooter) ID added in v0.0.4

func (b *TableFooter) ID() string

ID returns the identifier of the block.

func (*TableFooter) PageNumber added in v0.0.4

func (b *TableFooter) PageNumber() int

PageNumber returns the page number associated with the block.

func (*TableFooter) Polygon added in v0.0.4

func (b *TableFooter) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*TableFooter) Raw added in v0.0.4

func (b *TableFooter) Raw() types.Block

Raw returns the raw block data.

func (*TableFooter) Text added in v0.0.4

func (tf *TableFooter) Text() string

type TableRow added in v0.0.4

type TableRow struct {
	// contains filtered or unexported fields
}

func (*TableRow) Cells added in v0.0.4

func (tr *TableRow) Cells() []*TableCell

type TableTitle added in v0.0.4

type TableTitle struct {
	// contains filtered or unexported fields
}

func (*TableTitle) BlockType added in v0.0.4

func (b *TableTitle) BlockType() types.BlockType

BlockType returns the type of the block.

func (*TableTitle) BoundingBox added in v0.0.4

func (b *TableTitle) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*TableTitle) Confidence added in v0.0.4

func (b *TableTitle) Confidence() float32

Confidence returns the confidence of the block.

func (*TableTitle) ID added in v0.0.4

func (b *TableTitle) ID() string

ID returns the identifier of the block.

func (*TableTitle) PageNumber added in v0.0.4

func (b *TableTitle) PageNumber() int

PageNumber returns the page number associated with the block.

func (*TableTitle) Polygon added in v0.0.4

func (b *TableTitle) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*TableTitle) Raw added in v0.0.4

func (b *TableTitle) Raw() types.Block

Raw returns the raw block data.

func (*TableTitle) Text added in v0.0.4

func (tt *TableTitle) Text() string

type TextLinearizationOptions added in v0.0.4

type TextLinearizationOptions struct {
	// LinearizeTables includes tables in the linearized output.
	LinearizeTables bool

	// LinearizeKeyValues includes form key and values in the linearized output.
	LinearizeKeyValues bool

	// RemoveNewLinesInLeafElements removes new lines in leaf layout elements, removing extra whitespace.
	RemoveNewLinesInLeafElements bool

	// MaxNumberOfConsecutiveNewLines sets the maximum number of consecutive new lines to keep, removing extra whitespace.
	MaxNumberOfConsecutiveNewLines int

	// HideHeaderLayout hides headers in the linearized output.
	HideHeaderLayout bool

	// HideFooterLayout hides footers in the linearized output.
	HideFooterLayout bool

	// HideFigureLayout hides figures in the linearized output.
	HideFigureLayout bool

	// HidePageNumberLayout hides page numbers in the linearized output.
	HidePageNumberLayout bool

	// PageNumberPrefix is the prefix for page number layout elements.
	PageNumberPrefix string

	// PageNumberSuffix is the suffix for page number layout elements.
	PageNumberSuffix string

	// SameParagraphSeparator is the separator to use when combining elements within a text block.
	SameParagraphSeparator string

	// LayoutElementSeparator is the separator to use when combining linearized layout elements.
	LayoutElementSeparator string

	// ListElementSeparator is the separator for elements in a list layout.
	ListElementSeparator string

	// ListLayoutPrefix is the prefix for list layout elements (parent).
	ListLayoutPrefix string

	// ListLayoutSuffix is the suffix for list layout elements (parent).
	ListLayoutSuffix string

	// ListElementPrefix is the prefix for elements in a list layout (children).
	ListElementPrefix string

	// ListElementSuffix is the suffix for elements in a list layout (children).
	ListElementSuffix string

	// TitlePrefix is the prefix for title layout elements.
	TitlePrefix string

	// TitleSuffix is the suffix for title layout elements.
	TitleSuffix string

	// TableLayoutPrefix is the prefix for table elements.
	TableLayoutPrefix string

	// TableLayoutSuffix is the suffix for table elements.
	TableLayoutSuffix string

	// TableRemoveColumnHeaders removes column headers from tables.
	TableRemoveColumnHeaders bool

	// TableLinearizationFormat sets how to represent tables in the linearized output. Choices are plaintext or markdown.
	TableLinearizationFormat string

	// TableTabulateFormat is the markdown tabulate format to use when tables are linearized as markdown.
	TableTabulateFormat string

	// TableMinTableWords is the threshold below which tables will be rendered as words instead of using table layout.
	TableMinTableWords int

	// TableColumnSeparator is the table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature.
	TableColumnSeparator string

	// TablePrefix is the prefix for table layout.
	TablePrefix string

	// TableSuffix is the suffix for table layout.
	TableSuffix string

	// TableRowSeparator is the table row separator.
	TableRowSeparator string

	// TableRowPrefix is the prefix for table row.
	TableRowPrefix string

	// TableRowSuffix is the suffix for table row.
	TableRowSuffix string

	// TableCellPrefix is the prefix for table cell.
	TableCellPrefix string

	// TableCellSuffix is the suffix for table cell.
	TableCellSuffix string

	// SectionHeaderPrefix is the prefix for section header layout elements.
	SectionHeaderPrefix string

	// SectionHeaderSuffix is the suffix for section header layout elements.
	SectionHeaderSuffix string

	// TextPrefix is the prefix for text layout elements.
	TextPrefix string

	// TextSuffix is the suffix for text layout elements.
	TextSuffix string

	// KeyValueLayoutPrefix is the prefix for key_value layout elements (not for individual key-value elements).
	KeyValueLayoutPrefix string

	// KeyValueLayoutSuffix is the suffix for key_value layout elements (not for individual key-value elements).
	KeyValueLayoutSuffix string

	// KeyValuePrefix is the prefix for key-value elements.
	KeyValuePrefix string

	// KeyValueSuffix is the suffix for key-value elements.
	KeyValueSuffix string

	// KeyPrefix is the prefix for key elements.
	KeyPrefix string

	// KeySuffix is the suffix for key elements.
	KeySuffix string

	// ValuePrefix is the prefix for value elements.
	ValuePrefix string

	// ValueSuffix is the suffix for value elements.
	ValueSuffix string

	// SelectionElementSelected is the representation for selection elements when selected.
	SelectionElementSelected string

	// SelectionElementNotSelected is the representation for selection elements when not selected.
	SelectionElementNotSelected string

	// HeuristicHTolerance sets how much the line below and above the current line should differ in width to be separated.
	HeuristicHTolerance float32

	// HeuristicLineBreakThreshold sets how much space is acceptable between two lines before splitting them. Expressed in multiple of min heights.
	HeuristicLineBreakThreshold float32

	// HeuristicOverlapRatio sets how much vertical overlap is tolerated between two subsequent lines before merging them into a single line.
	HeuristicOverlapRatio float32

	// SignatureToken is the signature representation in the linearized text.
	SignatureToken string

	// AddPrefixesAndSuffixesAsWords controls if the prefixes/suffixes will be inserted in the words returned by `get_text_and_words`.
	AddPrefixesAndSuffixesAsWords bool

	// AddPrefixesAndSuffixesInText controls if the prefixes/suffixes will be added to the linearized text.
	AddPrefixesAndSuffixesInText bool
}

TextLinearizationOptions defines how a document is linearized into a text string.

type Value added in v0.0.4

type Value struct {
	// contains filtered or unexported fields
}

func (*Value) BlockType added in v0.0.4

func (b *Value) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Value) BoundingBox added in v0.0.4

func (b *Value) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Value) Confidence added in v0.0.4

func (b *Value) Confidence() float32

Confidence returns the confidence of the block.

func (*Value) ID added in v0.0.4

func (b *Value) ID() string

ID returns the identifier of the block.

func (*Value) PageNumber added in v0.0.4

func (b *Value) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Value) Polygon added in v0.0.4

func (b *Value) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Value) Raw added in v0.0.4

func (b *Value) Raw() types.Block

Raw returns the raw block data.

func (*Value) String added in v0.0.4

func (v *Value) String() string

String returns the string representation of the value.

func (*Value) Text added in v0.0.4

func (v *Value) Text() string

func (*Value) TextAndWords added in v0.0.4

func (v *Value) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word)

func (*Value) Words added in v0.0.4

func (v *Value) Words() []*Word

type Word

type Word struct {
	// contains filtered or unexported fields
}

Word represents a word extracted by Textract.

func (*Word) BlockType added in v0.0.4

func (b *Word) BlockType() types.BlockType

BlockType returns the type of the block.

func (*Word) BoundingBox added in v0.0.4

func (b *Word) BoundingBox() *BoundingBox

BoundingBox returns the bounding box information of the block.

func (*Word) Confidence

func (b *Word) Confidence() float32

Confidence returns the confidence of the block.

func (*Word) ID

func (b *Word) ID() string

ID returns the identifier of the block.

func (*Word) IsHandwriting

func (w *Word) IsHandwriting() bool

IsHandwriting checks if the word is handwriting.

func (*Word) IsPrinted

func (w *Word) IsPrinted() bool

IsPrinted checks if the word is printed text.

func (*Word) PageNumber added in v0.0.4

func (b *Word) PageNumber() int

PageNumber returns the page number associated with the block.

func (*Word) Polygon added in v0.0.4

func (b *Word) Polygon() []*Point

Polygon returns the polygon information of the block.

func (*Word) Raw added in v0.0.4

func (b *Word) Raw() types.Block

Raw returns the raw block data.

func (*Word) Text

func (w *Word) Text() string

Text returns the text content of the word.

func (*Word) TextType

func (w *Word) TextType() types.TextType

TextType returns the text type of the word.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL