textractor

package module
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 26, 2023 License: MIT Imports: 9 Imported by: 1

README

📄 go-textractor

Build Status Go Reference goreportcard codecov

Amazon textract response parser written in go.

Installation

Use Go modules to include go-textractor in your project:

go get github.com/hupe1980/go-textractor

Usage

package main

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/textract"
	"github.com/aws/aws-sdk-go-v2/service/textract/types"
	"github.com/hupe1980/go-textractor"
)

func main() {
	file, err := os.Open("examples/analyze_document/testfile.pdf")
	if err != nil {
		log.Fatal(err)
	}

	defer file.Close()

	b, err := io.ReadAll(file)
	if err != nil {
		log.Fatal(err)
	}

	cfg, _ := config.LoadDefaultConfig(context.Background())
	client := textract.NewFromConfig(cfg)

	res, err := client.AnalyzeDocument(context.Background(), &textract.AnalyzeDocumentInput{
		Document: &types.Document{
			Bytes: b,
		},
		FeatureTypes: []types.FeatureType{
			types.FeatureTypeTables, types.FeatureTypeForms,
		},
	})
	if err != nil {
		log.Fatal(err)
	}

	doc := textractor.NewDocument(&textractor.AnalyzeDocumentPage{Blocks: res.Blocks})

	// Iterate over elements in the document
	for _, p := range doc.Pages() {
		// Print lines and words
		for _, l := range p.Lines() {
			fmt.Printf("Line: %s (%f)\n", l.Text(), l.Confidence())
			for _, w := range l.Words() {
				fmt.Printf("Word: %s (%f)\n", w.Text(), w.Confidence())
			}
		}

		// Print tables
		for _, t := range p.Tables() {
			for r, row := range t.Rows() {
				for c, cell := range row.Cells() {
					fmt.Printf("Table[%d][%d] = %s (%f)\n", r, c, cell.Text(), cell.Confidence())
				}
			}
		}

		// Print fields
		for _, f := range p.Form().Fields() {
			fmt.Printf("Field: Key: %s, Value: %s\n", f.Key(), f.Value())
		}
	}
}

For more example usage, see examples.

Contributing

Contributions are welcome! Feel free to open an issue or submit a pull request for any improvements or new features you would like to see.

License

This project is licensed under the MIT License. See the LICENSE file for details.

Documentation

Overview

Package textractor provides functionality to work with the Amazon Textract service.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type AnalyzeDocumentPage added in v0.0.2

type AnalyzeDocumentPage struct {
	Blocks []types.Block
}

func NewAnalyzeDocumentPageFromJSON added in v0.0.2

func NewAnalyzeDocumentPageFromJSON(data []byte) (*AnalyzeDocumentPage, error)

type AnalyzeDocumentSchema added in v0.0.2

type AnalyzeDocumentSchema struct {
	DocumentMetadata struct {
		Pages *int32 `json:"Pages"`
	} `json:"DocumentMetadata"`
	Blocks []struct {
		BlockType   string   `json:"BlockType"`
		ColumnIndex *int32   `json:"ColumnIndex"`
		ColumnSpan  *int32   `json:"ColumnSpan"`
		ID          *string  `json:"Id"`
		Confidence  *float32 `json:"Confidence"`
		Text        *string  `json:"Text"`
		EntityTypes []string `json:"EntityTypes"`
		Geometry    struct {
			BoundingBox struct {
				Width  float32 `json:"Width"`
				Height float32 `json:"Height"`
				Left   float32 `json:"Left"`
				Top    float32 `json:"Top"`
			} `json:"BoundingBox"`
			Polygon []struct {
				X float32 `json:"X"`
				Y float32 `json:"Y"`
			} `json:"Polygon"`
		} `json:"Geometry"`
		Relationships []struct {
			Type string   `json:"Type"`
			IDs  []string `json:"Ids"`
		} `json:"Relationships"`
	} `json:"Blocks"`
}

type AnalyzeExpenseJSONResponse added in v0.0.2

type AnalyzeExpenseJSONResponse struct {
	DocumentMetadata struct {
		Pages *int32 `json:"Pages"`
	} `json:"DocumentMetadata"`
	ExpenseDocuments []struct {
		LineItemGroups []struct {
			LineItemGroupIndex *int32     `json:"LineItemGroupIndex"`
			LineItems          []struct{} `json:"LineItems"`
		} `json:"LineItemGroups"`
		SummaryFields []struct{} `json:"SummaryFields"`
	} `json:"ExpenseDocuments"`
}

type AnalyzeExpensePage added in v0.0.2

type AnalyzeExpensePage struct {
	ExpenseDocuments []types.ExpenseDocument
}

type BoundingBox

type BoundingBox struct {
	// contains filtered or unexported fields
}

BoundingBox represents the bounding box of a geometry.

func NewBoundingBox

func NewBoundingBox(boundingBox *types.BoundingBox) *BoundingBox

NewBoundingBox creates a new BoundingBox instance.

func (*BoundingBox) Bottom

func (bb *BoundingBox) Bottom() float32

Bottom returns the bottom coordinate of the bounding box.

func (*BoundingBox) Height

func (bb *BoundingBox) Height() float32

Height returns the height of the bounding box.

func (*BoundingBox) HorizontalCenter

func (bb *BoundingBox) HorizontalCenter() float32

HorizontalCenter returns the horizontal center coordinate of the bounding box.

func (*BoundingBox) Intersection

func (bb *BoundingBox) Intersection(other *BoundingBox) *BoundingBox

Intersection returns a new bounding box that represents the intersection of two bounding boxes.

func (*BoundingBox) Left

func (bb *BoundingBox) Left() float32

Left returns the left coordinate of the bounding box.

func (*BoundingBox) Right

func (bb *BoundingBox) Right() float32

Right returns the right coordinate of the bounding box.

func (*BoundingBox) String added in v0.0.2

func (bb *BoundingBox) String() string

String returns a string representation of the bounding box.

func (*BoundingBox) Top

func (bb *BoundingBox) Top() float32

Top returns the top coordinate of the bounding box.

func (*BoundingBox) Union

func (bb *BoundingBox) Union(other *BoundingBox) *BoundingBox

Union returns a new bounding box that represents the union of two bounding boxes.

func (*BoundingBox) VerticalCenter

func (bb *BoundingBox) VerticalCenter() float32

VerticalCenter returns the vertical center coordinate of the bounding box.

func (*BoundingBox) Width

func (bb *BoundingBox) Width() float32

Width returns the width of the bounding box.

type Cell

type Cell struct {
	// contains filtered or unexported fields
}

func NewCell

func NewCell(block types.Block, blockMap map[string]types.Block) *Cell

func (*Cell) Block

func (c *Cell) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Cell) ColumnIndex

func (c *Cell) ColumnIndex() int32

func (*Cell) ColumnSpan

func (c *Cell) ColumnSpan() int32

func (*Cell) Confidence

func (c *Cell) Confidence() float32

Confidence returns the confidence level of the content.

func (*Cell) Geometry

func (c *Cell) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Cell) ID

func (c *Cell) ID() string

ID returns the ID of the content.

func (*Cell) RowIndex

func (c *Cell) RowIndex() int32

func (*Cell) RowSpan

func (c *Cell) RowSpan() int32

func (*Cell) Text

func (c *Cell) Text() string

type Content

type Content interface {
	ID() string
	Confidence() float32
	Geometry() *Geometry
	Block() types.Block
}

Content is an interface for document content elements.

type Document

type Document struct {
	// contains filtered or unexported fields
}

Document represents a Textract document containing pages.

func NewDocument

func NewDocument(responsePages ...*AnalyzeDocumentPage) *Document

NewDocument creates a new Document instance using response pages from Textract.

func (*Document) BlockByID

func (doc *Document) BlockByID(id string) *types.Block

BlockByID retrieves a block by its ID.

func (*Document) PageCount

func (doc *Document) PageCount() int

PageCount returns the total number of pages in the document.

func (*Document) PageNumber

func (doc *Document) PageNumber(n int) *Page

PageNumber retrieves a page by its page number.

func (*Document) Pages

func (doc *Document) Pages() []*Page

Pages returns all pages in the document.

type Field

type Field struct {
	// contains filtered or unexported fields
}

Field represents a form field, consisting of a key and a value.

func NewField

func NewField(block types.Block, blockMap map[string]types.Block) *Field

NewField creates a new Field instance.

func (*Field) Confidence

func (f *Field) Confidence() float32

Confidence calculates the confidence score for the form field.

func (*Field) Key

func (f *Field) Key() *FieldKey

Key returns the key part of the form field.

func (*Field) OCRConfidence added in v0.0.2

func (f *Field) OCRConfidence() *OCRConfidence

OCRConfidence calculates the OCR confidence for the form field.

func (*Field) Value

func (f *Field) Value() *FieldValue

Value returns the value part of the form field.

type FieldKey

type FieldKey struct {
	// contains filtered or unexported fields
}

FieldKey represents the key part of a form field.

func NewFieldKey

func NewFieldKey(block types.Block, ids []string, blockMap map[string]types.Block) *FieldKey

NewFieldKey creates a new FieldKey instance.

func (*FieldKey) Block

func (c *FieldKey) Block() types.Block

Block returns the underlying types.Block of the content.

func (*FieldKey) Confidence

func (c *FieldKey) Confidence() float32

Confidence returns the confidence level of the content.

func (*FieldKey) Geometry

func (c *FieldKey) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*FieldKey) ID

func (c *FieldKey) ID() string

ID returns the ID of the content.

func (*FieldKey) OCRConfidence added in v0.0.2

func (fk *FieldKey) OCRConfidence() *OCRConfidence

OCRConfidence calculates the OCR confidence for the field key.

func (*FieldKey) String

func (fk *FieldKey) String() string

String returns the string representation of the field key.

func (*FieldKey) Text

func (fk *FieldKey) Text() string

Text returns the text representation of the field key.

func (*FieldKey) Words added in v0.0.2

func (fk *FieldKey) Words() []*Word

Words returns the words constituting the field key.

type FieldValue

type FieldValue struct {
	// contains filtered or unexported fields
}

FieldValue represents the value part of a form field.

func NewFieldValue

func NewFieldValue(block types.Block, ids []string, blockMap map[string]types.Block) *FieldValue

NewFieldValue creates a new FieldValue instance.

func (*FieldValue) Block

func (c *FieldValue) Block() types.Block

Block returns the underlying types.Block of the content.

func (*FieldValue) Confidence

func (c *FieldValue) Confidence() float32

Confidence returns the confidence level of the content.

func (*FieldValue) Geometry

func (c *FieldValue) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*FieldValue) ID

func (c *FieldValue) ID() string

ID returns the ID of the content.

func (*FieldValue) OCRConfidence added in v0.0.2

func (fv *FieldValue) OCRConfidence() *OCRConfidence

OCRConfidence calculates the OCR confidence for the field value.

func (*FieldValue) SelectionElement added in v0.0.2

func (fv *FieldValue) SelectionElement() *SelectionElement

SelectionElement returns the selection element associated with the field value.

func (*FieldValue) String

func (fv *FieldValue) String() string

String returns the string representation of the field value.

func (*FieldValue) Text

func (fv *FieldValue) Text() string

Text returns the text representation of the field value.

func (*FieldValue) Words added in v0.0.2

func (fv *FieldValue) Words() []*Word

Words returns the words constituting the field value.

type Form

type Form struct {
	// contains filtered or unexported fields
}

Form represents a form extracted from a document.

func NewForm

func NewForm() *Form

NewForm creates a new Form instance.

func (*Form) AddField

func (f *Form) AddField(field *Field)

AddField adds a field to the form, replacing it if a field with the same key already and lower confidence exists.

func (*Form) FieldByKey

func (f *Form) FieldByKey(key string) *Field

FieldByKey retrieves a field from the form by its key.

func (*Form) Fields

func (f *Form) Fields() []*Field

Fields returns all fields in the form.

func (*Form) SearchFieldByKey added in v0.0.2

func (f *Form) SearchFieldByKey(key string) []*Field

SearchFieldByKey searches for fields in the form with a key containing the specified string. It performs a case-insensitive search on the key text.

type Geometry

type Geometry struct {
	// contains filtered or unexported fields
}

Geometry represents the geometric properties of an element.

func NewGeometry

func NewGeometry(geometry *types.Geometry) *Geometry

NewGeometry creates a new Geometry instance.

func (*Geometry) BoundingBox

func (g *Geometry) BoundingBox() *BoundingBox

BoundingBox returns the bounding box of the geometry.

func (*Geometry) Orientation

func (g *Geometry) Orientation() *Orientation

Orientation returns the orientation of the geometry.

func (*Geometry) Polygon

func (g *Geometry) Polygon() []*Point

Polygon returns the polygon of the geometry.

type IdentityDocument added in v0.0.2

type IdentityDocument struct{}

func NewIndentiyDocument added in v0.0.2

func NewIndentiyDocument() *IdentityDocument

type Line

type Line struct {
	// contains filtered or unexported fields
}

Line represents a line of text in the document.

func NewLine

func NewLine(block types.Block, blockMap map[string]types.Block) *Line

NewLine creates a new Line instance.

func (*Line) Block

func (c *Line) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Line) Confidence

func (c *Line) Confidence() float32

Confidence returns the confidence level of the content.

func (*Line) Geometry

func (c *Line) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Line) ID

func (c *Line) ID() string

ID returns the ID of the content.

func (*Line) Text

func (l *Line) Text() string

Text returns the text content of the line.

func (*Line) Words

func (l *Line) Words() []*Word

Words returns the words in the line.

type OCRConfidence added in v0.0.2

type OCRConfidence struct {
	// contains filtered or unexported fields
}

OCRConfidence represents the OCR confidence scores, including mean, max, and min values.

func NewOCRConfidenceFromScores added in v0.0.2

func NewOCRConfidenceFromScores(scores []float32) *OCRConfidence

NewOCRConfidenceFromScores creates a new OCRConfidence instance from a slice of confidence scores. If the scores slice is empty, it returns nil.

func (*OCRConfidence) Max added in v0.0.2

func (c *OCRConfidence) Max() float32

Max returns the maximum OCR confidence score.

func (*OCRConfidence) Mean added in v0.0.2

func (c *OCRConfidence) Mean() float32

Mean returns the mean (average) OCR confidence score.

func (*OCRConfidence) Min added in v0.0.2

func (c *OCRConfidence) Min() float32

Min returns the minimum OCR confidence score.

type Orientation

type Orientation struct {
	// contains filtered or unexported fields
}

Orientation represents the orientation of a geometric element.

func NewOrientation

func NewOrientation(point0, point1 *Point) *Orientation

NewOrientation creates a new Orientation instance.

func (*Orientation) Degrees

func (o *Orientation) Degrees() float32

Degrees returns the orientation in degrees.

func (*Orientation) Radians

func (o *Orientation) Radians() float32

Radians returns the orientation in radians.

type Page

type Page struct {
	// contains filtered or unexported fields
}

Page represents a page in the document.

func NewPage

func NewPage(pageBlock types.Block, blocks []types.Block, blockMap map[string]types.Block) *Page

NewPage creates a new Page instance using Textract page blocks and a block map.

func (*Page) Blocks

func (p *Page) Blocks() []types.Block

Blocks returns all blocks in the page.

func (*Page) Form

func (p *Page) Form() *Form

Form returns the form information on the page.

func (*Page) Geometry

func (p *Page) Geometry() *Geometry

Geometry returns the geometry of the page.

func (*Page) ID

func (p *Page) ID() string

ID returns the ID of the page block.

func (*Page) LineAtIndex

func (p *Page) LineAtIndex(i int) *Line

LineAtIndex returns the line at the specified index.

func (*Page) LineCount

func (p *Page) LineCount() int

LineCount returns the total number of lines in the page.

func (*Page) Lines

func (p *Page) Lines() []*Line

Lines returns all lines in the page.

func (*Page) Queries added in v0.0.2

func (p *Page) Queries() Queries

Queries returns the queries for the page.

func (*Page) Signatures added in v0.0.2

func (p *Page) Signatures() []*Signature

Signatures returns the signatures on the page.

func (*Page) TableAtIndex

func (p *Page) TableAtIndex(i int) *Table

TableAtIndex returns the table at the specified index.

func (*Page) TableCount

func (p *Page) TableCount() int

TableCount returns the total number of tables in the page.

func (*Page) Tables

func (p *Page) Tables() []*Table

Tables returns all tables in the page.

func (*Page) Text

func (p *Page) Text() string

Text returns the concatenated text from all lines in the page.

type Point

type Point struct {
	// contains filtered or unexported fields
}

Point represents a 2D point.

func NewPoint

func NewPoint(point types.Point) *Point

NewPoint creates a new Point instance.

func (*Point) String added in v0.0.2

func (p *Point) String() string

String returns a string representation of the Point, including its X and Y coordinates.

func (*Point) X

func (p *Point) X() float32

X returns the X coordinate of the point.

func (*Point) Y

func (p *Point) Y() float32

Y returns the Y coordinate of the point.

type Queries added in v0.0.2

type Queries []*Query

Queries represents a slice of Query instances.

type Query added in v0.0.2

type Query struct {
	// contains filtered or unexported fields
}

Query represents a Textract query.

func NewQuery added in v0.0.2

func NewQuery(block types.Block, blockMap map[string]types.Block) *Query

NewQuery creates a new Query instance.

func (*Query) Alias added in v0.0.2

func (q *Query) Alias() string

Alias returns the alias of the query.

func (*Query) ResultsByConfidence added in v0.0.2

func (q *Query) ResultsByConfidence() []*QueryResult

ResultsByConfidence lists this query instance's results, sorted from most to least confident.

func (*Query) Text added in v0.0.2

func (q *Query) Text() string

Text returns the text content of the query.

func (*Query) TopResult added in v0.0.2

func (q *Query) TopResult() *QueryResult

TopResult retrieves the top result by confidence score, if any are available.

type QueryResult added in v0.0.2

type QueryResult struct {
	// contains filtered or unexported fields
}

QueryResult represents the result of a Textract query.

func NewQueryResult added in v0.0.2

func NewQueryResult(block types.Block) *QueryResult

NewQueryResult creates a new QueryResult instance.

func (*QueryResult) Block added in v0.0.2

func (c *QueryResult) Block() types.Block

Block returns the underlying types.Block of the content.

func (*QueryResult) Confidence added in v0.0.2

func (c *QueryResult) Confidence() float32

Confidence returns the confidence level of the content.

func (*QueryResult) Geometry added in v0.0.2

func (c *QueryResult) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*QueryResult) ID added in v0.0.2

func (c *QueryResult) ID() string

ID returns the ID of the content.

func (*QueryResult) Text added in v0.0.2

func (qr *QueryResult) Text() string

Text returns the text content of the query result.

type Row

type Row struct {
	// contains filtered or unexported fields
}

func NewRow

func NewRow() *Row

func (*Row) AddCell

func (r *Row) AddCell(cell *Cell)

func (Row) Block

func (c Row) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Row) CellAt

func (r *Row) CellAt(i int) *Cell

func (*Row) CellCount

func (r *Row) CellCount() int

func (*Row) Cells

func (r *Row) Cells() []*Cell

func (Row) Confidence

func (c Row) Confidence() float32

Confidence returns the confidence level of the content.

func (Row) Geometry

func (c Row) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (Row) ID

func (c Row) ID() string

ID returns the ID of the content.

type SelectionElement

type SelectionElement struct {
	// contains filtered or unexported fields
}

SelectionElement represents a selectable element in the document.

func NewSelectionElement

func NewSelectionElement(block types.Block) *SelectionElement

NewSelectionElement creates a new SelectionElement instance.

func (*SelectionElement) Block

func (c *SelectionElement) Block() types.Block

Block returns the underlying types.Block of the content.

func (*SelectionElement) Confidence

func (c *SelectionElement) Confidence() float32

Confidence returns the confidence level of the content.

func (*SelectionElement) Geometry

func (c *SelectionElement) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*SelectionElement) ID

func (c *SelectionElement) ID() string

ID returns the ID of the content.

func (*SelectionElement) IsSelected

func (se *SelectionElement) IsSelected() bool

IsSelected checks if the element is selected.

func (*SelectionElement) Status

func (se *SelectionElement) Status() types.SelectionStatus

Status returns the selection status of the element.

type Signature added in v0.0.2

type Signature struct {
	// contains filtered or unexported fields
}

Signature represents a signature in a document.

func NewSignature added in v0.0.2

func NewSignature(block types.Block) *Signature

NewSignature creates a new Signature instance.

func (*Signature) Block added in v0.0.2

func (c *Signature) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Signature) Confidence added in v0.0.2

func (c *Signature) Confidence() float32

Confidence returns the confidence level of the content.

func (*Signature) Geometry added in v0.0.2

func (c *Signature) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Signature) ID added in v0.0.2

func (c *Signature) ID() string

ID returns the ID of the content.

type Table

type Table struct {
	// contains filtered or unexported fields
}

func NewTable

func NewTable(block types.Block, blockMap map[string]types.Block) *Table

func (Table) Block

func (c Table) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Table) CellAt

func (t *Table) CellAt(rowIndex, columnIndex int) *Cell

func (Table) Confidence

func (c Table) Confidence() float32

Confidence returns the confidence level of the content.

func (Table) Geometry

func (c Table) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (Table) ID

func (c Table) ID() string

ID returns the ID of the content.

func (*Table) RowAt

func (t *Table) RowAt(rowIndex int) *Row

func (*Table) RowCount

func (t *Table) RowCount() int

func (*Table) Rows

func (t *Table) Rows() []*Row

type Word

type Word struct {
	// contains filtered or unexported fields
}

Word represents a word in the document.

func NewWord

func NewWord(block types.Block) *Word

NewWord creates a new Word instance.

func (*Word) Block

func (c *Word) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Word) Confidence

func (c *Word) Confidence() float32

Confidence returns the confidence level of the content.

func (*Word) Geometry

func (c *Word) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Word) ID

func (c *Word) ID() string

ID returns the ID of the content.

func (*Word) IsHandwriting

func (w *Word) IsHandwriting() bool

IsHandwriting checks if the word is handwriting.

func (*Word) IsPrinted

func (w *Word) IsPrinted() bool

IsPrinted checks if the word is printed text.

func (*Word) Text

func (w *Word) Text() string

Text returns the text content of the word.

func (*Word) TextType

func (w *Word) TextType() types.TextType

TextType returns the text type of the word.

Directories

Path Synopsis
examples

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL