textractor

package module
v0.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 26, 2023 License: MIT Imports: 8 Imported by: 1

README

📄 go-textractor

Build Status Go Reference goreportcard codecov

Amazon textract response parser written in go.

Installation

Use Go modules to include go-textractor in your project:

go get github.com/hupe1980/go-textractor

Usage

package main

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"

	"github.com/aws/aws-sdk-go-v2/config"
	"github.com/aws/aws-sdk-go-v2/service/textract"
	"github.com/aws/aws-sdk-go-v2/service/textract/types"
	"github.com/hupe1980/go-textractor"
)

func main() {
	file, err := os.Open("example/testfile.pdf")
	if err != nil {
		log.Fatal(err)
	}

	defer file.Close()

	b, err := io.ReadAll(file)
	if err != nil {
		log.Fatal(err)
	}

	cfg, _ := config.LoadDefaultConfig(context.Background())
	client := textract.NewFromConfig(cfg)

	res, err := client.AnalyzeDocument(context.Background(), &textract.AnalyzeDocumentInput{
		Document: &types.Document{
			Bytes: b,
		},
		FeatureTypes: []types.FeatureType{
			types.FeatureTypeTables,
		},
	})
	if err != nil {
		log.Fatal(err)
	}

	doc := textractor.NewDocument(&textractor.ResponsePage{Blocks: res.Blocks})

	// Iterate over elements in the document
	for _, p := range doc.Pages() {
		// Print lines and words
		for _, l := range p.Lines() {
			fmt.Printf("Line: %s (%f)\n", l.Text(), l.Confidence())
			for _, w := range l.Words() {
				fmt.Printf("Word: %s (%f)\n", w.Text(), w.Confidence())
			}
		}

		// Print tables
		for _, t := range p.Tables() {
			for r, row := range t.Rows() {
				for c, cell := range row.Cells() {
					fmt.Printf("Table[%d][%d] = %s (%f)\n", r, c, cell.Text(), cell.Confidence())
				}
			}
		}

		// Print fields
		for _, f := range p.Form().Fields() {
			fmt.Printf("Field: Key: %s, Value: %s\n", f.Key(), f.Value())
		}
	}
}

For more example usage, see examples.

Contributing

Contributions are welcome! Feel free to open an issue or submit a pull request for any improvements or new features you would like to see.

License

This project is licensed under the MIT License. See the LICENSE file for details.

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BoundingBox

type BoundingBox struct {
	// contains filtered or unexported fields
}

BoundingBox represents the bounding box of a geometry.

func NewBoundingBox

func NewBoundingBox(boundingBox *types.BoundingBox) *BoundingBox

NewBoundingBox creates a new BoundingBox instance.

func (*BoundingBox) Bottom

func (bb *BoundingBox) Bottom() float32

Bottom returns the bottom coordinate of the bounding box.

func (*BoundingBox) Height

func (bb *BoundingBox) Height() float32

Height returns the height of the bounding box.

func (*BoundingBox) HorizontalCenter

func (bb *BoundingBox) HorizontalCenter() float32

HorizontalCenter returns the horizontal center coordinate of the bounding box.

func (*BoundingBox) Intersection

func (bb *BoundingBox) Intersection(other *BoundingBox) *BoundingBox

Intersection returns a new bounding box that represents the intersection of two bounding boxes.

func (*BoundingBox) Left

func (bb *BoundingBox) Left() float32

Left returns the left coordinate of the bounding box.

func (*BoundingBox) Right

func (bb *BoundingBox) Right() float32

Right returns the right coordinate of the bounding box.

func (*BoundingBox) Top

func (bb *BoundingBox) Top() float32

Top returns the top coordinate of the bounding box.

func (*BoundingBox) Union

func (bb *BoundingBox) Union(other *BoundingBox) *BoundingBox

Union returns a new bounding box that represents the union of two bounding boxes.

func (*BoundingBox) VerticalCenter

func (bb *BoundingBox) VerticalCenter() float32

VerticalCenter returns the vertical center coordinate of the bounding box.

func (*BoundingBox) Width

func (bb *BoundingBox) Width() float32

Width returns the width of the bounding box.

type Cell

type Cell struct {
	// contains filtered or unexported fields
}

func NewCell

func NewCell(block types.Block, blockMap map[string]types.Block) *Cell

func (*Cell) Block

func (c *Cell) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Cell) ColumnIndex

func (c *Cell) ColumnIndex() int32

func (*Cell) ColumnSpan

func (c *Cell) ColumnSpan() int32

func (*Cell) Confidence

func (c *Cell) Confidence() float32

Confidence returns the confidence level of the content.

func (*Cell) Geometry

func (c *Cell) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Cell) ID

func (c *Cell) ID() string

ID returns the ID of the content.

func (*Cell) RowIndex

func (c *Cell) RowIndex() int32

func (*Cell) RowSpan

func (c *Cell) RowSpan() int32

func (*Cell) Text

func (c *Cell) Text() string

type Content

type Content interface {
	ID() string
	Confidence() float32
	Geometry() *Geometry
	Block() types.Block
}

Content is an interface for document content elements.

type Document

type Document struct {
	// contains filtered or unexported fields
}

func NewDocument

func NewDocument(responsePages ...*ResponsePage) *Document

func (*Document) BlockByID

func (doc *Document) BlockByID(id string) *types.Block

func (*Document) PageCount

func (doc *Document) PageCount() int

func (*Document) PageNumber

func (doc *Document) PageNumber(n int) (*Page, error)

func (*Document) Pages

func (doc *Document) Pages() []*Page

type Field

type Field struct {
	// contains filtered or unexported fields
}

func NewField

func NewField(block types.Block, blockMap map[string]types.Block) *Field

func (*Field) Confidence

func (f *Field) Confidence() float32

func (*Field) Key

func (f *Field) Key() *FieldKey

func (*Field) Value

func (f *Field) Value() *FieldValue

type FieldKey

type FieldKey struct {
	// contains filtered or unexported fields
}

func NewFieldKey

func NewFieldKey(block types.Block, ids []string, blockMap map[string]types.Block) *FieldKey

func (*FieldKey) Block

func (c *FieldKey) Block() types.Block

Block returns the underlying types.Block of the content.

func (*FieldKey) Confidence

func (c *FieldKey) Confidence() float32

Confidence returns the confidence level of the content.

func (*FieldKey) Geometry

func (c *FieldKey) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*FieldKey) ID

func (c *FieldKey) ID() string

ID returns the ID of the content.

func (*FieldKey) String

func (fk *FieldKey) String() string

func (*FieldKey) Text

func (fk *FieldKey) Text() string

type FieldValue

type FieldValue struct {
	// contains filtered or unexported fields
}

func NewFieldValue

func NewFieldValue(block types.Block, ids []string, blockMap map[string]types.Block) *FieldValue

func (*FieldValue) Block

func (c *FieldValue) Block() types.Block

Block returns the underlying types.Block of the content.

func (*FieldValue) Confidence

func (c *FieldValue) Confidence() float32

Confidence returns the confidence level of the content.

func (*FieldValue) Geometry

func (c *FieldValue) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*FieldValue) ID

func (c *FieldValue) ID() string

ID returns the ID of the content.

func (*FieldValue) String

func (fv *FieldValue) String() string

func (*FieldValue) Text

func (fv *FieldValue) Text() string

type Form

type Form struct {
	// contains filtered or unexported fields
}

func NewForm

func NewForm() *Form

func (*Form) AddField

func (f *Form) AddField(field *Field)

func (*Form) FieldByKey

func (f *Form) FieldByKey(key string) *Field

func (*Form) Fields

func (f *Form) Fields() []*Field

type Geometry

type Geometry struct {
	// contains filtered or unexported fields
}

Geometry represents the geometric properties of an element.

func NewGeometry

func NewGeometry(geometry *types.Geometry) *Geometry

NewGeometry creates a new Geometry instance.

func (*Geometry) BoundingBox

func (g *Geometry) BoundingBox() *BoundingBox

BoundingBox returns the bounding box of the geometry.

func (*Geometry) Orientation

func (g *Geometry) Orientation() *Orientation

Orientation returns the orientation of the geometry.

func (*Geometry) Polygon

func (g *Geometry) Polygon() []*Point

Polygon returns the polygon of the geometry.

type JSONResponse

type JSONResponse struct {
	DocumentMetadata struct {
		Pages *int32 `json:"Pages"`
	} `json:"DocumentMetadata"`
	Blocks []struct {
		BlockType   string   `json:"BlockType"`
		ColumnIndex *int32   `json:"ColumnIndex"`
		ColumnSpan  *int32   `json:"ColumnSpan"`
		ID          *string  `json:"Id"`
		Confidence  *float32 `json:"Confidence"`
		Text        *string  `json:"Text"`
		EntityTypes []string `json:"EntityTypes"`
		Geometry    struct {
			BoundingBox struct {
				Width  float32 `json:"Width"`
				Height float32 `json:"Height"`
				Left   float32 `json:"Left"`
				Top    float32 `json:"Top"`
			} `json:"BoundingBox"`
			Polygon []struct {
				X float32 `json:"X"`
				Y float32 `json:"Y"`
			} `json:"Polygon"`
		} `json:"Geometry"`
		Relationships []struct {
			Type string   `json:"Type"`
			IDs  []string `json:"Ids"`
		} `json:"Relationships"`
	} `json:"Blocks"`
}

type Line

type Line struct {
	// contains filtered or unexported fields
}

Line represents a line of text in the document.

func NewLine

func NewLine(block types.Block, blockMap map[string]types.Block) *Line

NewLine creates a new Line instance.

func (*Line) Block

func (c *Line) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Line) Confidence

func (c *Line) Confidence() float32

Confidence returns the confidence level of the content.

func (*Line) Geometry

func (c *Line) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Line) ID

func (c *Line) ID() string

ID returns the ID of the content.

func (*Line) Text

func (l *Line) Text() string

Text returns the text content of the line.

func (*Line) Words

func (l *Line) Words() []*Word

Words returns the words in the line.

type Orientation

type Orientation struct {
	// contains filtered or unexported fields
}

Orientation represents the orientation of a geometric element.

func NewOrientation

func NewOrientation(point0, point1 *Point) *Orientation

NewOrientation creates a new Orientation instance.

func (*Orientation) Degrees

func (o *Orientation) Degrees() float32

Degrees returns the orientation in degrees.

func (*Orientation) Radians

func (o *Orientation) Radians() float32

Radians returns the orientation in radians.

type Page

type Page struct {
	// contains filtered or unexported fields
}

Page represents a page in the document.

func NewPage

func NewPage(pageBlock types.Block, blocks []types.Block, blockMap map[string]types.Block) *Page

NewPage creates a new Page instance.

func (*Page) Blocks

func (p *Page) Blocks() []types.Block

func (*Page) Form

func (p *Page) Form() *Form

func (*Page) Geometry

func (p *Page) Geometry() *Geometry

func (*Page) ID

func (p *Page) ID() string

func (*Page) LineAtIndex

func (p *Page) LineAtIndex(i int) (*Line, error)

func (*Page) LineCount

func (p *Page) LineCount() int

func (*Page) Lines

func (p *Page) Lines() []*Line

func (*Page) TableAtIndex

func (p *Page) TableAtIndex(i int) *Table

func (*Page) TableCount

func (p *Page) TableCount() int

func (*Page) Tables

func (p *Page) Tables() []*Table

func (*Page) Text

func (p *Page) Text() string

type Point

type Point struct {
	// contains filtered or unexported fields
}

Point represents a 2D point.

func NewPoint

func NewPoint(point types.Point) *Point

NewPoint creates a new Point instance.

func (*Point) X

func (p *Point) X() float32

X returns the X coordinate of the point.

func (*Point) Y

func (p *Point) Y() float32

Y returns the Y coordinate of the point.

type ResponsePage

type ResponsePage struct {
	Blocks []types.Block
}

func NewResponsePageFromJSON

func NewResponsePageFromJSON(data []byte) (*ResponsePage, error)

type Row

type Row struct {
	// contains filtered or unexported fields
}

func NewRow

func NewRow() *Row

func (*Row) AddCell

func (r *Row) AddCell(cell *Cell)

func (Row) Block

func (c Row) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Row) CellAt

func (r *Row) CellAt(i int) *Cell

func (*Row) CellCount

func (r *Row) CellCount() int

func (*Row) Cells

func (r *Row) Cells() []*Cell

func (Row) Confidence

func (c Row) Confidence() float32

Confidence returns the confidence level of the content.

func (Row) Geometry

func (c Row) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (Row) ID

func (c Row) ID() string

ID returns the ID of the content.

type SelectionElement

type SelectionElement struct {
	// contains filtered or unexported fields
}

SelectionElement represents a selectable element in the document.

func NewSelectionElement

func NewSelectionElement(block types.Block) *SelectionElement

NewSelectionElement creates a new SelectionElement instance.

func (*SelectionElement) Block

func (c *SelectionElement) Block() types.Block

Block returns the underlying types.Block of the content.

func (*SelectionElement) Confidence

func (c *SelectionElement) Confidence() float32

Confidence returns the confidence level of the content.

func (*SelectionElement) Geometry

func (c *SelectionElement) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*SelectionElement) ID

func (c *SelectionElement) ID() string

ID returns the ID of the content.

func (*SelectionElement) IsSelected

func (se *SelectionElement) IsSelected() bool

IsSelected checks if the element is selected.

func (*SelectionElement) Status

func (se *SelectionElement) Status() types.SelectionStatus

Status returns the selection status of the element.

type Table

type Table struct {
	// contains filtered or unexported fields
}

func NewTable

func NewTable(block types.Block, blockMap map[string]types.Block) *Table

func (Table) Block

func (c Table) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Table) CellAt

func (t *Table) CellAt(rowIndex, columnIndex int) *Cell

func (Table) Confidence

func (c Table) Confidence() float32

Confidence returns the confidence level of the content.

func (Table) Geometry

func (c Table) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (Table) ID

func (c Table) ID() string

ID returns the ID of the content.

func (*Table) RowAt

func (t *Table) RowAt(rowIndex int) *Row

func (*Table) RowCount

func (t *Table) RowCount() int

func (*Table) Rows

func (t *Table) Rows() []*Row

type Word

type Word struct {
	// contains filtered or unexported fields
}

Word represents a word in the document.

func NewWord

func NewWord(block types.Block) *Word

NewWord creates a new Word instance.

func (*Word) Block

func (c *Word) Block() types.Block

Block returns the underlying types.Block of the content.

func (*Word) Confidence

func (c *Word) Confidence() float32

Confidence returns the confidence level of the content.

func (*Word) Geometry

func (c *Word) Geometry() *Geometry

Geometry returns the geometry information of the content.

func (*Word) ID

func (c *Word) ID() string

ID returns the ID of the content.

func (*Word) IsHandwriting

func (w *Word) IsHandwriting() bool

IsHandwriting checks if the word is handwriting.

func (*Word) IsPrinted

func (w *Word) IsPrinted() bool

IsPrinted checks if the word is printed text.

func (*Word) Text

func (w *Word) Text() string

Text returns the text content of the word.

func (*Word) TextType

func (w *Word) TextType() types.TextType

TextType returns the text type of the word.

Directories

Path Synopsis
examples

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL