documentloader

package

v0.0.102 Latest Latest Go to latest Published: Mar 9, 2024 License: MIT Imports: 24 Imported by: 1

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/hupe1980/golc

Links

Open Source Insights

Documentation ¶

Overview ¶

Package documentloader provides functionality for loading and processing documents.

Index ¶

Variables
func DefaultLinerizationOptions() textractor.TextLinearizationOptions
type AmazonTextract
- func NewAmazonTextractFromOutput(output *textractor.DocumentAPIOutput, optFns ...func(o *AmazonTextractOptions)) *AmazonTextract
- func NewAmazonTextractFromReader(client AmazonTextractClient, r io.Reader, ...) *AmazonTextract
- func NewAmazonTextractFromS3Object(client AmazonTextractClient, s3Object *types.S3Object, ...) *AmazonTextract
- func (l *AmazonTextract) Load(ctx context.Context) ([]schema.Document, error)
- func (l *AmazonTextract) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type AmazonTextractClient
type AmazonTextractOptions
type CSV
- func NewCSV(r io.Reader, optFns ...func(o *CSVOptions)) *CSV
- func (l *CSV) Load(ctx context.Context) ([]schema.Document, error)
- func (l *CSV) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type CSVOptions
type FileFilter
type Git
- func NewGit(r *git.Repository, optFns ...func(o *GitOptions)) *Git
- func NewGitFromCloneURL(url string, optFns ...func(o *GitCloneURLOptions)) (*Git, error)
- func NewGitFromCodeCommitURL(url string, creds aws.Credentials, optFns ...func(o *GitOptions)) (*Git, error)
- func NewGitFromPath(path string, optFns ...func(o *GitOptions)) (*Git, error)
- func (l *Git) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Git) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type GitCloneURLOptions
type GitOptions
type HTML
- func NewHTML(r io.Reader, optFns ...func(o *HTMLOptions)) *HTML
- func (l *HTML) Load(ctx context.Context) ([]schema.Document, error)
- func (l *HTML) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type HTMLOptions
type Notebook
- func NewNotebook(r io.Reader, optFns ...func(o *NotebookOptions)) *Notebook
- func (l *Notebook) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Notebook) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type NotebookOptions
type PDF
- func NewPDF(f io.ReaderAt, size int64, optFns ...func(o *PDFOptions)) (*PDF, error)
- func NewPDFFromFile(f *os.File, optFns ...func(o *PDFOptions)) (*PDF, error)
- func (l *PDF) Load(ctx context.Context) ([]schema.Document, error)
- func (l *PDF) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type PDFOptions
type Text
- func NewText(r io.Reader) *Text
- func (l *Text) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Text) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type UniDocDOCX
- func NewUniDocDOCX(parser UniDocParser, r io.ReaderAt, size int64, ...) *UniDocDOCX
- func NewUniDocDOCXFromFile(parser UniDocParser, f *os.File, optFns ...func(o *UniDocDOCXOptions)) *UniDocDOCX
- func (l *UniDocDOCX) Load(ctx context.Context) ([]schema.Document, error)
- func (l *UniDocDOCX) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type UniDocDOCXOptions
type UniDocParser
type Unstructured
- func NewUnstructured(apiKey string, file *os.File) *Unstructured
- func (l *Unstructured) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Unstructured) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

Constants ¶

This section is empty.

Variables ¶

View Source

var DefaultGitOptions = GitOptions{
	Branch:     "main",
	FileFilter: func(f *object.File) bool { return true },
}

DefaultGitOptions provides default Git options.

Functions ¶

func DefaultLinerizationOptions ¶ added in v0.0.101

func DefaultLinerizationOptions() textractor.TextLinearizationOptions

DefaultLinerizationOptions returns the default linearization options for Amazon Textract.

Types ¶

type AmazonTextract ¶ added in v0.0.101

type AmazonTextract struct {
	// contains filtered or unexported fields
}

AmazonTextract represents a document loader for Amazon Textract.

func NewAmazonTextractFromOutput ¶ added in v0.0.101

func NewAmazonTextractFromOutput(output *textractor.DocumentAPIOutput, optFns ...func(o *AmazonTextractOptions)) *AmazonTextract

NewAmazonTextractFromOutput creates a new AmazonTextract instance from a Textract output.

func NewAmazonTextractFromReader ¶ added in v0.0.101

func NewAmazonTextractFromReader(client AmazonTextractClient, r io.Reader, optFns ...func(o *AmazonTextractOptions)) *AmazonTextract

NewAmazonTextractFromReader creates a new AmazonTextract instance from a reader.

func NewAmazonTextractFromS3Object ¶ added in v0.0.101

func NewAmazonTextractFromS3Object(client AmazonTextractClient, s3Object *types.S3Object, optFns ...func(o *AmazonTextractOptions)) *AmazonTextract

NewAmazonTextractFromS3Object creates a new AmazonTextract instance from an S3 object.

func (*AmazonTextract) Load ¶ added in v0.0.101

func (l *AmazonTextract) Load(ctx context.Context) ([]schema.Document, error)

Load reads the content from the reader and returns it as a single document.

func (*AmazonTextract) LoadAndSplit ¶ added in v0.0.101

func (l *AmazonTextract) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit reads the content from the reader and splits it into multiple documents using the provided splitter.

type AmazonTextractClient ¶ added in v0.0.101

type AmazonTextractClient interface {
	// AnalyzeDocument performs document analysis using Amazon Textract.
	AnalyzeDocument(ctx context.Context, params *textract.AnalyzeDocumentInput, optFns ...func(*textract.Options)) (*textract.AnalyzeDocumentOutput, error)
}

AmazonTextractClient is an interface representing the methods required for interacting with Amazon Textract.

type AmazonTextractOptions ¶ added in v0.0.101

type AmazonTextractOptions struct {
	textractor.TextLinearizationOptions
	FeatureTypes []types.FeatureType
}

AmazonTextractOptions represents options for loading documents using Amazon Textract.

type CSV ¶ added in v0.0.39

type CSV struct {
	// contains filtered or unexported fields
}

CSV represents a CSV document loader.

func NewCSV ¶ added in v0.0.39

func NewCSV(r io.Reader, optFns ...func(o *CSVOptions)) *CSV

NewCSV creates a new CSV loader with an io.Reader and optional configuration options. It returns a pointer to the created CSV loader.

func (*CSV) Load ¶ added in v0.0.39

func (l *CSV) Load(ctx context.Context) ([]schema.Document, error)

Load loads CSV documents from the provided reader.

func (*CSV) LoadAndSplit ¶ added in v0.0.39

func (l *CSV) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads CSV documents from the provided reader and splits them using the specified text splitter.

type CSVOptions ¶ added in v0.0.41

type CSVOptions struct {
	// Separator is the rune used to separate fields in the CSV file.
	Separator rune

	// LazyQuotes controls whether the CSV reader should use lazy quotes mode.
	LazyQuotes bool

	// Columns is a list of column names to filter and include in the loaded documents.
	Columns []string
}

CSVOptions contains options for configuring the CSV loader.

type FileFilter ¶ added in v0.0.52

type FileFilter func(f *object.File) bool

FileFilter is a function that filters files based on specific criteria.

type Git ¶ added in v0.0.52

type Git struct {
	// contains filtered or unexported fields
}

Git is a Git-based implementation of the DocumentLoader interface.

func NewGit ¶ added in v0.0.52

func NewGit(r *git.Repository, optFns ...func(o *GitOptions)) *Git

NewGit creates a Git document loader from an existing Git repository and returns it. The options can be customized using functional options.

func NewGitFromCloneURL ¶ added in v0.0.52

func NewGitFromCloneURL(url string, optFns ...func(o *GitCloneURLOptions)) (*Git, error)

NewGitFromCloneURL clones a Git repository from a URL and returns a Git document loader. The options can be customized using functional options.

func NewGitFromCodeCommitURL ¶ added in v0.0.52

func NewGitFromCodeCommitURL(url string, creds aws.Credentials, optFns ...func(o *GitOptions)) (*Git, error)

NewGitFromCodeCommitURL clones a Git repository from an AWS CodeCommit URL using the provided AWS credentials, and returns a Git document loader. The options can be customized using functional options.

func NewGitFromPath ¶ added in v0.0.52

func NewGitFromPath(path string, optFns ...func(o *GitOptions)) (*Git, error)

NewGitFromPath opens an existing Git repository from a local path and returns a Git document loader. The options can be customized using functional options.

func (*Git) Load ¶ added in v0.0.52

func (l *Git) Load(ctx context.Context) ([]schema.Document, error)

Load retrieves documents from the Git repository and returns them as a slice of schema.Document.

func (*Git) LoadAndSplit ¶ added in v0.0.52

func (l *Git) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit retrieves documents from the Git repository, splits them using the provided TextSplitter, and returns the split documents as a slice of schema.Document.

type GitCloneURLOptions ¶ added in v0.0.52

type GitCloneURLOptions struct {
	GitOptions
	Auth transport.AuthMethod
}

GitCloneURLOptions holds options for Git repositories cloned from a URL.

type GitOptions ¶ added in v0.0.52

type GitOptions struct {
	Branch     string
	FileFilter FileFilter
}

GitOptions holds options for the Git document loader.

type HTML ¶ added in v0.0.55

type HTML struct {
	// contains filtered or unexported fields
}

HTML implements the DocumentLoader interface for HTML documents.

func NewHTML ¶ added in v0.0.55

func NewHTML(r io.Reader, optFns ...func(o *HTMLOptions)) *HTML

NewHTML creates a new HTML document loader with an io.Reader and optional configuration options. It returns a pointer to the created HTML loader.

func (*HTML) Load ¶ added in v0.0.55

func (l *HTML) Load(ctx context.Context) ([]schema.Document, error)

Load loads the HTML document from the reader and extracts the text content. It returns a list of schema.Document containing the extracted content and the title as metadata.

func (*HTML) LoadAndSplit ¶ added in v0.0.55

func (l *HTML) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads HTML documents from the provided reader and splits them using the specified text splitter.

type HTMLOptions ¶ added in v0.0.55

type HTMLOptions struct {
	// TagFilter is a list of HTML tags to be filtered from the document content.
	TagFilter []string
}

HTMLOptions contains options for the HTML document loader.

type Notebook ¶ added in v0.0.55

type Notebook struct {
	// contains filtered or unexported fields
}

Notebook represents a Jupyter Notebook document loader.

func NewNotebook ¶ added in v0.0.55

func NewNotebook(r io.Reader, optFns ...func(o *NotebookOptions)) *Notebook

NewNotebook creates a new instance of Notebook with the given reader and optional functions to set options.

func (*Notebook) Load ¶ added in v0.0.55

func (l *Notebook) Load(ctx context.Context) ([]schema.Document, error)

Load reads and parses the Jupyter Notebook from the provided reader. It returns a slice of schema.Document representing the notebook's content and any error encountered.

func (*Notebook) LoadAndSplit ¶ added in v0.0.55

func (l *Notebook) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads Notebook documents from the provided reader and splits them using the specified text splitter. It returns a slice of schema.Document representing the notebook's content and any error encountered.

type NotebookOptions ¶ added in v0.0.55

type NotebookOptions struct {
	// Include outputs (cell execution results) in the document content.
	IncludeOutputs bool

	// Include traceback information for cells with errors.
	Traceback bool

	// Maximum length of output text to include in the document.
	MaxOutputLength uint
}

NotebookOptions represents the options for loading a Jupyter Notebook.

type PDF ¶ added in v0.0.55

type PDF struct {
	// contains filtered or unexported fields
}

PDF represents a PDF document loader that implements the DocumentLoader interface.

func NewPDF ¶ added in v0.0.55

func NewPDF(f io.ReaderAt, size int64, optFns ...func(o *PDFOptions)) (*PDF, error)

NewPDFFromFile creates a new PDF loader with the given options.

func NewPDFFromFile ¶ added in v0.0.100

func NewPDFFromFile(f *os.File, optFns ...func(o *PDFOptions)) (*PDF, error)

NewPDFFromFile creates a new PDF loader with the given options.

func (*PDF) Load ¶ added in v0.0.55

func (l *PDF) Load(ctx context.Context) ([]schema.Document, error)

Load loads the PDF document and returns a slice of schema.Document containing the page contents and metadata.

func (*PDF) LoadAndSplit ¶ added in v0.0.55

func (l *PDF) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads PDF documents from the provided reader and splits them using the specified text splitter.

type PDFOptions ¶ added in v0.0.55

type PDFOptions struct {
	// Password for encrypted PDF files.
	Password string

	// Page number to start loading from (default is 1).
	StartPage uint

	// Maximum number of pages to load (0 for all pages).
	MaxPages uint

	// Source is the name of the pdf document
	Source string
}

type Text ¶ added in v0.0.22

type Text struct {
	// contains filtered or unexported fields
}

func NewText ¶ added in v0.0.22

func NewText(r io.Reader) *Text

NewText creates a new Text document loader with the given reader.

func (*Text) Load ¶ added in v0.0.22

func (l *Text) Load(ctx context.Context) ([]schema.Document, error)

Load reads the content from the reader and returns it as a single document.

func (*Text) LoadAndSplit ¶ added in v0.0.22

func (l *Text) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit reads the content from the reader and splits it into multiple documents using the provided splitter.

type UniDocDOCX ¶ added in v0.0.97

type UniDocDOCX struct {
	// contains filtered or unexported fields
}

UniDocDOCX is a document loader for DOCX files using UniDoc.

func NewUniDocDOCX ¶ added in v0.0.97

func NewUniDocDOCX(parser UniDocParser, r io.ReaderAt, size int64, optFns ...func(o *UniDocDOCXOptions)) *UniDocDOCX

func NewUniDocDOCXFromFile ¶ added in v0.0.100

func NewUniDocDOCXFromFile(parser UniDocParser, f *os.File, optFns ...func(o *UniDocDOCXOptions)) *UniDocDOCX

NewUniDocDOCX creates a new instance of UniDocDOCX loader.

func (*UniDocDOCX) Load ¶ added in v0.0.97

func (l *UniDocDOCX) Load(ctx context.Context) ([]schema.Document, error)

Load reads the document and extracts its content into schema.Document format.

func (*UniDocDOCX) LoadAndSplit ¶ added in v0.0.97

func (l *UniDocDOCX) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads dOCX documents from the provided reader and splits them using the specified text splitter.

type UniDocDOCXOptions ¶ added in v0.0.97

type UniDocDOCXOptions struct {
	IgnoreTables bool

	// Source is the name of the pdf document
	Source string
}

UniDocDOCXOptions contains options for configuring the UniDocDOCX loader.

type UniDocParser ¶ added in v0.0.97

type UniDocParser interface {
	ReadDocument(r io.ReaderAt, size int64) (unidoc.Document, error)
}

UniDocParser defines an interface for parsing documents using UniDoc.

type Unstructured ¶ added in v0.0.85

type Unstructured struct {
	// contains filtered or unexported fields
}

func NewUnstructured ¶ added in v0.0.85

func NewUnstructured(apiKey string, file *os.File) *Unstructured

func (*Unstructured) Load ¶ added in v0.0.85

func (l *Unstructured) Load(ctx context.Context) ([]schema.Document, error)

func (*Unstructured) LoadAndSplit ¶ added in v0.0.85

func (l *Unstructured) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL