documentloader

package

v0.0.98 Latest Latest Go to latest Published: Dec 21, 2023 License: MIT Imports: 21 Imported by: 1

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/hupe1980/golc

Links

Open Source Insights

Documentation ¶

Overview ¶

Package documentloader provides functionality for loading and processing documents.

Index ¶

Variables
type CSV
- func NewCSV(r io.Reader, optFns ...func(o *CSVOptions)) *CSV
- func (l *CSV) Load(ctx context.Context) ([]schema.Document, error)
- func (l *CSV) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type CSVOptions
type FileFilter
type Git
- func NewGit(r *git.Repository, optFns ...func(o *GitOptions)) *Git
- func NewGitFromCloneURL(url string, optFns ...func(o *GitCloneURLOptions)) (*Git, error)
- func NewGitFromCodeCommitURL(url string, creds aws.Credentials, optFns ...func(o *GitOptions)) (*Git, error)
- func NewGitFromPath(path string, optFns ...func(o *GitOptions)) (*Git, error)
- func (l *Git) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Git) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type GitCloneURLOptions
type GitOptions
type HTML
- func NewHTML(r io.Reader, optFns ...func(o *HTMLOptions)) *HTML
- func (l *HTML) Load(ctx context.Context) ([]schema.Document, error)
- func (l *HTML) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type HTMLOptions
type Notebook
- func NewNotebook(r io.Reader, optFns ...func(o *NotebookOptions)) *Notebook
- func (l *Notebook) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Notebook) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type NotebookOptions
type PDF
- func NewPDF(f *os.File, optFns ...func(o *PDFOptions)) (*PDF, error)
- func (l *PDF) Load(ctx context.Context) ([]schema.Document, error)
- func (l *PDF) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type PDFOptions
type Text
- func NewText(r io.Reader) *Text
- func (l *Text) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Text) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type UniDocDOCX
- func NewUniDocDOCX(parser UniDocParser, f *os.File, optFns ...func(o *UniDocDOCXOptions)) *UniDocDOCX
- func (l *UniDocDOCX) Load(ctx context.Context) ([]schema.Document, error)
- func (l *UniDocDOCX) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)
type UniDocDOCXOptions
type UniDocParser
type Unstructured
- func NewUnstructured(apiKey string, file *os.File) *Unstructured
- func (l *Unstructured) Load(ctx context.Context) ([]schema.Document, error)
- func (l *Unstructured) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

Constants ¶

This section is empty.

Variables ¶

View Source

var DefaultGitOptions = GitOptions{
	Branch:     "main",
	FileFilter: func(f *object.File) bool { return true },
}

DefaultGitOptions provides default Git options.

Functions ¶

This section is empty.

Types ¶

type CSV ¶ added in v0.0.39

type CSV struct {
	// contains filtered or unexported fields
}

CSV represents a CSV document loader.

func NewCSV ¶ added in v0.0.39

func NewCSV(r io.Reader, optFns ...func(o *CSVOptions)) *CSV

NewCSV creates a new CSV loader with an io.Reader and optional configuration options. It returns a pointer to the created CSV loader.

func (*CSV) Load ¶ added in v0.0.39

func (l *CSV) Load(ctx context.Context) ([]schema.Document, error)

Load loads CSV documents from the provided reader.

func (*CSV) LoadAndSplit ¶ added in v0.0.39

func (l *CSV) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads CSV documents from the provided reader and splits them using the specified text splitter.

type CSVOptions ¶ added in v0.0.41

type CSVOptions struct {
	// Separator is the rune used to separate fields in the CSV file.
	Separator rune

	// LazyQuotes controls whether the CSV reader should use lazy quotes mode.
	LazyQuotes bool

	// Columns is a list of column names to filter and include in the loaded documents.
	Columns []string
}

CSVOptions contains options for configuring the CSV loader.

type FileFilter ¶ added in v0.0.52

type FileFilter func(f *object.File) bool

FileFilter is a function that filters files based on specific criteria.

type Git ¶ added in v0.0.52

type Git struct {
	// contains filtered or unexported fields
}

Git is a Git-based implementation of the DocumentLoader interface.

func NewGit ¶ added in v0.0.52

func NewGit(r *git.Repository, optFns ...func(o *GitOptions)) *Git

NewGit creates a Git document loader from an existing Git repository and returns it. The options can be customized using functional options.

func NewGitFromCloneURL ¶ added in v0.0.52

func NewGitFromCloneURL(url string, optFns ...func(o *GitCloneURLOptions)) (*Git, error)

NewGitFromCloneURL clones a Git repository from a URL and returns a Git document loader. The options can be customized using functional options.

func NewGitFromCodeCommitURL ¶ added in v0.0.52

func NewGitFromCodeCommitURL(url string, creds aws.Credentials, optFns ...func(o *GitOptions)) (*Git, error)

NewGitFromCodeCommitURL clones a Git repository from an AWS CodeCommit URL using the provided AWS credentials, and returns a Git document loader. The options can be customized using functional options.

func NewGitFromPath ¶ added in v0.0.52

func NewGitFromPath(path string, optFns ...func(o *GitOptions)) (*Git, error)

NewGitFromPath opens an existing Git repository from a local path and returns a Git document loader. The options can be customized using functional options.

func (*Git) Load ¶ added in v0.0.52

func (l *Git) Load(ctx context.Context) ([]schema.Document, error)

Load retrieves documents from the Git repository and returns them as a slice of schema.Document.

func (*Git) LoadAndSplit ¶ added in v0.0.52

func (l *Git) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit retrieves documents from the Git repository, splits them using the provided TextSplitter, and returns the split documents as a slice of schema.Document.

type GitCloneURLOptions ¶ added in v0.0.52

type GitCloneURLOptions struct {
	GitOptions
	Auth transport.AuthMethod
}

GitCloneURLOptions holds options for Git repositories cloned from a URL.

type GitOptions ¶ added in v0.0.52

type GitOptions struct {
	Branch     string
	FileFilter FileFilter
}

GitOptions holds options for the Git document loader.

type HTML ¶ added in v0.0.55

type HTML struct {
	// contains filtered or unexported fields
}

HTML implements the DocumentLoader interface for HTML documents.

func NewHTML ¶ added in v0.0.55

func NewHTML(r io.Reader, optFns ...func(o *HTMLOptions)) *HTML

NewHTML creates a new HTML document loader with an io.Reader and optional configuration options. It returns a pointer to the created HTML loader.

func (*HTML) Load ¶ added in v0.0.55

func (l *HTML) Load(ctx context.Context) ([]schema.Document, error)

Load loads the HTML document from the reader and extracts the text content. It returns a list of schema.Document containing the extracted content and the title as metadata.

func (*HTML) LoadAndSplit ¶ added in v0.0.55

func (l *HTML) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads HTML documents from the provided reader and splits them using the specified text splitter.

type HTMLOptions ¶ added in v0.0.55

type HTMLOptions struct {
	// TagFilter is a list of HTML tags to be filtered from the document content.
	TagFilter []string
}

HTMLOptions contains options for the HTML document loader.

type Notebook ¶ added in v0.0.55

type Notebook struct {
	// contains filtered or unexported fields
}

Notebook represents a Jupyter Notebook document loader.

func NewNotebook ¶ added in v0.0.55

func NewNotebook(r io.Reader, optFns ...func(o *NotebookOptions)) *Notebook

NewNotebook creates a new instance of Notebook with the given reader and optional functions to set options.

func (*Notebook) Load ¶ added in v0.0.55

func (l *Notebook) Load(ctx context.Context) ([]schema.Document, error)

Load reads and parses the Jupyter Notebook from the provided reader. It returns a slice of schema.Document representing the notebook's content and any error encountered.

func (*Notebook) LoadAndSplit ¶ added in v0.0.55

func (l *Notebook) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads Notebook documents from the provided reader and splits them using the specified text splitter. It returns a slice of schema.Document representing the notebook's content and any error encountered.

type NotebookOptions ¶ added in v0.0.55

type NotebookOptions struct {
	// Include outputs (cell execution results) in the document content.
	IncludeOutputs bool

	// Include traceback information for cells with errors.
	Traceback bool

	// Maximum length of output text to include in the document.
	MaxOutputLength uint
}

NotebookOptions represents the options for loading a Jupyter Notebook.

type PDF ¶ added in v0.0.55

type PDF struct {
	// contains filtered or unexported fields
}

PDF represents a PDF document loader that implements the DocumentLoader interface.

func NewPDF ¶ added in v0.0.55

func NewPDF(f *os.File, optFns ...func(o *PDFOptions)) (*PDF, error)

NewPDF creates a new PDF loader with the given options.

func (*PDF) Load ¶ added in v0.0.55

func (l *PDF) Load(ctx context.Context) ([]schema.Document, error)

Load loads the PDF document and returns a slice of schema.Document containing the page contents and metadata.

func (*PDF) LoadAndSplit ¶ added in v0.0.55

func (l *PDF) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads PDF documents from the provided reader and splits them using the specified text splitter.

type PDFOptions ¶ added in v0.0.55

type PDFOptions struct {
	// Password for encrypted PDF files.
	Password string

	// Page number to start loading from (default is 1).
	StartPage uint

	// Maximum number of pages to load (0 for all pages).
	MaxPages uint
}

type Text ¶ added in v0.0.22

type Text struct {
	// contains filtered or unexported fields
}

func NewText ¶ added in v0.0.22

func NewText(r io.Reader) *Text

NewText creates a new Text document loader with the given reader.

func (*Text) Load ¶ added in v0.0.22

func (l *Text) Load(ctx context.Context) ([]schema.Document, error)

Load reads the content from the reader and returns it as a single document.

func (*Text) LoadAndSplit ¶ added in v0.0.22

func (l *Text) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit reads the content from the reader and splits it into multiple documents using the provided splitter.

type UniDocDOCX ¶ added in v0.0.97

type UniDocDOCX struct {
	// contains filtered or unexported fields
}

UniDocDOCX is a document loader for DOCX files using UniDoc.

func NewUniDocDOCX ¶ added in v0.0.97

func NewUniDocDOCX(parser UniDocParser, f *os.File, optFns ...func(o *UniDocDOCXOptions)) *UniDocDOCX

NewUniDocDOCX creates a new instance of UniDocDOCX loader.

func (*UniDocDOCX) Load ¶ added in v0.0.97

func (l *UniDocDOCX) Load(ctx context.Context) ([]schema.Document, error)

Load reads the document and extracts its content into schema.Document format.

func (*UniDocDOCX) LoadAndSplit ¶ added in v0.0.97

func (l *UniDocDOCX) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

LoadAndSplit loads dOCX documents from the provided reader and splits them using the specified text splitter.

type UniDocDOCXOptions ¶ added in v0.0.97

type UniDocDOCXOptions struct {
	IgnoreTables bool
}

UniDocDOCXOptions contains options for configuring the UniDocDOCX loader.

type UniDocParser ¶ added in v0.0.97

type UniDocParser interface {
	ReadDocument(f *os.File) (unidoc.Document, error)
}

UniDocParser defines an interface for parsing documents using UniDoc.

type Unstructured ¶ added in v0.0.85

type Unstructured struct {
	// contains filtered or unexported fields
}

func NewUnstructured ¶ added in v0.0.85

func NewUnstructured(apiKey string, file *os.File) *Unstructured

func (*Unstructured) Load ¶ added in v0.0.85

func (l *Unstructured) Load(ctx context.Context) ([]schema.Document, error)

func (*Unstructured) LoadAndSplit ¶ added in v0.0.85

func (l *Unstructured) LoadAndSplit(ctx context.Context, splitter schema.TextSplitter) ([]schema.Document, error)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL