webdoc

package
v0.0.0-...-25b8d04 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 26, 2024 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CanBeNested

func CanBeNested(tagName string) bool

Types

type BaseElement

type BaseElement struct {
	// contains filtered or unexported fields
}

BaseElement is base of any other element.

func (*BaseElement) IsContent

func (be *BaseElement) IsContent() bool

func (*BaseElement) SetIsContent

func (be *BaseElement) SetIsContent(b bool)

type Document

type Document struct {
	Elements []Element
}

Document is a simplified view of the underlying webpage. It contains the logical elements (blocks of text, image + caption, video, etc).

func NewDocument

func NewDocument() *Document

func (*Document) AddElements

func (doc *Document) AddElements(elements ...Element)

func (*Document) CreateTextDocument

func (doc *Document) CreateTextDocument() *TextDocument

CreateTextDocument generates a web document to be processed by distiller. Text groups have been introduced to help retain element order when adding images and embeds.

func (*Document) GenerateOutput

func (doc *Document) GenerateOutput(textOnly bool) string

func (*Document) GetImageURLs

func (doc *Document) GetImageURLs() []string

GetImageURLs returns list of source URLs of all image inside the document.

type DocumentBuilder

type DocumentBuilder interface {
	SkipNode(e *html.Node)
	StartNode(e *html.Node)
	EndNode()
	AddTextNode(textNode *html.Node)
	AddLineBreak(node *html.Node)
	AddDataTable(e *html.Node)
	AddTag(tag *Tag)
	AddEmbed(embed Element)
}

type Element

type Element interface {
	// GenerateOutput generates HTML output for this Element.
	GenerateOutput(textOnly bool) string
	IsContent() bool
	SetIsContent(bool)
	ElementType() string
	String() string
}

Element is some logical part of a web document (text block, image, video, table, etc.)

type ElementAction

type ElementAction struct {
	Flush           bool
	IsAnchor        bool
	ChangesTagLevel bool
	Labels          []string
}

func GetActionForElement

func GetActionForElement(element *html.Node) ElementAction

type Embed

type Embed struct {
	BaseElement

	Element *html.Node
	ID      string
	Type    string
	Params  map[string]string
}

Embed is the base class for many site-specific embedded elements (Twitter, YouTube, etc.).

func (*Embed) ElementType

func (e *Embed) ElementType() string

func (*Embed) GenerateOutput

func (e *Embed) GenerateOutput(textOnly bool) string

func (*Embed) String

func (e *Embed) String() string

type Figure

type Figure struct {
	Image
	Caption *html.Node
}

func (*Figure) ElementType

func (f *Figure) ElementType() string

func (*Figure) GenerateOutput

func (f *Figure) GenerateOutput(textOnly bool) string

type Image

type Image struct {
	BaseElement
	Element *html.Node // node for the image
	PageURL *nurl.URL  // url of page where image is placed
	// contains filtered or unexported fields
}

func (*Image) ElementType

func (i *Image) ElementType() string

func (*Image) GenerateOutput

func (i *Image) GenerateOutput(textOnly bool) string

func (*Image) GetURLs

func (i *Image) GetURLs() []string

GetURLs returns the list of source URLs of this image.

func (*Image) String

func (i *Image) String() string

type Table

type Table struct {
	BaseElement

	Element *html.Node
	PageURL *nurl.URL
	// contains filtered or unexported fields
}

func (*Table) ElementType

func (t *Table) ElementType() string

func (*Table) GenerateOutput

func (t *Table) GenerateOutput(textOnly bool) string

func (*Table) GetImageURLs

func (t *Table) GetImageURLs() []string

GetImageURLs returns list of source URLs of all image inside the table.

func (*Table) String

func (t *Table) String() string

type Tag

type Tag struct {
	BaseElement
	Name string
	Type TagType
}

Tag represents HTML tags that need to be preserved over.

func NewTag

func NewTag(name string, tagType TagType) *Tag

func (*Tag) ElementType

func (t *Tag) ElementType() string

func (*Tag) GenerateOutput

func (t *Tag) GenerateOutput(textOnly bool) string

func (*Tag) String

func (t *Tag) String() string

type TagType

type TagType uint
const (
	TagStart TagType = iota
	TagEnd
)

type Text

type Text struct {
	BaseElement

	Text           string
	NumWords       int
	NumLinkedWords int
	Labels         map[string]struct{}
	TagLevel       int
	OffsetBlock    int
	GroupNumber    int
	PageURL        *nurl.URL

	TextNodes     []*html.Node
	Start         int
	End           int
	FirstWordNode int
	LastWordNode  int
}

func (*Text) AddLabel

func (t *Text) AddLabel(s string)

func (*Text) ElementType

func (t *Text) ElementType() string

func (Text) FirstNonWhitespaceTextNode

func (t Text) FirstNonWhitespaceTextNode() *html.Node

func (*Text) GenerateOutput

func (t *Text) GenerateOutput(textOnly bool) string

func (Text) GetTextNodes

func (t Text) GetTextNodes() []*html.Node

func (*Text) HasLabel

func (t *Text) HasLabel(s string) bool

func (Text) LastNonWhitespaceTextNode

func (t Text) LastNonWhitespaceTextNode() *html.Node

func (*Text) String

func (t *Text) String() string

func (*Text) TakeLabels

func (t *Text) TakeLabels() map[string]struct{}

type TextBlock

type TextBlock struct {
	TextElements     []*Text
	Text             string
	Labels           map[string]struct{}
	NumWords         int
	NumWordsInAnchor int
	LinkDensity      float64
	TagLevel         int
	// contains filtered or unexported fields
}

TextBlock describes a block of text. A block can be an "atomic" text node (i.e., a sequence of text that is not interrupted by any HTML markup) or a compound of such atomic elements.

func NewTextBlock

func NewTextBlock(textElements ...*Text) *TextBlock

func (*TextBlock) AddLabels

func (tb *TextBlock) AddLabels(labels ...string)

func (*TextBlock) ApplyToModel

func (tb *TextBlock) ApplyToModel()

func (*TextBlock) FirstNonWhitespaceTextNode

func (tb *TextBlock) FirstNonWhitespaceTextNode() *html.Node

func (*TextBlock) HasLabel

func (tb *TextBlock) HasLabel(label string) bool

func (*TextBlock) IsContent

func (tb *TextBlock) IsContent() bool

func (*TextBlock) LastNonWhitespaceTextNode

func (tb *TextBlock) LastNonWhitespaceTextNode() *html.Node

func (*TextBlock) MergeNext

func (tb *TextBlock) MergeNext(other *TextBlock)

func (*TextBlock) OffsetBlocksEnd

func (tb *TextBlock) OffsetBlocksEnd() int

func (*TextBlock) OffsetBlocksStart

func (tb *TextBlock) OffsetBlocksStart() int

func (*TextBlock) RemoveLabels

func (tb *TextBlock) RemoveLabels(labels ...string)

func (*TextBlock) SetIsContent

func (tb *TextBlock) SetIsContent(isContent bool) bool

SetIsContent set the value of isContent. Returns true if isContent value changed.

func (*TextBlock) String

func (tb *TextBlock) String() string

type TextBuilder

type TextBuilder struct {
	// contains filtered or unexported fields
}

func NewTextBuilder

func NewTextBuilder(wc stringutil.WordCounter) *TextBuilder

func (*TextBuilder) AddLineBreak

func (tb *TextBuilder) AddLineBreak(node *html.Node)

func (*TextBuilder) AddTextNode

func (tb *TextBuilder) AddTextNode(textNode *html.Node, tagLevel int)

func (*TextBuilder) Build

func (tb *TextBuilder) Build(offsetBlock int) *Text

func (*TextBuilder) EnterAnchor

func (tb *TextBuilder) EnterAnchor()

func (*TextBuilder) ExitAnchor

func (tb *TextBuilder) ExitAnchor()

func (*TextBuilder) Reset

func (tb *TextBuilder) Reset()

type TextDocument

type TextDocument struct {
	TextBlocks []*TextBlock
}

TextDocument is a text document, consisting of one or more TextBlock.

func NewTextDocument

func NewTextDocument(textBlocks []*TextBlock) *TextDocument

func (*TextDocument) ApplyToModel

func (td *TextDocument) ApplyToModel()

func (*TextDocument) CountWordsInContent

func (td *TextDocument) CountWordsInContent() int

CountWordsInContent returns the sum of number of words in content blocks.

func (*TextDocument) DebugString

func (td *TextDocument) DebugString() string

DebugString returns detailed debugging information about the contained TextBlocks.

type Video

type Video struct {
	BaseElement

	// TODO: Handle multiple nested "source" and "track" tags.
	Element *html.Node
	Width   int
	Height  int
	PageURL *nurl.URL
}

func NewVideo

func NewVideo(node *html.Node, pageURL *nurl.URL, width, height int) *Video

func (*Video) ElementType

func (v *Video) ElementType() string

func (*Video) GenerateOutput

func (v *Video) GenerateOutput(textOnly bool) string

func (*Video) String

func (v *Video) String() string

type WebDocumentBuilder

type WebDocumentBuilder struct {
	// contains filtered or unexported fields
}

func NewWebDocumentBuilder

func NewWebDocumentBuilder(wc stringutil.WordCounter, pageURL *nurl.URL) *WebDocumentBuilder

func (*WebDocumentBuilder) AddDataTable

func (db *WebDocumentBuilder) AddDataTable(table *html.Node)

func (*WebDocumentBuilder) AddEmbed

func (db *WebDocumentBuilder) AddEmbed(embed Element)

func (*WebDocumentBuilder) AddLineBreak

func (db *WebDocumentBuilder) AddLineBreak(br *html.Node)

func (*WebDocumentBuilder) AddTag

func (db *WebDocumentBuilder) AddTag(tag *Tag)

func (*WebDocumentBuilder) AddTextNode

func (db *WebDocumentBuilder) AddTextNode(textNode *html.Node)

func (*WebDocumentBuilder) Build

func (db *WebDocumentBuilder) Build() *Document

func (*WebDocumentBuilder) EndNode

func (db *WebDocumentBuilder) EndNode()

func (*WebDocumentBuilder) SkipNode

func (db *WebDocumentBuilder) SkipNode(e *html.Node)

func (*WebDocumentBuilder) StartNode

func (db *WebDocumentBuilder) StartNode(e *html.Node)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL