Documentation ¶
Index ¶
- func CanBeNested(tagName string) bool
- type BaseElement
- type Document
- type DocumentBuilder
- type Element
- type ElementAction
- type Embed
- type Figure
- type Image
- type Table
- type Tag
- type TagType
- type Text
- func (t *Text) AddLabel(s string)
- func (t *Text) ElementType() string
- func (t Text) FirstNonWhitespaceTextNode() *html.Node
- func (t *Text) GenerateOutput(textOnly bool) string
- func (t Text) GetTextNodes() []*html.Node
- func (t *Text) HasLabel(s string) bool
- func (t Text) LastNonWhitespaceTextNode() *html.Node
- func (t *Text) String() string
- func (t *Text) TakeLabels() map[string]struct{}
- type TextBlock
- func (tb *TextBlock) AddLabels(labels ...string)
- func (tb *TextBlock) ApplyToModel()
- func (tb *TextBlock) FirstNonWhitespaceTextNode() *html.Node
- func (tb *TextBlock) HasLabel(label string) bool
- func (tb *TextBlock) IsContent() bool
- func (tb *TextBlock) LastNonWhitespaceTextNode() *html.Node
- func (tb *TextBlock) MergeNext(other *TextBlock)
- func (tb *TextBlock) OffsetBlocksEnd() int
- func (tb *TextBlock) OffsetBlocksStart() int
- func (tb *TextBlock) RemoveLabels(labels ...string)
- func (tb *TextBlock) SetIsContent(isContent bool) bool
- func (tb *TextBlock) String() string
- type TextBuilder
- type TextDocument
- type Video
- type WebDocumentBuilder
- func (db *WebDocumentBuilder) AddDataTable(table *html.Node)
- func (db *WebDocumentBuilder) AddEmbed(embed Element)
- func (db *WebDocumentBuilder) AddLineBreak(br *html.Node)
- func (db *WebDocumentBuilder) AddTag(tag *Tag)
- func (db *WebDocumentBuilder) AddTextNode(textNode *html.Node)
- func (db *WebDocumentBuilder) Build() *Document
- func (db *WebDocumentBuilder) EndNode()
- func (db *WebDocumentBuilder) SkipNode(e *html.Node)
- func (db *WebDocumentBuilder) StartNode(e *html.Node)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CanBeNested ¶
Types ¶
type BaseElement ¶
type BaseElement struct {
// contains filtered or unexported fields
}
BaseElement is base of any other element.
func (*BaseElement) IsContent ¶
func (be *BaseElement) IsContent() bool
func (*BaseElement) SetIsContent ¶
func (be *BaseElement) SetIsContent(b bool)
type Document ¶
type Document struct {
Elements []Element
}
Document is a simplified view of the underlying webpage. It contains the logical elements (blocks of text, image + caption, video, etc).
func NewDocument ¶
func NewDocument() *Document
func (*Document) AddElements ¶
func (*Document) CreateTextDocument ¶
func (doc *Document) CreateTextDocument() *TextDocument
CreateTextDocument generates a web document to be processed by distiller. Text groups have been introduced to help retain element order when adding images and embeds.
func (*Document) GenerateOutput ¶
func (*Document) GetImageURLs ¶
GetImageURLs returns list of source URLs of all image inside the document.
type DocumentBuilder ¶
type Element ¶
type Element interface { // GenerateOutput generates HTML output for this Element. GenerateOutput(textOnly bool) string IsContent() bool SetIsContent(bool) ElementType() string String() string }
Element is some logical part of a web document (text block, image, video, table, etc.)
type ElementAction ¶
func GetActionForElement ¶
func GetActionForElement(element *html.Node) ElementAction
type Embed ¶
Embed is the base class for many site-specific embedded elements (Twitter, YouTube, etc.).
func (*Embed) ElementType ¶
func (*Embed) GenerateOutput ¶
type Image ¶
type Image struct { BaseElement Element *html.Node // node for the image PageURL *nurl.URL // url of page where image is placed // contains filtered or unexported fields }
func (*Image) ElementType ¶
func (*Image) GenerateOutput ¶
type Table ¶
type Table struct { BaseElement Element *html.Node PageURL *nurl.URL // contains filtered or unexported fields }
func (*Table) ElementType ¶
func (*Table) GenerateOutput ¶
func (*Table) GetImageURLs ¶
GetImageURLs returns list of source URLs of all image inside the table.
type Tag ¶
type Tag struct { BaseElement Name string Type TagType }
Tag represents HTML tags that need to be preserved over.
func (*Tag) ElementType ¶
func (*Tag) GenerateOutput ¶
type Text ¶
type Text struct { BaseElement Text string NumWords int NumLinkedWords int Labels map[string]struct{} TagLevel int OffsetBlock int GroupNumber int PageURL *nurl.URL TextNodes []*html.Node Start int End int FirstWordNode int LastWordNode int }
func (*Text) ElementType ¶
func (Text) FirstNonWhitespaceTextNode ¶
func (*Text) GenerateOutput ¶
func (Text) GetTextNodes ¶
func (Text) LastNonWhitespaceTextNode ¶
func (*Text) TakeLabels ¶
type TextBlock ¶
type TextBlock struct { TextElements []*Text Text string Labels map[string]struct{} NumWords int NumWordsInAnchor int LinkDensity float64 TagLevel int // contains filtered or unexported fields }
TextBlock describes a block of text. A block can be an "atomic" text node (i.e., a sequence of text that is not interrupted by any HTML markup) or a compound of such atomic elements.
func NewTextBlock ¶
func (*TextBlock) ApplyToModel ¶
func (tb *TextBlock) ApplyToModel()
func (*TextBlock) FirstNonWhitespaceTextNode ¶
func (*TextBlock) LastNonWhitespaceTextNode ¶
func (*TextBlock) OffsetBlocksEnd ¶
func (*TextBlock) OffsetBlocksStart ¶
func (*TextBlock) RemoveLabels ¶
func (*TextBlock) SetIsContent ¶
SetIsContent set the value of isContent. Returns true if isContent value changed.
type TextBuilder ¶
type TextBuilder struct {
// contains filtered or unexported fields
}
func NewTextBuilder ¶
func NewTextBuilder(wc stringutil.WordCounter) *TextBuilder
func (*TextBuilder) AddLineBreak ¶
func (tb *TextBuilder) AddLineBreak(node *html.Node)
func (*TextBuilder) AddTextNode ¶
func (tb *TextBuilder) AddTextNode(textNode *html.Node, tagLevel int)
func (*TextBuilder) Build ¶
func (tb *TextBuilder) Build(offsetBlock int) *Text
func (*TextBuilder) EnterAnchor ¶
func (tb *TextBuilder) EnterAnchor()
func (*TextBuilder) ExitAnchor ¶
func (tb *TextBuilder) ExitAnchor()
func (*TextBuilder) Reset ¶
func (tb *TextBuilder) Reset()
type TextDocument ¶
type TextDocument struct {
TextBlocks []*TextBlock
}
TextDocument is a text document, consisting of one or more TextBlock.
func NewTextDocument ¶
func NewTextDocument(textBlocks []*TextBlock) *TextDocument
func (*TextDocument) ApplyToModel ¶
func (td *TextDocument) ApplyToModel()
func (*TextDocument) CountWordsInContent ¶
func (td *TextDocument) CountWordsInContent() int
CountWordsInContent returns the sum of number of words in content blocks.
func (*TextDocument) DebugString ¶
func (td *TextDocument) DebugString() string
DebugString returns detailed debugging information about the contained TextBlocks.
type Video ¶
type Video struct { BaseElement // TODO: Handle multiple nested "source" and "track" tags. Element *html.Node Width int Height int PageURL *nurl.URL }
func (*Video) ElementType ¶
func (*Video) GenerateOutput ¶
type WebDocumentBuilder ¶
type WebDocumentBuilder struct {
// contains filtered or unexported fields
}
func NewWebDocumentBuilder ¶
func NewWebDocumentBuilder(wc stringutil.WordCounter, pageURL *nurl.URL) *WebDocumentBuilder
func (*WebDocumentBuilder) AddDataTable ¶
func (db *WebDocumentBuilder) AddDataTable(table *html.Node)
func (*WebDocumentBuilder) AddEmbed ¶
func (db *WebDocumentBuilder) AddEmbed(embed Element)
func (*WebDocumentBuilder) AddLineBreak ¶
func (db *WebDocumentBuilder) AddLineBreak(br *html.Node)
func (*WebDocumentBuilder) AddTag ¶
func (db *WebDocumentBuilder) AddTag(tag *Tag)
func (*WebDocumentBuilder) AddTextNode ¶
func (db *WebDocumentBuilder) AddTextNode(textNode *html.Node)
func (*WebDocumentBuilder) Build ¶
func (db *WebDocumentBuilder) Build() *Document
func (*WebDocumentBuilder) EndNode ¶
func (db *WebDocumentBuilder) EndNode()
func (*WebDocumentBuilder) SkipNode ¶
func (db *WebDocumentBuilder) SkipNode(e *html.Node)
func (*WebDocumentBuilder) StartNode ¶
func (db *WebDocumentBuilder) StartNode(e *html.Node)