Documentation ¶
Index ¶
- Variables
- type AuthorRule
- type BaseResult
- type BaseRule
- type CanonicalRule
- type DateRule
- type DescriptionRule
- type ExtractFunc
- type ExtractResult
- type ExtractionStrategy
- type FaviconRule
- type FeedRule
- type LangRule
- type LeadImageRule
- type MultiStringResult
- type NoResult
- type PublisherRule
- type ReadableResult
- type ReadableRule
- type ReadableValue
- type Rule
- type SelectorInfo
- type SiteNameRule
- type StringResult
- type TitleRule
Constants ¶
This section is empty.
Variables ¶
var ErrInvalidImageFormat = errors.New("invalid image format")
var ErrValueNotFound = errors.New("no value found")
Functions ¶
This section is empty.
Types ¶
type AuthorRule ¶
type AuthorRule struct {
BaseRule
}
AuthorRule is the rule for extracting the author information from a page.
func NewAuthorRule ¶
func NewAuthorRule(strategies ...ExtractionStrategy) *AuthorRule
type BaseResult ¶
type BaseResult struct {
// contains filtered or unexported fields
}
func (*BaseResult) ApplyMetadata ¶
func (*BaseResult) Found ¶
func (r *BaseResult) Found() bool
func (*BaseResult) SelectorInfo ¶
func (r *BaseResult) SelectorInfo() SelectorInfo
type BaseRule ¶
type BaseRule struct {
Strategies []ExtractionStrategy
}
BaseRule is the base rule for all rules
type CanonicalRule ¶
type CanonicalRule struct {
BaseRule
}
CanonicalRule is the rule for extracting the canonical URL of a page.
func NewCanonicalRule ¶
func NewCanonicalRule() *CanonicalRule
func (*CanonicalRule) Extract ¶
func (cr *CanonicalRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type DateRule ¶
type DateRule struct {
BaseRule
}
DateRule is the rule for extracting the date information from a page.
func NewDateRule ¶
func NewDateRule(strategies ...ExtractionStrategy) *DateRule
type DescriptionRule ¶
type DescriptionRule struct {
BaseRule
}
DescriptionRule is the rule for extracting the description information from a page.
func NewDescriptionRule ¶
func NewDescriptionRule() *DescriptionRule
type ExtractFunc ¶
ExtractFunc is the function signature for all extractors that can be used in a strategy. It accepts the node to extract from, the target URL, and the selectors to use It returns the value as an array of strings, a string indicating where it was found, and a boolean indicating if the value was found
func ExtractAttr ¶
func ExtractAttr(attribute string) ExtractFunc
ExtractAttr extracts a selector from the given document using the given attribute.
type ExtractResult ¶
type ExtractResult interface { ApplyMetadata(key string, u *url.URL, m *metadata.Metadata) Found() bool SelectorInfo() SelectorInfo Value() any }
ExtractResult is the result of an extraction.
func ExtractCSS ¶
ExtractCSS extracts the given CSS selector from the given document.
func ExtractJSONLD ¶
ExtractJSONLD extracts the given JSON-LD attribute from the given document.
func ExtractMeta ¶
ExtractMeta extracts the given meta tag from the given document.
type ExtractionStrategy ¶
type ExtractionStrategy struct { Selectors []string Extractor ExtractFunc }
ExtractionStrategy is the strategy for extracting a value
type FaviconRule ¶
type FaviconRule struct {
BaseRule
}
FaviconRule is the rule for extracting the favicon URL of a page.
func NewFaviconRule ¶
func NewFaviconRule() *FaviconRule
func (*FaviconRule) Extract ¶
func (r *FaviconRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type FeedRule ¶
type FeedRule struct {
BaseRule
}
FeedRule is the rule for extracting the feed URL of a page. It will respond with an array of feed URLs it found.
func NewFeedRule ¶
func NewFeedRule() *FeedRule
type LangRule ¶
type LangRule struct {
BaseRule
}
LangRule is the rule for extracting the language information from a page.
func NewLangRule ¶
func NewLangRule() *LangRule
type LeadImageRule ¶
type LeadImageRule struct {
BaseRule
}
LeadImageRule is the rule for extracting the lead image from a page.
func NewLeadImageRule ¶
func NewLeadImageRule() *LeadImageRule
type MultiStringResult ¶
type MultiStringResult struct { *BaseResult // contains filtered or unexported fields }
func NewMultiStringResult ¶
func NewMultiStringResult(value []string, selectorInfo SelectorInfo, found bool) *MultiStringResult
func (*MultiStringResult) ApplyMetadata ¶
func (*MultiStringResult) Found ¶
func (r *MultiStringResult) Found() bool
func (*MultiStringResult) Value ¶
func (r *MultiStringResult) Value() any
type NoResult ¶
type NoResult struct {
*BaseResult
}
func NewNoResult ¶
func NewNoResult() *NoResult
type PublisherRule ¶
type PublisherRule struct {
BaseRule
}
PublisherRule is the rule for extracting the publisher information from a page.
func NewPublisherRule ¶
func NewPublisherRule() *PublisherRule
type ReadableResult ¶
type ReadableResult struct { *BaseResult // contains filtered or unexported fields }
func NewReadableResult ¶
func NewReadableResult(value ReadableValue, selectorInfo SelectorInfo, found bool) *ReadableResult
func (*ReadableResult) ApplyMetadata ¶
func (*ReadableResult) Value ¶
func (r *ReadableResult) Value() any
type ReadableRule ¶
type ReadableRule struct {
BaseRule
}
ReadableRule is the rule for extracting the readable content
func NewReadableRule ¶
func NewReadableRule() *ReadableRule
NewReadableRule creates a new ReadableRule
type ReadableValue ¶
type Rule ¶
type Rule interface { // Extract extracts the value from the node Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error) }
A Rule is a rule for extracting a value from a node. It encapsulates multiple strategies for extracting a value. Each strategy is tried in order of priority until a value is found, or all strategies have been tried.
type SelectorInfo ¶
type SiteNameRule ¶
type SiteNameRule struct {
BaseRule
}
SiteNameRule is the rule for extracting the site name information from a page.
func NewSiteNameRule ¶
func NewSiteNameRule() *SiteNameRule
func (*SiteNameRule) Extract ¶
func (r *SiteNameRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type StringResult ¶
type StringResult struct { *BaseResult // contains filtered or unexported fields }
func NewStringResult ¶
func NewStringResult(value string, selectorInfo SelectorInfo, found bool) *StringResult
func (*StringResult) ApplyMetadata ¶
func (*StringResult) Value ¶
func (r *StringResult) Value() any