Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var ( // Source sets target source. Source = func(v string) Option { return func(c *Config) { c.Source = v } } // Language ... Language = func(v string) Option { return func(c *Config) { c.Lang = v } } // Query sets CSS query for the target. Query = func(v string) Option { return func(c *Config) { c.Query = v } } // Timeout sets the overall deadline for the operation. Timeout = func(d time.Duration) Option { return func(c *Config) { c.Timeout = d } } // WaitFor sets CSS query for the target of In-Out. WaitFor = func(query string) Option { return func(c *Config) { c.WaitFor = query } } // WaitUntil sets page load duration to wait for. WaitUntil = func(d time.Duration) Option { return func(c *Config) { c.WaitUntil = d } } // Screenshot captures screenshot, Reader will ImgBytes of the image populated. Screenshot = func(v bool) Option { return func(c *Config) { c.Screenshot = v } } // UserAgent allows to specify custom User Agent of the HTTP headless calls. UserAgent = func(ua string) Option { return func(c *Config) { c.UserAgent = ua } } // Content sets content of the target. Content = func(v string) Option { return func(c *Config) { c.Content = v } } // TargetType sets content type of the target. TargetType = func(v ContentType) Option { return func(c *Config) { c.ContentType = v } } // Limit sets the limit of tags for the target. Limit = func(v int) Option { return func(c *Config) { c.Limit = v } } // Verbose enables high verbosity. Verbose = func(v bool) Option { return func(c *Config) { c.Verbose = v } } // NoStopWords enables stop-words exclusion from the output. NoStopWords = func(v bool) Option { return func(c *Config) { c.NoStopWords = v } } // StopWords allows to provide a custom set of stop-words. StopWords = func(v []string) Option { return func(c *Config) { c.StopWords = stopwords.Setup(stopwords.WordsSlice(v)) } } // ContentOnly ignores all none content related parts of the HTML page. ContentOnly = func(v bool) Option { return func(c *Config) { c.ContentOnly = v } } // FullSite tells parser to process full site (HTML only). FullSite = func(v bool) Option { return func(c *Config) { c.FullSite = v } } // TagWeightsString ... TagWeightsString = func(v string) Option { return func(c *Config) { c.TagWeights = ParseTagWeights(strings.NewReader(v), String) } } // TagWeightsJSON ... TagWeightsJSON = func(v string) Option { return func(c *Config) { f, err := os.Open(v) if err != nil { println(fmt.Errorf("error: can't open JSON file [%s]: %w", v, err)) return } r := bufio.NewReader(f) c.TagWeights = ParseTagWeights(r, JSON) f.Close() } } // ExtraTagWeightsString ... ExtraTagWeightsString = func(v string) Option { return func(c *Config) { c.ExtraTagWeights = ParseTagWeights(strings.NewReader(v), String) } } // TagWeightsJSON ... ExtraTagWeightsJSON = func(v string) Option { return func(c *Config) { f, err := os.Open(v) if err != nil { println(fmt.Errorf("error: can't open JSON file [%s]: %w", v, err)) return } r := bufio.NewReader(f) c.ExtraTagWeights = ParseTagWeights(r, JSON) f.Close() } } // ExcludeTagsString ... ExcludeTagsString = func(v string) Option { return func(c *Config) { c.ExcludeTags = ParseTagWeights(strings.NewReader(v), String) } } // AllTagWeights ... AllTagWeights = func(v bool) Option { return func(c *Config) { c.AllTagWeights = v } } AdjustScores = func(v bool) Option { return func(c *Config) { c.AdjustScores = v } } Extensions = func(v []extension.Extension) Option { return func(c *Config) { c.Extensions = make([]extension.Extension, len(v)) copy(c.Extensions, v) } } )
View Source
var (
ContentTypes = [...]string{
"Unknown",
"Text",
"HTML",
"Markdown",
}
)
Functions ¶
func BytesToStrings ¶ added in v0.60.1
func DetectLang ¶ added in v0.59.0
DetectLang detects language and setups the stop words for it.
Types ¶
type Config ¶
type Config struct { Source string Lang string ContentType Content string Timeout time.Duration // headless Query string WaitFor string WaitUntil time.Duration Screenshot bool UserAgent string // misc Limit int Verbose bool NoStopWords bool SkipLang bool StopWords *stopwords.Register ContentOnly bool FullSite bool // weighing AllTagWeights bool TagWeights ExtraTagWeights TagWeights ExcludeTags TagWeights AdjustScores bool Extensions []extension.Extension // contains filtered or unexported fields }
Config ...
type ContentType ¶
type ContentType byte
ContentType ...
const ( Unknown ContentType = iota Text HTML Markdown )
Content types
func ContentTypeOf ¶
func ContentTypeOf(contentType string) ContentType
ContentTypeOf returns ContentType based on string value.
type DefaultSegmenter ¶ added in v0.60.1
type DefaultSegmenter struct {
// contains filtered or unexported fields
}
func NewDefaultSegmenter ¶ added in v0.60.1
func NewDefaultSegmenter(c *Config) *DefaultSegmenter
func (*DefaultSegmenter) Segment ¶ added in v0.60.1
func (s *DefaultSegmenter) Segment(text []byte) [][]byte
type TagWeights ¶
TagWeights ...
func ParseTagWeights ¶ added in v0.50.0
func ParseTagWeights(reader io.Reader, readerType TagWeightsType) TagWeights
type TagWeightsType ¶ added in v0.50.0
type TagWeightsType byte
TagWeightsType ...
const ( String TagWeightsType = iota // <tagName1>:<tagScore1>|<tagName2>:<tagScore2> JSON // { "<tagName1>": <tagScore1>, "<tagName2>": <tagScore2> } )
Wight input types
Click to show internal directories.
Click to hide internal directories.