Documentation ¶
Index ¶
Constants ¶
View Source
const HTMLParseEndErrorMsg = "received stop command, exiting HTML parser"
Variables ¶
View Source
var ParseHTML model.ParseFunc = func(c *config.Config, reader io.ReadCloser) *model.Result { defer reader.Close() if c.Verbose { fmt.Println("--> parsing HTML...") } var err error var contents *HTMLContents var parseFn parseFunc = parseHTML exts := extHTML(c.Extensions) if c.TagWeights == nil { c.TagWeights = defaultTagWeights } if c.ExtraTagWeights != nil { for k, v := range c.ExtraTagWeights { c.TagWeights[k] = v } } if c.FullSite && c.Source != "" { var crawler *webCrawler crawler, err = newWebCrawler(parseFn, exts, c.Source, c.Verbose) if err != nil { return model.ErrResult(err) } contents = crawler.run(reader) } else { contents = parseFn(reader, c, exts, nil) } if c.Verbose { fmt.Println("--> parsed") } if err != nil { return &model.Result{Err: err} } if len(contents.lines) == 0 { return model.EmptyResult() } tags, title, lang := tagifyHTML(contents, c, exts) return &model.Result{ Meta: &model.Meta{ ContentType: config.HTML, DocTitle: title, DocHash: fmt.Sprintf("%x", contents.hash()), Lang: lang, }, RawTags: tags, Extensions: extension.MapResults(c.Extensions), } }
ParseHTML receives lines of raw HTML markup text from the Web and returns simple text, plus list of prioritised tags (if tagify == true) based on the importance of HTML tags which wrap sentences.
Example:
<h1>A story about foo <p> Foo was a good guy but, had a quite poor time management skills, therefore he had issues with shipping all his tasks. Though foo had heaps of other amazing skills, which gained him a fortune.
Result:
foo: 2 + 1 = 3, story: 2, management: 1 + 1 = 2, skills: 1 + 1 = 2.
Returns a slice of tags as 1st result, a title of the page as 2nd and a version of the document based on the hashed contents as 3rd.
Functions ¶
This section is empty.
Types ¶
type HTMLContents ¶ added in v0.50.0
type HTMLContents struct {
// contains filtered or unexported fields
}
HTMLContents stores text from target tags.
func (*HTMLContents) Append ¶ added in v0.50.0
func (cnt *HTMLContents) Append(lineIndex int, tag string, data []byte)
func (*HTMLContents) Len ¶ added in v0.50.0
func (cnt *HTMLContents) Len() int
func (*HTMLContents) String ¶ added in v0.50.0
func (cnt *HTMLContents) String() string
func (*HTMLContents) Weigh ¶ added in v0.50.0
func (cnt *HTMLContents) Weigh(lineIndex int, weight float64)
type HTMLExtParseTag ¶ added in v0.50.0
type HTMLExtParseTag interface { HTMLExt // ParseTag returns true in case if the contents have been appended and false otherwise. ParseTag(cfg *config.Config, token *html.Token, lineIdx int, cnts *HTMLContents) (bool, error) }
HTMLExtParseTag executed at the HTML parsing phase when dealing with the HTML tag.
type HTMLExtParseText ¶ added in v0.50.0
type HTMLExtParseText interface { HTMLExt // ParseText ... ParseText(cfg *config.Config, tagName, text string, lineIdx int) error }
HTMLExtParseText executed at the HTML parsing phase when dealing with the text inside an HTML tag.
type HTMLExtTagify ¶ added in v0.50.0
type HTMLExtTagify interface { HTMLExt Tagify(cfg *config.Config, line *HTMLLine, tokenIndex map[string]*model.Tag) error }
HTMLExtParseText executed during token counting phase.
type HTMLParseEndError ¶ added in v0.50.0
type HTMLParseEndError struct { }
func NewHTMLParseEndError ¶ added in v0.50.0
func NewHTMLParseEndError() *HTMLParseEndError
func (*HTMLParseEndError) Error ¶ added in v0.50.0
func (e *HTMLParseEndError) Error() string
Click to show internal directories.
Click to hide internal directories.