Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
var ( // Verbose enables high verbosity. Verbose = func(verbose bool) ParseOption { return func(c *parseConfig) { c.verbose = verbose } } // NoStopWords enables stop-words exclusion from the output. NoStopWords = func(noStopWords bool) ParseOption { return func(c *parseConfig) { c.noStopWords = noStopWords } } // ContentOnly ignores all none content related parts of the HTML page (HTML only). ContentOnly = func(v bool) ParseOption { return func(c *parseConfig) { c.contentOnly = v } } // FullSite tells parser to process full site (HTML only). FullSite = func(v bool) ParseOption { return func(c *parseConfig) { c.fullSite = v } } // Source of the parser. Source = func(v string) ParseOption { return func(c *parseConfig) { c.source = v } } )
Functions ¶
func SplitToSentences ¶ added in v0.24.0
SplitToSentences splits given text into slice of sentences.
Types ¶
type InputReader ¶ added in v0.22.0
type InputReader interface { ReadLines() ([]string, error) io.ReadCloser }
InputReader ...
type ParseFunc ¶ added in v0.40.0
type ParseFunc func(reader io.ReadCloser, options ...ParseOption) *ParseOutput
ParseFunc represents an arbitrary handler, which goes through given reader and produces tags.
var ParseHTML ParseFunc = func(reader io.ReadCloser, options ...ParseOption) *ParseOutput { defer reader.Close() c := &parseConfig{} for _, option := range options { option(c) } if c.verbose { fmt.Println("--> parsing HTML...") } var err error var contents *htmlContents var parseFn parseFunc = parseHTML if c.fullSite && c.source != "" { var crawler *webCrawler crawler, err = newWebCrawler(parseFn, c.source, c.verbose) if err != nil { return &ParseOutput{Err: err} } contents = crawler.run(reader) } else { contents = parseFn(reader, nil) } if c.verbose { fmt.Println("--> parsed") fmt.Printf("%s\n", contents) } if err != nil { return &ParseOutput{Err: err} } if len(contents.lines) == 0 { return &ParseOutput{} } tags, title := tagifyHTML(contents, c.verbose, c.noStopWords, c.contentOnly) return &ParseOutput{Tags: tags, DocTitle: title, DocHash: contents.hash()} }
ParseHTML receives lines of raw HTML markup text from the Web and returns simple text, plus list of prioritised tags (if tagify == true) based on the importance of HTML tags which wrap sentences.
Example:
<h1>A story about foo <p> Foo was a good guy but, had a quite poor time management skills, therefore he had issues with shipping all his tasks. Though foo had heaps of other amazing skills, which gained him a fortune.
Result:
foo: 2 + 1 = 3, story: 2, management: 1 + 1 = 2, skills: 1 + 1 = 2.
Returns a slice of tags as 1st result, a title of the page as 2nd and a version of the document based on the hashed contents as 3rd.
var ParseMD ParseFunc = func(in io.ReadCloser, options ...ParseOption) *ParseOutput { c := &parseConfig{} for _, option := range options { option(c) } if c.verbose { fmt.Println("--> parsing Markdown...") } defer in.Close() contents := parseMD(in) if c.verbose { fmt.Println("--> parsed") fmt.Printf("%s\n", contents) } tags, title := tagifyMD(contents, c.verbose, c.noStopWords) return &ParseOutput{Tags: tags, DocTitle: title, DocHash: contents.hash()} }
ParseMD parses given Markdown document input into a slice of tags.
var ParseText ParseFunc = func(in io.ReadCloser, options ...ParseOption) *ParseOutput { c := &parseConfig{} for _, option := range options { option(c) } if c.verbose { fmt.Println("parsing plain text...") } var docsCount int defer in.Close() buf := new(bytes.Buffer) _, _ = buf.ReadFrom(in) inStr := buf.String() lines := strings.FieldsFunc(inStr, func(r rune) bool { return r == '\n' }) if c.verbose { fmt.Printf("got %d lines\n", len(lines)) } if len(lines) == 0 { return &ParseOutput{} } tokenIndex := make(map[string]*Tag) tokens := make([]string, 0) for _, l := range lines { sentences := SplitToSentences([]byte(l)) for _, s := range sentences { docsCount++ tokens = append(tokens, sanitize(bytes.Fields(s), c.noStopWords)...) visited := map[string]bool{} for _, token := range tokens { visited[token] = true item, ok := tokenIndex[token] if !ok { item = &Tag{Value: token} tokenIndex[token] = item } item.Score++ item.Count++ } for token := range visited { tokenIndex[token].Docs++ } } } for _, v := range tokenIndex { v.DocsCount = docsCount } return &ParseOutput{Tags: flatten(tokenIndex), DocHash: hashTokens(tokens)} }
ParseText parses given text lines of text into a slice of tags.
type ParseOption ¶ added in v0.40.0
type ParseOption func(*parseConfig)
ParseOption allows to customise `Tagger` configuration.
type ParseOutput ¶ added in v0.40.0
ParseOutput is a result of the `ParseFunc`.
type Tag ¶
type Tag struct { // Value of the tag, i.e. a word Value string // Score used to represent importance of the tag Score float64 // Count is the number of times tag appeared in a text Count int // Docs is the number of documents in a text in which the tag appeared Docs int // DocsCount is the number of documents in a text DocsCount int }
Tag holds some arbitrary string value (e.g. a word) along with some extra data about it.