processor

package
v0.45.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 6, 2021 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// Verbose enables high verbosity.
	Verbose = func(verbose bool) ParseOption {
		return func(c *parseConfig) {
			c.verbose = verbose
		}
	}

	// NoStopWords enables stop-words exclusion from the output.
	NoStopWords = func(noStopWords bool) ParseOption {
		return func(c *parseConfig) {
			c.noStopWords = noStopWords
		}
	}

	// ContentOnly ignores all none content related parts of the HTML page (HTML only).
	ContentOnly = func(v bool) ParseOption {
		return func(c *parseConfig) {
			c.contentOnly = v
		}
	}

	// FullSite tells parser to process full site (HTML only).
	FullSite = func(v bool) ParseOption {
		return func(c *parseConfig) {
			c.fullSite = v
		}
	}

	// Source of the parser.
	Source = func(v string) ParseOption {
		return func(c *parseConfig) {
			c.source = v
		}
	}
)

Functions

func SplitToSentences added in v0.24.0

func SplitToSentences(text []byte) [][]byte

SplitToSentences splits given text into slice of sentences.

func ToStrings added in v0.8.0

func ToStrings(items []*Tag) []string

ToStrings transforms list of given tags into a list of strings.

Types

type InputReader added in v0.22.0

type InputReader interface {
	ReadLines() ([]string, error)
	io.ReadCloser
}

InputReader ...

type ParseFunc added in v0.40.0

type ParseFunc func(reader io.ReadCloser, options ...ParseOption) *ParseOutput

ParseFunc represents an arbitrary handler, which goes through given reader and produces tags.

var ParseHTML ParseFunc = func(reader io.ReadCloser, options ...ParseOption) *ParseOutput {

	defer reader.Close()

	c := &parseConfig{}

	for _, option := range options {
		option(c)
	}

	if c.verbose {
		fmt.Println("--> parsing HTML...")
	}

	var err error
	var contents *htmlContents
	var parseFn parseFunc = parseHTML

	if c.fullSite && c.source != "" {
		var crawler *webCrawler
		crawler, err = newWebCrawler(parseFn, c.source, c.verbose)
		if err != nil {
			return &ParseOutput{Err: err}
		}
		contents = crawler.run(reader)
	} else {
		contents = parseFn(reader, nil)
	}

	if err != nil {
		return &ParseOutput{Err: err}
	}

	if len(contents.lines) == 0 {
		return &ParseOutput{}
	}

	tags, title := tagifyHTML(contents, c.verbose, c.noStopWords, c.contentOnly)

	return &ParseOutput{Tags: tags, DocTitle: title, DocHash: contents.hash()}
}

ParseHTML receives lines of raw HTML markup text from the Web and returns simple text, plus list of prioritised tags (if tagify == true) based on the importance of HTML tags which wrap sentences.

Example:

<h1>A story about foo
<p> Foo was a good guy but, had a quite poor time management skills,
therefore he had issues with shipping all his tasks. Though foo had heaps
of other amazing skills, which gained him a fortune.

Result:

foo: 2 + 1 = 3, story: 2, management: 1 + 1 = 2, skills: 1 + 1 = 2.

Returns a slice of tags as 1st result, a title of the page as 2nd and a version of the document based on the hashed contents as 3rd.

var ParseMD ParseFunc = func(in io.ReadCloser, options ...ParseOption) *ParseOutput {

	c := &parseConfig{}

	for _, option := range options {
		option(c)
	}

	if c.verbose {
		fmt.Println("--> parsing Markdown...")
	}

	defer in.Close()
	contents := parseMD(in)

	if c.verbose {
		fmt.Println("--> parsed")
		fmt.Printf("%s\n", contents)
	}

	tags, title := tagifyMD(contents, c.verbose, c.noStopWords)

	return &ParseOutput{Tags: tags, DocTitle: title, DocHash: contents.hash()}
}

ParseMD parses given Markdown document input into a slice of tags.

var ParseText ParseFunc = func(in io.ReadCloser, options ...ParseOption) *ParseOutput {

	c := &parseConfig{}

	for _, option := range options {
		option(c)
	}

	if c.verbose {
		fmt.Println("parsing plain text...")
	}

	var docsCount int

	defer in.Close()
	buf := new(bytes.Buffer)
	_, _ = buf.ReadFrom(in)
	inStr := buf.String()
	lines := strings.FieldsFunc(inStr, func(r rune) bool {
		return r == '\n'
	})

	if c.verbose {
		fmt.Printf("got %d lines\n", len(lines))
	}

	if len(lines) == 0 {
		return &ParseOutput{}
	}

	tokenIndex := make(map[string]*Tag)
	tokens := make([]string, 0)
	for _, l := range lines {
		sentences := SplitToSentences([]byte(l))
		for _, s := range sentences {
			docsCount++
			tokens = append(tokens, sanitize(bytes.Fields(s), c.noStopWords)...)
			visited := map[string]bool{}
			for _, token := range tokens {
				visited[token] = true
				item, ok := tokenIndex[token]
				if !ok {
					item = &Tag{Value: token}
					tokenIndex[token] = item
				}
				item.Score++
				item.Count++
			}

			for token := range visited {
				tokenIndex[token].Docs++
			}
		}
	}

	for _, v := range tokenIndex {
		v.DocsCount = docsCount
	}

	return &ParseOutput{Tags: flatten(tokenIndex), DocHash: hashTokens(tokens)}
}

ParseText parses given text lines of text into a slice of tags.

type ParseOption added in v0.40.0

type ParseOption func(*parseConfig)

ParseOption allows to customise `Tagger` configuration.

type ParseOutput added in v0.40.0

type ParseOutput struct {
	Tags     []*Tag
	DocTitle string
	DocHash  []byte
	Err      error
}

ParseOutput is a result of the `ParseFunc`.

type Tag

type Tag struct {
	// Value of the tag, i.e. a word
	Value string
	// Score used to represent importance of the tag
	Score float64
	// Count is the number of times tag appeared in a text
	Count int
	// Docs is the number of documents in a text in which the tag appeared
	Docs int
	// DocsCount is the number of documents in a text
	DocsCount int
}

Tag holds some arbitrary string value (e.g. a word) along with some extra data about it.

func Run

func Run(items []*Tag, limit int) []*Tag

Run - 1st sorts given list, then iterates over it and de-dupes items in the list by merging inflections, then sorts de-duped list again and takes only requested size (limit) or just everything if result is smaller than limit.

nolint: gocyclo

func (*Tag) String

func (t *Tag) String() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL