text

package
v0.62.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 30, 2024 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ProcessText model.ProcessFunc = func(c *config.Config, in io.ReadCloser) *model.Result {

	if c.Verbose {
		fmt.Println("parsing plain text...")
	}

	var docsCount int

	defer in.Close()
	buf := new(bytes.Buffer)
	_, _ = buf.ReadFrom(in)
	inStr := buf.String()
	lines := strings.FieldsFunc(inStr, func(r rune) bool {
		return r == '\n'
	})

	if c.Verbose {
		fmt.Printf("got %d lines\n", len(lines))
	}

	if len(lines) == 0 {
		return &model.Result{}
	}

	tokenIndex := make(map[string]*model.Tag)
	tokens := make([]string, 0)
	for _, l := range lines {

		if !c.SkipLang && c.StopWords == nil && len(l) > 0 {
			config.DetectLang(c, l)
		}
		sentences := util.SplitToSentences([]byte(l))
		for _, s := range sentences {
			docsCount++
			tokens = append(tokens, util.SplitToTokens(s, c)...)
			visited := map[string]bool{}
			for _, token := range tokens {
				visited[token] = true
				item, ok := tokenIndex[token]
				if !ok {
					item = &model.Tag{Value: token}
					tokenIndex[token] = item
				}
				item.Score++
				item.Count++
			}

			for token := range visited {
				tokenIndex[token].Docs++
			}
		}
	}

	for _, v := range tokenIndex {
		v.DocsCount = docsCount
	}

	return &model.Result{
		RawTags: tokenIndex,
		Meta: &model.Meta{
			ContentType: config.Text,
			DocHash:     fmt.Sprintf("%x", hashTokens(tokens)),
			Lang:        c.Lang,
		},
	}
}

ProcessText parses given text lines of text into a slice of tags.

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL