text

package
v0.58.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 2, 2022 License: Apache-2.0 Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ProcessText model.ProcessFunc = func(c *config.Config, in io.ReadCloser) *model.Result {

	if c.Verbose {
		fmt.Println("parsing plain text...")
	}

	var docsCount int

	defer in.Close()
	buf := new(bytes.Buffer)
	_, _ = buf.ReadFrom(in)
	inStr := buf.String()
	lines := strings.FieldsFunc(inStr, func(r rune) bool {
		return r == '\n'
	})

	if c.Verbose {
		fmt.Printf("got %d lines\n", len(lines))
	}

	if len(lines) == 0 {
		return &model.Result{}
	}

	var lang string
	var reg *stopwords.Register
	tokenIndex := make(map[string]*model.Tag)
	tokens := make([]string, 0)
	for _, l := range lines {

		if c.StopWords == nil && l != "" {
			info := whatlanggo.Detect(l)
			lang = info.Lang.String()
			c.SetStopWords(info.Lang.Iso6391())
			if c.Verbose {
				fmt.Printf("detected language: %s [%s] [%s]\n ",
					info.Lang.String(), info.Lang.Iso6391(), info.Lang.Iso6393())
			}
			if c.NoStopWords {
				reg = c.StopWords
			}
		}
		sentences := util.SplitToSentences([]byte(l))
		for _, s := range sentences {
			docsCount++
			tokens = append(tokens, util.Sanitize(bytes.Fields(s), reg)...)
			visited := map[string]bool{}
			for _, token := range tokens {
				visited[token] = true
				item, ok := tokenIndex[token]
				if !ok {
					item = &model.Tag{Value: token}
					tokenIndex[token] = item
				}
				item.Score++
				item.Count++
			}

			for token := range visited {
				tokenIndex[token].Docs++
			}
		}
	}

	for _, v := range tokenIndex {
		v.DocsCount = docsCount
	}

	return &model.Result{
		RawTags: tokenIndex,
		Meta: &model.Meta{
			ContentType: config.Text,
			DocHash:     fmt.Sprintf("%x", hashTokens(tokens)),
			Lang:        lang,
		},
	}
}

ProcessText parses given text lines of text into a slice of tags.

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL