tokenizers

package

v0.0.0-...-62718c5 Latest Latest Go to latest Published: May 15, 2021 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/broosaction/gotext

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func ReadLines(file string) ([]string, error)
func ReadSplitter(file string, splitter byte) (lines []string, err error)
func Tweet(tweet string) []string
type DefaultTokenizer
- func (d *DefaultTokenizer) GetName() string
- func (d *DefaultTokenizer) Tokenize(text string) []string
type LineTokenizer
- func (l *LineTokenizer) GetName() string
- func (l *LineTokenizer) Tokenize(sentence string) []string
type NGramTokenizer
- func (ng NGramTokenizer) GetName() string
- func (ng NGramTokenizer) Tokenize(text string) []string
type ParagraphTokenizer
- func (w *ParagraphTokenizer) GetName() string
- func (w *ParagraphTokenizer) Tokenize(text string) []string
type SentenceTokenizer
- func (s *SentenceTokenizer) GetName() string
- func (s *SentenceTokenizer) Tokenize(text string) []string
type Tokenizer
- func GetTokenizer(name string) Tokenizer
type WhitespaceTokenizer
- func (wp WhitespaceTokenizer) GetName() string
- func (wp WhitespaceTokenizer) Tokenize(sentence string) []string
type WordTokenizer
- func (wt WordTokenizer) GetName() string
- func (wt WordTokenizer) Tokenize(text string) []string

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	EMOTICONS = []string{
		"(?:",
		"[<>]?",
		"[:;=8]",
		"[\\-o*\"]?",
		"[)\\](\\[dDpP/:}{@|\"]",
		"|",
		"[)\\](\\[dDpP/:}{@|\"]",
		"[\\-o*\"]?",
		"[:;=8]",
		"[<>]?",
		"|",
		"<3",
		")",
	}

	// URL pattern due to John Gruber, modified by Tom Winzig. See
	// https://gist.github.com/winzig/8894715
	URLS = []string{
		"(?:",
		"https?:",
		"(?:",
		"\\/{1,3}",
		"|",
		"[a-z0-9%]",
		")",
		"|",
		"[a-z0-9.\\-]+[.]",
		"(?:[a-z]{2,13})",
		"\\/",
		")",
		"(?:",
		"[^\\s()<>{}\\[\\]]+",
		"|",
		"\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)",
		"|",
		"\\([^\\s]+?\\)",
		")+",
		"(?:",
		"\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)",
		"|",
		"\\([^\\s]+?\\)",
		"|",
		"[^\\s`!()\\[\\]{};:\".,<>?«»“”‘’]",
		")",
		"|",
		"(?:",

		"[a-z0-9]+",
		"(?:[.\\-][a-z0-9]+)*",
		"[.]",
		"(?:[a-z]{2,13})",
		"\\b",
		"\\/?",
		"(?!@)",
		")",
	}
)

View Source

var DefaultTokenizerName = "DefaultTokenizer"

View Source

var LineTokenizerName = "LineTokenizer"

View Source

var NGramTokenizerName = "NGramTokenizer"

View Source

var ParagraphTokenizerName = "ParagraphTokenizer"

View Source

var SentenceTokenizerName = "SentenceTokenizer"

View Source

var WhitespaceTokenizerName = "WhitespaceTokenizer"

View Source

var WordTokenizerName = "WordTokenizer"

Functions ¶

func ReadLines ¶

func ReadLines(file string) ([]string, error)

func ReadSplitter ¶

func ReadSplitter(file string, splitter byte) (lines []string, err error)

func Tweet ¶

func Tweet(tweet string) []string

Types ¶

type DefaultTokenizer ¶

type DefaultTokenizer struct {
	RemoveStopWords bool
}

func (*DefaultTokenizer) GetName ¶

func (d *DefaultTokenizer) GetName() string

func (*DefaultTokenizer) Tokenize ¶

func (d *DefaultTokenizer) Tokenize(text string) []string

*

Given an input string, tokenize it into an array of word tokens.
This is the default tokenization function used if user does not provide one in `options`. *
@param {String} text
@return {Array}
[Author]: Bruce Mubangwa

type LineTokenizer ¶

type LineTokenizer struct {
}

func (*LineTokenizer) GetName ¶

func (l *LineTokenizer) GetName() string

func (*LineTokenizer) Tokenize ¶

func (l *LineTokenizer) Tokenize(sentence string) []string

type NGramTokenizer ¶

type NGramTokenizer struct {

	/**
	 * The minimum number of contiguous words to a single token.
	 *
	 * @var int
	 */
	Min int

	/**
	 * The maximum number of contiguous words to a single token.
	 *
	 * @var int
	 */
	Max int
}

func (NGramTokenizer) GetName ¶

func (ng NGramTokenizer) GetName() string

func (NGramTokenizer) Tokenize ¶

func (ng NGramTokenizer) Tokenize(text string) []string

*

Tokenize a block of text. *

type ParagraphTokenizer ¶

type ParagraphTokenizer struct{}

func (*ParagraphTokenizer) GetName ¶

func (w *ParagraphTokenizer) GetName() string

func (*ParagraphTokenizer) Tokenize ¶

func (w *ParagraphTokenizer) Tokenize(text string) []string

type SentenceTokenizer ¶

type SentenceTokenizer struct {
	RemoveStopWords bool
}

func (*SentenceTokenizer) GetName ¶

func (s *SentenceTokenizer) GetName() string

func (*SentenceTokenizer) Tokenize ¶

func (s *SentenceTokenizer) Tokenize(text string) []string

type Tokenizer ¶

type Tokenizer interface {
	// Compute the output value.
	Tokenize(string) []string
}

func GetTokenizer ¶

func GetTokenizer(name string) Tokenizer

type WhitespaceTokenizer ¶

type WhitespaceTokenizer struct {
	RemoveStopWords bool
	// contains filtered or unexported fields
}

func (WhitespaceTokenizer) GetName ¶

func (wp WhitespaceTokenizer) GetName() string

func (WhitespaceTokenizer) Tokenize ¶

func (wp WhitespaceTokenizer) Tokenize(sentence string) []string

type WordTokenizer ¶

type WordTokenizer struct {
	RemoveStopWords bool
}

*

Word *
This tokenizer matches words with 1 or more characters. *
@category Machine Learning
[Author]: Bruce Mubangwa

WordTokenizer is the primary interface for tokenizing words

func (WordTokenizer) GetName ¶

func (wt WordTokenizer) GetName() string

func (WordTokenizer) Tokenize ¶

func (wt WordTokenizer) Tokenize(text string) []string

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
sentences
english

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL