tokenizers

package
v0.0.0-...-62718c5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 15, 2021 License: MIT Imports: 9 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	EMOTICONS = []string{
		"(?:",
		"[<>]?",
		"[:;=8]",
		"[\\-o*\"]?",
		"[)\\](\\[dDpP/:}{@|\"]",
		"|",
		"[)\\](\\[dDpP/:}{@|\"]",
		"[\\-o*\"]?",
		"[:;=8]",
		"[<>]?",
		"|",
		"<3",
		")",
	}

	// URL pattern due to John Gruber, modified by Tom Winzig. See
	// https://gist.github.com/winzig/8894715
	URLS = []string{
		"(?:",
		"https?:",
		"(?:",
		"\\/{1,3}",
		"|",
		"[a-z0-9%]",
		")",
		"|",
		"[a-z0-9.\\-]+[.]",
		"(?:[a-z]{2,13})",
		"\\/",
		")",
		"(?:",
		"[^\\s()<>{}\\[\\]]+",
		"|",
		"\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)",
		"|",
		"\\([^\\s]+?\\)",
		")+",
		"(?:",
		"\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)",
		"|",
		"\\([^\\s]+?\\)",
		"|",
		"[^\\s`!()\\[\\]{};:\".,<>?«»“”‘’]",
		")",
		"|",
		"(?:",

		"[a-z0-9]+",
		"(?:[.\\-][a-z0-9]+)*",
		"[.]",
		"(?:[a-z]{2,13})",
		"\\b",
		"\\/?",
		"(?!@)",
		")",
	}
)
View Source
var DefaultTokenizerName = "DefaultTokenizer"
View Source
var LineTokenizerName = "LineTokenizer"
View Source
var NGramTokenizerName = "NGramTokenizer"
View Source
var ParagraphTokenizerName = "ParagraphTokenizer"
View Source
var SentenceTokenizerName = "SentenceTokenizer"
View Source
var WhitespaceTokenizerName = "WhitespaceTokenizer"
View Source
var WordTokenizerName = "WordTokenizer"

Functions

func ReadLines

func ReadLines(file string) ([]string, error)

func ReadSplitter

func ReadSplitter(file string, splitter byte) (lines []string, err error)

func Tweet

func Tweet(tweet string) []string

Types

type DefaultTokenizer

type DefaultTokenizer struct {
	RemoveStopWords bool
}

func (*DefaultTokenizer) GetName

func (d *DefaultTokenizer) GetName() string

func (*DefaultTokenizer) Tokenize

func (d *DefaultTokenizer) Tokenize(text string) []string

*

  • Given an input string, tokenize it into an array of word tokens.
  • This is the default tokenization function used if user does not provide one in `options`. *
  • @param {String} text
  • @return {Array}
  • [Author]: Bruce Mubangwa

type LineTokenizer

type LineTokenizer struct {
}

func (*LineTokenizer) GetName

func (l *LineTokenizer) GetName() string

func (*LineTokenizer) Tokenize

func (l *LineTokenizer) Tokenize(sentence string) []string

type NGramTokenizer

type NGramTokenizer struct {

	/**
	 * The minimum number of contiguous words to a single token.
	 *
	 * @var int
	 */
	Min int

	/**
	 * The maximum number of contiguous words to a single token.
	 *
	 * @var int
	 */
	Max int
}

func (NGramTokenizer) GetName

func (ng NGramTokenizer) GetName() string

func (NGramTokenizer) Tokenize

func (ng NGramTokenizer) Tokenize(text string) []string

*

  • Tokenize a block of text. *

type ParagraphTokenizer

type ParagraphTokenizer struct{}

func (*ParagraphTokenizer) GetName

func (w *ParagraphTokenizer) GetName() string

func (*ParagraphTokenizer) Tokenize

func (w *ParagraphTokenizer) Tokenize(text string) []string

type SentenceTokenizer

type SentenceTokenizer struct {
	RemoveStopWords bool
}

func (*SentenceTokenizer) GetName

func (s *SentenceTokenizer) GetName() string

func (*SentenceTokenizer) Tokenize

func (s *SentenceTokenizer) Tokenize(text string) []string

type Tokenizer

type Tokenizer interface {
	// Compute the output value.
	Tokenize(string) []string
}

func GetTokenizer

func GetTokenizer(name string) Tokenizer

type WhitespaceTokenizer

type WhitespaceTokenizer struct {
	RemoveStopWords bool
	// contains filtered or unexported fields
}

func (WhitespaceTokenizer) GetName

func (wp WhitespaceTokenizer) GetName() string

func (WhitespaceTokenizer) Tokenize

func (wp WhitespaceTokenizer) Tokenize(sentence string) []string

type WordTokenizer

type WordTokenizer struct {
	RemoveStopWords bool
}

*

  • Word *
  • This tokenizer matches words with 1 or more characters. *
  • @category Machine Learning
  • [Author]: Bruce Mubangwa

WordTokenizer is the primary interface for tokenizing words

func (WordTokenizer) GetName

func (wt WordTokenizer) GetName() string

func (WordTokenizer) Tokenize

func (wt WordTokenizer) Tokenize(text string) []string

Directories

Path Synopsis
sentences

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL