Documentation ¶
Index ¶
- Variables
- func ReadLines(file string) ([]string, error)
- func ReadSplitter(file string, splitter byte) (lines []string, err error)
- func Tweet(tweet string) []string
- type DefaultTokenizer
- type LineTokenizer
- type NGramTokenizer
- type ParagraphTokenizer
- type SentenceTokenizer
- type Tokenizer
- type WhitespaceTokenizer
- type WordTokenizer
Constants ¶
This section is empty.
Variables ¶
View Source
var ( EMOTICONS = []string{ "(?:", "[<>]?", "[:;=8]", "[\\-o*\"]?", "[)\\](\\[dDpP/:}{@|\"]", "|", "[)\\](\\[dDpP/:}{@|\"]", "[\\-o*\"]?", "[:;=8]", "[<>]?", "|", "<3", ")", } // URL pattern due to John Gruber, modified by Tom Winzig. See // https://gist.github.com/winzig/8894715 URLS = []string{ "(?:", "https?:", "(?:", "\\/{1,3}", "|", "[a-z0-9%]", ")", "|", "[a-z0-9.\\-]+[.]", "(?:[a-z]{2,13})", "\\/", ")", "(?:", "[^\\s()<>{}\\[\\]]+", "|", "\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)", "|", "\\([^\\s]+?\\)", ")+", "(?:", "\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)", "|", "\\([^\\s]+?\\)", "|", "[^\\s`!()\\[\\]{};:\".,<>?«»“”‘’]", ")", "|", "(?:", "[a-z0-9]+", "(?:[.\\-][a-z0-9]+)*", "[.]", "(?:[a-z]{2,13})", "\\b", "\\/?", "(?!@)", ")", } )
View Source
var DefaultTokenizerName = "DefaultTokenizer"
View Source
var LineTokenizerName = "LineTokenizer"
View Source
var NGramTokenizerName = "NGramTokenizer"
View Source
var ParagraphTokenizerName = "ParagraphTokenizer"
View Source
var SentenceTokenizerName = "SentenceTokenizer"
View Source
var WhitespaceTokenizerName = "WhitespaceTokenizer"
View Source
var WordTokenizerName = "WordTokenizer"
Functions ¶
Types ¶
type DefaultTokenizer ¶
type DefaultTokenizer struct {
RemoveStopWords bool
}
func (*DefaultTokenizer) GetName ¶
func (d *DefaultTokenizer) GetName() string
func (*DefaultTokenizer) Tokenize ¶
func (d *DefaultTokenizer) Tokenize(text string) []string
*
- Given an input string, tokenize it into an array of word tokens.
- This is the default tokenization function used if user does not provide one in `options`. *
- @param {String} text
- @return {Array}
- [Author]: Bruce Mubangwa
type LineTokenizer ¶
type LineTokenizer struct { }
func (*LineTokenizer) GetName ¶
func (l *LineTokenizer) GetName() string
func (*LineTokenizer) Tokenize ¶
func (l *LineTokenizer) Tokenize(sentence string) []string
type NGramTokenizer ¶
type NGramTokenizer struct { /** * The minimum number of contiguous words to a single token. * * @var int */ Min int /** * The maximum number of contiguous words to a single token. * * @var int */ Max int }
func (NGramTokenizer) GetName ¶
func (ng NGramTokenizer) GetName() string
func (NGramTokenizer) Tokenize ¶
func (ng NGramTokenizer) Tokenize(text string) []string
*
- Tokenize a block of text. *
type ParagraphTokenizer ¶
type ParagraphTokenizer struct{}
func (*ParagraphTokenizer) GetName ¶
func (w *ParagraphTokenizer) GetName() string
func (*ParagraphTokenizer) Tokenize ¶
func (w *ParagraphTokenizer) Tokenize(text string) []string
type SentenceTokenizer ¶
type SentenceTokenizer struct {
RemoveStopWords bool
}
func (*SentenceTokenizer) GetName ¶
func (s *SentenceTokenizer) GetName() string
func (*SentenceTokenizer) Tokenize ¶
func (s *SentenceTokenizer) Tokenize(text string) []string
type Tokenizer ¶
func GetTokenizer ¶
type WhitespaceTokenizer ¶
type WhitespaceTokenizer struct { RemoveStopWords bool // contains filtered or unexported fields }
func (WhitespaceTokenizer) GetName ¶
func (wp WhitespaceTokenizer) GetName() string
func (WhitespaceTokenizer) Tokenize ¶
func (wp WhitespaceTokenizer) Tokenize(sentence string) []string
type WordTokenizer ¶
type WordTokenizer struct {
RemoveStopWords bool
}
*
- Word *
- This tokenizer matches words with 1 or more characters. *
- @category Machine Learning
- [Author]: Bruce Mubangwa
WordTokenizer is the primary interface for tokenizing words
func (WordTokenizer) GetName ¶
func (wt WordTokenizer) GetName() string
func (WordTokenizer) Tokenize ¶
func (wt WordTokenizer) Tokenize(text string) []string
Source Files ¶
Click to show internal directories.
Click to hide internal directories.