Documentation ¶
Overview ¶
Basic text preprocessing tasks are: 1. Remove HTML tags 2. Remove extra whitespaces 3. Convert accented characters to ASCII characters 4. Expand contractions 5. Remove special characters 6. Lowercase all texts 7. Convert number words to numeric form 8. Remove numbers 9. Remove stopwords 10. Lemmatization
Index ¶
- Constants
- func BytesToChar(s string, byteRange []int) (retVal []int)
- func CharToBytes(s string, charRange []int) (retVal []int)
- func FindAllStringIndex(re *re2.Regexp, s string, n int) [][]int
- func IsBertPunctuation(c rune) bool
- func IsBertWhitespace(c rune) bool
- func IsChinese(c rune) bool
- func IsPunctuation(c rune) bool
- func IsWhitespace(c rune) bool
- func RangeOf(s string, r []int) (retVal string)
- type BertNormalizer
- type ChangeMap
- type DefaultNormalizer
- type DefaultOption
- type FnPattern
- type IndexOn
- type Invert
- type NFC
- type NFD
- type NFKC
- type NFKD
- type NormFn
- type NormalizedString
- func (n *NormalizedString) Alignments() (retVal [][]int)
- func (n *NormalizedString) AlignmentsOriginal() (retVal [][]int)
- func (n *NormalizedString) Append(s string) (retVal *NormalizedString)
- func (n *NormalizedString) Clear()
- func (n *NormalizedString) ConvertOffset(inputRange *Range) (retVal *Range)
- func (n *NormalizedString) Filter(fn func(rune) bool) (retVal *NormalizedString)
- func (n *NormalizedString) ForEach(nfn NormFn) (retVal *NormalizedString)
- func (n *NormalizedString) GetNormalized() string
- func (n *NormalizedString) GetOriginal() string
- func (n *NormalizedString) IsEmpty() bool
- func (n *NormalizedString) LStrip() (retVal *NormalizedString)
- func (n *NormalizedString) Len() int
- func (n *NormalizedString) LenOriginal() int
- func (n *NormalizedString) Lowercase() (retVal *NormalizedString)
- func (n *NormalizedString) Map(nfn NormFn) (retVal *NormalizedString)
- func (n *NormalizedString) NFC() (retVal *NormalizedString)
- func (n *NormalizedString) NFD() (retVal *NormalizedString)
- func (n *NormalizedString) NFKC() (retVal *NormalizedString)
- func (n *NormalizedString) NFKD() (retVal *NormalizedString)
- func (n *NormalizedString) OffsetsOriginal() []int
- func (n *NormalizedString) Prepend(s string) (retVal *NormalizedString)
- func (n *NormalizedString) RStrip() (retVal *NormalizedString)
- func (n *NormalizedString) Range(r *Range) (retVal string)
- func (n *NormalizedString) RangeOriginal(r *Range) (retVal string)
- func (n *NormalizedString) RemoveAccents() (retVal *NormalizedString)
- func (n *NormalizedString) Replace(pattern Pattern, content string) (retVal *NormalizedString)
- func (n *NormalizedString) Shift() int
- func (n *NormalizedString) Slice(inputRange *Range) (retVal *NormalizedString)
- func (n *NormalizedString) Split(pattern Pattern, behavior SplitDelimiterBehavior) (retVal []NormalizedString)
- func (n *NormalizedString) Strip() (retVal *NormalizedString)
- func (n *NormalizedString) Transform(m []ChangeMap, initialOffset int) (retVal *NormalizedString)
- func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeMap, initialOffset int) (retVal *NormalizedString)
- func (n *NormalizedString) Uppercase() (retVal *NormalizedString)
- type Normalizer
- type OffsetsMatch
- type OffsetsRemove
- type Option
- type Pattern
- type PatternFn
- type Precompiled
- type Prepend
- type Range
- type RegexpPattern
- type Replace
- type ReplacePattern
- type RunePattern
- type Sequence
- type SplitDelimiterBehavior
- type StringPattern
- type Strip
- type StripAccents
- type UnicodeNormalizer
Constants ¶
const ( RemovedBehavior = iota IsolatedBehavior MergedWithPreviousBehavior MergedWithNextBehavior ContiguousBehavior )
const ( OriginalTarget = iota NormalizedTarget )
Variables ¶
This section is empty.
Functions ¶
func BytesToChar ¶
BytesToChar converts a given range from bytes to `char`
func CharToBytes ¶
CharToBytes converts a given range from `char` to bytes
func IsBertPunctuation ¶
IsBertPunctuation checks whether an input rune is a BERT punctuation
func IsBertWhitespace ¶
IsBertWhitespace checks whether an input rune is a BERT whitespace
func IsPunctuation ¶
IsPunctuation returns whether input rune is a punctuation or not.
func IsWhitespace ¶
IsWhitespace checks whether an input rune is a whitespace
Types ¶
type BertNormalizer ¶
type BertNormalizer struct { CleanText bool `json:"clean_text"` // Whether to remove Control characters and all sorts of whitespaces replaced with single ` ` space Lowercase bool `json:"lowercase"` // Whether to do lowercase HandleChineseChars bool `json:"handle_chinese_chars"` // Whether to put spaces around chinese characters so they get split StripAccents bool `json:"strip_accents"` // whether to remove accents }
func NewBertNormalizer ¶
func NewBertNormalizer(cleanText, lowercase, handleChineseChars, stripAccents bool) *BertNormalizer
func (*BertNormalizer) Normalize ¶
func (bn *BertNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)
Normalize implements Normalizer interface for BertNormalizer
type DefaultNormalizer ¶
type DefaultNormalizer struct {
// contains filtered or unexported fields
}
func NewDefaultNormalizer ¶
func NewDefaultNormalizer(opts ...DefaultOption) *DefaultNormalizer
func (*DefaultNormalizer) Normalize ¶
func (dn *DefaultNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)
type DefaultOption ¶
type DefaultOption func(*DefaultNormalizer)
func WithLowercase ¶
func WithLowercase(lowercase bool) DefaultOption
func WithStrip ¶
func WithStrip(strip bool) DefaultOption
type FnPattern ¶
type FnPattern struct {
// contains filtered or unexported fields
}
func NewFnPattern ¶
func (*FnPattern) FindMatches ¶
func (fp *FnPattern) FindMatches(inside string) []OffsetsMatch
FindMatches implements Pattern interface for FnPattern
type IndexOn ¶
type IndexOn int
RangeType is a enum like representing which string (original or normalized) then range indexes on.
type Invert ¶
type Invert struct {
Pattern Pattern
}
Invert the `is_match` flags for the wrapped Pattern. This is usefull for example when we use a regex that matches words instead of a delimiter, and we want to match the delimiter.
func NewInvertPattern ¶
func (*Invert) FindMatches ¶
func (i *Invert) FindMatches(inside string) []OffsetsMatch
FindMatches implement Pattern interface for Invert
type NFC ¶
type NFC struct{}
func (*NFC) Normalize ¶
func (n *NFC) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFD ¶
type NFD struct{}
func (*NFD) Normalize ¶
func (n *NFD) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFKC ¶
type NFKC struct{}
func (*NFKC) Normalize ¶
func (n *NFKC) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFKD ¶
type NFKD struct{}
func (*NFKD) Normalize ¶
func (n *NFKD) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NormalizedString ¶
type NormalizedString struct {
// contains filtered or unexported fields
}
A `NormalizedString` takes care of processing an "original" string to modify it and obtain a "normalized" string. It keeps both version of the string, alignments information between both and provides an interface to retrieve ranges of each string, using offsets from any of them.
It is possible to retrieve a part of the original string, by indexing it with offsets from the normalized one, and the other way around too. It is also possible to convert offsets from one referential to the other one easily.
func NewNormalizedFrom ¶
func NewNormalizedFrom(s string) (retVal *NormalizedString)
NewNormalizedFrom creates a Normalized instance from string input
func NewNormalizedString ¶
func NewNormalizedString(original, normalized string, alignments, alignmentsOriginal [][]int, originalShift int) *NormalizedString
func (*NormalizedString) Alignments ¶
func (n *NormalizedString) Alignments() (retVal [][]int)
Alignments returns alignments mapping normalized string to original string
func (*NormalizedString) AlignmentsOriginal ¶
func (n *NormalizedString) AlignmentsOriginal() (retVal [][]int)
AlignmentsOriginal returns original alignments mapping to original string
func (*NormalizedString) Append ¶
func (n *NormalizedString) Append(s string) (retVal *NormalizedString)
Append adds given string to the end of NormalizedString
func (*NormalizedString) Clear ¶
func (n *NormalizedString) Clear()
Clear clears the normalized part of the string
func (*NormalizedString) ConvertOffset ¶
func (n *NormalizedString) ConvertOffset(inputRange *Range) (retVal *Range)
ConvertOffsets converts the given offsets range from one referential to the other one: `Original => Normalized` or `Normalized => Original`
Returns `nil` when targeting something that is outside range
func (*NormalizedString) Filter ¶
func (n *NormalizedString) Filter(fn func(rune) bool) (retVal *NormalizedString)
Filter applies filtering on NormalizedString
func (*NormalizedString) ForEach ¶
func (n *NormalizedString) ForEach(nfn NormFn) (retVal *NormalizedString)
ForEach applies function on each `char` of normalized string Similar to Map???
func (*NormalizedString) GetNormalized ¶
func (n *NormalizedString) GetNormalized() string
GetNormalized returns the Normalized struct
func (*NormalizedString) GetOriginal ¶
func (n *NormalizedString) GetOriginal() string
GetOriginal return the original string
func (*NormalizedString) IsEmpty ¶
func (n *NormalizedString) IsEmpty() bool
IsEmpty returns whether the normalized string is empty
func (*NormalizedString) LStrip ¶
func (n *NormalizedString) LStrip() (retVal *NormalizedString)
LStrip removes leading spaces
func (*NormalizedString) Len ¶
func (n *NormalizedString) Len() int
Len returns length (in bytes) of normalized string
func (*NormalizedString) LenOriginal ¶
func (n *NormalizedString) LenOriginal() int
LenOriginal returns the length of Original string in bytes
func (*NormalizedString) Lowercase ¶
func (n *NormalizedString) Lowercase() (retVal *NormalizedString)
Lowercase transforms string to lowercase
func (*NormalizedString) Map ¶
func (n *NormalizedString) Map(nfn NormFn) (retVal *NormalizedString)
Map maps and applies function to each `char` of normalized string
func (*NormalizedString) NFC ¶
func (n *NormalizedString) NFC() (retVal *NormalizedString)
func (*NormalizedString) NFD ¶
func (n *NormalizedString) NFD() (retVal *NormalizedString)
func (*NormalizedString) NFKC ¶
func (n *NormalizedString) NFKC() (retVal *NormalizedString)
func (*NormalizedString) NFKD ¶
func (n *NormalizedString) NFKD() (retVal *NormalizedString)
func (*NormalizedString) OffsetsOriginal ¶
func (n *NormalizedString) OffsetsOriginal() []int
OffsetsOriginal returns the original offsets
func (*NormalizedString) Prepend ¶
func (n *NormalizedString) Prepend(s string) (retVal *NormalizedString)
Prepend adds given string to the begining of NormalizedString
func (*NormalizedString) RStrip ¶
func (n *NormalizedString) RStrip() (retVal *NormalizedString)
RStrip removes trailing spaces
func (*NormalizedString) Range ¶
func (n *NormalizedString) Range(r *Range) (retVal string)
Range returns a substring of the NORMALIZED string
func (*NormalizedString) RangeOriginal ¶
func (n *NormalizedString) RangeOriginal(r *Range) (retVal string)
RangeOriginal returns substring of ORIGINAL string
func (*NormalizedString) RemoveAccents ¶
func (n *NormalizedString) RemoveAccents() (retVal *NormalizedString)
RemoveAccents removes all Unicode Mn group (M non-spacing)
func (*NormalizedString) Replace ¶
func (n *NormalizedString) Replace(pattern Pattern, content string) (retVal *NormalizedString)
func (*NormalizedString) Shift ¶
func (n *NormalizedString) Shift() int
Shift returns original shift
func (*NormalizedString) Slice ¶
func (n *NormalizedString) Slice(inputRange *Range) (retVal *NormalizedString)
Slice returns a slice of the current NormalizedString If the range is not on char boundaries, return `nil`
func (*NormalizedString) Split ¶
func (n *NormalizedString) Split(pattern Pattern, behavior SplitDelimiterBehavior) (retVal []NormalizedString)
Split the current string in many subparts. Specify what to do with the delimiter.
This method will always ensure that the entire `NOrmalizedString` is covered in the produced subparts. This means that the delimiter parts will also be included, and will appear empty if we don't want to include them (their `original` part will still be present). It should always be possible to merge all the subparts back to the original `NormalizedString`
## Splitting Behavior for the delimiter
The behavior can be one of the followings: When splitting on `'-'` for example, with input `the-final--countdown`:
- RemovedBehavior => `[ "the", "", "final", "", "", "countdown" ]`
- IsolatedBehavior => `[ "the", "-", "final", "-", "-", "countdown" ]`
- MergedWithPreviousBehavior => `[ "the-", "final-", "-", "countdown" ]`
- MergedWithNextBehavior => `[ "the", "-final", "-", "-countdown" ]`
- Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
func (*NormalizedString) Strip ¶
func (n *NormalizedString) Strip() (retVal *NormalizedString)
Strip remove leading and trailing spaces
func (*NormalizedString) Transform ¶
func (n *NormalizedString) Transform(m []ChangeMap, initialOffset int) (retVal *NormalizedString)
Transform applies transformations to the current normalized version, updating the current alignments with the new ones. This method expect an Iterator yielding each rune of the new normalized string with a `change` interger size equals to:
- `1` if this is a new rune
- `-N` if the char is right before N removed runes
- `0` if this rune represents the old one (even if changed)
Since it is possible that the normalized string doesn't include some of the `characters` (runes) at the beginning of the original one, we need an `initial_offset` which represents the number of removed runes at the very beginning.
`change` should never be more than `1`. If multiple runes are added, each of them has a `change` of `1`, but more doesn't make any sense. We treat any value above `1` as `1`.
E.g. string `élégant` Before NFD(): [{233 0} {108 1} {233 2} {103 3} {97 4} {110 5} {116 6}] After NFD(): [{101 0} {769 1} {108 2} {101 3} {769 4} {103 5} {97 6} {110 7} {116 8}] New Alignments: {0, 1}, {0, 1}, {1, 2}, {2, 3}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7},
func (*NormalizedString) TransformRange ¶
func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeMap, initialOffset int) (retVal *NormalizedString)
This method expect an iterator yielding each `char` of the new normalized string with a `change` of int type equals to:
- `1` if this is a new char
- `-N` if the char is right before N removed chars
- `0` if the char is replacing the existing one
Since it is possible that the normalized string doesn't include some of the characters at the beginning of the original one, we need an `initialOffset` which represents the number of removed chars at the very beginning.
func (*NormalizedString) Uppercase ¶
func (n *NormalizedString) Uppercase() (retVal *NormalizedString)
Uppercase transforms string to uppercase
type Normalizer ¶
type Normalizer interface {
Normalize(normalized *NormalizedString) (*NormalizedString, error)
}
func NewNormalizer ¶
func NewNormalizer(opts ...Option) Normalizer
type OffsetsMatch ¶
OfsetsMatch contains a combination of Offsets position and a boolean indicates whether this is a match or not.
type OffsetsRemove ¶
type Option ¶
type Option func(*normalizer)
func WithBertNormalizer ¶
WithBertNormalizer creates normalizer with BERT normalization features.
func WithUnicodeNormalizer ¶
WithUnicodeNormalizer creates normalizer with one of unicode NFD, NFC, NFKD, or NFKC normalization feature.
type Pattern ¶
type Pattern interface { // FindMatches slices the given string in a list of pattern match positions, with // a boolean indicating whether this is a match or not. // // NOTE. This method *must* cover the whole string in its outputs, with // contiguous ordered slices. FindMatches(inside string) []OffsetsMatch }
Pattern is used to split a NormalizedString
type Precompiled ¶
type Precompiled struct {
*spm.Precompiled
}
func (*Precompiled) Normalize ¶
func (m *Precompiled) Normalize(normalized *NormalizedString) (*NormalizedString, error)
Implement Normalizer for spm.Precompiled
type Prepend ¶
type Prepend struct {
Prepend string `json:"prepend"`
}
Prepend creates a normalizer that strip the normalized string inplace.
func NewPrepend ¶
func (*Prepend) Normalize ¶
func (p *Prepend) Normalize(normalized *NormalizedString) (*NormalizedString, error)
Implement Normalizer for Prepend
type Range ¶
type Range struct {
// contains filtered or unexported fields
}
Range is a slice of indexes on either normalized string or original string It is INCLUSIVE start and EXCLUSIVE end
func (*Range) IntoFullRange ¶
IntoFullRange convert the current range to cover the case where the original provided range was out of bound. maxLen is maximal len of string in `chars` (runes)
type RegexpPattern ¶
type RegexpPattern struct {
// contains filtered or unexported fields
}
func NewRegexpPattern ¶
func NewRegexpPattern(s string) *RegexpPattern
func (*RegexpPattern) FindMatches ¶
func (rp *RegexpPattern) FindMatches(inside string) []OffsetsMatch
FindMatches implements Pattern interface for RegexpPattern
type Replace ¶
type Replace struct { PatternType ReplacePattern `json:"pattern_type"` Pattern Pattern `json:"pattern"` Content string `json:"content"` }
func NewReplace ¶
func NewReplace(patternType ReplacePattern, pattern string, content string) *Replace
func (*Replace) DecodeChain ¶
Implement Decoder for Replace
func (*Replace) Normalize ¶
func (r *Replace) Normalize(normalized *NormalizedString) (*NormalizedString, error)
Implement Normalizer for Replace
type ReplacePattern ¶
type ReplacePattern int
Enum of different patterns that Replace can use.
const ( String ReplacePattern = iota Regex )
type RunePattern ¶
type RunePattern struct {
// contains filtered or unexported fields
}
RunePattern is a wrapper of primitive rune so that it can implement `Pattern` interface
func NewRunePattern ¶
func NewRunePattern(r rune) *RunePattern
func (*RunePattern) FindMatches ¶
func (r *RunePattern) FindMatches(inside string) []OffsetsMatch
FindMaches implements Pattern interface for RunePattern
type Sequence ¶
type Sequence struct {
Normalizers []Normalizer `json:"normalizers"`
}
Sequence wraps a slice of normalizers to normalize string in sequence.
func NewSequence ¶
func NewSequence(norms []Normalizer) *Sequence
func (*Sequence) Normalize ¶
func (s *Sequence) Normalize(normalized *NormalizedString) (*NormalizedString, error)
Implement Normalizer for Sequence
type SplitDelimiterBehavior ¶
type SplitDelimiterBehavior int
SplitDelimiterBehavior is a enum-like type . It defines the expected behavior for the delimiter of a Split Pattern When splitting on `'-'` for example, with input `the-final--countdown`:
- RemovedBehavior => `[ "the", "final", "countdown" ]`
- IsolatedBehavior => `[ "the", "-", "final", "-", "-", "countdown" ]`
- MergedWithPreviousBehavior => `[ "the-", "final-", "-", "countdown" ]`
- MergedWithNextBehavior => `[ "the", "-final", "-", "-countdown" ]`
- Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
type StringPattern ¶
type StringPattern struct {
// contains filtered or unexported fields
}
String is a wrapper of primitive string so that it can implement `Pattern` interface
func NewStringPattern ¶
func NewStringPattern(s string) *StringPattern
func (*StringPattern) FindMatches ¶
func (s *StringPattern) FindMatches(inside string) []OffsetsMatch
type Strip ¶
type Strip struct {
// contains filtered or unexported fields
}
func (*Strip) Normalize ¶
func (s *Strip) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type StripAccents ¶
type StripAccents struct{}
func NewStripAccents ¶
func NewStripAccents() *StripAccents
func (*StripAccents) Normalize ¶
func (sa *StripAccents) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type UnicodeNormalizer ¶
func NewUnicodeNormalizer ¶
func NewUnicodeNormalizer(form norm.Form) *UnicodeNormalizer
func (*UnicodeNormalizer) Normalize ¶
func (un *UnicodeNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)