normalizer

package

v1.0.0 Latest Latest Go to latest Published: Sep 20, 2024 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/danmolitor/tokenizer

Links

Open Source Insights

Documentation ¶

Overview ¶

Basic text preprocessing tasks are: 1. Remove HTML tags 2. Remove extra whitespaces 3. Convert accented characters to ASCII characters 4. Expand contractions 5. Remove special characters 6. Lowercase all texts 7. Convert number words to numeric form 8. Remove numbers 9. Remove stopwords 10. Lemmatization

Index ¶

Constants
func BytesToChar(s string, byteRange []int) (retVal []int)
func CharToBytes(s string, charRange []int) (retVal []int)
func FindAllStringIndex(re *re2.Regexp, s string, n int) [][]int
func IsBertPunctuation(c rune) bool
func IsBertWhitespace(c rune) bool
func IsChinese(c rune) bool
func IsPunctuation(c rune) bool
func IsWhitespace(c rune) bool
func RangeOf(s string, r []int) (retVal string)
type BertNormalizer
- func NewBertNormalizer(cleanText, lowercase, handleChineseChars, stripAccents bool) *BertNormalizer
- func (bn *BertNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)
type ChangeMap
type DefaultNormalizer
- func NewDefaultNormalizer(opts ...DefaultOption) *DefaultNormalizer
- func (dn *DefaultNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)
type DefaultOption
- func WithLowercase(lowercase bool) DefaultOption
- func WithStrip(strip bool) DefaultOption
type FnPattern
- func NewFnPattern(fn PatternFn) *FnPattern
- func (fp *FnPattern) FindMatches(inside string) []OffsetsMatch
type IndexOn
type Invert
- func NewInvertPattern(p Pattern) *Invert
- func (i *Invert) FindMatches(inside string) []OffsetsMatch
type NFC
- func NewNFC() *NFC
- func (n *NFC) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFD
- func NewNFD() *NFD
- func (n *NFD) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFKC
- func NewNFKC() *NFKC
- func (n *NFKC) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NFKD
- func NewNFKD() *NFKD
- func (n *NFKD) Normalize(norm *NormalizedString) (*NormalizedString, error)
type NormFn
type NormalizedString
- func NewNormalizedFrom(s string) (retVal *NormalizedString)
- func NewNormalizedString(original, normalized string, alignments, alignmentsOriginal [][]int, ...) *NormalizedString
- func (n *NormalizedString) Alignments() (retVal [][]int)
- func (n *NormalizedString) AlignmentsOriginal() (retVal [][]int)
- func (n *NormalizedString) Append(s string) (retVal *NormalizedString)
- func (n *NormalizedString) Clear()
- func (n *NormalizedString) ConvertOffset(inputRange *Range) (retVal *Range)
- func (n *NormalizedString) Filter(fn func(rune) bool) (retVal *NormalizedString)
- func (n *NormalizedString) ForEach(nfn NormFn) (retVal *NormalizedString)
- func (n *NormalizedString) GetNormalized() string
- func (n *NormalizedString) GetOriginal() string
- func (n *NormalizedString) IsEmpty() bool
- func (n *NormalizedString) LStrip() (retVal *NormalizedString)
- func (n *NormalizedString) Len() int
- func (n *NormalizedString) LenOriginal() int
- func (n *NormalizedString) Lowercase() (retVal *NormalizedString)
- func (n *NormalizedString) Map(nfn NormFn) (retVal *NormalizedString)
- func (n *NormalizedString) NFC() (retVal *NormalizedString)
- func (n *NormalizedString) NFD() (retVal *NormalizedString)
- func (n *NormalizedString) NFKC() (retVal *NormalizedString)
- func (n *NormalizedString) NFKD() (retVal *NormalizedString)
- func (n *NormalizedString) OffsetsOriginal() []int
- func (n *NormalizedString) Prepend(s string) (retVal *NormalizedString)
- func (n *NormalizedString) RStrip() (retVal *NormalizedString)
- func (n *NormalizedString) Range(r *Range) (retVal string)
- func (n *NormalizedString) RangeOriginal(r *Range) (retVal string)
- func (n *NormalizedString) RemoveAccents() (retVal *NormalizedString)
- func (n *NormalizedString) Replace(pattern Pattern, content string) (retVal *NormalizedString)
- func (n *NormalizedString) Shift() int
- func (n *NormalizedString) Slice(inputRange *Range) (retVal *NormalizedString)
- func (n *NormalizedString) Split(pattern Pattern, behavior SplitDelimiterBehavior) (retVal []NormalizedString)
- func (n *NormalizedString) Strip() (retVal *NormalizedString)
- func (n *NormalizedString) Transform(m []ChangeMap, initialOffset int) (retVal *NormalizedString)
- func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeMap, initialOffset int) (retVal *NormalizedString)
- func (n *NormalizedString) Uppercase() (retVal *NormalizedString)
type Normalizer
- func Lowercase() Normalizer
- func NewNormalizer(opts ...Option) Normalizer
type OffsetsMatch
type OffsetsRemove
type Option
- func WithBertNormalizer(cleanText, lowercase, handleChineseChars, stripAccents bool) Option
- func WithUnicodeNormalizer(form norm.Form) Option
type Pattern
type PatternFn
type Precompiled
- func (m *Precompiled) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type Prepend
- func NewPrepend(prepend string) *Prepend
- func (p *Prepend) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type Range
- func NewRange(start int, end int, indexOn IndexOn) (retVal *Range)
- func (r *Range) End() (retVal int)
- func (r *Range) IntoFullRange(maxLen int) (retVal *Range)
- func (r *Range) Len() int
- func (r *Range) On() IndexOn
- func (r *Range) Start() (retVal int)
- func (r *Range) Values() []int
type RegexpPattern
- func NewRegexpPattern(s string) *RegexpPattern
- func (rp *RegexpPattern) FindMatches(inside string) []OffsetsMatch
type Replace
- func NewReplace(patternType ReplacePattern, pattern string, content string) *Replace
- func (r *Replace) Decode(tokens []string) string
- func (r *Replace) DecodeChain(tokens []string) []string
- func (r *Replace) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type ReplacePattern
type RunePattern
- func NewRunePattern(r rune) *RunePattern
- func (r *RunePattern) FindMatches(inside string) []OffsetsMatch
type Sequence
- func NewSequence(norms []Normalizer) *Sequence
- func (s *Sequence) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type SplitDelimiterBehavior
type StringPattern
- func NewStringPattern(s string) *StringPattern
- func (s *StringPattern) FindMatches(inside string) []OffsetsMatch
type Strip
- func NewStrip(stripLeft, stripRight bool) *Strip
- func (s *Strip) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type StripAccents
- func NewStripAccents() *StripAccents
- func (sa *StripAccents) Normalize(normalized *NormalizedString) (*NormalizedString, error)
type UnicodeNormalizer
- func NewUnicodeNormalizer(form norm.Form) *UnicodeNormalizer
- func (un *UnicodeNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)

Constants ¶

View Source

const (
	RemovedBehavior = iota
	IsolatedBehavior
	MergedWithPreviousBehavior
	MergedWithNextBehavior
	ContiguousBehavior
)

View Source

const (
	OriginalTarget = iota
	NormalizedTarget
)

Variables ¶

This section is empty.

Functions ¶

func BytesToChar ¶

func BytesToChar(s string, byteRange []int) (retVal []int)

BytesToChar converts a given range from bytes to `char`

func CharToBytes ¶

func CharToBytes(s string, charRange []int) (retVal []int)

CharToBytes converts a given range from `char` to bytes

func FindAllStringIndex ¶

func FindAllStringIndex(re *re2.Regexp, s string, n int) [][]int

func IsBertPunctuation ¶

func IsBertPunctuation(c rune) bool

IsBertPunctuation checks whether an input rune is a BERT punctuation

func IsBertWhitespace ¶

func IsBertWhitespace(c rune) bool

IsBertWhitespace checks whether an input rune is a BERT whitespace

func IsChinese ¶

func IsChinese(c rune) bool

isChinese validates that rune c is in the CJK range according to BERT spec

func IsPunctuation ¶

func IsPunctuation(c rune) bool

IsPunctuation returns whether input rune is a punctuation or not.

func IsWhitespace ¶

func IsWhitespace(c rune) bool

IsWhitespace checks whether an input rune is a whitespace

func RangeOf ¶

func RangeOf(s string, r []int) (retVal string)

RangeOf returns a range of normalized string It will return empty string if input range is out of bound

Types ¶

type BertNormalizer ¶

type BertNormalizer struct {
	CleanText          bool `json:"clean_text"`           // Whether to remove Control characters and all sorts of whitespaces replaced with single ` ` space
	Lowercase          bool `json:"lowercase"`            // Whether to do lowercase
	HandleChineseChars bool `json:"handle_chinese_chars"` // Whether to put spaces around chinese characters so they get split
	StripAccents       bool `json:"strip_accents"`        // whether to remove accents
}

func NewBertNormalizer ¶

func NewBertNormalizer(cleanText, lowercase, handleChineseChars, stripAccents bool) *BertNormalizer

func (*BertNormalizer) Normalize ¶

func (bn *BertNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)

Normalize implements Normalizer interface for BertNormalizer

type ChangeMap ¶

type ChangeMap struct {
	RuneVal string
	Changes int
}

type DefaultNormalizer ¶

type DefaultNormalizer struct {
	// contains filtered or unexported fields
}

func NewDefaultNormalizer ¶

func NewDefaultNormalizer(opts ...DefaultOption) *DefaultNormalizer

func (*DefaultNormalizer) Normalize ¶

func (dn *DefaultNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)

type DefaultOption ¶

type DefaultOption func(*DefaultNormalizer)

func WithLowercase ¶

func WithLowercase(lowercase bool) DefaultOption

func WithStrip ¶

func WithStrip(strip bool) DefaultOption

type FnPattern ¶

type FnPattern struct {
	// contains filtered or unexported fields
}

func NewFnPattern ¶

func NewFnPattern(fn PatternFn) *FnPattern

func (*FnPattern) FindMatches ¶

func (fp *FnPattern) FindMatches(inside string) []OffsetsMatch

FindMatches implements Pattern interface for FnPattern

type IndexOn ¶

type IndexOn int

RangeType is a enum like representing which string (original or normalized) then range indexes on.

type Invert ¶

type Invert struct {
	Pattern Pattern
}

Invert the `is_match` flags for the wrapped Pattern. This is usefull for example when we use a regex that matches words instead of a delimiter, and we want to match the delimiter.

func NewInvertPattern ¶

func NewInvertPattern(p Pattern) *Invert

func (*Invert) FindMatches ¶

func (i *Invert) FindMatches(inside string) []OffsetsMatch

FindMatches implement Pattern interface for Invert

type NFC ¶

type NFC struct{}

func NewNFC ¶

func NewNFC() *NFC

func (*NFC) Normalize ¶

func (n *NFC) Normalize(norm *NormalizedString) (*NormalizedString, error)

type NFD ¶

type NFD struct{}

func NewNFD ¶

func NewNFD() *NFD

func (*NFD) Normalize ¶

func (n *NFD) Normalize(norm *NormalizedString) (*NormalizedString, error)

type NFKC ¶

type NFKC struct{}

func NewNFKC ¶

func NewNFKC() *NFKC

func (*NFKC) Normalize ¶

func (n *NFKC) Normalize(norm *NormalizedString) (*NormalizedString, error)

type NFKD ¶

type NFKD struct{}

func NewNFKD ¶

func NewNFKD() *NFKD

func (*NFKD) Normalize ¶

func (n *NFKD) Normalize(norm *NormalizedString) (*NormalizedString, error)

type NormFn ¶

type NormFn func(rune) rune

NormFn is a convenient function type for applying on each `char` of normalized string

type NormalizedString ¶

type NormalizedString struct {
	// contains filtered or unexported fields
}

A `NormalizedString` takes care of processing an "original" string to modify it and obtain a "normalized" string. It keeps both version of the string, alignments information between both and provides an interface to retrieve ranges of each string, using offsets from any of them.

It is possible to retrieve a part of the original string, by indexing it with offsets from the normalized one, and the other way around too. It is also possible to convert offsets from one referential to the other one easily.

func NewNormalizedFrom ¶

func NewNormalizedFrom(s string) (retVal *NormalizedString)

NewNormalizedFrom creates a Normalized instance from string input

func NewNormalizedString ¶

func NewNormalizedString(original, normalized string, alignments, alignmentsOriginal [][]int, originalShift int) *NormalizedString

func (*NormalizedString) Alignments ¶

func (n *NormalizedString) Alignments() (retVal [][]int)

Alignments returns alignments mapping normalized string to original string

func (*NormalizedString) AlignmentsOriginal ¶

func (n *NormalizedString) AlignmentsOriginal() (retVal [][]int)

AlignmentsOriginal returns original alignments mapping to original string

func (*NormalizedString) Append ¶

func (n *NormalizedString) Append(s string) (retVal *NormalizedString)

Append adds given string to the end of NormalizedString

func (*NormalizedString) Clear ¶

func (n *NormalizedString) Clear()

Clear clears the normalized part of the string

func (*NormalizedString) ConvertOffset ¶

func (n *NormalizedString) ConvertOffset(inputRange *Range) (retVal *Range)

ConvertOffsets converts the given offsets range from one referential to the other one: `Original => Normalized` or `Normalized => Original`

Returns `nil` when targeting something that is outside range

func (*NormalizedString) Filter ¶

func (n *NormalizedString) Filter(fn func(rune) bool) (retVal *NormalizedString)

Filter applies filtering on NormalizedString

func (*NormalizedString) ForEach ¶

func (n *NormalizedString) ForEach(nfn NormFn) (retVal *NormalizedString)

ForEach applies function on each `char` of normalized string Similar to Map???

func (*NormalizedString) GetNormalized ¶

func (n *NormalizedString) GetNormalized() string

GetNormalized returns the Normalized struct

func (*NormalizedString) GetOriginal ¶

func (n *NormalizedString) GetOriginal() string

GetOriginal return the original string

func (*NormalizedString) IsEmpty ¶

func (n *NormalizedString) IsEmpty() bool

IsEmpty returns whether the normalized string is empty

func (*NormalizedString) LStrip ¶

func (n *NormalizedString) LStrip() (retVal *NormalizedString)

LStrip removes leading spaces

func (*NormalizedString) Len ¶

func (n *NormalizedString) Len() int

Len returns length (in bytes) of normalized string

func (*NormalizedString) LenOriginal ¶

func (n *NormalizedString) LenOriginal() int

LenOriginal returns the length of Original string in bytes

func (*NormalizedString) Lowercase ¶

func (n *NormalizedString) Lowercase() (retVal *NormalizedString)

Lowercase transforms string to lowercase

func (*NormalizedString) Map ¶

func (n *NormalizedString) Map(nfn NormFn) (retVal *NormalizedString)

Map maps and applies function to each `char` of normalized string

func (*NormalizedString) NFC ¶

func (n *NormalizedString) NFC() (retVal *NormalizedString)

func (*NormalizedString) NFD ¶

func (n *NormalizedString) NFD() (retVal *NormalizedString)

func (*NormalizedString) NFKC ¶

func (n *NormalizedString) NFKC() (retVal *NormalizedString)

func (*NormalizedString) NFKD ¶

func (n *NormalizedString) NFKD() (retVal *NormalizedString)

func (*NormalizedString) OffsetsOriginal ¶

func (n *NormalizedString) OffsetsOriginal() []int

OffsetsOriginal returns the original offsets

func (*NormalizedString) Prepend ¶

func (n *NormalizedString) Prepend(s string) (retVal *NormalizedString)

Prepend adds given string to the begining of NormalizedString

func (*NormalizedString) RStrip ¶

func (n *NormalizedString) RStrip() (retVal *NormalizedString)

RStrip removes trailing spaces

func (*NormalizedString) Range ¶

func (n *NormalizedString) Range(r *Range) (retVal string)

Range returns a substring of the NORMALIZED string

func (*NormalizedString) RangeOriginal ¶

func (n *NormalizedString) RangeOriginal(r *Range) (retVal string)

RangeOriginal returns substring of ORIGINAL string

func (*NormalizedString) RemoveAccents ¶

func (n *NormalizedString) RemoveAccents() (retVal *NormalizedString)

RemoveAccents removes all Unicode Mn group (M non-spacing)

func (*NormalizedString) Replace ¶

func (n *NormalizedString) Replace(pattern Pattern, content string) (retVal *NormalizedString)

func (*NormalizedString) Shift ¶

func (n *NormalizedString) Shift() int

Shift returns original shift

func (*NormalizedString) Slice ¶

func (n *NormalizedString) Slice(inputRange *Range) (retVal *NormalizedString)

Slice returns a slice of the current NormalizedString If the range is not on char boundaries, return `nil`

func (*NormalizedString) Split ¶

func (n *NormalizedString) Split(pattern Pattern, behavior SplitDelimiterBehavior) (retVal []NormalizedString)

Split the current string in many subparts. Specify what to do with the delimiter.

This method will always ensure that the entire `NOrmalizedString` is covered in the produced subparts. This means that the delimiter parts will also be included, and will appear empty if we don't want to include them (their `original` part will still be present). It should always be possible to merge all the subparts back to the original `NormalizedString`

## Splitting Behavior for the delimiter

The behavior can be one of the followings: When splitting on `'-'` for example, with input `the-final--countdown`:

RemovedBehavior => `[ "the", "", "final", "", "", "countdown" ]`
IsolatedBehavior => `[ "the", "-", "final", "-", "-", "countdown" ]`
MergedWithPreviousBehavior => `[ "the-", "final-", "-", "countdown" ]`
MergedWithNextBehavior => `[ "the", "-final", "-", "-countdown" ]`
Contiguous => `[ "the", "-", "final", "--", "countdown" ]`

func (*NormalizedString) Strip ¶

func (n *NormalizedString) Strip() (retVal *NormalizedString)

Strip remove leading and trailing spaces

func (*NormalizedString) Transform ¶

func (n *NormalizedString) Transform(m []ChangeMap, initialOffset int) (retVal *NormalizedString)

Transform applies transformations to the current normalized version, updating the current alignments with the new ones. This method expect an Iterator yielding each rune of the new normalized string with a `change` interger size equals to:

`1` if this is a new rune
`-N` if the char is right before N removed runes
`0` if this rune represents the old one (even if changed)

Since it is possible that the normalized string doesn't include some of the `characters` (runes) at the beginning of the original one, we need an `initial_offset` which represents the number of removed runes at the very beginning.

`change` should never be more than `1`. If multiple runes are added, each of them has a `change` of `1`, but more doesn't make any sense. We treat any value above `1` as `1`.

E.g. string `élégant` Before NFD(): [{233 0} {108 1} {233 2} {103 3} {97 4} {110 5} {116 6}] After NFD(): [{101 0} {769 1} {108 2} {101 3} {769 4} {103 5} {97 6} {110 7} {116 8}] New Alignments: {0, 1}, {0, 1}, {1, 2}, {2, 3}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7},

func (*NormalizedString) TransformRange ¶

func (n *NormalizedString) TransformRange(inputRange *Range, changeMap []ChangeMap, initialOffset int) (retVal *NormalizedString)

This method expect an iterator yielding each `char` of the new normalized string with a `change` of int type equals to:

`1` if this is a new char
`-N` if the char is right before N removed chars
`0` if the char is replacing the existing one

Since it is possible that the normalized string doesn't include some of the characters at the beginning of the original one, we need an `initialOffset` which represents the number of removed chars at the very beginning.

func (*NormalizedString) Uppercase ¶

func (n *NormalizedString) Uppercase() (retVal *NormalizedString)

Uppercase transforms string to uppercase

type Normalizer ¶

type Normalizer interface {
	Normalize(normalized *NormalizedString) (*NormalizedString, error)
}

func Lowercase ¶

func Lowercase() Normalizer

Lowercase creates a lowercase normalizer

func NewNormalizer ¶

func NewNormalizer(opts ...Option) Normalizer

type OffsetsMatch ¶

type OffsetsMatch struct {
	Offsets []int // slice of 2 elements (start, end)
	Match   bool
}

OfsetsMatch contains a combination of Offsets position and a boolean indicates whether this is a match or not.

type OffsetsRemove ¶

type OffsetsRemove struct {
	Offsets      []int
	ShouldRemove bool
}

type Option ¶

type Option func(*normalizer)

func WithBertNormalizer ¶

func WithBertNormalizer(cleanText, lowercase, handleChineseChars, stripAccents bool) Option

WithBertNormalizer creates normalizer with BERT normalization features.

func WithUnicodeNormalizer ¶

func WithUnicodeNormalizer(form norm.Form) Option

WithUnicodeNormalizer creates normalizer with one of unicode NFD, NFC, NFKD, or NFKC normalization feature.

type Pattern ¶

type Pattern interface {
	// FindMatches slices the given string in a list of pattern match positions, with
	// a boolean indicating whether this is a match or not.
	//
	// NOTE. This method *must* cover the whole string in its outputs, with
	// contiguous ordered slices.
	FindMatches(inside string) []OffsetsMatch
}

Pattern is used to split a NormalizedString

type PatternFn ¶

type PatternFn func(rune) bool

PatternFn is a func type to apply pattern

type Precompiled ¶

type Precompiled struct {
	*spm.Precompiled
}

func (*Precompiled) Normalize ¶

func (m *Precompiled) Normalize(normalized *NormalizedString) (*NormalizedString, error)

Implement Normalizer for spm.Precompiled

type Prepend ¶

type Prepend struct {
	Prepend string `json:"prepend"`
}

Prepend creates a normalizer that strip the normalized string inplace.

func NewPrepend ¶

func NewPrepend(prepend string) *Prepend

func (*Prepend) Normalize ¶

func (p *Prepend) Normalize(normalized *NormalizedString) (*NormalizedString, error)

Implement Normalizer for Prepend

type Range ¶

type Range struct {
	// contains filtered or unexported fields
}

Range is a slice of indexes on either normalized string or original string It is INCLUSIVE start and EXCLUSIVE end

func NewRange ¶

func NewRange(start int, end int, indexOn IndexOn) (retVal *Range)

func (*Range) End ¶

func (r *Range) End() (retVal int)

func (*Range) IntoFullRange ¶

func (r *Range) IntoFullRange(maxLen int) (retVal *Range)

IntoFullRange convert the current range to cover the case where the original provided range was out of bound. maxLen is maximal len of string in `chars` (runes)

func (*Range) Len ¶

func (r *Range) Len() int

Len returns the length of the current Range if not unbounded

func (*Range) On ¶

func (r *Range) On() IndexOn

IndexOn returns the target where range index on

func (*Range) Start ¶

func (r *Range) Start() (retVal int)

func (*Range) Values ¶

func (r *Range) Values() []int

Values returns range values (start, end)

type RegexpPattern ¶

type RegexpPattern struct {
	// contains filtered or unexported fields
}

func NewRegexpPattern ¶

func NewRegexpPattern(s string) *RegexpPattern

func (*RegexpPattern) FindMatches ¶

func (rp *RegexpPattern) FindMatches(inside string) []OffsetsMatch

FindMatches implements Pattern interface for RegexpPattern

type Replace ¶

type Replace struct {
	PatternType ReplacePattern `json:"pattern_type"`
	Pattern     Pattern        `json:"pattern"`
	Content     string         `json:"content"`
}

func NewReplace ¶

func NewReplace(patternType ReplacePattern, pattern string, content string) *Replace

func (*Replace) Decode ¶

func (r *Replace) Decode(tokens []string) string

func (*Replace) DecodeChain ¶

func (r *Replace) DecodeChain(tokens []string) []string

Implement Decoder for Replace

func (*Replace) Normalize ¶

func (r *Replace) Normalize(normalized *NormalizedString) (*NormalizedString, error)

Implement Normalizer for Replace

type ReplacePattern ¶

type ReplacePattern int

Enum of different patterns that Replace can use.

const (
	String ReplacePattern = iota
	Regex
)

type RunePattern ¶

type RunePattern struct {
	// contains filtered or unexported fields
}

RunePattern is a wrapper of primitive rune so that it can implement `Pattern` interface

func NewRunePattern ¶

func NewRunePattern(r rune) *RunePattern

func (*RunePattern) FindMatches ¶

func (r *RunePattern) FindMatches(inside string) []OffsetsMatch

FindMaches implements Pattern interface for RunePattern

type Sequence ¶

type Sequence struct {
	Normalizers []Normalizer `json:"normalizers"`
}

Sequence wraps a slice of normalizers to normalize string in sequence.

func NewSequence ¶

func NewSequence(norms []Normalizer) *Sequence

func (*Sequence) Normalize ¶

func (s *Sequence) Normalize(normalized *NormalizedString) (*NormalizedString, error)

Implement Normalizer for Sequence

type SplitDelimiterBehavior ¶

type SplitDelimiterBehavior int

SplitDelimiterBehavior is a enum-like type . It defines the expected behavior for the delimiter of a Split Pattern When splitting on `'-'` for example, with input `the-final--countdown`:

RemovedBehavior => `[ "the", "final", "countdown" ]`
IsolatedBehavior => `[ "the", "-", "final", "-", "-", "countdown" ]`
MergedWithPreviousBehavior => `[ "the-", "final-", "-", "countdown" ]`
MergedWithNextBehavior => `[ "the", "-final", "-", "-countdown" ]`
Contiguous => `[ "the", "-", "final", "--", "countdown" ]`

type StringPattern ¶

type StringPattern struct {
	// contains filtered or unexported fields
}

String is a wrapper of primitive string so that it can implement `Pattern` interface

func NewStringPattern ¶

func NewStringPattern(s string) *StringPattern

func (*StringPattern) FindMatches ¶

func (s *StringPattern) FindMatches(inside string) []OffsetsMatch

type Strip ¶

type Strip struct {
	// contains filtered or unexported fields
}

func NewStrip ¶

func NewStrip(stripLeft, stripRight bool) *Strip

func (*Strip) Normalize ¶

func (s *Strip) Normalize(normalized *NormalizedString) (*NormalizedString, error)

type StripAccents ¶

type StripAccents struct{}

func NewStripAccents ¶

func NewStripAccents() *StripAccents

func (*StripAccents) Normalize ¶

func (sa *StripAccents) Normalize(normalized *NormalizedString) (*NormalizedString, error)

type UnicodeNormalizer ¶

type UnicodeNormalizer struct {
	Form norm.Form
}

func NewUnicodeNormalizer ¶

func NewUnicodeNormalizer(form norm.Form) *UnicodeNormalizer

func (*UnicodeNormalizer) Normalize ¶

func (un *UnicodeNormalizer) Normalize(n *NormalizedString) (*NormalizedString, error)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL