Documentation
¶
Index ¶
- Constants
- Variables
- func ReleaseCtx[T byteseq.Byteseq](ctx *Ctx[T])
- type Bigram
- type Cleaner
- type Ctx
- func (ctx *Ctx[T]) Clean() *Ctx[T]
- func (ctx *Ctx[T]) DetectScript() *Ctx[T]
- func (ctx *Ctx[T]) DetectScriptProba() *Ctx[T]
- func (ctx *Ctx[T]) GetError() error
- func (ctx *Ctx[T]) GetOriginText() T
- func (ctx *Ctx[T]) GetRunes() []rune
- func (ctx *Ctx[T]) GetScript() Script
- func (ctx *Ctx[T]) GetScriptProba() ScriptProba
- func (ctx *Ctx[T]) GetScriptsLimit() []Script
- func (ctx *Ctx[T]) GetText() T
- func (ctx *Ctx[T]) GetTokens() Tokens
- func (ctx *Ctx[T]) LimitScripts(list []Script) *Ctx[T]
- func (ctx *Ctx[T]) Modify() *Ctx[T]
- func (ctx *Ctx[T]) Reset() *Ctx[T]
- func (ctx *Ctx[T]) ResetCleaners() *Ctx[T]
- func (ctx *Ctx[T]) ResetModifiers() *Ctx[T]
- func (ctx *Ctx[T]) ResetScriptDetector() *Ctx[T]
- func (ctx *Ctx[T]) ResetTokenizer() *Ctx[T]
- func (ctx *Ctx[T]) SetText(text T) *Ctx[T]
- func (ctx *Ctx[T]) Tokenize() *Ctx[T]
- func (ctx *Ctx[T]) WithCleaner(cln Cleaner[T]) *Ctx[T]
- func (ctx *Ctx[T]) WithModifier(mod Modifier[T]) *Ctx[T]
- func (ctx *Ctx[T]) WithScriptDetector(ds ScriptDetector[T]) *Ctx[T]
- func (ctx *Ctx[T]) WithTokenizer(tkn Tokenizer[T]) *Ctx[T]
- type DummyModifier
- type Fivegram
- type Language
- type Modifier
- type NGModel
- func (m *NGModel[T]) AddBigram(ng Bigram) *NGModel[T]
- func (m *NGModel[T]) AddFivegram(ng Fivegram) *NGModel[T]
- func (m *NGModel[T]) AddQuadrigram(ng Quadrigram) *NGModel[T]
- func (m *NGModel[T]) AddTrigram(ng Trigram) *NGModel[T]
- func (m *NGModel[T]) AddUnigram(ng Unigram) *NGModel[T]
- func (m *NGModel[T]) LoadFile(path string) error
- func (m *NGModel[T]) Parse(text T) *NGModel[T]
- func (m *NGModel[T]) Read(r io.Reader) (n int, err error)
- func (m *NGModel[T]) Stat() (int, int, int, int, int)
- func (m *NGModel[T]) Write(w io.Writer) (n int, err error)
- type Quadrigram
- type SRE
- type Script
- type ScriptDetectAlgo
- type ScriptDetector
- type ScriptProba
- type ScriptScore
- type StringTokenizer
- type Token
- type Tokenizer
- type TokenizerBlankLines
- type Tokens
- type Trigram
- type UnicodeCleaner
- type UnicodeScriptDetector
- type Unigram
Constants ¶
const ( CleanControl = 1 << iota CleanMark CleanPunct CleanSpace CleanDigit CleanNumber CleanSymbol CleanLetter CleanPrint CleanGraphic DefaultCleanMask = CleanControl | CleanMark | CleanSymbol | CleanNumber | CleanPunct )
Variables ¶
var ( ErrEmptyInput = errors.New("input text is empty") ErrBadVersion = errors.New("incompatible version") )
Functions ¶
func ReleaseCtx ¶
Types ¶
type Ctx ¶
type Ctx[T byteseq.Byteseq] struct { bitset.Bitset BufSP ScriptProba // contains filtered or unexported fields }
func AcquireCtx ¶
func (*Ctx[T]) DetectScript ¶
func (*Ctx[T]) DetectScriptProba ¶
func (*Ctx[T]) GetOriginText ¶
func (ctx *Ctx[T]) GetOriginText() T
func (*Ctx[T]) GetScriptProba ¶
func (ctx *Ctx[T]) GetScriptProba() ScriptProba
func (*Ctx[T]) GetScriptsLimit ¶
func (*Ctx[T]) LimitScripts ¶
func (*Ctx[T]) ResetCleaners ¶
func (*Ctx[T]) ResetModifiers ¶
func (*Ctx[T]) ResetScriptDetector ¶
func (*Ctx[T]) ResetTokenizer ¶
func (*Ctx[T]) WithCleaner ¶
func (*Ctx[T]) WithModifier ¶
func (*Ctx[T]) WithScriptDetector ¶
func (ctx *Ctx[T]) WithScriptDetector(ds ScriptDetector[T]) *Ctx[T]
func (*Ctx[T]) WithTokenizer ¶
type DummyModifier ¶
func (DummyModifier[T]) AppendModify ¶
func (DummyModifier[T]) AppendModify(dst []rune, _ T) []rune
func (DummyModifier[T]) Modify ¶
func (DummyModifier[T]) Modify(x T) T
type Fivegram ¶
type Fivegram struct {
// contains filtered or unexported fields
}
func NewFivegram ¶
type Language ¶
type Language uint
Language describes language type that allow to get different form of the language names.
Use generated language_repo.go for fast access to names repository. Similar to stringer approach but ~2-3 times faster. See https://github.com/koykov/versus/tree/master/stringer for comparison benchmarks.
const ( LanguageAbaza Language = iota LanguageAbenaki_Penobscot LanguageAbkhaz LanguageAdyghe LanguageAfar LanguageAfrikaans LanguageAghul LanguageAhtna LanguageAinu LanguageAlsatian LanguageArabic_Romanized LanguageArabic LanguageArmenian LanguageAromanian LanguageAkan LanguageAlbanian LanguageAmharic LanguageAzerbaijani LanguageBalkar LanguageBalochi LanguageBambara LanguageBashkir LanguageBasque LanguageBlackfoot LanguageBolivian_Quechua LanguageBonan LanguageBhojpuri LanguageBelarusian LanguageBengali LanguageBosnian LanguageBulgarian LanguageBurmese LanguageBrazilian_Portuguese LanguageBrazilian_Veneto LanguageBreton LanguageBudukh LanguageBuryat LanguageCantonese LanguageCatalan LanguageCebuano LanguageChti LanguageCherokee LanguageCheyenne LanguageChewa LanguageChichewa LanguageChinese LanguageChoctaw LanguageChulym LanguageChuukese LanguageChuvash LanguageCornish LanguageCroatian LanguageCzech LanguageDalmatian LanguageDanish LanguageDaur LanguageDolgan LanguageDongxiang LanguageDutch LanguageEnets LanguageEnglish LanguageEsperanto LanguageEstonian LanguageEvenki LanguageFaroese LanguageFinnish LanguageFrench LanguageFulfulde LanguageGagauz LanguageGanda LanguageGbari LanguageGeorgian LanguageGerman LanguageGothic LanguageGreenlandic LanguageGreek LanguageGujarati LanguageGwichin LanguageHaida LanguageHaitian_Creole LanguageHan LanguageHausa LanguageHebrew LanguageHindi LanguageHmar LanguageHmong LanguageHungarian LanguageIndonesian LanguageInuit LanguageIrish LanguageIgbo LanguageIcelandic LanguageIlocano LanguageItalian LanguageJapanese LanguageJavanese LanguageJerriais LanguageJingpho LanguageJurchen LanguageKabardian LanguageKabyle LanguageKalmyk LanguageKamas LanguageKaraim LanguageKarakalpak LanguageKarakhanid LanguageKashubian LanguageKaska LanguageKazakh LanguageKannada LanguageKet LanguageKhakas LanguageKhalaj LanguageKhanty LanguageKhmer LanguageKhowar LanguageKiche LanguageKinyarwanda LanguageKipsigis LanguageKirghiz LanguageKiribati LanguageKoine_Greek LanguageKomi LanguageKorean LanguageKorean_Hangul LanguageKosraean LanguageKott LanguageKryts LanguageKumyk LanguageKurdish LanguageKutchi LanguageLak LanguageLakota LanguageLao LanguageLatin LanguageLatvian LanguageLaz LanguageLepcha LanguageLezgi LanguageLigurian LanguageLingala LanguageLithuanian LanguageMacedonian LanguageMalagasy LanguageMalay LanguageMalayalam LanguageMaori LanguageMarathi LanguageMongolian LanguageMaithili LanguageNepali LanguageNorwegian_Bokmal LanguageNorwegian_Nynorsk LanguageOromo LanguageOriya LanguagePunjabi LanguagePersian LanguagePolish LanguagePortuguese LanguageKirundi LanguageRomanian LanguageRussian LanguageSaraiki LanguageSerbian LanguageShona LanguageSinhalese LanguageSlovak LanguageSlovenian LanguageSomali LanguageSotho LanguageSpanish LanguageSwahili LanguageSwedish LanguageTamil LanguageTelugu LanguageThai LanguageTigrinya LanguageTurkmen LanguageTagalog LanguageTswana LanguageTurkish LanguageTsonga LanguageUyghur LanguageUkrainian LanguageUrdu LanguageUzbek LanguageVietnamese LanguageWelsh LanguageXhosa LanguageYiddish LanguageYoruba LanguageZulu )
type NGModel ¶
type NGModel[T byteseq.Byteseq] struct { Version uint64 Tokenizer Tokenizer[T] // contains filtered or unexported fields }
func (*NGModel[T]) AddFivegram ¶
func (*NGModel[T]) AddQuadrigram ¶
func (m *NGModel[T]) AddQuadrigram(ng Quadrigram) *NGModel[T]
func (*NGModel[T]) AddTrigram ¶
func (*NGModel[T]) AddUnigram ¶
type Quadrigram ¶
type Quadrigram uint64
func NewQuadrigram ¶
func NewQuadrigram(a, b, c, d rune) (n Quadrigram)
func (Quadrigram) AppendTo ¶
func (q Quadrigram) AppendTo(dst []byte) []byte
func (Quadrigram) String ¶
func (q Quadrigram) String() string
type SRE ¶
SRE is a script rune evaluator. Nested functions approach performance https://github.com/koykov/lab/tree/master/call_perf
type Script ¶
type Script uint
const ( ScriptLatin Script = iota ScriptArabic ScriptCyrillic ScriptDevanagari ScriptEthiopic ScriptHan ScriptTagalog ScriptPhags_Pa ScriptTelugu ScriptHebrew ScriptBopomofo ScriptMyanmar ScriptBengali ScriptDeseret ScriptShavian ScriptDuployan ScriptGeorgian ScriptRunic ScriptGreek ScriptGujarati ScriptArmenian ScriptMahajani ScriptOgham ScriptSyriac ScriptHiragana ScriptKatakana ScriptJavanese ScriptKannada ScriptKhmer ScriptHangul ScriptMalayalam ScriptModi ScriptMongolian ScriptTirhuta ScriptElbasan ScriptGurmukhi ScriptSinhala ScriptOsmanya ScriptTamil ScriptThai )
func ScriptsSupported ¶
func ScriptsSupported() []Script
func (Script) Evaluate ¶
Evaluate checks if given rune r is written on script s. Use precompiled SRE (script rune evaluator) to speed up evaluation. See performance tests https://github.com/koykov/versus/blob/master/nlp_script/evaluate_test.go
type ScriptDetectAlgo ¶
type ScriptDetectAlgo uint
const ( ScriptDetectAlgoHalf ScriptDetectAlgo = iota ScriptDetectAlgoDistributed ScriptDetectAlgoFull )
type ScriptDetector ¶
type ScriptProba ¶
type ScriptProba []ScriptScore
func (ScriptProba) Len ¶
func (s ScriptProba) Len() int
func (ScriptProba) Less ¶
func (s ScriptProba) Less(i, j int) bool
func (*ScriptProba) Swap ¶
func (s *ScriptProba) Swap(i, j int)
type ScriptScore ¶
type StringTokenizer ¶
type StringTokenizer[T byteseq.Byteseq] struct { Separator string BlankLines TokenizerBlankLines }
func NewStringTokenizer ¶
func NewStringTokenizer[T byteseq.Byteseq](sep string, blankLines TokenizerBlankLines) StringTokenizer[T]
func (StringTokenizer[T]) AppendTokenize ¶
func (t StringTokenizer[T]) AppendTokenize(dst Tokens, x T) Tokens
func (StringTokenizer[T]) Tokenize ¶
func (t StringTokenizer[T]) Tokenize(x T) Tokens
type TokenizerBlankLines ¶
type TokenizerBlankLines int
const ( TokenizerBlankLinesDiscard TokenizerBlankLines = iota TokenizerBlankLinesKeep TokenizerBlankLinesDiscardEOF DefaultTokenSeparator = " \n\t" )
type UnicodeCleaner ¶
func NewUnicodeCleaner ¶
func NewUnicodeCleaner[T byteseq.Byteseq](mask uint32) UnicodeCleaner[T]
func (UnicodeCleaner[T]) AppendClean ¶
func (c UnicodeCleaner[T]) AppendClean(dst []rune, x T) []rune
func (UnicodeCleaner[T]) Clean ¶
func (c UnicodeCleaner[T]) Clean(x T) []rune
type UnicodeScriptDetector ¶
UnicodeScriptDetector is a builtin detector of writing scripts.
func NewUnicodeScriptDetector ¶
func NewUnicodeScriptDetector[T byteseq.Byteseq]() UnicodeScriptDetector[T]
func NewUnicodeScriptDetectorWithAlgo ¶
func NewUnicodeScriptDetectorWithAlgo[T byteseq.Byteseq](algo ScriptDetectAlgo) UnicodeScriptDetector[T]
func (UnicodeScriptDetector[T]) Detect ¶
func (d UnicodeScriptDetector[T]) Detect(ctx *Ctx[T]) (Script, error)
func (UnicodeScriptDetector[T]) DetectProba ¶
func (d UnicodeScriptDetector[T]) DetectProba(ctx *Ctx[T]) (ScriptProba, error)