Documentation ¶
Index ¶
Constants ¶
const ( DatePattern = `(?i)(?:(?:tgl)?\d{1,2}[^0-9^:]\d{1,2}[^0-9^:](?:19|20)?\d{2})|(?:(?:19|20)?\d{2}[^0-9^:]\d{1,2}[^0-9^:]\d{1,2})` TimePattern = `(?:(?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d(?:\.\d+)?(?:\s*(?:\+|-)(?:0?\d|1[0-2]):?(?:0|3)0)?)?)` //TimePattern = `(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)` PhonePattern = `` /* 133-byte string literal not displayed */ PhonesWithExtsPattern = `` /* 273-byte string literal not displayed */ LinkPattern = `(?i)(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?` EmailPattern = `(?i)([A-Za-z0-9!#$%&'*+\/=?^_{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)` IPv4Pattern = `(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)` CreditCardPattern = `(?i)(?:(?:(?:[\d\*x]{4}[- ]?){3}[\d\*x]{4}|[\d\*x]{15,16}))` VISACreditCardPattern = `4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}` MCCreditCardPattern = `5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}` BtcAddressPattern = `[13][a-km-zA-HJ-NP-Z1-9]{25,34}` SSNPattern = `(?:\d{3}-\d{2}-\d{4})` MD5HexPattern = `[0-9a-fA-F]{32}` SHA1HexPattern = `[0-9a-fA-F]{40}` SHA256HexPattern = `[0-9a-fA-F]{64}` GUIDPattern = `[0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}` ISBN13Pattern = `(?:[\d]-?){12}[\dxX]` ISBN10Pattern = `(?:[\d]-?){9}[\dxX]` MACAddressPattern = `(([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))` IBANPattern = `[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}` NumericPattern = `([+\-]?((\d{1,3}(,\d{3})+))|((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?))` DigitsPattern = `\d+` )
https://github.com/mingrammer/commonregex Regular expression patterns
const Lemma = "lemma"
Lemma processor name
const Lower = "lower"
const Norm = "norm"
const Unidecode = "unidecode"
Variables ¶
var Processors = make(map[string]Processor)
var Regexes = map[string]*regexp.Regexp{ "date": regexp.MustCompile(DatePattern), "time": regexp.MustCompile(TimePattern), "phone": regexp.MustCompile(PhonePattern), "phones_with_exts": regexp.MustCompile(PhonesWithExtsPattern), "link": regexp.MustCompile(LinkPattern), "email": regexp.MustCompile(EmailPattern), "ipv4": regexp.MustCompile(IPv4Pattern), "credit_card": regexp.MustCompile(CreditCardPattern), "btc_address": regexp.MustCompile(BtcAddressPattern), "ssn": regexp.MustCompile(SSNPattern), "md5_hex": regexp.MustCompile(MD5HexPattern), "sha1_hex": regexp.MustCompile(SHA1HexPattern), "sha256_hex": regexp.MustCompile(SHA256HexPattern), "guid": regexp.MustCompile(GUIDPattern), "isbn13": regexp.MustCompile(ISBN13Pattern), "isbn10": regexp.MustCompile(ISBN10Pattern), "visa_credit_card": regexp.MustCompile(VISACreditCardPattern), "mc_credit_card": regexp.MustCompile(MCCreditCardPattern), "mac_address": regexp.MustCompile(MACAddressPattern), "iban": regexp.MustCompile(IBANPattern), "numeric": regexp.MustCompile(NumericPattern), "digits": regexp.MustCompile(DigitsPattern), }
Regexes is the compiled regular expressions
Functions ¶
Types ¶
type APITagger ¶
type APITagger struct {
// contains filtered or unexported fields
}
APITagger via http interface
func NewAPITagger ¶
NewAPITagger returns a new tagger
type DictTagger ¶
type DictTagger struct {
*d.Dictionary
}
func NewDictTagger ¶
func NewDictTagger() (*DictTagger, error)
func (*DictTagger) Process ¶
func (t *DictTagger) Process(d *Document) error
type Document ¶
type Document struct { Text string `json:"text"` Tokens []*Token `json:"tokens"` Spans []*Span `json:"spans"` Lang string `json:"lang"` Langs []string `json:"langs"` }
func NewDocument ¶
func (*Document) XRealTokens ¶
type Entity ¶
type Entity struct { Text string `json:"text"` Type string `json:"type"` Value interface{} `json:"value"` Start int `json:"start"` End int `json:"end"` }
Entity stores the NER entity
type Lemmatizer ¶
type Lemmatizer struct { }
Lemmatizer is the processor for lemmatization
func (*Lemmatizer) Process ¶
func (l *Lemmatizer) Process(d *Document) error
Process is the function to annotate documents
type Normalizer ¶
type Normalizer struct { }
Normalizer is the processor for token normalization
func (*Normalizer) Process ¶
func (n *Normalizer) Process(d *Document) error
Process normalizes the tokens of Document d
type Pipeline ¶
type Pipeline struct { Annotators []string // contains filtered or unexported fields }
A Pipeline contains configured annotators and taggers for nl processing
func DefaultNLP ¶
DefaultNLP returns ling handler with norm, lemma, unidecode and regex
func MustNLP ¶
MustNLP is like NLP but panics if the annotators are not correct. It simplifies safe initialization of global variables holding ling handler
type RegexTagger ¶
type RegexTagger struct { }
RegexTagger is the processor that uses regex expression
func (*RegexTagger) Process ¶
func (t *RegexTagger) Process(d *Document) error
Process is the function to annotate documents
type Span ¶
type Token ¶
type TokenType ¶
type TokenType byte
func (TokenType) MarshalJSON ¶
MarshalJSON is generated so TokenType satisfies json.Marshaler.
func (*TokenType) UnmarshalJSON ¶
UnmarshalJSON is generated so TokenType satisfies json.Unmarshaler.
type Unidecoder ¶
type Unidecoder struct { }
func (*Unidecoder) Process ¶
func (u *Unidecoder) Process(d *Document) error