ling

package module
v0.0.0-...-dc65a0e Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 4, 2023 License: Apache-2.0 Imports: 18 Imported by: 0

README

ling is a golang toolkit for natural language processing

GoDocGo Report Card

Implementation references

Similar NLP tools

Multilingual text toknization

Text normalization

Lemmatization

词干提取(stemming)和词形还原(lemmatization)

Tagging

  • Regex tagger
    • commonregex, a collection of common regular expressions for Go.
    • xurls, a Go package of regex for urls.

Natural language Detection

getlang is much slower than franco

Documentation

Index

Constants

View Source
const (
	DatePattern = `(?i)(?:(?:tgl)?\d{1,2}[^0-9^:]\d{1,2}[^0-9^:](?:19|20)?\d{2})|(?:(?:19|20)?\d{2}[^0-9^:]\d{1,2}[^0-9^:]\d{1,2})`
	TimePattern = `(?:(?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d(?:\.\d+)?(?:\s*(?:\+|-)(?:0?\d|1[0-2]):?(?:0|3)0)?)?)`
	//TimePattern = `(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`
	PhonePattern          = `` /* 133-byte string literal not displayed */
	PhonesWithExtsPattern = `` /* 273-byte string literal not displayed */
	LinkPattern           = `(?i)(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?`
	EmailPattern          = `(?i)([A-Za-z0-9!#$%&'*+\/=?^_{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)`
	IPv4Pattern           = `(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)`
	CreditCardPattern     = `(?i)(?:(?:(?:[\d\*x]{4}[- ]?){3}[\d\*x]{4}|[\d\*x]{15,16}))`
	VISACreditCardPattern = `4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}`
	MCCreditCardPattern   = `5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}`
	BtcAddressPattern     = `[13][a-km-zA-HJ-NP-Z1-9]{25,34}`
	SSNPattern            = `(?:\d{3}-\d{2}-\d{4})`
	MD5HexPattern         = `[0-9a-fA-F]{32}`
	SHA1HexPattern        = `[0-9a-fA-F]{40}`
	SHA256HexPattern      = `[0-9a-fA-F]{64}`
	GUIDPattern           = `[0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}`
	ISBN13Pattern         = `(?:[\d]-?){12}[\dxX]`
	ISBN10Pattern         = `(?:[\d]-?){9}[\dxX]`
	MACAddressPattern     = `(([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))`
	IBANPattern           = `[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}`
	NumericPattern        = `([+\-]?((\d{1,3}(,\d{3})+))|((?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?))`
	DigitsPattern         = `\d+`
)

https://github.com/mingrammer/commonregex Regular expression patterns

View Source
const Lemma = "lemma"

Lemma processor name

View Source
const Lower = "lower"
View Source
const Norm = "norm"
View Source
const Unidecode = "unidecode"

Variables

View Source
var Processors = make(map[string]Processor)

Regexes is the compiled regular expressions

Functions

func Script

func Script(text string) string

Types

type APITagger

type APITagger struct {
	// contains filtered or unexported fields
}

APITagger via http interface

func NewAPITagger

func NewAPITagger(addr string) (*APITagger, error)

NewAPITagger returns a new tagger

func (*APITagger) Process

func (t *APITagger) Process(d *Document) error

Process the input document

type DictTagger

type DictTagger struct {
	*d.Dictionary
}

func NewDictTagger

func NewDictTagger() (*DictTagger, error)

func (*DictTagger) Process

func (t *DictTagger) Process(d *Document) error

type Document

type Document struct {
	Text   string   `json:"text"`
	Tokens []*Token `json:"tokens"`
	Spans  []*Span  `json:"spans"`
	Lang   string   `json:"lang"`
	Langs  []string `json:"langs"`
}

func NewDocument

func NewDocument(text string) *Document

func (*Document) NewSpan

func (d *Document) NewSpan(start, end int) *Span

func (*Document) String

func (d *Document) String() string

func (*Document) XRealTokens

func (d *Document) XRealTokens(anno string) []string

func (*Document) XTokens

func (d *Document) XTokens(anno string) []string

type Entity

type Entity struct {
	Text  string      `json:"text"`
	Type  string      `json:"type"`
	Value interface{} `json:"value"`
	Start int         `json:"start"`
	End   int         `json:"end"`
}

Entity stores the NER entity

type Lemmatizer

type Lemmatizer struct {
}

Lemmatizer is the processor for lemmatization

func (*Lemmatizer) Process

func (l *Lemmatizer) Process(d *Document) error

Process is the function to annotate documents

type Normalizer

type Normalizer struct {
}

Normalizer is the processor for token normalization

func (*Normalizer) Process

func (n *Normalizer) Process(d *Document) error

Process normalizes the tokens of Document d

type Pipeline

type Pipeline struct {
	Annotators []string
	// contains filtered or unexported fields
}

A Pipeline contains configured annotators and taggers for nl processing

func DefaultNLP

func DefaultNLP() (*Pipeline, error)

DefaultNLP returns ling handler with norm, lemma, unidecode and regex

func MustNLP

func MustNLP(annotators ...string) *Pipeline

MustNLP is like NLP but panics if the annotators are not correct. It simplifies safe initialization of global variables holding ling handler

func NLP

func NLP(annotators ...string) (*Pipeline, error)

NLP returns ling handler with the annotators

func (*Pipeline) AddTagger

func (p *Pipeline) AddTagger(t Processor) error

AddTagger adds a new processor t to Pipeline p

func (*Pipeline) Annotate

func (p *Pipeline) Annotate(d *Document) error

Annotate tags the Document by each configured processors and taggers

func (*Pipeline) AnnotatePro

func (p *Pipeline) AnnotatePro(d *Document, taggers ...Processor) error

AnnotatePro tags the Document by each configured processors and taggers

type Processor

type Processor interface {
	Process(d *Document) error
}

type RegexTagger

type RegexTagger struct {
}

RegexTagger is the processor that uses regex expression

func (*RegexTagger) Process

func (t *RegexTagger) Process(d *Document) error

Process is the function to annotate documents

type Span

type Span struct {
	Doc         *Document              `json:"-"`
	Start       int                    `json:"start"`
	End         int                    `json:"end"`
	Annotations map[string]interface{} `json:"annotations"`
}

func (*Span) String

func (s *Span) String() string

type Token

type Token struct {
	Doc         *Document         `json:"-"`
	Text        string            `json:"text"`
	Type        TokenType         `json:"type"`
	Script      string            `json:"script"`
	I           int               `json:"i"`
	StartByte   int               `json:"start_byte"`
	EndByte     int               `json:"end_byte"`
	Annotations map[string]string `json:"annotations"`
}

func (*Token) String

func (t *Token) String() string

type TokenType

type TokenType byte
const (
	EOF TokenType = iota
	Space
	Symbol
	Number
	Letters
	Punct
	Word
)

func Type

func Type(text string) TokenType

func (TokenType) MarshalJSON

func (r TokenType) MarshalJSON() ([]byte, error)

MarshalJSON is generated so TokenType satisfies json.Marshaler.

func (TokenType) String

func (i TokenType) String() string

func (*TokenType) UnmarshalJSON

func (r *TokenType) UnmarshalJSON(data []byte) error

UnmarshalJSON is generated so TokenType satisfies json.Unmarshaler.

type Tokenizer

type Tokenizer struct {
}

func (*Tokenizer) Process

func (t *Tokenizer) Process(d *Document) error

type Unidecoder

type Unidecoder struct {
}

func (*Unidecoder) Process

func (u *Unidecoder) Process(d *Document) error

Directories

Path Synopsis
cmd

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL