Documentation ¶
Overview ¶
package lingo provides the data structures and algorithms required for natural language processing.
Index ¶
- Constants
- Variables
- func AllocTree() depConsOpt
- func EqStringSlice(a, b []string) bool
- func FromAnnotatedSentence(s AnnotatedSentence) depConsOpt
- func InDepTypes(x DependencyType, set []DependencyType) bool
- func InPOSTags(x POSTag, set []POSTag) bool
- func InStringSlice(s string, l []string) bool
- func IsAdjective(x POSTag) bool
- func IsAdverb(x POSTag) bool
- func IsCompound(x DependencyType) bool
- func IsDeterminer(x POSTag) bool
- func IsDeterminerRel(x DependencyType) bool
- func IsIN(x POSTag) bool
- func IsInterrogative(x POSTag) bool
- func IsModifier(x DependencyType) bool
- func IsMultiword(x DependencyType) bool
- func IsNoun(x POSTag) bool
- func IsNumber(x POSTag) bool
- func IsProperNoun(x POSTag) bool
- func IsQuantifier(x DependencyType) bool
- func IsSymbol(x POSTag) bool
- func IsVerb(x POSTag) bool
- func ReadCluster(r io.Reader) map[string]Cluster
- func StringIs(s string, f is) bool
- func UnescapeSpecials(word string) string
- type AnnotatedSentence
- func (as AnnotatedSentence) Children(h int) (retVal []int)
- func (as AnnotatedSentence) Clone() AnnotatedSentence
- func (as AnnotatedSentence) Dependency() *Dependency
- func (as AnnotatedSentence) Edges() (retVal []DependencyEdge)
- func (as AnnotatedSentence) Fix()
- func (as AnnotatedSentence) Heads() []int
- func (as AnnotatedSentence) IDs() []int
- func (as AnnotatedSentence) IsValid() bool
- func (as AnnotatedSentence) Labels() []DependencyType
- func (as AnnotatedSentence) Leaves() (retVal []int)
- func (as AnnotatedSentence) LemmaString() string
- func (as AnnotatedSentence) Lemmas() []string
- func (as AnnotatedSentence) Len() int
- func (as AnnotatedSentence) Less(i, j int) bool
- func (as AnnotatedSentence) LoweredString() string
- func (as AnnotatedSentence) LoweredStringSlice() []string
- func (as AnnotatedSentence) MarshalJSON() ([]byte, error)
- func (as AnnotatedSentence) Phrase(start, end int) (AnnotatedSentence, error)
- func (as AnnotatedSentence) SetID()
- func (as AnnotatedSentence) StemString() string
- func (as AnnotatedSentence) Stems() []string
- func (as AnnotatedSentence) String() string
- func (as AnnotatedSentence) StringSlice() []string
- func (as AnnotatedSentence) Swap(i, j int)
- func (as AnnotatedSentence) Tags() []POSTag
- func (as AnnotatedSentence) Tree() *DependencyTree
- func (as *AnnotatedSentence) UnmarshalJSON(b []byte) error
- func (as AnnotatedSentence) ValueString() string
- type Annotation
- func (a *Annotation) Clone() *Annotation
- func (a *Annotation) GoString() string
- func (a *Annotation) HeadID() int
- func (a *Annotation) IsNumber() bool
- func (a *Annotation) MarshalJSON() ([]byte, error)
- func (a *Annotation) Process(f AnnotationFixer) error
- func (a *Annotation) SetHead(headAnn *Annotation)
- func (a *Annotation) String() string
- func (a *Annotation) UnmarshalJSON(b []byte) error
- type AnnotationFixer
- type AnnotationSet
- func (as AnnotationSet) Add(a *Annotation) AnnotationSet
- func (as AnnotationSet) Contains(a *Annotation) bool
- func (as AnnotationSet) Index(a *Annotation) int
- func (as AnnotationSet) Len() int
- func (as AnnotationSet) Less(i, j int) bool
- func (as AnnotationSet) Set() AnnotationSet
- func (as AnnotationSet) Swap(i, j int)
- type Cluster
- type Corpus
- type Dependency
- func (d *Dependency) AddArc(head, child int, label DependencyType)
- func (d *Dependency) AddChild(head, child int)
- func (d *Dependency) AddRel(child int, rel DependencyType)
- func (d *Dependency) Annotation(i int) *Annotation
- func (d *Dependency) HasSingleRoot() bool
- func (d *Dependency) Head(i int) int
- func (d *Dependency) IsLegal() bool
- func (d *Dependency) IsProjective() bool
- func (d *Dependency) Label(i int) DependencyType
- func (d *Dependency) Lefts() [][]int
- func (d *Dependency) N() int
- func (d *Dependency) Rights() [][]int
- func (d *Dependency) Root() int
- func (d *Dependency) Sentence() AnnotatedSentence
- func (d *Dependency) SetLefts(l [][]int)
- func (d *Dependency) SetRights(r [][]int)
- func (d *Dependency) SprintRel() string
- func (d *Dependency) WordCount() int
- type DependencyEdge
- type DependencyTree
- type DependencyType
- type DependencyTypeSet
- type Lemmatizer
- type Lexeme
- type LexemeSentence
- type LexemeType
- type POSTag
- type Sentencer
- type Shape
- type Stemmer
- type TagSet
- type WordEmbeddings
- type WordFlag
Constants ¶
const BUILD_RELSET = "universalrel"
const BUILD_TAGSET = "universaltags"
Variables ¶
var Adjectives = []POSTag{ADJ}
var Adverbs = []POSTag{ADV}
var Compounds = []DependencyType{Compound, Compound_Part}
var DeterminerRels = []DependencyType{Det, Det_PreDet}
var Determiners = []POSTag{DET}
var Interrogatives = []POSTag{PRON, DET, ADV}
var Modifiers = []DependencyType{AMod}
var MultiWord = []DependencyType{MWE, Compound, Compound_Part, Parataxis}
var Nouns = []POSTag{NOUN, PROPN}
var NumberWords = map[string]int{
"zero": 0,
"one": 1,
"two": 2,
"three": 3,
"four": 4,
"five": 5,
"six": 6,
"seven": 7,
"eight": 8,
"nine": 9,
"ten": 10,
"eleven": 11,
"twelve": 12,
"thirteen": 13,
"fourteen": 14,
"fifteen": 15,
"sixteen": 16,
"nineteen": 19,
"seventeen": 17,
"eighteen": 18,
"twenty": 20,
"thirty": 30,
"forty": 40,
"fifty": 50,
"sixty": 60,
"seventy": 70,
"eighty": 80,
"ninety": 90,
"hundred": 100,
"thousand": 1000,
"million": 1000000,
"billion": 1000000000,
"trillion": 1000000000000,
"quadrillion": 1000000000000000,
}
NumberWords was generated with this python code
numberWords = {} simple = '''zero one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty'''.split() for i, word in zip(xrange(0, 20+1), simple): numberWords[word] = i tense = '''thirty forty fifty sixty seventy eighty ninety hundred'''.split() for i, word in zip(xrange(30, 100+1, 10), tense): numberWords[word] = i larges = '''thousand million billion trillion quadrillion quintillion sextillion septillion'''.split() for i, word in zip(xrange(3, 24+1, 3), larges): numberWords[word] = 10**i
var Numbers = []POSTag{NUM}
var ProperNouns = []POSTag{PROPN}
var QuantifingMods = []DependencyType{NumMod}
var Symbols = []POSTag{SYM, PUNCT}
var Verbs = []POSTag{VERB}
Functions ¶
func AllocTree ¶
func AllocTree() depConsOpt
AllocTree allocates the lefts and rights. Typical construction of the *Dependency doesn't allocate the trees as they're not necessary for a number of tasks.
func EqStringSlice ¶
func FromAnnotatedSentence ¶
func FromAnnotatedSentence(s AnnotatedSentence) depConsOpt
FromAnnotatedSentence creates a dependency from an AnnotatedSentence.
func InDepTypes ¶
func InDepTypes(x DependencyType, set []DependencyType) bool
func InStringSlice ¶
func IsAdjective ¶
func IsCompound ¶
func IsCompound(x DependencyType) bool
func IsDeterminer ¶
func IsDeterminerRel ¶
func IsDeterminerRel(x DependencyType) bool
func IsIN ¶
IsIN returns true if the POSTag is a subordinating conjunction. The reason why this exists is because in the stanford tag, IN is the POSTag while in the universal dependencies, it's the SCONJ POSTag
func IsInterrogative ¶
func IsModifier ¶
func IsModifier(x DependencyType) bool
func IsMultiword ¶
func IsMultiword(x DependencyType) bool
func IsProperNoun ¶
func IsQuantifier ¶
func IsQuantifier(x DependencyType) bool
func ReadCluster ¶
ReadCluster reads PercyLiang's cluster file format and returns a map of strings to Cluster
func UnescapeSpecials ¶
Types ¶
type AnnotatedSentence ¶
type AnnotatedSentence []*Annotation
AnnotatedSentence is a sentence, but each word has been annotated.
func NewAnnotatedSentence ¶
func NewAnnotatedSentence() AnnotatedSentence
func (AnnotatedSentence) Children ¶
func (as AnnotatedSentence) Children(h int) (retVal []int)
func (AnnotatedSentence) Clone ¶
func (as AnnotatedSentence) Clone() AnnotatedSentence
func (AnnotatedSentence) Dependency ¶
func (as AnnotatedSentence) Dependency() *Dependency
func (AnnotatedSentence) Edges ¶
func (as AnnotatedSentence) Edges() (retVal []DependencyEdge)
func (AnnotatedSentence) Fix ¶
func (as AnnotatedSentence) Fix()
func (AnnotatedSentence) Heads ¶
func (as AnnotatedSentence) Heads() []int
Heads returns the head IDs of the sentence. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) IDs ¶
func (as AnnotatedSentence) IDs() []int
IDs returns the list of IDs in the sentence. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) IsValid ¶
func (as AnnotatedSentence) IsValid() bool
func (AnnotatedSentence) Labels ¶
func (as AnnotatedSentence) Labels() []DependencyType
Labels returns the DependencyTypes of the sentence. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) Leaves ¶
func (as AnnotatedSentence) Leaves() (retVal []int)
Leaves returns the *Annotations which are leaves. If the dependency hasn't been set yet, every single *Annotation is a leaf.
func (AnnotatedSentence) LemmaString ¶
func (as AnnotatedSentence) LemmaString() string
func (AnnotatedSentence) Lemmas ¶
func (as AnnotatedSentence) Lemmas() []string
Lemmas returns the lemmas as as slice of string. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) Less ¶
func (as AnnotatedSentence) Less(i, j int) bool
func (AnnotatedSentence) LoweredString ¶
func (as AnnotatedSentence) LoweredString() string
func (AnnotatedSentence) LoweredStringSlice ¶
func (as AnnotatedSentence) LoweredStringSlice() []string
LoweredStringSlice returns the lowercased version of the words in the sentence as a slice of string. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) MarshalJSON ¶
func (as AnnotatedSentence) MarshalJSON() ([]byte, error)
func (AnnotatedSentence) Phrase ¶
func (as AnnotatedSentence) Phrase(start, end int) (AnnotatedSentence, error)
Phrase returns the slice of the sentence. While you can do the same by simply doing as[start:end], this method returns errors instead of panicking
func (AnnotatedSentence) SetID ¶
func (as AnnotatedSentence) SetID()
func (AnnotatedSentence) StemString ¶
func (as AnnotatedSentence) StemString() string
func (AnnotatedSentence) Stems ¶
func (as AnnotatedSentence) Stems() []string
Stems returns the stems as a slice of string. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) String ¶
func (as AnnotatedSentence) String() string
func (AnnotatedSentence) StringSlice ¶
func (as AnnotatedSentence) StringSlice() []string
StringSlice returns the original words as a slice of string. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) Swap ¶
func (as AnnotatedSentence) Swap(i, j int)
func (AnnotatedSentence) Tags ¶
func (as AnnotatedSentence) Tags() []POSTag
Tags returns the POSTags of the sentence. The return value has exactly the same length as the sentence.
func (AnnotatedSentence) Tree ¶
func (as AnnotatedSentence) Tree() *DependencyTree
func (*AnnotatedSentence) UnmarshalJSON ¶
func (as *AnnotatedSentence) UnmarshalJSON(b []byte) error
func (AnnotatedSentence) ValueString ¶
func (as AnnotatedSentence) ValueString() string
type Annotation ¶
type Annotation struct { Lexeme POSTag // fields to do with an annotation being in a collection DependencyType ID int Head *Annotation // info about the annotation itself Lemma string Lowered string Stem string // auxiliary data for processing Cluster Shape WordFlag // contains filtered or unexported fields }
Annotation is the word and it's metadata. This includes the position, its dependency head (if available), its lemma, POSTag, etc
A collection of Annoations - AnnotatedSentence is also a representation of a dependency parse ¶
Every field is exported for easy gobbing. be very careful with setting stuff
func AnnotationFromLexTag ¶
func AnnotationFromLexTag(l Lexeme, t POSTag, f AnnotationFixer) *Annotation
AnnotationFromLexTag is only ever used in tests. Fixer is optional
func NewAnnotation ¶
func NewAnnotation() *Annotation
func NullAnnotation ¶
func NullAnnotation() *Annotation
func RootAnnotation ¶
func RootAnnotation() *Annotation
func StartAnnotation ¶
func StartAnnotation() *Annotation
func StringToAnnotation ¶
func StringToAnnotation(s string, f AnnotationFixer) *Annotation
func (*Annotation) Clone ¶
func (a *Annotation) Clone() *Annotation
func (*Annotation) GoString ¶
func (a *Annotation) GoString() string
func (*Annotation) HeadID ¶
func (a *Annotation) HeadID() int
func (*Annotation) IsNumber ¶
func (a *Annotation) IsNumber() bool
func (*Annotation) MarshalJSON ¶
func (a *Annotation) MarshalJSON() ([]byte, error)
func (*Annotation) Process ¶
func (a *Annotation) Process(f AnnotationFixer) error
func (*Annotation) SetHead ¶
func (a *Annotation) SetHead(headAnn *Annotation)
func (*Annotation) String ¶
func (a *Annotation) String() string
func (*Annotation) UnmarshalJSON ¶
func (a *Annotation) UnmarshalJSON(b []byte) error
type AnnotationFixer ¶
type AnnotationFixer interface { Lemmatizer Stemmer Clusters() (map[string]Cluster, error) }
type AnnotationSet ¶
type AnnotationSet []*Annotation
func (AnnotationSet) Add ¶
func (as AnnotationSet) Add(a *Annotation) AnnotationSet
func (AnnotationSet) Contains ¶
func (as AnnotationSet) Contains(a *Annotation) bool
func (AnnotationSet) Index ¶
func (as AnnotationSet) Index(a *Annotation) int
func (AnnotationSet) Len ¶
func (as AnnotationSet) Len() int
func (AnnotationSet) Less ¶
func (as AnnotationSet) Less(i, j int) bool
func (AnnotationSet) Set ¶
func (as AnnotationSet) Set() AnnotationSet
func (AnnotationSet) Swap ¶
func (as AnnotationSet) Swap(i, j int)
type Corpus ¶
type Corpus interface { // ID returns the ID of a word and whether or not it was found in the corpus Id(word string) (id int, ok bool) // Word returns the word given the ID, and whether or not it was found in the corpus Word(id int) (word string, ok bool) // Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID Add(word string) int // Size returns the size of the corpus. Size() int // WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0. WordFreq(word string) int // IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0. IDFreq(id int) int // TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words. TotalFreq() int // MaxWordLength returns the length of the longest known word in the corpus MaxWordLength() int // WordProb returns the probability of a word appearing in the corpus WordProb(word string) (float64, bool) // IO stuff gob.GobEncoder gob.GobDecoder }
Corpus is the interface for the corpus.
type Dependency ¶
type Dependency struct { AnnotatedSentence // contains filtered or unexported fields }
Dependency represents the dependency parse of a sentence. While AnnotatedSentence does already do a job of representing the dependency parse of a sentence, *Dependency actually contains meta information about the dependency parse (specifically, lefts, rights) that makes parsing a dependency a lot faster
The fields are mostly left unexported for a good reason - a dependency parse SHOULD be static after it's been built
func NewDependency ¶
func NewDependency(opts ...depConsOpt) *Dependency
NewDependency creates a new *Dependency. It takes optional construction options:
FromAnnotatedSentence AllocTree
func (*Dependency) AddArc ¶
func (d *Dependency) AddArc(head, child int, label DependencyType)
func (*Dependency) AddChild ¶
func (d *Dependency) AddChild(head, child int)
func (*Dependency) AddRel ¶
func (d *Dependency) AddRel(child int, rel DependencyType)
func (*Dependency) Annotation ¶
func (d *Dependency) Annotation(i int) *Annotation
func (*Dependency) HasSingleRoot ¶
func (d *Dependency) HasSingleRoot() bool
func (*Dependency) Head ¶
func (d *Dependency) Head(i int) int
func (*Dependency) IsLegal ¶
func (d *Dependency) IsLegal() bool
func (*Dependency) IsProjective ¶
func (d *Dependency) IsProjective() bool
func (*Dependency) Label ¶
func (d *Dependency) Label(i int) DependencyType
func (*Dependency) Lefts ¶
func (d *Dependency) Lefts() [][]int
func (*Dependency) N ¶
func (d *Dependency) N() int
func (*Dependency) Rights ¶
func (d *Dependency) Rights() [][]int
func (*Dependency) Root ¶
func (d *Dependency) Root() int
func (*Dependency) Sentence ¶
func (d *Dependency) Sentence() AnnotatedSentence
func (*Dependency) SetLefts ¶
func (d *Dependency) SetLefts(l [][]int)
please only use these for testing
func (*Dependency) SetRights ¶
func (d *Dependency) SetRights(r [][]int)
func (*Dependency) SprintRel ¶
func (d *Dependency) SprintRel() string
func (*Dependency) WordCount ¶
func (d *Dependency) WordCount() int
type DependencyEdge ¶
type DependencyEdge struct { Gov *Annotation Dep *Annotation Rel DependencyType }
type DependencyTree ¶
type DependencyTree struct { Parent *DependencyTree ID int // the word number in a sentence Type DependencyType // refers to the dependency type to the parent Word *Annotation Children []*DependencyTree }
A DependencyTree is an alternate form of representing a dependency parse. This form makes it easier to traverse the tree
func NewDependencyTree ¶
func NewDependencyTree(parent *DependencyTree, ID int, ann *Annotation) *DependencyTree
func (*DependencyTree) AddChild ¶
func (d *DependencyTree) AddChild(child *DependencyTree)
func (*DependencyTree) AddRel ¶
func (d *DependencyTree) AddRel(rel DependencyType)
func (*DependencyTree) Dot ¶
func (d *DependencyTree) Dot() string
func (*DependencyTree) Walk ¶
func (d *DependencyTree) Walk(fn func(interface{}))
type DependencyType ¶
type DependencyType byte
DependencyType represents the relation between two words
const ( NoDepType DependencyType = iota Dep Root // nominal dependencies NSubj NSubjPass DObj IObj // predicate dependencies CSubj CSubjPass CComp XComp // nominal dependencies NumMod Appos NMod // predicate dependencies ACl ACl_RelCl // RCMod in stanford deps Det Det_PreDet // modifier word AMod Neg // Case Marking, preposition, possessive Case // Nominal dependencies NMod_NPMod NMod_TMod NMod_Poss // Predicate Dependencies AdvCl // Modifier Word AdvMod // Compounding and Unanalyzed Compound Compound_Part Name // Unused in English MWE Foreign // Unused in English GoesWith // Unused in English // Loose Joining Relations List Dislocated // Unused in English Parataxis Remnant // Unused in English Reparandum // Unused in English // Nominal Dependent Vocative // Unused in English Discourse Expl // Auxilliary Aux AuxPass Cop // Other Mark Punct Conj Coordination // CC CC_PreConj MAXDEPTYPE )
func (DependencyType) MarshalText ¶
func (dt DependencyType) MarshalText() ([]byte, error)
func (DependencyType) String ¶
func (i DependencyType) String() string
func (*DependencyType) UnmarshalText ¶
func (dt *DependencyType) UnmarshalText(text []byte) error
type DependencyTypeSet ¶
type DependencyTypeSet [MAXDEPTYPE]bool
DependencyTypeSet is a set of all the DependencyTypes
func (DependencyTypeSet) String ¶
func (dts DependencyTypeSet) String() string
type Lemmatizer ¶
Lemmatizer is anything that can lemmatize
type Lexeme ¶
type Lexeme struct { Value string LexemeType LexemeType Line int Col int Pos int }
func MakeLexeme ¶
func MakeLexeme(s string, t LexemeType) Lexeme
func NullLexeme ¶
func NullLexeme() Lexeme
func RootLexeme ¶
func RootLexeme() Lexeme
func StartLexeme ¶
func StartLexeme() Lexeme
type LexemeSentence ¶
type LexemeSentence []Lexeme
Lexeme Sentence
func NewLexemeSentence ¶
func NewLexemeSentence() LexemeSentence
func (LexemeSentence) String ¶
func (ls LexemeSentence) String() string
type LexemeType ¶
type LexemeType byte
const ( EOF LexemeType = iota Word Disambig URI Number Date Time Punctuation Symbol Space SystemUse )
func (LexemeType) String ¶
func (i LexemeType) String() string
type POSTag ¶
type POSTag byte
POSTag represents a Part of Speech Tag.
func POSTagShortcut ¶
POSTagShortcut is a shortcut function to help the POSTagger shortcircuit some decisions about what the tag is
func (POSTag) MarshalText ¶
func (*POSTag) UnmarshalText ¶
type Sentencer ¶
type Sentencer interface {
Sentence() AnnotatedSentence
}
Sentencer is anything that returns an AnnotatedSentence
type Shape ¶
type Shape string
Shape represents the shape of a word. It's currently implemented as a string
type WordEmbeddings ¶
type WordEmbeddings interface { Corpus // WordVector returns a vector of embeddings given the word WordVector(word string) (vec tensor.Tensor, err error) // Vector returns a vector of embeddings given the word ID Vector(id int) (vec tensor.Tensor, err error) // Embedding returns the matrix Embedding() tensor.Tensor }
WordEmbeddings is any type that is both a corpus and can return word vectors
Source Files ¶
- POSTag.go
- POSTag_universal.go
- POSTag_universal_string.go
- annotation.go
- annotationSet.go
- browncluster.go
- const.go
- dependency.go
- dependencyTree.go
- dependencyType.go
- dependencyType_universal.go
- dependencyType_universal_string.go
- errors.go
- interfaces.go
- io.go
- lexeme.go
- lexemetype_string.go
- lingo.go
- sentence.go
- sets.go
- shape.go
- stopwords.go
- utils.go
- wordFlags.go