dict

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 15, 2024 License: MIT Imports: 13 Imported by: 34

Documentation

Overview

Package dict implements the dictionary of the morph analyzer.

Index

Constants

View Source
const (
	POSStartIndex      = "_pos_start"
	POSHierarchy       = "_pos_hierarchy"
	InflectionalType   = "_inflectional_type"
	InflectionalForm   = "_inflectional_form"
	BaseFormIndex      = "_base"
	ReadingIndex       = "_reading"
	PronunciationIndex = "_pronunciation"
)
View Source
const (
	// MorphDictFileName is the default file name of a morph dict.
	MorphDictFileName = "morph.dict"
	// POSDictFileName is the default file name of a part of speech dict.
	POSDictFileName = "pos.dict"
	// ContentMetaFileName is the default file name of content meta.
	ContentMetaFileName = "content.meta"
	// ContentDictFileName is the default file name of a content dict.
	ContentDictFileName = "content.dict"
	// IndexDictFileName is the default filename of a dictionary index.
	IndexDictFileName = "index.dict"
	// ConnectionDictFileName is the default filename of a connection dict.
	ConnectionDictFileName = "connection.dict"
	// CharDefDictFileName is the default filename of a char def.
	CharDefDictFileName = "chardef.dict"
	// UnkDictFileName is the default filename of an unknown dict.
	UnkDictFileName = "unk.dict"
	// DictInfoFileName is the file name of a dictionary info.
	DictInfoFileName = "dict.info"
)
View Source
const UserDictColumnSize = 4

UserDictColumnSize is the column size of the user dictionary.

Variables

This section is empty.

Functions

func NewContents

func NewContents(b []byte) [][]string

NewContents creates dictionary contents from byte slice.

Types

type CharCategory

type CharCategory []byte

CharCategory represents categories for characters.

type CharClass

type CharClass []string

CharClass represents a character class.

type CharDef

type CharDef struct {
	CharClass    CharClass
	CharCategory CharCategory
	InvokeList   InvokeList
	GroupList    GroupList
}

CharDef represents char.def.

func ReadCharDef

func ReadCharDef(r io.Reader) (*CharDef, error)

ReadCharDef reads char.def format.

func (CharDef) WriteTo

func (d CharDef) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriteTo interface.

type ConnectionTable

type ConnectionTable struct {
	Row, Col int64
	Vec      []int16
}

ConnectionTable represents a connection matrix of morphs.

func ReadConnectionTable

func ReadConnectionTable(r io.Reader) (ConnectionTable, error)

ReadConnectionTable loads ConnectionTable from io.Reader.

func (*ConnectionTable) At

func (t *ConnectionTable) At(row, col int) int16

At returns the connection cost of matrix[row, col].

func (ConnectionTable) WriteTo

func (t ConnectionTable) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriterTo interface

type Contents

type Contents [][]string

Contents represents dictionary contents.

func ReadContents

func ReadContents(r io.Reader) (Contents, error)

ReadContents reads dictionary contents from io.Reader.

func (Contents) WriteTo

func (c Contents) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriterTo interface.

type ContentsMeta

type ContentsMeta map[string]int8

ContentsMeta represents the contents record information.

func ReadContentsMeta

func ReadContentsMeta(r io.Reader) (ContentsMeta, error)

func (ContentsMeta) WriteTo

func (c ContentsMeta) WriteTo(w io.Writer) (n int64, err error)

type Dict

type Dict struct {
	Morphs       Morphs
	POSTable     POSTable
	ContentsMeta ContentsMeta
	Contents     Contents
	Connection   ConnectionTable
	Index        IndexTable
	CharClass    CharClass
	CharCategory CharCategory
	InvokeList   InvokeList
	GroupList    GroupList
	UnkDict      UnkDict
	// contains filtered or unexported fields
}

Dict represents a dictionary of a tokenizer.

func Load

func Load(r *zip.Reader, full bool) (*Dict, error)

Load loads a dictionary from a zipped reader.

func LoadDictFile

func LoadDictFile(path string) (d *Dict, err error)

LoadDictFile loads a dictionary from a file.

func LoadShrink

func LoadShrink(path string) (d *Dict, err error)

LoadShrink loads a dictionary from a file without contents.

func (*Dict) CharacterCategory

func (d *Dict) CharacterCategory(r rune) byte

CharacterCategory returns the category of a rune.

func (Dict) Info added in v1.1.0

func (d Dict) Info() *Info

func (Dict) Save

func (d Dict) Save(zw *zip.Writer) error

Save saves a dictionary in a zipped format.

func (*Dict) SetInfo added in v1.1.0

func (d *Dict) SetInfo(info *Info)

type GroupList

type GroupList []bool

GroupList represents whether to make a new word by grouping the same character category.

type IndexTable

type IndexTable struct {
	Da  trie.DoubleArray
	Dup map[int32]int32
}

IndexTable represents a dictionary index.

func BuildIndexTable

func BuildIndexTable(sortedKeywords []string) (IndexTable, error)

BuildIndexTable constructs a index table from keywords.

func ReadIndexTable

func ReadIndexTable(r io.Reader) (IndexTable, error)

ReadIndexTable loads a index table.

func (IndexTable) CommonPrefixSearch

func (idx IndexTable) CommonPrefixSearch(input string) (lens []int, ids [][]int)

CommonPrefixSearch finds keywords sharing common prefix in an input and returns the ids and it's lengths if found.

func (IndexTable) CommonPrefixSearchCallback

func (idx IndexTable) CommonPrefixSearchCallback(input string, callback func(id, l int))

CommonPrefixSearchCallback finds keywords sharing common prefix in an input and callback with id and length.

func (IndexTable) Search

func (idx IndexTable) Search(input string) []int

Search finds the given keyword and returns the id if found.

func (IndexTable) WriteTo

func (idx IndexTable) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriterTo interface.

type Info added in v1.1.0

type Info struct {
	Name string
	Src  string
}

Info represents the dictionary info.

func ReadDictInfo added in v1.1.0

func ReadDictInfo(r io.Reader) *Info

ReadDictInfo reads gob encoded dictionary info and returns it.

For backward compatibility, if a dictionary name is not defined or empty, it returns UndefinedDictName.

func (Info) WriteTo added in v1.1.0

func (d Info) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriteTo interface.

type InvokeList

type InvokeList []bool

InvokeList represents whether to invoke unknown word processing.

type Morph

type Morph struct {
	LeftID, RightID, Weight int16
}

Morph represents part of speeches and an occurrence cost.

type Morphs

type Morphs []Morph

Morphs represents a slice of morphs.

func ReadMorphs

func ReadMorphs(r io.Reader) (Morphs, error)

ReadMorphs loads morph data from io.Reader.

func (Morphs) WriteTo

func (m Morphs) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriterTo interface.

type POS

type POS []POSID

POS represents a vector of part of speech.

type POSID

type POSID uint16

POSID represents a ID of part of speech.

type POSMap

type POSMap map[string]POSID

POSMap represents a part of speech control table.

func (POSMap) Add

func (p POSMap) Add(pos []string) POS

Add adds part of speech item to the POS control table and returns it's id.

func (POSMap) List

func (p POSMap) List() []string

List returns a list whose index is POS ID and value is its name.

type POSTable

type POSTable struct {
	POSs     []POS
	NameList []string
}

POSTable represents a table for managing part of speeches.

func ReadPOSTable

func ReadPOSTable(r io.Reader) (POSTable, error)

ReadPOSTable loads a POS table.

func (POSTable) WriteTo

func (p POSTable) WriteTo(w io.Writer) (int64, error)

WriteTo saves a POS table.

type SizeReaderAt

type SizeReaderAt interface {
	ReadAt(p []byte, off int64) (n int, err error)
	Size() int64
}

SizeReaderAt is the interface that wraps the Size and ReadAt method.

func MultiSizeReaderAt

func MultiSizeReaderAt(rs ...SizeReaderAt) SizeReaderAt

MultiSizeReaderAt returns a SizeReaderAt that is the logical concatenation of the provided input readers.

type UnkDict

type UnkDict struct {
	Morphs       Morphs
	Index        map[int32]int32
	IndexDup     map[int32]int32
	ContentsMeta ContentsMeta
	Contents     Contents
}

UnkDict represents an unknown word dictionary part.

func ReadUnkDic

func ReadUnkDic(r io.Reader) (UnkDict, error)

ReadUnkDic loads an unknown word dictionary.

func (UnkDict) WriteTo

func (u UnkDict) WriteTo(w io.Writer) (n int64, err error)

WriteTo implements the io.WriterTo interface.

type UserDicRecord

type UserDicRecord struct {
	Text   string   `json:"text"`
	Tokens []string `json:"tokens"`
	Yomi   []string `json:"yomi"`
	Pos    string   `json:"pos"`
}

UserDicRecord represents a record of the user dictionary file format.

type UserDict

type UserDict struct {
	Index    IndexTable
	Contents []UserDictContent
}

UserDict represents a user dictionary.

func NewUserDict

func NewUserDict(path string) (*UserDict, error)

NewUserDict build a user dictionary from a file.

type UserDictContent

type UserDictContent struct {
	Tokens []string
	Yomi   []string
	Pos    string
}

UserDictContent represents contents of a word in a user dictionary.

type UserDictRecords

type UserDictRecords []UserDicRecord

UserDictRecords represents user dictionary data.

func NewUserDicRecords

func NewUserDicRecords(r io.Reader) (UserDictRecords, error)

NewUserDicRecords loads user dictionary data from io.Reader.

func (UserDictRecords) Len

func (u UserDictRecords) Len() int

func (UserDictRecords) Less

func (u UserDictRecords) Less(i, j int) bool

func (UserDictRecords) NewUserDict

func (u UserDictRecords) NewUserDict() (*UserDict, error)

NewUserDict builds a user dictionary.

func (UserDictRecords) Swap

func (u UserDictRecords) Swap(i, j int)

Directories

Path Synopsis
Package builder implements the dictionary builder.
Package builder implements the dictionary builder.
Package trie implements the double array trie library.
Package trie implements the double array trie library.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL