apoco

package

v0.0.6 Latest Latest Go to latest Published: Oct 16, 2020 License: MIT Imports: 20 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

git.sr.ht/~flobar/apoco

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func AgreeingOCRs(t Token, i, n int) (float64, bool)
func ApplyOCRToCorrection(ocr, sug string) string
func CandidateAgreeingOCR(t Token, i, n int) (float64, bool)
func CandidateHistPatternConf(t Token, i, n int) (float64, bool)
func CandidateLen(t Token, i, n int) (float64, bool)
func CandidateLevenshteinDist(t Token, i, n int) (float64, bool)
func CandidateMatchesOCR(t Token, i, n int) (float64, bool)
func CandidateMaxTrigramFreq(t Token, i, n int) (float64, bool)
func CandidateMinTrigramFreq(t Token, i, n int) (float64, bool)
func CandidateOCRPatternConf(t Token, i, n int) (float64, bool)
func CandidateProfilerWeight(t Token, i, n int) (float64, bool)
func CandidateTrigramFreq(t Token, i, n int) (float64, bool)
func CandidateUnigramFreq(t Token, i, n int) (float64, bool)
func ConnectCandidates(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token
func DocumentLexicality(t Token, i, n int) (float64, bool)
func EachToken(ctx context.Context, in <-chan Token, f func(Token) error) error
func EachTokenGroup(ctx context.Context, in <-chan Token, f func(string, ...Token) error) error
func FilterLexiconEntries(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token
func FilterShort(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token
func Normalize(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token
func OCRMaxCharConf(t Token, i, n int) (float64, bool)
func OCRMaxTrigramFreq(t Token, i, n int) (float64, bool)
func OCRMinCharConf(t Token, i, n int) (float64, bool)
func OCRMinTrigramFreq(t Token, i, n int) (float64, bool)
func OCRTokenConf(t Token, i, n int) (float64, bool)
func OCRTokenLen(t Token, i, n int) (float64, bool)
func OCRTrigramFreq(t Token, i, n int) (float64, bool)
func OCRUnigramFreq(t Token, i, n int) (float64, bool)
func Pipe(ctx context.Context, g *errgroup.Group, r StreamFunc, ps ...StreamFunc) <-chan Token
func RankingCandidateConfDiffToNext(t Token, i, n int) (float64, bool)
func RankingConf(t Token, i, n int) (float64, bool)
func RankingConfDiffToNext(t Token, i, n int) (float64, bool)
func SendTokens(ctx context.Context, out chan<- Token, tokens ...Token) error
type Char
- func (char Char) String() string
type Chars
- func (chars Chars) String() string
type Config
- func ReadConfig(file string) (*Config, error)
- func (c *Config) Overwrite(model string, nocr int, cautious, cache bool)
type Correction
type FeatureFunc
type FeatureSet
- func NewFeatureSet(names ...string) (FeatureSet, error)
- func (fs FeatureSet) Calculate(xs []float64, t Token, n int) []float64
type FreqList
type LanguageModel
- func (lm *LanguageModel) AddUnigram(token string)
- func (lm *LanguageModel) EachTrigram(str string, f func(float64))
- func (lm *LanguageModel) LoadGzippedNGram(path string) error
- func (lm *LanguageModel) LoadProfile(ctx context.Context, exe, config string, cache bool, tokens ...Token) error
- func (lm *LanguageModel) Trigram(str string) float64
- func (lm *LanguageModel) Unigram(str string) float64
type Model
- func ReadModel(model, ngrams string) (Model, error)
- func (m Model) Get(mod string, nocr int) (*ml.LR, FeatureSet, error)
- func (m Model) Put(mod string, nocr int, lr *ml.LR, fs []string)
- func (m Model) Write(path string) (err error)
type Ranking
type StreamFunc
- func ConnectCorrections(lr *ml.LR, fs FeatureSet, n int) StreamFunc
- func ConnectLM(c *Config, ngrams FreqList) StreamFunc
- func ConnectRankings(lr *ml.LR, fs FeatureSet, n int) StreamFunc
- func FilterBad(min int) StreamFunc
type Token
- func ReadToken(ctx context.Context, in <-chan Token) (Token, bool, error)
- func (t *Token) HasTrait(i int, trait TraitType) bool
- func (t Token) IsLexiconEntry() bool
- func (t *Token) SetTrait(i int, trait TraitType)
- func (t Token) String() string
type TraitType

Constants ¶

View Source

const (
	FirstInLine = 1 << iota
	LastInLine
	LowerCase
	UpperCase
	TitleCase
	MixedCase
)

Trait flags

Variables ¶

This section is empty.

Functions ¶

func AgreeingOCRs ¶

func AgreeingOCRs(t Token, i, n int) (float64, bool)

AgreeingOCRs returns the number of OCRs that aggree with the master OCR token.

func ApplyOCRToCorrection ¶

func ApplyOCRToCorrection(ocr, sug string) string

ApplyOCRToCorrection applies the casing of the master OCR string to the correction's candidate suggestion and prepends and appends any punctuation of the master OCR to the suggestion.

func CandidateAgreeingOCR ¶

func CandidateAgreeingOCR(t Token, i, n int) (float64, bool)

CandidateAgreeingOCR returns the number of OCR tokens that agree with the specific profiler candidate of the token.

func CandidateHistPatternConf ¶

func CandidateHistPatternConf(t Token, i, n int) (float64, bool)

CandidateHistPatternConf returns the product of the confidences of the primary OCR characters for the assumed historical rewrite pattern of the connected candidate.

func CandidateLen ¶

func CandidateLen(t Token, i, n int) (float64, bool)

CandidateLen returns the length of the connected profiler candidate.

func CandidateLevenshteinDist ¶

func CandidateLevenshteinDist(t Token, i, n int) (float64, bool)

CandidateLevenshteinDist returns the levenshtein distance between the OCR token and the token's connected profiler candidate. For the master OCR the according Distance from the profiler candidate is used, whereas for support OCRs the levenshtein distance is calculated.

func CandidateMatchesOCR ¶

func CandidateMatchesOCR(t Token, i, n int) (float64, bool)

CandidateMatchesOCR returns true if the according ocr matches the connected candidate and false otherwise.

func CandidateMaxTrigramFreq ¶

func CandidateMaxTrigramFreq(t Token, i, n int) (float64, bool)

CandidateMaxTrigramFreq returns the maximal trigram frequenzy for the connected candidate.

func CandidateMinTrigramFreq ¶

func CandidateMinTrigramFreq(t Token, i, n int) (float64, bool)

CandidateMinTrigramFreq returns the minimal trigram frequezy for the connected candidate.

func CandidateOCRPatternConf ¶

func CandidateOCRPatternConf(t Token, i, n int) (float64, bool)

CandidateOCRPatternConf returns the product of the confidences of the primary OCR characters for the assumed OCR error pattern of the connected candidate.

func CandidateProfilerWeight ¶

func CandidateProfilerWeight(t Token, i, n int) (float64, bool)

CandidateProfilerWeight returns the profiler confidence value for tokens candidate.

func CandidateTrigramFreq ¶

func CandidateTrigramFreq(t Token, i, n int) (float64, bool)

CandidateTrigramFreq returns the product of the candidate's trigrams.

func CandidateUnigramFreq ¶

func CandidateUnigramFreq(t Token, i, n int) (float64, bool)

CandidateUnigramFreq returns the relative frequency of the token's candidate.

func ConnectCandidates ¶

func ConnectCandidates(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token

ConnectCandidates connects tokens with their respective candidates to the stream. Tokens with no candidates or tokens with only a modern interpretation are filtered from the stream.

func DocumentLexicality ¶

func DocumentLexicality(t Token, i, n int) (float64, bool)

DocumentLexicality returns the (global) lexicality of the given token's document. Using this feature only makes sense if the training contains at least more than one training document.

func EachToken ¶

func EachToken(ctx context.Context, in <-chan Token, f func(Token) error) error

EachToken iterates over the tokens in the input channel and calls the callback function for each token.

func EachTokenGroup ¶ added in v0.0.6

func EachTokenGroup(ctx context.Context, in <-chan Token, f func(string, ...Token) error) error

EachTokenGroup iterates over the tokens grouping them together based on their groups. The given callback function is called for each group of tokens.

func FilterLexiconEntries ¶

func FilterLexiconEntries(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token

FilterLexiconEntries filters all tokens that are lexicon entries from the stream.

func FilterShort ¶

func FilterShort(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token

FilterShort filters short master OCR tokens from the input stream. Short tokens are tokens, with less than 4 unicode characters.

func Normalize ¶

func Normalize(ctx context.Context, g *errgroup.Group, in <-chan Token) <-chan Token

Normalize trims all leading and subsequent punctionation from the tokens, converts them to lowercase and replaces any whitespace (in the case of merges due to the alignment) with a '_'.

func OCRMaxCharConf ¶

func OCRMaxCharConf(t Token, i, n int) (float64, bool)

OCRMaxCharConf returns the maximal character confidence of the master OCR token.

func OCRMaxTrigramFreq ¶

func OCRMaxTrigramFreq(t Token, i, n int) (float64, bool)

OCRMaxTrigramFreq returns the maximal trigram relative frequenzy confidence of the tokens.

func OCRMinCharConf ¶

func OCRMinCharConf(t Token, i, n int) (float64, bool)

OCRMinCharConf returns the minimal character confidence of the master OCR token.

func OCRMinTrigramFreq ¶

func OCRMinTrigramFreq(t Token, i, n int) (float64, bool)

OCRMinTrigramFreq returns the minimal trigram relative frequenzy confidence of the tokens.

func OCRTokenConf ¶

func OCRTokenConf(t Token, i, n int) (float64, bool)

OCRTokenConf return the OCR-confidence for the the given configuration.

func OCRTokenLen ¶

func OCRTokenLen(t Token, i, n int) (float64, bool)

OCRTokenLen returns the length of the OCR token. It operates on any configuration.

func OCRTrigramFreq ¶

func OCRTrigramFreq(t Token, i, n int) (float64, bool)

OCRTrigramFreq returns the product of the OCR token's trigrams.

func OCRUnigramFreq ¶

func OCRUnigramFreq(t Token, i, n int) (float64, bool)

OCRUnigramFreq returns the relative frequency of the OCR token in the unigram language model.

func Pipe ¶

func Pipe(ctx context.Context, g *errgroup.Group, r StreamFunc, ps ...StreamFunc) <-chan Token

Pipe pipes multiple stream funcs together. The first function in the list (the reader) is called with a nil channel

func RankingCandidateConfDiffToNext ¶

func RankingCandidateConfDiffToNext(t Token, i, n int) (float64, bool)

RankingCandidateConfDiffToNext returns the top ranked candidate's weight minus the the weight of the next (or 0).

func RankingConf ¶

func RankingConf(t Token, i, n int) (float64, bool)

RankingConf returns the confidence of the best ranked correction candidate for the given token.

func RankingConfDiffToNext ¶

func RankingConfDiffToNext(t Token, i, n int) (float64, bool)

RankingConfDiffToNext returns the difference of the best ranked correction candidate's confidence to the next. If only one correction candidate is available, the next ranking's confidence is assumed to be 0.

func SendTokens ¶

func SendTokens(ctx context.Context, out chan<- Token, tokens ...Token) error

SendTokens writes tokens into the given output channel.

Types ¶

type Char ¶

type Char struct {
	Conf float64 // confidence of the rune
	Char rune    // rune
}

Char represents an OCR char with its confidence.

func (Char) String ¶

func (char Char) String() string

type Chars ¶

type Chars []Char

Chars represents the master OCR chars with the respective confidences.

func (Chars) String ¶

func (chars Chars) String() string

type Config ¶

type Config struct {
	Model          string   `json:"model,omitempty"`
	Ngrams         string   `json:"ngrams"`
	ProfilerBin    string   `json:"profilerBin"`
	ProfilerConfig string   `json:"profilerConfig"`
	RRFeatures     []string `json:"rrFeatures"`
	DMFeatures     []string `json:"dmFeatures"`
	LearningRate   float64  `json:"learningRate"`
	Ntrain         int      `json:"ntrain"`
	Nocr           int      `json:"nocr"`
	Cache          bool     `json:"cache"`
	Cautious       bool     `json:"cautious"`
}

Config defines the command's configuration.

func ReadConfig ¶

func ReadConfig(file string) (*Config, error)

ReadConfig reads the config from a json file.

func (*Config) Overwrite ¶

func (c *Config) Overwrite(model string, nocr int, cautious, cache bool)

Overwrite overwrites the appropriate variables in the config file with the given values. Values only overwrite the variables if they are not go's default zero value.

type Correction ¶

type Correction struct {
	Candidate *gofiler.Candidate
	Conf      float64
}

Correction represents a correction decision for tokens.

type FeatureFunc ¶

type FeatureFunc func(t Token, i, n int) (float64, bool)

FeatureFunc defines the function a feature needs to implement. A feature func gets a token and a configuration (the current OCR-index i and the total number of parallel OCRs n). The function then should return the feature value for the given token and wether this feature applies for the given configuration (i and n).

type FeatureSet ¶

type FeatureSet []FeatureFunc

FeatureSet is just a list of feature funcs.

func NewFeatureSet ¶

func NewFeatureSet(names ...string) (FeatureSet, error)

NewFeatureSet creates a new feature set from the list of feature function names.

func (FeatureSet) Calculate ¶

func (fs FeatureSet) Calculate(xs []float64, t Token, n int) []float64

Calculate calculates the feature vector for the given feature functions for the given token and the given number of OCRs and appends it to the given vector. Any given feature function that does not apply to the given configuration (and returns false as it second return parameter for the configuration) is omitted and not appended to the resulting feature vector.

type FreqList ¶

type FreqList struct {
	FreqList map[string]int `json:"freqList"`
	Total    int            `json:"total"`
}

FreqList is a simple frequenzy map.

type LanguageModel ¶

type LanguageModel struct {
	Profile    gofiler.Profile
	Lexicality float64
	// contains filtered or unexported fields
}

LanguageModel consists of holds the language model for tokens.

func (*LanguageModel) AddUnigram ¶

func (lm *LanguageModel) AddUnigram(token string)

AddUnigram adds the token to the language model's unigram map.

func (*LanguageModel) EachTrigram ¶

func (lm *LanguageModel) EachTrigram(str string, f func(float64))

EachTrigram looks up the trigrams of the given token and returns the product of the token's trigrams.

func (*LanguageModel) LoadGzippedNGram ¶

func (lm *LanguageModel) LoadGzippedNGram(path string) error

LoadGzippedNGram loads the (gzipped) ngram model file. The expected format for each line is `%d,%s`.

func (*LanguageModel) LoadProfile ¶

func (lm *LanguageModel) LoadProfile(ctx context.Context, exe, config string, cache bool, tokens ...Token) error

LoadProfile loads the profile for the master OCR tokens.

func (*LanguageModel) Trigram ¶

func (lm *LanguageModel) Trigram(str string) float64

Trigram looks up the trigrams of the given token and returns the product of the token's trigrams.

func (*LanguageModel) Unigram ¶

func (lm *LanguageModel) Unigram(str string) float64

Unigram looks up the given token in the unigram list (or 0 if the unigram is not present).

type Model ¶

type Model struct {
	Models map[string]map[int]modelData `json:"models"`
	Ngrams FreqList                     `json:"ngrams"`
}

Model holds the different models for the different number of OCRs.

func ReadModel ¶

func ReadModel(model, ngrams string) (Model, error)

ReadModel reads a model from a gzip compressed input file. If the given file does not exist, an empty model is returned. If the model does not contain a valid ngram frequency list, the list is loaded from the given path.

func (Model) Get ¶

func (m Model) Get(mod string, nocr int) (*ml.LR, FeatureSet, error)

Get loads the the model and the according feature set for the given configuration.

func (Model) Put ¶

func (m Model) Put(mod string, nocr int, lr *ml.LR, fs []string)

Put inserts the weights and the according feature set for the given configuration into this model.

func (Model) Write ¶

func (m Model) Write(path string) (err error)

Write writes the model as json encoded, gziped file to the given path overwriting any previous existing models.

type Ranking ¶

type Ranking struct {
	Candidate *gofiler.Candidate
	Prob      float64
}

Ranking maps correction candidates of tokens to their predicted probabilities.

type StreamFunc ¶

type StreamFunc func(context.Context, *errgroup.Group, <-chan Token) <-chan Token

StreamFunc is a type def for stream funcs.

func ConnectCorrections ¶

func ConnectCorrections(lr *ml.LR, fs FeatureSet, n int) StreamFunc

ConnectCorrections connects the tokens with the decider's correction decisions.

func ConnectLM ¶

func ConnectLM(c *Config, ngrams FreqList) StreamFunc

ConnectLM loads the language model for the tokens and adds them to each token. Based on the file group of the tokens different language models are loaded.

func ConnectRankings ¶

func ConnectRankings(lr *ml.LR, fs FeatureSet, n int) StreamFunc

ConnectRankings connects the tokens of the input stream with their respective rankings.

func FilterBad ¶

func FilterBad(min int) StreamFunc

FilterBad filters tokens with not enough ocr and/or gt tokens.

type Token ¶

type Token struct {
	LM      *LanguageModel // language model for this token
	Payload interface{}    // token payload; *gofiler.Candidate, []Ranking or Correction
	File    string         // the file of the token
	Group   string         // file group of the token
	ID      string         // id of the token in this file
	Chars   Chars          // master OCR tokens with confidences
	Confs   []float64      // master and support OCR confidences
	Tokens  []string       // master and support OCRs and gt
	Lines   []string       // lines of the tokens
	// contains filtered or unexported fields
}

Token represent aligned OCR-tokens.

func ReadToken ¶

func ReadToken(ctx context.Context, in <-chan Token) (Token, bool, error)

ReadToken reads one token from the given channel.

func (*Token) HasTrait ¶

func (t *Token) HasTrait(i int, trait TraitType) bool

HasTrait returns true if the token has the given trait.

func (Token) IsLexiconEntry ¶

func (t Token) IsLexiconEntry() bool

IsLexiconEntry returns true if this token is a normal lexicon entry for its connected language model.

func (*Token) SetTrait ¶

func (t *Token) SetTrait(i int, trait TraitType)

SetTrait sets a trait.

func (Token) String ¶

func (t Token) String() string

type TraitType ¶

type TraitType int64

TraitType is used to define different traits for the tokens.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
align
lev
mets
ml
node Package node provides helper functions to work with queryxml.Node pointers.	Package node provides helper functions to work with queryxml.Node pointers.
pagexml
snippets

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL