metrics

package

v0.3.1 Latest Latest Go to latest Published: Sep 27, 2023 License: MIT Imports: 5 Imported by: 70

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/adrg/strutil

Links

Open Source Insights

Documentation ¶

Index ¶

type Hamming
- func NewHamming() *Hamming
- func (m *Hamming) Compare(a, b string) float64
- func (m *Hamming) Distance(a, b string) int
type Jaccard
- func NewJaccard() *Jaccard
- func (m *Jaccard) Compare(a, b string) float64
type Jaro
- func NewJaro() *Jaro
- func (m *Jaro) Compare(a, b string) float64
type JaroWinkler
- func NewJaroWinkler() *JaroWinkler
- func (m *JaroWinkler) Compare(a, b string) float64
type Levenshtein
- func NewLevenshtein() *Levenshtein
- func (m *Levenshtein) Compare(a, b string) float64
- func (m *Levenshtein) Distance(a, b string) int
type MatchMismatch
- func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64
- func (m MatchMismatch) Max() float64
- func (m MatchMismatch) Min() float64
type OverlapCoefficient
- func NewOverlapCoefficient() *OverlapCoefficient
- func (m *OverlapCoefficient) Compare(a, b string) float64
type SmithWatermanGotoh
- func NewSmithWatermanGotoh() *SmithWatermanGotoh
- func (m *SmithWatermanGotoh) Compare(a, b string) float64
type SorensenDice
- func NewSorensenDice() *SorensenDice
- func (m *SorensenDice) Compare(a, b string) float64
type Substitution

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Hamming ¶ added in v0.2.2

type Hamming struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Hamming represents the Hamming metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Hamming_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	h := metrics.NewHamming()

	sim := h.Compare("text", "test")
	fmt.Printf("(text, test) similarity: %.2f\n", sim)

	dist := h.Distance("text", "test")
	fmt.Printf("(text, test) distance: %d\n", dist)

	// Custom options.
	h.CaseSensitive = false

	sim = h.Compare("ONE", "once")
	fmt.Printf("(ONE, once) similarity: %.2f\n", sim)

	dist = h.Distance("one", "once")
	fmt.Printf("(ONE, once) distance: %d\n", dist)

}

Output:

(text, test) similarity: 0.75
(text, test) distance: 1
(ONE, once) similarity: 0.50
(ONE, once) distance: 2

func NewHamming ¶ added in v0.2.2

func NewHamming() *Hamming

NewHamming returns a new Hamming string metric.

Default options:

CaseSensitive: true

func (*Hamming) Compare ¶ added in v0.2.2

func (m *Hamming) Compare(a, b string) float64

Compare returns the Hamming similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Hamming) Distance ¶ added in v0.2.2

func (m *Hamming) Distance(a, b string) int

Distance returns the Hamming distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type Jaccard ¶ added in v0.2.0

type Jaccard struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

Jaccard represents the Jaccard index for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaccard_index.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	j := metrics.NewJaccard()
	sim := j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	j.CaseSensitive = false
	j.NgramSize = 3

	sim = j.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.43
(night, alright) similarity: 0.33

func NewJaccard ¶ added in v0.2.0

func NewJaccard() *Jaccard

NewJaccard returns a new Jaccard string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*Jaccard) Compare ¶ added in v0.2.0

func (m *Jaccard) Compare(a, b string) float64

Compare returns the Jaccard similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Jaro ¶

type Jaro struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

Jaro represents the Jaro metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jaro := metrics.NewJaro()
	sim := jaro.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}

Output:

(sort, shirt) similarity: 0.78

func NewJaro ¶

func NewJaro() *Jaro

NewJaro returns a new Jaro string metric.

Default options:

CaseSensitive: true

func (*Jaro) Compare ¶

func (m *Jaro) Compare(a, b string) float64

Compare returns the Jaro similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type JaroWinkler ¶

type JaroWinkler struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool
}

JaroWinkler represents the Jaro-Winkler metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	jw := metrics.NewJaroWinkler()
	sim := jw.Compare("sort", "shirt")
	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)

}

Output:

(sort, shirt) similarity: 0.80

func NewJaroWinkler ¶

func NewJaroWinkler() *JaroWinkler

NewJaroWinkler returns a new Jaro-Winkler string metric.

Default options:

CaseSensitive: true

func (*JaroWinkler) Compare ¶

func (m *JaroWinkler) Compare(a, b string) float64

Compare returns the Jaro-Winkler similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type Levenshtein ¶

type Levenshtein struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// InsertCost represents the Levenshtein cost of a character insertion.
	InsertCost int

	// InsertCost represents the Levenshtein cost of a character deletion.
	DeleteCost int

	// InsertCost represents the Levenshtein cost of a character substitution.
	ReplaceCost int
}

Levenshtein represents the Levenshtein metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	lev := metrics.NewLevenshtein()

	sim := lev.Compare("book", "brick")
	fmt.Printf("(book, brick) similarity: %.2f\n", sim)

	dist := lev.Distance("book", "brick")
	fmt.Printf("(book, brick) distance: %d\n", dist)

	// Custom options.
	lev.CaseSensitive = false
	lev.ReplaceCost = 2

	sim = lev.Compare("HELLO", "jello")
	fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)

	dist = lev.Distance("HELLO", "jello")
	fmt.Printf("(HELLO, jello) distance: %d\n", dist)

}

Output:

(book, brick) similarity: 0.40
(book, brick) distance: 3
(HELLO, jello) similarity: 0.60
(HELLO, jello) distance: 2

func NewLevenshtein ¶

func NewLevenshtein() *Levenshtein

NewLevenshtein returns a new Levenshtein string metric.

Default options:

CaseSensitive: true
InsertCost: 1
DeleteCost: 1
ReplaceCost: 1

func (*Levenshtein) Compare ¶

func (m *Levenshtein) Compare(a, b string) float64

Compare returns the Levenshtein similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

func (*Levenshtein) Distance ¶

func (m *Levenshtein) Distance(a, b string) int

Distance returns the Levenshtein distance between a and b. Lower distances indicate closer matches. A distance of 0 means the strings are identical.

type MatchMismatch ¶

type MatchMismatch struct {
	// Match represents the score of equal character substitutions.
	Match float64

	// Mismatch represents the score of unequal character substitutions.
	Mismatch float64
}

MatchMismatch represents a substitution function which returns the match or mismatch value depeding on the equality of the compared characters. The match value must be greater than the mismatch value.

func (MatchMismatch) Compare ¶

func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64

Compare returns the match value if a[idxA] is equal to b[idxB] or the mismatch value otherwise.

func (MatchMismatch) Max ¶

func (m MatchMismatch) Max() float64

Max returns the match value.

func (MatchMismatch) Min ¶

func (m MatchMismatch) Min() float64

Min returns the mismatch value.

type OverlapCoefficient ¶ added in v0.2.0

type OverlapCoefficient struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

OverlapCoefficient represents the overlap coefficient for measuring the similarity between sequences. The metric is also know as the Szymkiewicz-Simpson coefficient.

For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	oc := metrics.NewOverlapCoefficient()
	sim := oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Subset comparison.
	sim = oc.Compare("aa", "aaaa")
	fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)

	// Custom options.
	oc.CaseSensitive = false
	oc.NgramSize = 3

	sim = oc.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.75
(aa, aaaa) similarity: 1.00
(night, alright) similarity: 0.67

func NewOverlapCoefficient ¶ added in v0.2.0

func NewOverlapCoefficient() *OverlapCoefficient

NewOverlapCoefficient returns a new overlap coefficient string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*OverlapCoefficient) Compare ¶ added in v0.2.0

func (m *OverlapCoefficient) Compare(a, b string) float64

Compare returns the OverlapCoefficient similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type SmithWatermanGotoh ¶

type SmithWatermanGotoh struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// GapPenalty defines a score penalty for character insertions or deletions.
	// For relevant results, the gap penalty should be a non-positive number.
	GapPenalty float64

	// Substitution represents a substitution function which is used to
	// calculate a score for character substitutions.
	Substitution Substitution
}

SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	swg := metrics.NewSmithWatermanGotoh()

	sim := swg.Compare("a pink kitten", "a kitten")
	fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)

	// Custom options.
	swg.CaseSensitive = false
	swg.GapPenalty = -0.1
	swg.Substitution = metrics.MatchMismatch{
		Match:    1,
		Mismatch: -0.5,
	}

	sim = swg.Compare("a pink kitten", "A KITTEN")
	fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)

}

Output:

(a pink kitten, a kitten) similarity: 0.88
(a pink kitten, A KITTEN) similarity: 0.94

func NewSmithWatermanGotoh ¶

func NewSmithWatermanGotoh() *SmithWatermanGotoh

NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.

Default options:

CaseSensitive: true
GapPenalty: -0.5
Substitution: MatchMismatch{
	Match:    1,
	Mismatch: -2,
},

func (*SmithWatermanGotoh) Compare ¶

func (m *SmithWatermanGotoh) Compare(a, b string) float64

Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches.

type SorensenDice ¶

type SorensenDice struct {
	// CaseSensitive specifies if the string comparison is case sensitive.
	CaseSensitive bool

	// NgramSize represents the size (in characters) of the tokens generated
	// when comparing the input sequences.
	NgramSize int
}

SorensenDice represents the Sorensen-Dice metric for measuring the similarity between sequences.

For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.

Example ¶

package main

import (
	"fmt"

	"github.com/adrg/strutil/metrics"
)

func main() {
	// Default options.
	sd := metrics.NewSorensenDice()
	sim := sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

	// Custom options.
	sd.CaseSensitive = false
	sd.NgramSize = 3

	sim = sd.Compare("night", "alright")
	fmt.Printf("(night, alright) similarity: %.2f\n", sim)

}

Output:

(night, alright) similarity: 0.60
(night, alright) similarity: 0.50

func NewSorensenDice ¶

func NewSorensenDice() *SorensenDice

NewSorensenDice returns a new Sorensen-Dice string metric.

Default options:

CaseSensitive: true
NGramSize: 2

func (*SorensenDice) Compare ¶

func (m *SorensenDice) Compare(a, b string) float64

Compare returns the Sorensen-Dice similarity coefficient of a and b. The returned similarity is a number between 0 and 1. Larger similarity numbers indicate closer matches. An n-gram size of 2 is used if the provided size is less than or equal to 0.

type Substitution ¶

type Substitution interface {
	// Compare returns the substitution score of characters a[idxA] and b[idxB].
	Compare(a []rune, idxA int, b []rune, idxB int) float64

	// Returns the maximum score of a character substitution operation.
	Max() float64

	// Returns the minimum score of a character substitution operation.
	Min() float64
}

Substitution represents a substitution function which is used to calculate a score for character substitutions.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL