Documentation ¶
Overview ¶
simhash package implements Charikar's simhash algorithm to generate a 64-bit fingerprint of a given document.
simhash fingerprints have the property that similar documents will have a similar fingerprint. Therefore, the hamming distance between two fingerprints will be small if the documents are similar
Example (Output) ¶
for standalone test, change package to main and the next func def to, func main() {
package main import ( "fmt" "github.com/go-dedup/simhash" ) func main() { var docs = [][]byte{ []byte("this is a test phrase"), []byte("this is a test phrass"), []byte("these are test phrases"), []byte("foo bar"), } hashes := make([]uint64, len(docs)) for i, d := range docs { hashes[i] = simhash.Simhash(simhash.NewWordFeatureSet(d)) fmt.Printf("Simhash of '%s': %x\n", d, hashes[i]) } fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[1], simhash.Compare(hashes[0], hashes[1])) fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[2], simhash.Compare(hashes[0], hashes[2])) fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[3], simhash.Compare(hashes[0], hashes[3])) }
Output: Simhash of 'this is a test phrase': 8c3a5f7e9ecb3f35 Simhash of 'this is a test phrass': 8c3a5f7e9ecb3f21 Simhash of 'these are test phrases': ddfdbf7fbfaffb1d Simhash of 'foo bar': d8dbe7186bad3db3 Comparison of `this is a test phrase` and `this is a test phrass`: 2 Comparison of `this is a test phrase` and `these are test phrases`: 22 Comparison of `this is a test phrase` and `foo bar`: 29
Index ¶
- func Compare(a uint64, b uint64) uint8
- func Fingerprint(v Vector) uint64
- func NewFeature(f []byte) feature
- func NewFeatureWithWeight(f []byte, weight int) feature
- func Shingle(w int, b [][]byte) [][]byte
- func Simhash(fs FeatureSet) uint64
- func SimhashBytes(b [][]byte) uint64
- type Feature
- type FeatureSet
- type UnicodeWordFeatureSet
- type Vector
- type WordFeatureSet
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func Compare ¶
Compare calculates the Hamming distance between two 64-bit integers
Currently, this is calculated using the Kernighan method [1]. Other methods exist which may be more efficient and are worth exploring at some point
[1] http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
func Fingerprint ¶
Fingerprint returns a 64-bit fingerprint of the given vector. The fingerprint f of a given 64-dimension vector v is defined as follows:
f[i] = 1 if v[i] >= 0 f[i] = 0 if v[i] < 0
func NewFeature ¶
func NewFeature(f []byte) feature
Returns a new feature representing the given byte slice, using a weight of 1
func NewFeatureWithWeight ¶
Returns a new feature representing the given byte slice with the given weight
func Shingle ¶
Shingle returns the w-shingling of the given set of bytes. For example, if the given input was {"this", "is", "a", "test"}, this returns {"this is", "is a", "a test"}
func SimhashBytes ¶
Returns a 64-bit simhash of the given bytes
Types ¶
type Feature ¶
type Feature interface { // Sum returns the 64-bit sum of this feature Sum() uint64 // Weight returns the weight of this feature Weight() int }
Feature consists of a 64-bit hash and a weight
type FeatureSet ¶
type FeatureSet interface {
GetFeatures() []Feature
}
FeatureSet represents a set of features in a given document
type UnicodeWordFeatureSet ¶
type UnicodeWordFeatureSet struct {
// contains filtered or unexported fields
}
UnicodeWordFeatureSet is a feature set in which each word is a feature, all equal weight.
See: http://blog.golang.org/normalization See: https://groups.google.com/forum/#!topic/golang-nuts/YyH1f_qCZVc
func NewUnicodeWordFeatureSet ¶
func NewUnicodeWordFeatureSet(b []byte, f norm.Form) *UnicodeWordFeatureSet
Example (InChinese) ¶
text := []byte("当山峰没有棱角的时候") fs := NewUnicodeWordFeatureSet(text, norm.NFKC) fmt.Printf("%#v\n", fs) actual := fs.GetFeatures() fmt.Printf("%#v\n", actual)
Output: &simhash.UnicodeWordFeatureSet{b:[]uint8{0xe5, 0xbd, 0x93, 0xe5, 0xb1, 0xb1, 0xe5, 0xb3, 0xb0, 0xe6, 0xb2, 0xa1, 0xe6, 0x9c, 0x89, 0xe6, 0xa3, 0xb1, 0xe8, 0xa7, 0x92, 0xe7, 0x9a, 0x84, 0xe6, 0x97, 0xb6, 0xe5, 0x80, 0x99}, f:2} []simhash.Feature{simhash.feature{sum:0xa5edea16c0c7a180, weight:1}}
Example (InWestern) ¶
text := []byte("la fin d'un bel après-midi d'été") fs := NewUnicodeWordFeatureSet(text, norm.NFKC) fmt.Printf("%#v\n", fs) actual := fs.GetFeatures() fmt.Printf("%#v\n", actual)
Output: &simhash.UnicodeWordFeatureSet{b:[]uint8{0x6c, 0x61, 0x20, 0x66, 0x69, 0x6e, 0x20, 0x64, 0x27, 0x75, 0x6e, 0x20, 0x62, 0x65, 0x6c, 0x20, 0x61, 0x70, 0x72, 0xc3, 0xa8, 0x73, 0x2d, 0x6d, 0x69, 0x64, 0x69, 0x20, 0x64, 0x27, 0xc3, 0xa9, 0x74, 0xc3, 0xa9}, f:2} []simhash.Feature{simhash.feature{sum:0x8325c07b4eb2548, weight:1}, simhash.feature{sum:0xd8cbc5186ba13198, weight:1}, simhash.feature{sum:0x15cdbd7eed98cfab, weight:1}, simhash.feature{sum:0xd8d9a1186bad324a, weight:1}, simhash.feature{sum:0x3adb901f8c8a7b5e, weight:1}, simhash.feature{sum:0x7e8f29c36ffb774e, weight:1}}
func (*UnicodeWordFeatureSet) GetFeatures ¶
func (w *UnicodeWordFeatureSet) GetFeatures() []Feature
Returns a []Feature representing each word in the byte slice
type Vector ¶
type Vector [64]int
func Vectorize ¶
Vectorize generates 64 dimension vectors given a set of features. Vectors are initialized to zero. The i-th element of the vector is then incremented by weight of the i-th feature if the i-th bit of the feature is set, and decremented by the weight of the i-th feature otherwise.
func VectorizeBytes ¶
VectorizeBytes generates 64 dimension vectors given a set of [][]byte, where each []byte is a feature with even weight.
Vectors are initialized to zero. The i-th element of the vector is then incremented by weight of the i-th feature if the i-th bit of the feature is set, and decremented by the weight of the i-th feature otherwise.
type WordFeatureSet ¶
type WordFeatureSet struct {
// contains filtered or unexported fields
}
WordFeatureSet is a feature set in which each word is a feature, all equal weight.
func NewWordFeatureSet ¶
func NewWordFeatureSet(b []byte) *WordFeatureSet
Example ¶
text := []byte("a a abc abc test test string.") fs := NewWordFeatureSet(text) fmt.Printf("%#v\n", fs) actual := fs.GetFeatures() fmt.Printf("%#v\n", actual)
Output: &simhash.WordFeatureSet{b:[]uint8{0x61, 0x20, 0x61, 0x20, 0x61, 0x62, 0x63, 0x20, 0x61, 0x62, 0x63, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x2e}} []simhash.Feature{simhash.feature{sum:0xaf63bd4c8601b7be, weight:1}, simhash.feature{sum:0xaf63bd4c8601b7be, weight:1}, simhash.feature{sum:0xd8dcca186bafadcb, weight:1}, simhash.feature{sum:0xd8dcca186bafadcb, weight:1}, simhash.feature{sum:0x8c093f7e9fccbf69, weight:1}, simhash.feature{sum:0x8c093f7e9fccbf69, weight:1}, simhash.feature{sum:0x9926dcde0a17d48e, weight:1}}
func (*WordFeatureSet) GetFeatures ¶
func (w *WordFeatureSet) GetFeatures() []Feature
Returns a []Feature representing each word in the byte slice