Documentation
¶
Overview ¶
simhashUTF -- simhash language-specific handling for UTF.
This package is to refactor the Unicode handling code from the original (v1) design out to this thin language handling layer, which showcases how easy it is to extend the simhash's language-specific handling functionality.
Such modular approach (v2 design) helps to reduce and limit the size of the core code, while make it easy to extend the core function as well.
Example (Output) ¶
for standalone test, change package to `main` and the next func def to, func main() {
// package main package main import ( "fmt" "github.com/go-dedup/simhash" "github.com/go-dedup/simhash/simhashUTF" "golang.org/x/text/unicode/norm" ) // for standalone test, change package to `main` and the next func def to, // func main() { func main() { hashes := make([]uint64, len(docs)) sh := simhashUTF.NewUTFSimhash(norm.NFKC) for i, d := range docs { hashes[i] = sh.GetSimhash(sh.NewWordFeatureSet(d)) fmt.Printf("Simhash of '%s': %x\n", d, hashes[i]) } fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[1], simhash.Compare(hashes[0], hashes[1])) fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[2], simhash.Compare(hashes[0], hashes[2])) fmt.Printf("Comparison of `%s` and `%s`: %d\n", docs[0], docs[3], simhash.Compare(hashes[0], hashes[3])) } var docs = [][]byte{ []byte("la fin d'un bel après-midi d'été"), []byte("bonne après-midi"), []byte("Bonjour"), []byte("Bonsoir"), }
Output: Simhash of 'la fin d'un bel après-midi d'été': 58dbbd1fefab774a Simhash of 'bonne après-midi': fadfbfbfdf8e7b7f Simhash of 'Bonjour': ac5261af4fdd5252 Simhash of 'Bonsoir': fb42ceaf7cda4905 Comparison of `la fin d'un bel après-midi d'été` and `bonne après-midi`: 18 Comparison of `la fin d'un bel après-midi d'été` and `Bonjour`: 28 Comparison of `la fin d'un bel après-midi d'été` and `Bonsoir`: 34
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type SimhashUTF ¶
type SimhashUTF struct { simhash.SimhashBase // contains filtered or unexported fields }
func NewUTFSimhash ¶
func NewUTFSimhash(_f norm.Form) *SimhashUTF
NewUTFSimhash makes a new SimhashUTF
func (*SimhashUTF) NewUnicodeWordFeatureSet ¶
func (st *SimhashUTF) NewUnicodeWordFeatureSet(b []byte, f norm.Form) *UnicodeWordFeatureSet
Example (InChinese) ¶
sh := NewSimhash() text := []byte("当山峰没有棱角的时候") fs := sh.NewUnicodeWordFeatureSet(text, norm.NFKC) fmt.Printf("%#v\n", fs) actual := fs.GetFeatures() fmt.Printf("%#v\n", actual)
Output: &simhashUTF.UnicodeWordFeatureSet{WordFeatureSet:simhash.WordFeatureSet{B:[]uint8{0xe5, 0xbd, 0x93, 0xe5, 0xb1, 0xb1, 0xe5, 0xb3, 0xb0, 0xe6, 0xb2, 0xa1, 0xe6, 0x9c, 0x89, 0xe6, 0xa3, 0xb1, 0xe8, 0xa7, 0x92, 0xe7, 0x9a, 0x84, 0xe6, 0x97, 0xb6, 0xe5, 0x80, 0x99}}, f:2} []simhash.Feature{simhash.feature{sum:0xa5edea16c0c7a180, weight:1}}
Example (InWestern) ¶
sh := NewSimhash() text := []byte("la fin d'un bel après-midi d'été") fs := sh.NewUnicodeWordFeatureSet(text, norm.NFKC) fmt.Printf("%#v\n", fs) actual := fs.GetFeatures() fmt.Printf("%#v\n", actual)
Output: &simhashUTF.UnicodeWordFeatureSet{WordFeatureSet:simhash.WordFeatureSet{B:[]uint8{0x6c, 0x61, 0x20, 0x66, 0x69, 0x6e, 0x20, 0x64, 0x27, 0x75, 0x6e, 0x20, 0x62, 0x65, 0x6c, 0x20, 0x61, 0x70, 0x72, 0xc3, 0xa8, 0x73, 0x2d, 0x6d, 0x69, 0x64, 0x69, 0x20, 0x64, 0x27, 0xc3, 0xa9, 0x74, 0xc3, 0xa9}}, f:2} []simhash.Feature{simhash.feature{sum:0x8325c07b4eb2548, weight:1}, simhash.feature{sum:0xd8cbc5186ba13198, weight:1}, simhash.feature{sum:0x15cdbd7eed98cfab, weight:1}, simhash.feature{sum:0xd8d9a1186bad324a, weight:1}, simhash.feature{sum:0x3adb901f8c8a7b5e, weight:1}, simhash.feature{sum:0x7e8f29c36ffb774e, weight:1}}
func (*SimhashUTF) NewWordFeatureSet ¶
func (st *SimhashUTF) NewWordFeatureSet(b []byte) *UnicodeWordFeatureSet
type UnicodeWordFeatureSet ¶
type UnicodeWordFeatureSet struct { simhash.WordFeatureSet // contains filtered or unexported fields }
UnicodeWordFeatureSet is a feature set in which each word is a feature, all equal weight.
See: http://blog.golang.org/normalization See: https://groups.google.com/forum/#!topic/golang-nuts/YyH1f_qCZVc
func (*UnicodeWordFeatureSet) GetFeatures ¶
func (w *UnicodeWordFeatureSet) GetFeatures() []simhash.Feature
Returns a []Feature representing each word in the byte slice
func (*UnicodeWordFeatureSet) Normalize ¶
func (w *UnicodeWordFeatureSet) Normalize()