go_tf_idf

package module
v0.0.0-...-dad26aa Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 22, 2022 License: MIT Imports: 5 Imported by: 1

README

go-tf-idf

Coverage Status

A small Go implementation of tf-idf (term frequency-inverse document frequency) with support for comparing documents using cosine similarities.

Usage

Install with go get -u github.com/dkgv/go-tf-idf.

package main

import (
    "fmt"
    go_tf_idf "github.com/dkgv/go-tf-idf"
)

func main() {
    // Initializing a tf-idf container 
    doc1 := "this is a document"
    doc2 := "and this is another document"
    tfidf := go_tf_idf.New(
        go_tf_idf.WithDocuments([]string{doc1, doc2}),
        go_tf_idf.WithDefaultStopWords(),
    )

    // Calculating tf-idf for a term
    term := "document"
    res1 := tfidf.TermFrequencyInverseDocumentFrequencyForTerm(term, doc1)
    fmt.Printf("res1 %f", res1)

    // Comparing two documents via cosine similarity
    comparator := go_tf_idf.CosineComparator
    similarity, err := tfidf.Compare(doc1, doc2, comparator)
    if err != nil {
        // ...
    }
	
    fmt.Printf("similarity %f", similarity)
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultList = map[string]bool{}/* 175 elements not displayed */

Functions

func CosineComparator

func CosineComparator(vec1, vec2 []float64) float64

func Tokenize

func Tokenize(s string) []string

Types

type Comparator

type Comparator func(vector1, vector2 []float64) float64

type Document

type Document struct {
	AllTokens    []string
	TermCount    map[string]int
	UniqueTokens []string
}

func (Document) GetVectors

func (d Document) GetVectors(other Document) ([]float64, []float64)

func (Document) TermFrequency

func (d Document) TermFrequency(term string) float64

type Filter

type Filter func(string) bool

type Option

type Option func(idf *TfIdf)

func WithComparator

func WithComparator(comparator Comparator) Option

func WithDefaultStopWords

func WithDefaultStopWords() Option

func WithDocuments

func WithDocuments(documents []string) Option

func WithStopWords

func WithStopWords(stopWords []string) Option

type StopWords

type StopWords struct {
	List    map[string]bool
	Filters []Filter
}

func NewEmptyStopWords

func NewEmptyStopWords() *StopWords

func (*StopWords) AddIgnoreFilter

func (w *StopWords) AddIgnoreFilter(filter Filter)

func (*StopWords) AddWord

func (w *StopWords) AddWord(word string)

func (*StopWords) AddWords

func (w *StopWords) AddWords(words []string)

func (*StopWords) Matches

func (w *StopWords) Matches(word string) bool

type TfIdf

type TfIdf struct {
	Documents map[string]Document
	StopWords *StopWords
	// contains filtered or unexported fields
}

func DefaultOptions

func DefaultOptions() *TfIdf

func New

func New(opts ...Option) *TfIdf

func (TfIdf) AddDocument

func (i TfIdf) AddDocument(document string)

func (TfIdf) Compare

func (i TfIdf) Compare(document1, document2 string) (float64, error)

func (TfIdf) GetDocument

func (i TfIdf) GetDocument(document string) *Document

func (TfIdf) InverseDocumentFrequency

func (i TfIdf) InverseDocumentFrequency(term string) float64

func (TfIdf) TermFrequencyInverseDocumentFrequencyForDocument

func (i TfIdf) TermFrequencyInverseDocumentFrequencyForDocument(document string) []float64

func (TfIdf) TermFrequencyInverseDocumentFrequencyForTerm

func (i TfIdf) TermFrequencyInverseDocumentFrequencyForTerm(term string, document string) float64

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL