gcse

package module
v0.0.0-...-0a0262d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 19, 2015 License: BSD-2-Clause, BSD-3-Clause Imports: 36 Imported by: 0

README

Go Search GoSearch

A keyword search engine helping people to find popular and relevant Go packages.

Online service: Go Search

This is the root package with shared functions.

Sub packages are commands for running:

  • HTTP Server: Searching and web service
  • ToCrawl: Find packages to crawl.
  • Crawler: Crawling package files.
  • MergeDocs: Merge crawled package files with doc DB.
  • Indexer: Analyzing package information and generating indexed data for searching.

Development

You'll need to perform the following steps to get a basic server running:

  1. Create a basic conf.json file, limiting the crawler to a one minute run: { "crawler": { "due_per_run": "1m" } }
  2. Run the package finder: go run tocrawl/*.go
  3. Run the crawler: go run crawler/*.go
  4. Merge the crawled docs: go run mergedocs/*.go
  5. Run the indexer: go run indexer/*.go
  6. Run the server: go run server/*.go
  7. Visit http://localhost:8080 in your browser

LICENSE

BSD license.

Documentation

Overview

Package gcse is the core supporting library for go-code-search-engine (GCSE). Its exported types and functions are mainly for sub packages. If you want some of the function, copy the code away.

Sub-projects

crawler crawling packages

indexer creating index data for web-server

server providing web services, including home/top/search services.

Data-flows

project Read Write ------- ---- ----- crawler fnCrawlerDB fnCrawlerDB

        fnDocDB       fnDocDB
		              DBOutSegments

indexer DBOutSegments IndexSegments

server IndexSegments

Index

Constants

View Source
const (
	KindIndex = "index"
	IndexFn   = KindIndex + ".gob"

	KindDocDB = "docdb"

	FnCrawlerDB = "crawler"
	KindPackage = "package"
	KindPerson  = "person"
	KindToCheck = "tocheck"

	FnToCrawl = "tocrawl"
	FnPackage = "package"
	FnPerson  = "person"
	// key: RawString, value: DocInfo
	FnDocs    = "docs"
	FnNewDocs = "newdocs"
)
View Source
const (
	// whole document updated
	NDA_UPDATE = iota
	// only stars updated
	NDA_STARS
	// deleted
	NDA_DEL
	// Original document
	NDA_ORIGINAL
)
View Source
const (
	IndexTextField = "text"
	IndexNameField = "name"
	IndexPkgField  = "pkg"
)
View Source
const (
	DOCS_PARTS = 128
)

Variables

View Source
var (
	ServerAddr = ":8080"
	ServerRoot = villa.Path("./server/")

	LoadTemplatePass = ""
	AutoLoadTemplate = false

	DataRoot      = villa.Path("./data/")
	CrawlerDBPath = DataRoot.Join(FnCrawlerDB)
	DocsDBPath    = DataRoot.Join(FnDocs)

	// producer: server, consumer: crawler
	ImportPath     villa.Path
	ImportSegments Segments

	// producer: crawler, consumer: indexer
	DBOutPath     villa.Path
	DBOutSegments Segments

	// producer: indexer, consumer: server.
	// server never delete index segments, indexer clear updated segments.
	IndexPath     villa.Path
	IndexSegments Segments

	// configures of crawler
	CrawlByGodocApi           = true
	CrawlGithubUpdate         = true
	CrawlerDuePerRun          = 1 * time.Hour
	CrawlerGithubClientID     = ""
	CrawlerGithubClientSecret = ""

	/*
		Increase this to ignore etag of last versions to crawl and parse all
		packages.

		ChangeLog:
		    0    First version
		    1    Add TestImports/XTestImports to Imports
		    2    Parse markdown readme to text before selecting synopsis
			     from it
			3    Add exported tokens to indexes
			4    Move TestImports/XTestImports out of Imports, to TestImports
			4    A bug of checking CrawlerVersion is fixed
	*/
	CrawlerVersion = 5

	NonCrawlHosts          = stringsp.Set{}
	NonStorePackageRegexps = []string{}
)
View Source
var (
	ErrPackageNotModifed = errors.New("package not modified")
	ErrInvalidPackage    = errors.New("invalid package")
)

Functions

func AppendPackages

func AppendPackages(pkgs []string) bool

AppendPackages appends a list packages to imports folder for crawler backend to read

func AppendTokens

func AppendTokens(tokens stringsp.Set, text []byte) stringsp.Set

func AuthorOfPackage

func AuthorOfPackage(pkg string) string

func CalcMatchScore

func CalcMatchScore(doc *HitInfo, tokenList []string,
	textIdfs, nameIdfs []float64) float64

func CalcPackagePartition

func CalcPackagePartition(pkg string, totalParts int) int

func CalcStaticScore

func CalcStaticScore(doc *HitInfo) float64

func CalcTestStaticScore

func CalcTestStaticScore(doc *HitInfo, realImported []string) float64

func CheckCamel

func CheckCamel(last, current rune) index.RuneType

func CheckRuneType

func CheckRuneType(last, current rune) index.RuneType

func ChooseImportantSentenses

func ChooseImportantSentenses(text string, name, pkg string) []string

func ClearWatcherEvents

func ClearWatcherEvents(watcher *fsnotify.Watcher)

func DumpMemStats

func DumpMemStats()

func FetchAllPackagesInGodoc

func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error)

FetchAllPackagesInGodoc fetches the list of all packages on godoc.org

func FullProjectOfPackage

func FullProjectOfPackage(pkg string) string

func GenHttpClient

func GenHttpClient(proxy string) doc.HttpClient

func GithubUpdates

func GithubUpdates() (map[string]time.Time, error)

func HostOfPackage

func HostOfPackage(pkg string) string

func IdOfPerson

func IdOfPerson(site, username string) string

func Index

func Index(docDB mr.Input) (*index.TokenSetSearcher, error)

func IsBadPackage

func IsBadPackage(err error) bool

func LikeButton

func LikeButton(httpClient doc.HttpClient, Url string) (int, error)

func NewDocInfo

func NewDocInfo() sophie.Sophier

Returns a new instance of DocInfo as a sophie.Sophier

func NewNewDocAction

func NewNewDocAction() sophie.Sophier

Returns a new instance of *NewDocAction as a Sophier

func NormWord

func NormWord(word string) string

func ParsePersonId

func ParsePersonId(id string) (site, username string)

func Plusone

func Plusone(httpClient doc.HttpClient, url string) (int, error)

func ProjectOfPackage

func ProjectOfPackage(pkg string) string

core project of a packaage

func ReadJsonFile

func ReadJsonFile(fn villa.Path, data interface{}) error

func ReadPackages

func ReadPackages(segm Segment) (pkgs []string, err error)

func ReadmeToText

func ReadmeToText(fn, data string) string

func SegmentLess

func SegmentLess(a, b Segment) bool

func SplitSentences

func SplitSentences(text string) []string

func TrimPackageName

func TrimPackageName(pkg string) string

func WaitForWatcherEvents

func WaitForWatcherEvents(watcher *fsnotify.Watcher)

func WriteJsonFile

func WriteJsonFile(fn villa.Path, data interface{}) error

Types

type BlackRequest

type BlackRequest struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func (*BlackRequest) Do

func (br *BlackRequest) Do(req *http.Request) (*http.Response, error)

type CrawlerDB

type CrawlerDB struct {
	PackageDB *MemDB
	PersonDB  *MemDB
}

* CrawlerDB including all crawler entires database.

func LoadCrawlerDB

func LoadCrawlerDB() *CrawlerDB

LoadCrawlerDB loads PackageDB and PersonDB and returns a new *CrawlerDB

func (*CrawlerDB) AppendPackage

func (cdb *CrawlerDB) AppendPackage(pkg string,
	inDocs func(pkg string) bool)

AppendPackage appends a package. If the package did not exist in either PackageDB or Docs, shedulet it (immediately).

func (*CrawlerDB) AppendPerson

func (cdb *CrawlerDB) AppendPerson(site, username string) bool

AppendPerson appends a person to the PersonDB, schedules to crawl immediately for a new person

func (*CrawlerDB) SchedulePackage

func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time,
	etag string) error

SchedulePackage schedules a package to be crawled at a specific time.

func (*CrawlerDB) SchedulePerson

func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error

SchedulePerson schedules a person to be crawled at a specific time.

func (*CrawlerDB) Sync

func (cdb *CrawlerDB) Sync() error

Sync syncs both PackageDB and PersonDB. Returns error if any of the sync failed.

type CrawlingEntry

type CrawlingEntry struct {
	ScheduleTime time.Time
	// if gcse.CrawlerVersion is different from this value, etag is ignored
	Version int
	Etag    string
}

func (*CrawlingEntry) ReadFrom

func (c *CrawlingEntry) ReadFrom(r sophie.Reader, l int) error

func (*CrawlingEntry) WriteTo

func (c *CrawlingEntry) WriteTo(w sophie.Writer) error

type DocDB

type DocDB interface {
	Sync() error
	Export(root villa.Path, kind string) error

	Get(key string, data interface{}) bool
	Put(key string, data interface{})
	Delete(key string)
	Iterate(output func(key string, val interface{}) error) error
}

type DocInfo

type DocInfo struct {
	Name        string
	Package     string
	Author      string
	LastUpdated time.Time
	StarCount   int
	Synopsis    string
	Description string
	ProjectURL  string
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)
}

DocInfo is the information stored in backend docDB

func (*DocInfo) ReadFrom

func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error

func (*DocInfo) WriteTo

func (d *DocInfo) WriteTo(w sophie.Writer) error

type HitInfo

type HitInfo struct {
	DocInfo

	Imported           []string
	TestImported       []string
	ImportantSentences []string

	AssignedStarCount float64
	StaticScore       float64
	TestStaticScore   float64
	StaticRank        int // zero-based
}

HitInfo is the information provided to frontend

type MemDB

type MemDB struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewMemDB

func NewMemDB(root villa.Path, kind string) *MemDB

func (*MemDB) Count

func (mdb *MemDB) Count() int

Count returns the number of entries in the DB

func (*MemDB) Delete

func (mdb *MemDB) Delete(key string)

func (*MemDB) Export

func (mdb *MemDB) Export(root villa.Path, kind string) error

Export saves the data to some space, but not affecting the modified property.

func (*MemDB) Get

func (mdb *MemDB) Get(key string, data interface{}) bool

Get fetches an entry of specified key. data is a pointer. Return false if not exists

func (*MemDB) Iterate

func (mdb *MemDB) Iterate(output func(key string, val interface{}) error) error

func (*MemDB) LastModified

func (mdb *MemDB) LastModified() time.Time

func (*MemDB) Load

func (mdb *MemDB) Load() error

func (*MemDB) Modified

func (mdb *MemDB) Modified() bool

func (*MemDB) Put

func (mdb *MemDB) Put(key string, data interface{})

func (*MemDB) Sync

func (mdb *MemDB) Sync() error

type NewDocAction

type NewDocAction struct {
	Action sophie.VInt
	DocInfo
}

* If Action equals NDA_DEL, DocInfo is undefined.

func (*NewDocAction) ReadFrom

func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error

func (*NewDocAction) WriteTo

func (nda *NewDocAction) WriteTo(w sophie.Writer) error

type Package

type Package struct {
	Package     string
	Name        string
	Synopsis    string
	Doc         string
	ProjectURL  string
	StarCount   int
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)

	References []string
	Etag       string
}

Package stores information from crawler

func CrawlPackage

func CrawlPackage(httpClient doc.HttpClient, pkg string,
	etag string) (p *Package, err error)

type PackedDocDB

type PackedDocDB struct {
	*MemDB
}

func (PackedDocDB) Get

func (db PackedDocDB) Get(key string, data interface{}) bool

func (PackedDocDB) Iterate

func (db PackedDocDB) Iterate(
	output func(key string, val interface{}) error) error

func (PackedDocDB) Put

func (db PackedDocDB) Put(key string, data interface{})

type Person

type Person struct {
	Id       string
	Packages []string
}

func CrawlPerson

func CrawlPerson(httpClient doc.HttpClient, id string) (*Person, error)

type Segment

type Segment interface {
	Name() string
	Join(name string) villa.Path
	IsDone() bool
	Done() error
	ListFiles() ([]villa.Path, error)
	Remove() error
}

type Segments

type Segments interface {
	Watch(watcher *fsnotify.Watcher) error
	ListAll() ([]Segment, error)
	// all done
	ListDones() ([]Segment, error)
	// max done
	FindMaxDone() (Segment, error)
	// generates an arbitrary new segment
	GenNewSegment() (Segment, error)
	// generates a segment greated than all existence
	GenMaxSegment() (Segment, error)
	// clear
	ClearUndones() error
}

type Size

type Size int64

func (Size) String

func (s Size) String() string

type TokenIndexer

type TokenIndexer struct {
	index.TokenIndexer

	sync.RWMutex
	// contains filtered or unexported fields
}

TokenIndexer is thread-safe.

func NewTokenIndexer

func NewTokenIndexer(root villa.Path, kind string) *TokenIndexer

func (*TokenIndexer) Export

func (ti *TokenIndexer) Export(root villa.Path, kind string) error

func (*TokenIndexer) IdsOfToken

func (ti *TokenIndexer) IdsOfToken(token string) []string

func (*TokenIndexer) LastModified

func (ti *TokenIndexer) LastModified() time.Time

func (*TokenIndexer) Load

func (ti *TokenIndexer) Load() error

func (*TokenIndexer) Modified

func (ti *TokenIndexer) Modified() bool

func (*TokenIndexer) Put

func (ti *TokenIndexer) Put(id string, tokens stringsp.Set)

func (*TokenIndexer) Sync

func (ti *TokenIndexer) Sync() error

func (*TokenIndexer) TokensOfId

func (ti *TokenIndexer) TokensOfId(id string) []string

Directories

Path Synopsis
GCSE Crawler background program.
GCSE Crawler background program.
GCSE HTTP server.
GCSE HTTP server.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL