gcse

package module

v0.0.0-...-e2ffb22 Latest Latest Go to latest Published: Apr 16, 2018 License: BSD-2-Clause, BSD-3-Clause Imports: 44 Imported by: 59

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/daviddengcn/gcse

Links

Open Source Insights

README ¶

Go Search

A keyword search engine helping people to find popular and relevant Go packages.

Online service: Go Search

This is the root package with shared functions.

Sub packages are commands for running:

HTTP Server: Searching and web service
ToCrawl: Find packages to crawl.
Crawler: Crawling package files.
MergeDocs: Merge crawled package files with doc DB.
Indexer: Analyzing package information and generating indexed data for searching.

Development

You'll need to perform the following steps to get a basic server running:

Create a basic conf.json file, limiting the crawler to a one minute run: { "crawler": { "due_per_run": "1m" } }
Run the package finder: go run tocrawl/*.go
Run the crawler: go run crawler/*.go
Merge the crawled docs: go run mergedocs/*.go
Run the indexer: go run indexer/*.go
Run the server: go run server/*.go
Visit http://localhost:8080 in your browser

LICENSE

BSD license.

Documentation ¶

Overview ¶

Package gcse is the core supporting library for go-code-search-engine (GCSE). Its exported types and functions are mainly for sub packages. If you want some of the function, copy the code away.

Index ¶

Constants
Variables
func AddBiValueAndProcess(aggr bi.AggregateMethod, name string, value int)
func AppendPackages(pkgs []string) bool
func AppendTokens(tokens stringsp.Set, text []byte) stringsp.Set
func AuthorOfPackage(pkg string) string
func CalcMatchScore(doc *HitInfo, tokenList []string, textIdfs, nameIdfs []float64) float64
func CalcPackagePartition(pkg string, totalParts int) int
func CalcStaticScore(doc *HitInfo) float64
func CalcTestStaticScore(doc *HitInfo, realImported []string) float64
func CheckCamel(last, current rune) index.RuneType
func CheckRuneType(last, current rune) index.RuneType
func ChooseImportantSentenses(text string, name, pkg string) []string
func ClearWatcherEvents(watcher *fsnotify.Watcher)
func CrawlRepoInfo(ctx context.Context, site, user, name string) *gpb.RepoInfo
func FullProjectOfPackage(pkg string) string
func GenHttpClient(proxy string) doc.HttpClient
func HostOfPackage(pkg string) string
func IdOfPerson(site, username string) string
func Index(docDB mr.Input, outDir string) (*index.TokenSetSearcher, error)
func IsBadPackage(err error) bool
func LikeButton(httpClient doc.HttpClient, Url string) (int, error)
func NewDocInfo() sophie.Sophier
func NewNewDocAction() sophie.Sophier
func NormWord(word string) string
func ParsePersonId(id string) (site, username string)
func Plusone(httpClient doc.HttpClient, url string) (int, error)
func ProjectOfPackage(pkg string) string
func ReadPackages(segm utils.Segment) ([]string, error)
func ReadmeToText(fn, data string) string
func SplitSentences(text string) []string
func TrimPackageName(pkg string) string
func WaitForWatcherEvents(watcher *fsnotify.Watcher)
type BlackRequest
- func (br *BlackRequest) Do(req *http.Request) (*http.Response, error)
type CrawlerDB
- func LoadCrawlerDB() *CrawlerDB
- func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool)
- func (cdb *CrawlerDB) AppendPerson(site, username string) bool
- func (cdb *CrawlerDB) PushToCrawlPackage(pkg string)
- func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time, etag string) error
- func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error
- func (cdb *CrawlerDB) Sync() error
type CrawlingEntry
- func (c *CrawlingEntry) ReadFrom(r sophie.Reader, l int) error
- func (c *CrawlingEntry) WriteTo(w sophie.Writer) error
type DocDB
type DocInfo
- func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error
- func (d *DocInfo) WriteTo(w sophie.Writer) error
type HitInfo
type MemDB
- func NewMemDB(root villa.Path, kind string) *MemDB
- func (mdb *MemDB) Count() int
- func (mdb *MemDB) Delete(key string)
- func (mdb *MemDB) Export(root villa.Path, kind string) error
- func (mdb *MemDB) Get(key string, data interface{}) bool
- func (mdb *MemDB) Iterate(output func(key string, val interface{}) error) error
- func (mdb *MemDB) LastModified() time.Time
- func (mdb *MemDB) Load() error
- func (mdb *MemDB) Modified() bool
- func (mdb *MemDB) Put(key string, data interface{})
- func (mdb *MemDB) Sync() error
type NewDocAction
- func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error
- func (nda *NewDocAction) WriteTo(w sophie.Writer) error
type Package
- func CrawlPackage(ctx context.Context, httpClient doc.HttpClient, pkg string, etag string) (p *Package, folders []*gpb.FolderInfo, err error)
type PackedDocDB
- func (db PackedDocDB) Get(key string, data interface{}) bool
- func (db PackedDocDB) Iterate(output func(key string, val interface{}) error) error
- func (db PackedDocDB) Put(key string, data interface{})
type Person
- func CrawlPerson(ctx context.Context, httpClient doc.HttpClient, id string) (*Person, error)
type TokenIndexer
- func NewTokenIndexer(root villa.Path, kind string) *TokenIndexer
- func (ti *TokenIndexer) Export(root villa.Path, kind string) error
- func (ti *TokenIndexer) IdsOfToken(token string) []string
- func (ti *TokenIndexer) LastModified() time.Time
- func (ti *TokenIndexer) Load() error
- func (ti *TokenIndexer) Modified() bool
- func (ti *TokenIndexer) Put(id string, tokens stringsp.Set)
- func (ti *TokenIndexer) Sync() error
- func (ti *TokenIndexer) TokensOfId(id string) []string

Constants ¶

View Source

const (
	// whole document updated
	NDA_UPDATE = iota
	// only stars updated
	NDA_STARS
	// deleted
	NDA_DEL
	// Original document
	NDA_ORIGINAL
)

View Source

const (
	KindIndex   = "index"
	KindDocDB   = "docdb"
	KindPackage = "package"
	KindPerson  = "person"
	KindToCheck = "tocheck"
	IndexFn     = KindIndex + ".gob"
)

View Source

const (
	HitsArrFn = "hits"

	IndexTextField = "text"
	IndexNameField = "name"
	IndexPkgField  = "pkg"
)

View Source

const (

	/*
		Increase this to ignore etag of last versions to crawl and parse all
		packages.

		ChangeLog:
		    0    First version
		    1    Add TestImports/XTestImports to Imports
		    2    Parse markdown readme to text before selecting synopsis
			     from it
			3    Add exported tokens to indexes
			4    Move TestImports/XTestImports out of Imports, to TestImports
			4    A bug of checking CrawlerVersion is fixed
	*/
	CrawlerVersion = 5
)

View Source

const (
	DOCS_PARTS = 128
)

Variables ¶

View Source

var (
	ErrPackageNotModifed = errors.New("package not modified")
	ErrInvalidPackage    = errors.New("invalid package")
)

View Source

var GithubSpider *github.Spider

Functions ¶

func AddBiValueAndProcess ¶

func AddBiValueAndProcess(aggr bi.AggregateMethod, name string, value int)

func AppendPackages ¶

func AppendPackages(pkgs []string) bool

AppendPackages appends a list packages to imports folder for crawler backend to read

func AppendTokens ¶

func AppendTokens(tokens stringsp.Set, text []byte) stringsp.Set

Tokenizes text into the current token set.

func AuthorOfPackage ¶

func AuthorOfPackage(pkg string) string

func CalcMatchScore ¶

func CalcMatchScore(doc *HitInfo, tokenList []string, textIdfs, nameIdfs []float64) float64

func CalcPackagePartition ¶

func CalcPackagePartition(pkg string, totalParts int) int

func CalcStaticScore ¶

func CalcStaticScore(doc *HitInfo) float64

func CalcTestStaticScore ¶

func CalcTestStaticScore(doc *HitInfo, realImported []string) float64

func CheckCamel ¶

func CheckCamel(last, current rune) index.RuneType

func CheckRuneType ¶

func CheckRuneType(last, current rune) index.RuneType

func ChooseImportantSentenses ¶

func ChooseImportantSentenses(text string, name, pkg string) []string

func ClearWatcherEvents ¶

func ClearWatcherEvents(watcher *fsnotify.Watcher)

func CrawlRepoInfo ¶

func CrawlRepoInfo(ctx context.Context, site, user, name string) *gpb.RepoInfo

func FullProjectOfPackage ¶

func FullProjectOfPackage(pkg string) string

func GenHttpClient ¶

func GenHttpClient(proxy string) doc.HttpClient

func HostOfPackage ¶

func HostOfPackage(pkg string) string

func IdOfPerson ¶

func IdOfPerson(site, username string) string

func Index ¶

func Index(docDB mr.Input, outDir string) (*index.TokenSetSearcher, error)

func IsBadPackage ¶

func IsBadPackage(err error) bool

func LikeButton ¶

func LikeButton(httpClient doc.HttpClient, Url string) (int, error)

func NewDocInfo ¶

func NewDocInfo() sophie.Sophier

Returns a new instance of DocInfo as a sophie.Sophier

func NewNewDocAction ¶

func NewNewDocAction() sophie.Sophier

Returns a new instance of *NewDocAction as a Sophier

func NormWord ¶

func NormWord(word string) string

func ParsePersonId ¶

func ParsePersonId(id string) (site, username string)

func Plusone ¶

func Plusone(httpClient doc.HttpClient, url string) (int, error)

func ProjectOfPackage ¶

func ProjectOfPackage(pkg string) string

core project of a packaage

func ReadPackages ¶

func ReadPackages(segm utils.Segment) ([]string, error)

func ReadmeToText ¶

func ReadmeToText(fn, data string) string

func SplitSentences ¶

func SplitSentences(text string) []string

func TrimPackageName ¶

func TrimPackageName(pkg string) string

func WaitForWatcherEvents ¶

func WaitForWatcherEvents(watcher *fsnotify.Watcher)

Types ¶

type BlackRequest ¶

type BlackRequest struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func (*BlackRequest) Do ¶

func (br *BlackRequest) Do(req *http.Request) (*http.Response, error)

type CrawlerDB ¶

type CrawlerDB struct {
	PackageDB *MemDB
	PersonDB  *MemDB
}

* CrawlerDB including all crawler entires database.

func LoadCrawlerDB ¶

func LoadCrawlerDB() *CrawlerDB

LoadCrawlerDB loads PackageDB and PersonDB and returns a new *CrawlerDB

func (*CrawlerDB) AppendPackage ¶

func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool)

AppendPackage appends a package. If the package did not exist in either PackageDB or Docs, schedule it (immediately).

func (*CrawlerDB) AppendPerson ¶

func (cdb *CrawlerDB) AppendPerson(site, username string) bool

AppendPerson appends a person to the PersonDB, schedules to crawl immediately for a new person

func (*CrawlerDB) PushToCrawlPackage ¶

func (cdb *CrawlerDB) PushToCrawlPackage(pkg string)

SchedulePackage schedules a package to be crawled at a specific time if not specified earlier.

func (*CrawlerDB) SchedulePackage ¶

func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time, etag string) error

SchedulePackage schedules a package to be crawled at a specific time.

func (*CrawlerDB) SchedulePerson ¶

func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error

SchedulePerson schedules a person to be crawled at a specific time.

func (*CrawlerDB) Sync ¶

func (cdb *CrawlerDB) Sync() error

Sync syncs both PackageDB and PersonDB. Returns error if any of the sync failed.

type CrawlingEntry ¶

type CrawlingEntry struct {
	ScheduleTime time.Time
	// if gcse.CrawlerVersion is different from this value, etag is ignored
	Version int
	Etag    string
}

func (*CrawlingEntry) ReadFrom ¶

func (c *CrawlingEntry) ReadFrom(r sophie.Reader, l int) error

func (*CrawlingEntry) WriteTo ¶

func (c *CrawlingEntry) WriteTo(w sophie.Writer) error

type DocDB ¶

type DocDB interface {
	Sync() error
	Export(root villa.Path, kind string) error

	Get(key string, data interface{}) bool
	Put(key string, data interface{})
	Delete(key string)
	Iterate(output func(key string, val interface{}) error) error
}

type DocInfo ¶

type DocInfo struct {
	Name        string // Package name
	Package     string // Package path
	Author      string
	LastUpdated time.Time
	StarCount   int
	Synopsis    string
	Description string
	ProjectURL  string
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)
}

DocInfo is the information stored in backend docDB

func (*DocInfo) ReadFrom ¶

func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error

func (*DocInfo) WriteTo ¶

func (d *DocInfo) WriteTo(w sophie.Writer) error

type HitInfo ¶

type HitInfo struct {
	DocInfo

	Imported    []string
	ImportedLen int

	TestImported    []string
	TestImportedLen int

	ImportantSentences []string

	AssignedStarCount float64
	StaticScore       float64
	TestStaticScore   float64
	StaticRank        int // zero-based
}

HitInfo is the information provided to frontend

type MemDB ¶

type MemDB struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewMemDB ¶

func NewMemDB(root villa.Path, kind string) *MemDB

func (*MemDB) Count ¶

func (mdb *MemDB) Count() int

Count returns the number of entries in the DB

func (*MemDB) Delete ¶

func (mdb *MemDB) Delete(key string)

func (*MemDB) Export ¶

func (mdb *MemDB) Export(root villa.Path, kind string) error

Export saves the data to some space, but not affecting the modified property.

func (*MemDB) Get ¶

func (mdb *MemDB) Get(key string, data interface{}) bool

Get fetches an entry of specified key. data is a pointer. Return false if not exists

func (*MemDB) Iterate ¶

func (mdb *MemDB) Iterate(output func(key string, val interface{}) error) error

func (*MemDB) LastModified ¶

func (mdb *MemDB) LastModified() time.Time

func (*MemDB) Load ¶

func (mdb *MemDB) Load() error

func (*MemDB) Modified ¶

func (mdb *MemDB) Modified() bool

func (*MemDB) Put ¶

func (mdb *MemDB) Put(key string, data interface{})

func (*MemDB) Sync ¶

func (mdb *MemDB) Sync() error

type NewDocAction ¶

type NewDocAction struct {
	Action sophie.VInt
	DocInfo
}

* If Action equals NDA_DEL, DocInfo is undefined.

func (*NewDocAction) ReadFrom ¶

func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error

func (*NewDocAction) WriteTo ¶

func (nda *NewDocAction) WriteTo(w sophie.Writer) error

type Package ¶

type Package struct {
	Package     string
	Name        string
	Synopsis    string
	Doc         string
	ProjectURL  string
	StarCount   int
	ReadmeFn    string
	ReadmeData  string
	Imports     []string
	TestImports []string
	Exported    []string // exported tokens(funcs/types)

	References []string
	Etag       string
}

Package stores information from crawler

func CrawlPackage ¶

func CrawlPackage(ctx context.Context, httpClient doc.HttpClient, pkg string, etag string) (p *Package, folders []*gpb.FolderInfo, err error)

type PackedDocDB ¶

type PackedDocDB struct {
	*MemDB
}

func (PackedDocDB) Get ¶

func (db PackedDocDB) Get(key string, data interface{}) bool

func (PackedDocDB) Iterate ¶

func (db PackedDocDB) Iterate(
	output func(key string, val interface{}) error) error

func (PackedDocDB) Put ¶

func (db PackedDocDB) Put(key string, data interface{})

type Person ¶

type Person struct {
	Id       string
	Packages []string
}

func CrawlPerson ¶

func CrawlPerson(ctx context.Context, httpClient doc.HttpClient, id string) (*Person, error)

type TokenIndexer ¶

type TokenIndexer struct {
	index.TokenIndexer

	sync.RWMutex
	// contains filtered or unexported fields
}

TokenIndexer is thread-safe.

func NewTokenIndexer ¶

func NewTokenIndexer(root villa.Path, kind string) *TokenIndexer

func (*TokenIndexer) Export ¶

func (ti *TokenIndexer) Export(root villa.Path, kind string) error

func (*TokenIndexer) IdsOfToken ¶

func (ti *TokenIndexer) IdsOfToken(token string) []string

func (*TokenIndexer) LastModified ¶

func (ti *TokenIndexer) LastModified() time.Time

func (*TokenIndexer) Load ¶

func (ti *TokenIndexer) Load() error

func (*TokenIndexer) Modified ¶

func (ti *TokenIndexer) Modified() bool

func (*TokenIndexer) Put ¶

func (ti *TokenIndexer) Put(id string, tokens stringsp.Set)

func (*TokenIndexer) Sync ¶

func (ti *TokenIndexer) Sync() error

func (*TokenIndexer) TokensOfId ¶

func (ti *TokenIndexer) TokensOfId(id string) []string

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
configs Package configs define and load all configurations.	Package configs define and load all configurations.
pipelines
crawler GCSE Crawler background program.	GCSE Crawler background program.
indexer
mergedocs Input FnDocs FnNewDocs	Input FnDocs FnNewDocs
spider
tocrawl
service
stored
web GCSE HTTP server.	GCSE HTTP server.
shared
proto Package gcsepb is a generated protocol buffer package.	Package gcsepb is a generated protocol buffer package.
spider
github
godocorg
store Package store handlings all the storage in GCSE backend.	Package store handlings all the storage in GCSE backend.
tools
countdocs
dump
exps
fillfound
fixcrawldb
utils

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL