spider

package
v1.2.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 25, 2021 License: MIT Imports: 21 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func Crawler

func Crawler(running *int32, group *sync.WaitGroup, jbd *JobDB, dbd *DoneDB, config Config, crawl <-chan *URLInfo, parse chan<- *PageInfo,
	jobsCrawlerTotal *int32, jobsCrawlerTotalFail *int32, crawlfunc string, crawlTimeout int, crawlRetry int, stat *Stat, ctx *Content)

func GetChromeWSEndpoint

func GetChromeWSEndpoint() string

func GetSize

func GetSize(db *DB) int

func Ini

func Ini()

func InsertSpider

func InsertSpider(db *DB, title string, name string, url string)

func Parser

func Parser(running *int32, group *sync.WaitGroup, jbd *JobDB, dbd *DoneDB, config Config, crawl chan<- *URLInfo, parse <-chan *PageInfo, save chan<- interface{},
	hosturl string, stat *Stat, ctx *Content)

func Saver

func Saver(running *int32, group *sync.WaitGroup, save <-chan interface{}, stat *Stat, ctx *Content)

func Start

func Start(ctx *Content, config Config, url string, stat *Stat)

Types

type Config

type Config struct {
	Threadnum    int
	Buffersize   int
	Deps         int
	FocusSpider  bool
	Crawlfunc    string // simple,puppeteer
	CrawlTimeout int
	CrawlRetry   int
}

type Content

type Content struct {
	Dsn   string
	Conn  int
	Crawl func(pg *PageInfo, doc *goquery.Document) *PageInfo
	Parse func(hosturl string, pg *PageInfo, save chan<- interface{}) bool
	Save  func(result interface{})
}

type DB

type DB struct {
	// contains filtered or unexported fields
}

func Load

func Load(dsn string, conn int, name string, expireday int) *DB

type DBLinkInfo

type DBLinkInfo struct {
	Host  string
	Title string
	Name  string
	Url   string
}

type DoneDB

type DoneDB struct {
	// contains filtered or unexported fields
}

type FindData

type FindData struct {
	Title string
	Name  string
	URL   string
}

func Find

func Find(db *DB, str string, max int) []FindData

func Last

func Last(db *DB, n int) []FindData

func Select

func Select(db *DB, offset int, count int) []FindData

type JobDB

type JobDB struct {
	// contains filtered or unexported fields
}

type LoopSpider

type LoopSpider struct {
	Thread int
	Buffer int
	Cur    string
	// contains filtered or unexported fields
}

func NewLoopSpider

func NewLoopSpider(lss LoopSpiderSlot) *LoopSpider

func (*LoopSpider) GetLoopSpiderStatus

func (ls *LoopSpider) GetLoopSpiderStatus() LoopSpiderStatus

type LoopSpiderSlot

type LoopSpiderSlot interface {
	Name() string
	DefaultCur() string
	Crawl(cur string) bool
	NextCur(cur string) string
}

type LoopSpiderStatus

type LoopSpiderStatus struct {
	Cur      string
	CurInt   int
	UsedTime string
	Done     int64
	Speed    string
	Fail     int64
	OK       int64
}

type PageInfo

type PageInfo struct {
	UI    URLInfo
	Title string
	Son   []PageLinkInfo
}

type PageLinkInfo

type PageLinkInfo struct {
	UI   URLInfo
	Name string
}

type SpiderData

type SpiderData struct {
	// contains filtered or unexported fields
}

type Stat

type Stat struct {
	CrawBePushJobNum int

	CrawChannelNum  int
	CrawFunc        string
	CrawNum         int
	CrawRetrtyNum   int
	CrawOKNum       int64
	CrawFailNum     int
	CrawOKTotalTime int64
	CrawOKAvgTime   int64

	ParseChannelNum int
	ParseNum        int
	ParseValidNum   int
	ParseSpawnNum   int
	ParseFinishNum  int
	ParseTooDeepNum int
	ParseJobNum     int

	SaveChannelNum int
	SaveNum        int

	InsertNum       int64
	InsertTotalTime int64
	InsertAvgTime   int64

	JobInsertNum       int64
	JobInsertTotalTime int64
	JobInsertAvgTime   int64
	JobPopNum          int64
	JobPopTotalTime    int64
	JobPopAvgTime      int64
	JobHasNum          int64
	JobHasTotalTime    int64
	JobHasAvgTime      int64

	DoneInsertNum       int64
	DoneInsertTotalTime int64
	DoneInsertAvgTime   int64
	DoneHasNum          int64
	DoneHasTotalTime    int64
	DoneHasAvgTime      int64
}

type URLInfo

type URLInfo struct {
	Url  string
	Deps int
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL