spider

package
v0.0.0-...-b569211 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 9, 2020 License: MIT Imports: 20 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CloseDone

func CloseDone(db *DoneDB)

func CloseJob

func CloseJob(db *JobDB)

func Crawler

func Crawler(running *int32, group *sync.WaitGroup, jbd *JobDB, dbd *DoneDB, config Config, crawl <-chan *URLInfo, parse chan<- *PageInfo, jobsCrawlerTotal *int32, jobsCrawlerTotalFail *int32, crawlfunc string, crawlTimeout int, crawlRetry int, stat *Stat)

func DeleteOldSpider

func DeleteOldSpider(db *DB)

func DeleteSpiderDone

func DeleteSpiderDone(db *DoneDB)

func GetChromeWSEndpoint

func GetChromeWSEndpoint() string

func GetDoneSize

func GetDoneSize(db *DoneDB) int

func GetJobSize

func GetJobSize(db *JobDB) int

func GetSize

func GetSize(db *DB) int

func HasDone

func HasDone(db *DoneDB, url string, stat *Stat) bool

func HasJob

func HasJob(db *JobDB, url string, stat *Stat) bool

func Ini

func Ini()

func InsertSpider

func InsertSpider(db *DB, title string, name string, url string, host string, stat *Stat)

func InsertSpiderDone

func InsertSpiderDone(db *DoneDB, url string, stat *Stat)

func InsertSpiderJob

func InsertSpiderJob(db *JobDB, url string, deps int, stat *Stat)

func Parser

func Parser(running *int32, group *sync.WaitGroup, jbd *JobDB, dbd *DoneDB, config Config, crawl chan<- *URLInfo, parse <-chan *PageInfo, save chan<- *DBInfo, hosturl string, stat *Stat)

func PopSpiderJob

func PopSpiderJob(db *JobDB, n int, stat *Stat) ([]string, []int)

func Saver

func Saver(running *int32, group *sync.WaitGroup, db *DB, save <-chan *DBInfo, stat *Stat)

func SetCallback

func SetCallback(cb func(host string, title string, name string, url string))

func Start

func Start(db *DB, config Config, url string, stat *Stat)

Types

type Config

type Config struct {
	Threadnum    int
	Buffersize   int
	Deps         int
	FocusSpider  bool
	Crawlfunc    string // simple,puppeteer
	CrawlTimeout int
	CrawlRetry   int
}

type DB

type DB struct {
	// contains filtered or unexported fields
}

func Load

func Load(dsn string, conn int) *DB

func (*DB) GetSelectStmt

func (db *DB) GetSelectStmt() *sql.Stmt

func (*DB) GetSqlDB

func (db *DB) GetSqlDB() *sql.DB

type DBInfo

type DBInfo struct {
	Host  string
	Title string
	Name  string
	Url   string
}

type DoneDB

type DoneDB struct {
	// contains filtered or unexported fields
}

func LoadDone

func LoadDone(dsn string, conn int, src string) *DoneDB

type FindData

type FindData struct {
	Title string
	Name  string
	URL   string
}

func Find

func Find(db *DB, str string, max int) []FindData

func Last

func Last(db *DB, n int) []FindData

func Select

func Select(db *DB, offset int, count int) []FindData

type JobDB

type JobDB struct {
	// contains filtered or unexported fields
}

func LoadJob

func LoadJob(dsn string, conn int, src string) *JobDB

type LoopSpider

type LoopSpider struct {
	Thread int
	Buffer int
	Cur    string
	// contains filtered or unexported fields
}

func NewLoopSpider

func NewLoopSpider(lss LoopSpiderSlot) *LoopSpider

func (*LoopSpider) GetLoopSpiderStatus

func (ls *LoopSpider) GetLoopSpiderStatus() LoopSpiderStatus

type LoopSpiderSlot

type LoopSpiderSlot interface {
	Name() string
	DefaultCur() string
	Crawl(cur string) bool
	NextCur(cur string) string
}

type LoopSpiderStatus

type LoopSpiderStatus struct {
	Cur      string
	CurInt   int
	UsedTime string
	Done     int64
	Speed    string
	Fail     int64
	OK       int64
}

type PageInfo

type PageInfo struct {
	UI    URLInfo
	Title string
	Son   []PageLinkInfo
}

type PageLinkInfo

type PageLinkInfo struct {
	UI   URLInfo
	Name string
}

type SpiderData

type SpiderData struct {
	// contains filtered or unexported fields
}

type Stat

type Stat struct {
	CrawBePushJobNum int

	CrawChannelNum  int
	CrawFunc        string
	CrawNum         int
	CrawRetrtyNum   int
	CrawOKNum       int64
	CrawFailNum     int
	CrawOKTotalTime int64
	CrawOKAvgTime   int64

	ParseChannelNum int
	ParseNum        int
	ParseValidNum   int
	ParseSpawnNum   int
	ParseFinishNum  int
	ParseTooDeepNum int
	ParseJobNum     int

	SaveChannelNum int
	SaveNum        int

	InsertNum         int64
	InsertTotalTime   int64
	InsertCBTotalTime int64
	InsertAvgTime     int64
	InsertCBAvgTime   int64

	JobInsertNum       int64
	JobInsertTotalTime int64
	JobInsertAvgTime   int64
	JobPopNum          int64
	JobPopTotalTime    int64
	JobPopAvgTime      int64
	JobHasNum          int64
	JobHasTotalTime    int64
	JobHasAvgTime      int64

	DoneInsertNum       int64
	DoneInsertTotalTime int64
	DoneInsertAvgTime   int64
	DoneHasNum          int64
	DoneHasTotalTime    int64
	DoneHasAvgTime      int64
}

type URLInfo

type URLInfo struct {
	Url  string
	Deps int
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL