spider

package
v0.0.0-...-714e105 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 12, 2024 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultType = "default"
	RestyType   = "resty"
	ImocType    = "imroc"
)

Variables

Functions

This section is empty.

Types

type BaseSpiderConf

type BaseSpiderConf struct {
	EnableCookie   bool
	EnableProxy    bool
	ProxyUrl       string
	DownloaderType string

	Cookie *cookiejar.Jar
}

BaseSpiderConf 基础爬虫配置

type Context

type Context struct {
	Ctx      context.Context
	Cancel   context.CancelFunc
	Response *http.Response

	Temp map[string]interface{}
	Info map[string]string
	// contains filtered or unexported fields
}

func (*Context) Abort

func (c *Context) Abort()

func (*Context) GetDom

func (c *Context) GetDom() (*goquery.Document, error)

type MiddlewareHandler

type MiddlewareHandler func(ctx *Context)

type MiddlewareHandlerErr

type MiddlewareHandlerErr func(ctx *Context) error

type Option

type Option func(b *BaseSpiderConf)

func NewCookieJar

func NewCookieJar(cookie *cookiejar.Jar) Option

func NewDownloader

func NewDownloader(value string) Option

func NewProxyUrl

func NewProxyUrl(value string) Option

type Resource

type Resource struct {
	SpiderUniqueKey string

	*downloader.Request
	// contains filtered or unexported fields
}

func NewResource

func NewResource(key, rule, link string, info map[string]string) Resource

type Schedule

type Schedule struct {
	ResourcePoolList chan Resource
	ConcurrentNum    int // 并发数量
	// contains filtered or unexported fields
}

func NewSchedule

func NewSchedule() *Schedule

func (*Schedule) AddResource

func (s *Schedule) AddResource(resource Resource) (err error)

func (*Schedule) Close

func (s *Schedule) Close()

func (*Schedule) Init

func (s *Schedule) Init()

func (*Schedule) Register

func (s *Schedule) Register(spider *Spider) *Schedule

func (*Schedule) UnReg

func (s *Schedule) UnReg(spider *Spider) *Schedule

type Spider

type Spider struct {
	UniqueKey string // 唯一标识符
	STATUS    uint   // 状态

	Downloader    downloader.Downloader          // 下载器
	RuleHandlers  map[string][]MiddlewareHandler // 规则中间件
	CloseCallback func(s *Spider)                // 回调关闭
	// contains filtered or unexported fields
}

func NewSpider

func NewSpider(opts ...Option) *Spider

func (*Spider) SetConcurrent

func (s *Spider) SetConcurrent(num int) *Spider

func (*Spider) SetGlobalPreRun

func (s *Spider) SetGlobalPreRun(f MiddlewareHandlerErr) *Spider

func (*Spider) SetRangeTime

func (s *Spider) SetRangeTime(sleepTime int) *Spider

func (*Spider) SetRules

func (s *Spider) SetRules(key string, h ...MiddlewareHandler) *Spider

SetRules 设置爬虫key=规则名

func (*Spider) SetTimeTicker

func (s *Spider) SetTimeTicker(num int) *Spider

SetTimeTicker 设置探活时间 默认十秒

func (*Spider) Start

func (s *Spider) Start()

func (*Spider) Stop

func (s *Spider) Stop()

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL