spider

package
v0.0.0-...-278ce41 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 28, 2024 License: Apache-2.0 Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Context

type Context struct {
	Body []byte
	Req  *Request
}

func (*Context) GetRule

func (c *Context) GetRule(ruleName string) *Rule

func (*Context) Output

func (c *Context) Output(data any) *DataCell

func (*Context) OutputJS

func (c *Context) OutputJS(reg string) ParseResult

func (*Context) ParseJSReg

func (c *Context) ParseJSReg(name, reg string) ParseResult

type DataCell

type DataCell struct {
	Task *Task
	Data map[string]interface{}
}

func (*DataCell) GetTableName

func (d *DataCell) GetTableName() string

func (*DataCell) GetTaskName

func (d *DataCell) GetTaskName() string

type Fetcher

type Fetcher interface {
	Get(req *Request) ([]byte, error)
}

type LimitConfig

type LimitConfig struct {
	EventCount int
	EventDur   int // 秒
	Bucket     int // 桶大小
}

type Option

type Option func(opts *Options)

func WithCookie

func WithCookie(cookie string) Option

func WithFetcher

func WithFetcher(f Fetcher) Option

func WithLogger

func WithLogger(logger *zap.Logger) Option

func WithMaxDepth

func WithMaxDepth(maxDepth int64) Option

func WithName

func WithName(name string) Option

func WithReload

func WithReload(reload bool) Option

func WithStorage

func WithStorage(s Storage) Option

func WithURL

func WithURL(url string) Option

func WithWaitTime

func WithWaitTime(waitTime int64) Option

type Options

type Options struct {
	Name     string `json:"name"` // 任务名称,应保证唯一性
	URL      string `json:"url"`
	Cookie   string `json:"cookie"`
	WaitTime int64  `json:"wait_time"` // 随机休眠时间,秒
	Reload   bool   `json:"reload"`    // 网站是否可以重复爬取
	MaxDepth int64  `json:"max_depth"`
	Fetcher  Fetcher
	Storage  Storage
	Limit    limiter.RateLimiter
	// contains filtered or unexported fields
}

type ParseResult

type ParseResult struct {
	Requests []*Request
	Items    []any
}

type Property

type Property struct {
	Name     string `json:"name"` // 用户界面显示的名称,且需保证唯一性
	URL      string `json:"url"`
	Cookie   string `json:"cookie"`
	WaitTime int64  `json:"wait_time"` // 随机休眠时间,秒
	Reload   bool   `json:"reload"`    // 网站是否可以重复爬取
	MaxDepth int64  `json:"max_depth"`
}

type Request

type Request struct {
	Task     *Task
	URL      string
	Method   string
	Depth    int64
	Priority int64
	RuleName string
	TmpData  *Temp
}

func AddJsReq

func AddJsReq(jsReq map[string]any) []*Request

func AddJsReqs

func AddJsReqs(jsReqs []map[string]any) []*Request

func (*Request) Check

func (r *Request) Check() error

func (*Request) Fetch

func (r *Request) Fetch() ([]byte, error)

func (*Request) Unique

func (r *Request) Unique() string

type Rule

type Rule struct {
	ItemFields []string
	// TODO: return *ParseResult
	ParseFunc func(*Context) (ParseResult, error) // 内容解析函数
}

Rule 采集规则节点

type RuleModel

type RuleModel struct {
	Name      string `json:"name"`
	ParseFunc string `json:"parse_script"`
}

type RuleTree

type RuleTree struct {
	Root  func() ([]*Request, error) // 根节点,执行入口
	Trunk map[string]*Rule           // 规则哈希表
}

RuleTree 采集规则树

type Storage

type Storage interface {
	Save(datas ...*DataCell) error
}

type Task

type Task struct {
	Visited     map[string]bool
	VisitedLock sync.Mutex
	Rule        RuleTree
	Closed      bool
	Options
}

Task 一个任务实例

func NewTask

func NewTask(opts ...Option) *Task

type TaskConfig

type TaskConfig struct {
	Name     string
	Cookie   string
	WaitTime int64
	Reload   bool
	MaxDepth int64
	Fetcher  string
	Limits   []LimitConfig
}

type TaskModel

type TaskModel struct {
	Property
	Root  string      `json:"root_script"`
	Rules []RuleModel `json:"rule"`
}

type Temp

type Temp struct {
	// contains filtered or unexported fields
}

func (*Temp) Get

func (t *Temp) Get(key string) any

Get 返回临时缓存数据

func (*Temp) Set

func (t *Temp) Set(key string, val any) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL