Documentation ¶
Index ¶
- type Args
- type Context
- type Cookies
- type CrawlEngine
- type Crawler
- func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler
- func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler
- func (c *Crawler) ClearPipelines() *Crawler
- func (c *Crawler) CrawlURL(url string)
- func (c *Crawler) OnItem(f ItemPipelineFunc) *Crawler
- func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler
- func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler
- func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler
- func (c *Crawler) OnStart(callback func(ctx *Context)) *Crawler
- func (c *Crawler) OnStop(callback func(ctx *Context)) *Crawler
- func (c *Crawler) Start(wait bool) *Crawler
- func (c *Crawler) Wait()
- func (c *Crawler) WithDefaultCallback(callback func(res *Response, ctx *Context)) *Crawler
- func (c *Crawler) WithSettings(s *Settings) *Crawler
- func (c *Crawler) WithStartRequests(callback func(ctx *Context) []*Request) *Crawler
- type Headers
- type ItemPipeline
- type ItemPipelineFunc
- type Meta
- type MongoPipeline
- type Request
- func (req *Request) OnError(callback RequestErrorCallback) *Request
- func (req *Request) OnResponse(callback ResponseCallback) *Request
- func (req *Request) WithContentType(contentType string) *Request
- func (req *Request) WithCookies(cookies map[string]string) *Request
- func (req *Request) WithHeaders(headers map[string]string) *Request
- func (req *Request) WithMeta(meta Meta) *Request
- func (req *Request) WithProxy(proxy string) *Request
- func (req *Request) WithTimeout(timeout int) *Request
- type RequestErrorCallback
- type Response
- type ResponseCallback
- type Settings
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Context ¶
type Context struct { Engine *CrawlEngine Crawler *Crawler Settings *Settings Depth int32 LastRequest *Request LastResponse *Response }
Context 爬虫执行上下文
type CrawlEngine ¶
type CrawlEngine struct { RequestQueue chan *Request ItemQueue chan *itemWrapper RequestingCount int32 ProcessingItemCount int32 Settings *Settings RequestMetaMap *sync.Map //map[*http.Request]Meta // contains filtered or unexported fields }
CrawlEngine 爬取引擎
func (*CrawlEngine) StartProcessItems ¶
func (eng *CrawlEngine) StartProcessItems()
StartProcessItems 开始处理Items
func (*CrawlEngine) StartProcessRequests ¶
func (eng *CrawlEngine) StartProcessRequests()
StartProcessRequests 开始处理请求
type Crawler ¶
type Crawler struct { Name string StartUrls []string StartRequests func(ctx *Context) []*Request Pipelines []ItemPipeline ItemTypeFuncs map[string]ItemPipelineFunc ResponseCallback func(res *Response, ctx *Context) RequestErrorCallback RequestErrorCallback Settings *Settings Engine *CrawlEngine // contains filtered or unexported fields }
Crawler 爬虫本体
func (*Crawler) AddItemPipeline ¶
func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler
AddItemPipeline 添加Pipeline
func (*Crawler) AddItemPipelineFunc ¶
func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler
AddItemPipelineFunc 添加Pipeline func
func (*Crawler) ClearPipelines ¶
ClearPipelines 清空pipeline
func (*Crawler) OnItemType ¶
func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler
OnItemType 与itemExample同类型的item处理函数
func (*Crawler) OnRequestError ¶
func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler
func (*Crawler) OnResponse ¶
func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler
func (*Crawler) WithDefaultCallback ¶
WithDefaultCallback 设置默认回调函数
func (*Crawler) WithSettings ¶
WithSettings 设置settings
type ItemPipeline ¶
type ItemPipeline interface {
ProcessItem(item interface{}, ctx *Context) interface{}
}
ItemPipeline pipeline接口
func FuncPipeline ¶
func FuncPipeline(callback ItemPipelineFunc) ItemPipeline
FuncPipeline 仅提供一个函数的pipeline
type ItemPipelineFunc ¶
type ItemPipelineFunc func(item interface{}, ctx *Context) interface{}
ItemPipelineFunc 处理item的函数
type MongoPipeline ¶
MongoPipeline 默认mongodb pipeline
func (*MongoPipeline) ProcessItem ¶
func (dmp *MongoPipeline) ProcessItem(item interface{}, ctx *Context) interface{}
ProcessItem 实现ItemPipeline接口
type Request ¶
type Request struct { Method string URL string Body []byte Headers Headers Cookies Cookies Timeout int Meta Meta Callback ResponseCallback ErrorCallback RequestErrorCallback ProxyURL string // contains filtered or unexported fields }
Request Crawler的请求
func NewRequest ¶
NewRequest NewRequest
func PostRequest ¶
PostRequest basic post request
func (*Request) OnError ¶
func (req *Request) OnError(callback RequestErrorCallback) *Request
func (*Request) OnResponse ¶
func (req *Request) OnResponse(callback ResponseCallback) *Request
OnResponse set Response callback
func (*Request) WithContentType ¶
WithContentType set Content-Type
func (*Request) WithCookies ¶
WithCookies set Cookies
func (*Request) WithHeaders ¶
WithHeaders set Headers
func (*Request) WithTimeout ¶
WithTimeout set timeout
type RequestErrorCallback ¶
type Response ¶
type Response struct { StatusCode int *htmlquery.Selector URL string Status string Body []byte Request *Request Headers Headers Cookies Cookies Meta Meta NativeResponse *http.Response // contains filtered or unexported fields }
Response Crawler的响应
func NewResponse ¶
NewResponse 创建Response
func NewResponse(content []byte) *Response { return &Response{Selector: htmlquery.NewSelector(content), Body: content} }
NewResponse create a Response from http.Response
func (*Response) WithRequest ¶
WithRequest 设置request
type ResponseCallback ¶
ResponseCallback ResponseCallback