Documentation ¶
Index ¶
- func URLJoin(url string, path string) string
- type Args
- type Context
- type Cookies
- type CrawlEngine
- type Crawler
- func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler
- func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler
- func (c *Crawler) AddRequest(req *Request)
- func (c *Crawler) ClearPipelines() *Crawler
- func (c *Crawler) CrawlURL(url string)
- func (c *Crawler) IsIdle() bool
- func (c *Crawler) OnItem(f ItemPipelineFunc) *Crawler
- func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler
- func (c *Crawler) OnRedirect(callback RedirectCallback) *Crawler
- func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler
- func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler
- func (c *Crawler) OnStart(callback func(ctx *Context)) *Crawler
- func (c *Crawler) OnStop(callback func(ctx *Context)) *Crawler
- func (c *Crawler) Start(wait bool) *Crawler
- func (c *Crawler) Wait()
- func (c *Crawler) WaitTime(seconds time.Duration)
- func (c *Crawler) WithDefaultCallback(callback func(res *Response, ctx *Context)) *Crawler
- func (c *Crawler) WithStartRequests(callback func(ctx *Context) []*Request) *Crawler
- type Headers
- type History
- type HistoryItem
- type ItemPipeline
- type ItemPipelineFunc
- type Meta
- type MongoPipeline
- type RedirectCallback
- type Request
- func (req *Request) AddHeader(key string, value string) *Request
- func (req *Request) AddMeta(key string, value interface{}) *Request
- func (req *Request) Clone() *Request
- func (req *Request) OnError(callback RequestErrorCallback) *Request
- func (req *Request) OnResponse(callback ResponseCallback) *Request
- func (req *Request) WithContentType(contentType string) *Request
- func (req *Request) WithCookies(cookies map[string]string) *Request
- func (req *Request) WithHeaders(headers map[string]string) *Request
- func (req *Request) WithHost(host string) *Request
- func (req *Request) WithMeta(meta Meta) *Request
- func (req *Request) WithProxy(proxy string) *Request
- func (req *Request) WithTimeout(timeout int) *Request
- type RequestErrorCallback
- type Response
- type ResponseCallback
- type Settings
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
Types ¶
type Context ¶
type Context struct { Engine *CrawlEngine Crawler *Crawler Settings *Settings Depth int32 LastRequest *Request LastResponse *Response }
Context 爬虫执行上下文
type CrawlEngine ¶
type CrawlEngine struct { //fastHttpClient *fasthttp.Client RequestQueue chan *Request ItemQueue chan *itemWrapper //RequestingCount int32 //ProcessingItemCount int32 Settings *Settings RequestMetaMap *sync.Map //map[*http.Request]Meta // contains filtered or unexported fields }
CrawlEngine 爬取引擎
func (*CrawlEngine) StartProcessItems ¶
func (eng *CrawlEngine) StartProcessItems()
StartProcessItems 开始处理Items
func (*CrawlEngine) StartProcessRequests ¶
func (eng *CrawlEngine) StartProcessRequests()
StartProcessRequests 开始处理请求
func (*CrawlEngine) WaitTime ¶ added in v0.0.3
func (eng *CrawlEngine) WaitTime(seconds time.Duration)
Wait 等待引擎执行结束
type Crawler ¶
type Crawler struct { Name string StartUrls []string StartRequests func(ctx *Context) []*Request Pipelines []ItemPipeline ItemTypeFuncs map[string]ItemPipelineFunc Settings *Settings Engine *CrawlEngine // contains filtered or unexported fields }
Crawler 爬虫本体
func (*Crawler) AddItemPipeline ¶
func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler
AddItemPipeline 添加Pipeline
func (*Crawler) AddItemPipelineFunc ¶
func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler
AddItemPipelineFunc 添加Pipeline func
func (*Crawler) AddRequest ¶ added in v0.0.3
AddRequest crawl one url
func (*Crawler) ClearPipelines ¶
ClearPipelines 清空pipeline
func (*Crawler) OnItemType ¶
func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler
OnItemType 与itemExample同类型的item处理函数
func (*Crawler) OnRedirect ¶ added in v0.0.3
func (c *Crawler) OnRedirect(callback RedirectCallback) *Crawler
OnRedirect Set redirect callback
func (*Crawler) OnRequestError ¶
func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler
OnRequestError Set request error callback
func (*Crawler) OnResponse ¶
func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler
OnResponse Set response callback
func (*Crawler) WithDefaultCallback ¶
WithDefaultCallback 设置默认回调函数
type History ¶ added in v0.0.3
type History []*HistoryItem
type HistoryItem ¶ added in v0.0.3
type ItemPipeline ¶
type ItemPipeline interface {
ProcessItem(item interface{}, ctx *Context) interface{}
}
ItemPipeline pipeline接口
func FuncPipeline ¶
func FuncPipeline(callback ItemPipelineFunc) ItemPipeline
FuncPipeline 仅提供一个函数的pipeline
type ItemPipelineFunc ¶
type ItemPipelineFunc func(item interface{}, ctx *Context) interface{}
ItemPipelineFunc 处理item的函数
type MongoPipeline ¶
MongoPipeline 默认mongodb pipeline
func (*MongoPipeline) ProcessItem ¶
func (dmp *MongoPipeline) ProcessItem(item interface{}, ctx *Context) interface{}
ProcessItem 实现ItemPipeline接口
type RedirectCallback ¶ added in v0.0.3
type Request ¶
type Request struct { Method string URL string Body []byte Headers http.Header Cookies Cookies Timeout int Meta Meta Callback ResponseCallback ErrorCallback RequestErrorCallback ProxyURL string OriginURL string Host string History History // contains filtered or unexported fields }
Request Crawler的请求
func NewRequest ¶
NewRequest NewRequest
func PostRequest ¶
PostRequest basic post request
func (*Request) OnError ¶
func (req *Request) OnError(callback RequestErrorCallback) *Request
func (*Request) OnResponse ¶
func (req *Request) OnResponse(callback ResponseCallback) *Request
OnResponse set Response callback
func (*Request) WithContentType ¶
WithContentType set Content-Type
func (*Request) WithCookies ¶
WithCookies set Cookies
func (*Request) WithHeaders ¶
WithHeaders set Headers
func (*Request) WithTimeout ¶
WithTimeout set timeout
type RequestErrorCallback ¶
type Response ¶
type Response struct { *htmlquery.Selector StatusCode int URL string Status string Body []byte Request *Request Headers http.Header Cookies Cookies Meta Meta History History //NativeResponse *http.Response X509Certificate *x509.Certificate X509CertChan []*x509.Certificate // contains filtered or unexported fields }
Response Crawler的响应
func NewResponse ¶
NewResponse 创建Response
func NewResponse(content []byte) *Response { return &Response{Selector: htmlquery.NewSelector(content), Body: content} }
NewResponse create a Response from http.Response
func (*Response) WithRequest ¶
WithRequest 设置request
type ResponseCallback ¶
ResponseCallback ResponseCallback