Documentation ¶
Index ¶
- Constants
- Variables
- func ReleaseRequest(req *Request)
- func ReleaseResponse(resp *Response)
- type AcquireProxies
- type Crawler
- func (c *Crawler) AfterResponse(f HandleResponse)
- func (c *Crawler) BeforeRequest(f HandleRequest)
- func (c *Crawler) ClearCache() error
- func (c *Crawler) DialWithProxy() fasthttp.DialFunc
- func (c *Crawler) DialWithProxyAndTimeout(timeout time.Duration) fasthttp.DialFunc
- func (c *Crawler) Error(err error)
- func (c *Crawler) Get(URL string) error
- func (c *Crawler) ParseHTML(selector string, f HandleHTML)
- func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (c *Crawler) PostJSON(URL string, requestData map[string]interface{}, ctx pctx.Context) error
- func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error
- func (c Crawler) ProxyPoolAmount() int
- func (c *Crawler) Wait()
- type CrawlerOption
- func WithCache(cc cache.Cache, compressed bool, cacheFileds ...string) CrawlerOption
- func WithConcurrency(count uint64) CrawlerOption
- func WithCookies(cookies map[string]string) CrawlerOption
- func WithLogger(lop *LogOp) CrawlerOption
- func WithProxy(proxyURL string) CrawlerOption
- func WithProxyPool(proxyURLs []string) CrawlerOption
- func WithRawCookie(cookie string) CrawlerOption
- func WithRetry(count uint32, cond RetryConditions) CrawlerOption
- func WithUserAgent(ua string) CrawlerOption
- type CustomRandomBoundary
- type HTMLParser
- type HandleHTML
- type HandleRequest
- type HandleResponse
- type LogOp
- type MultipartForm
- type Pool
- type Request
- func (r *Request) Abort()
- func (r Request) Get(u string) error
- func (r Request) Hash() (string, error)
- func (r *Request) New(method, URL string, body []byte) *Request
- func (r Request) NumberOfRetries() uint32
- func (r Request) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (r *Request) Reset()
- func (r *Request) SetContentType(contentType string)
- func (r *Request) SetHeaders(headers map[string]string)
- type Response
- type RetryConditions
- type Task
Constants ¶
const ( RUNNING = 1 STOPED = 0 )
running status
Variables ¶
var ( InvalidProxyError = errors.New("the proxy ip should contain the protocol") UnknownProtocolError = errors.New("only support http and socks5 protocol") ProxyExpiredError = errors.New("the proxy ip has expired") OnlyOneProxyIPError = errors.New("unable to delete the only proxy ip") UnkownProxyIPError = errors.New("proxy is unkown") EmptyProxyPoolError = errors.New("after deleting the invalid proxy, the current proxy ip pool is empty") NoCacheSet = errors.New("No cache set") )
var ( // return if pool size <= 0 ErrInvalidPoolCap = errors.New("invalid pool cap") // put task but pool already closed ErrPoolAlreadyClosed = errors.New("pool already closed") )
errors
var (
IncorrectResponse = errors.New("the response status code is not 200 or 201")
)
Functions ¶
func ReleaseRequest ¶
func ReleaseRequest(req *Request)
ReleaseRequest returns req acquired via AcquireRequest to request pool.
It is forbidden accessing req and/or its' members after returning it to request pool.
func ReleaseResponse ¶
func ReleaseResponse(resp *Response)
ReleaseResponse returns resp acquired via AcquireResponse to response pool.
It is forbidden accessing resp and/or its' members after returning it to response pool.
Types ¶
type Crawler ¶
type Crawler struct { // UserAgent is the User-Agent string used by HTTP requests UserAgent string // 在多协程中这个上下文管理可以用来退出或取消多个协程 Context context.Context // contains filtered or unexported fields }
Crawler is the provider of crawlers
func NewCrawler ¶
func NewCrawler(opts ...CrawlerOption) *Crawler
NewCrawler creates a new Crawler instance with some CrawlerOptions
func (*Crawler) AfterResponse ¶
func (c *Crawler) AfterResponse(f HandleResponse)
AfterResponse is used to process the response, this method should be used for the response body in non-html format
func (*Crawler) BeforeRequest ¶
func (c *Crawler) BeforeRequest(f HandleRequest)
BeforeRequest used to process requests, such as setting headers, passing context, etc.
func (*Crawler) DialWithProxy ¶
func (*Crawler) DialWithProxyAndTimeout ¶
func (*Crawler) ParseHTML ¶
func (c *Crawler) ParseHTML(selector string, f HandleHTML)
ParseHTML can parse html to find the data you need, and process the data
func (*Crawler) PostMultipart ¶
PostMultipart
func (Crawler) ProxyPoolAmount ¶
ProxyPoolAmount returns the number of proxies in the proxy pool
type CrawlerOption ¶
type CrawlerOption func(*Crawler)
func WithCache ¶
func WithCache(cc cache.Cache, compressed bool, cacheFileds ...string) CrawlerOption
WithCache 使用缓存,可以选择是否压缩缓存的响应。 使用缓存时,如果发出的是 POST 请求,最好传入能 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
注意:当不传入缓存字段时,将会默认采用整个请求体作为 缓存标识,但由于 map 无序,同一个请求体生成的 key 很 难保证相同,所以可能会有同一个请求缓存多次,或者无法 从缓存中读取已请求过的请求的响应的情况出现。
func WithConcurrency ¶
func WithConcurrency(count uint64) CrawlerOption
WithConcurrency 使用并发,参数为要创建的协程池数量
func WithCookies ¶
func WithCookies(cookies map[string]string) CrawlerOption
func WithLogger ¶
func WithLogger(lop *LogOp) CrawlerOption
func WithRawCookie ¶
func WithRawCookie(cookie string) CrawlerOption
func WithRetry ¶
func WithRetry(count uint32, cond RetryConditions) CrawlerOption
WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
func WithUserAgent ¶
func WithUserAgent(ua string) CrawlerOption
type CustomRandomBoundary ¶
type CustomRandomBoundary func() string
CustomRandomBoundary generates a custom boundary
type HTMLParser ¶
type HTMLParser struct { Selector string Handle HandleHTML }
HTMLParser is used to parse html
type HandleResponse ¶
type HandleResponse func(r *Response)
HandleResponse is used to handle the response
type LogOp ¶
func (*LogOp) ToConsoleAndFile ¶
type MultipartForm ¶
type MultipartForm struct {
// contains filtered or unexported fields
}
MultipartForm 请求体的构造
func NewMultipartForm ¶
func NewMultipartForm(dash string, f CustomRandomBoundary) *MultipartForm
func (*MultipartForm) AppendFile ¶
func (mf *MultipartForm) AppendFile(name, filePath string) error
func (*MultipartForm) AppendString ¶
func (mf *MultipartForm) AppendString(name, value string)
func (*MultipartForm) Boundary ¶
func (mf *MultipartForm) Boundary() string
Boundary returns the Writer's boundary.
func (*MultipartForm) Bytes ¶
func (mf *MultipartForm) Bytes() []byte
func (*MultipartForm) FormDataContentType ¶
func (mf *MultipartForm) FormDataContentType() string
FormDataContentType returns the Content-Type for an HTTP multipart/form-data with this Writer's Boundary.
type Request ¶
type Request struct { // 访问的链接 URL string // 请求方法 Method string // 请求头 Headers *fasthttp.RequestHeader // 请求和响应之间共享的上下文 Ctx pctx.Context // 请求体 Body []byte // 唯一标识 ID uint32 // contains filtered or unexported fields }
func AcquireRequest ¶
func AcquireRequest() *Request
AcquireRequest returns an empty Request instance from request pool.
The returned Request instance may be passed to ReleaseRequest when it is no longer needed. This allows Request recycling, reduces GC pressure and usually improves performance.
func (Request) NumberOfRetries ¶
func (*Request) SetContentType ¶
func (*Request) SetHeaders ¶
type Response ¶
type Response struct { // 响应状态码 StatusCode int // 二进制请求体 Body []byte // 请求和响应之间共享的上下文 Ctx ctx.Context `json:"-"` // 响应对应的请求 Request *Request `json:"-"` // 响应头 Headers fasthttp.ResponseHeader // 是否从缓存中取得的响应 FromCache bool }
func AcquireResponse ¶
func AcquireResponse() *Response
AcquireResponse returns an empty Response instance from response pool.
The returned Response instance may be passed to ReleaseResponse when it is no longer needed. This allows Response recycling, reduces GC pressure and usually improves performance.