Documentation ¶
Index ¶
- Constants
- Variables
- func ReleaseRequest(req *Request)
- func ReleaseResponse(resp *Response)
- type AcquireProxies
- type Crawler
- func (c *Crawler) AfterResponse(f HandleResponse)
- func (c *Crawler) BeforeRequest(f HandleRequest)
- func (c *Crawler) ClearCache() error
- func (c *Crawler) Clone() *Crawler
- func (c *Crawler) DialWithProxy() fasthttp.DialFunc
- func (c *Crawler) DialWithProxyAndTimeout(timeout time.Duration) fasthttp.DialFunc
- func (c *Crawler) Error(err error)
- func (c *Crawler) Get(URL string) error
- func (c *Crawler) ParseHTML(selector string, f HandleHTML)
- func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (c *Crawler) PostJSON(URL string, requestData map[string]interface{}, ctx pctx.Context) error
- func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error
- func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error
- func (c Crawler) ProxyPoolAmount() int
- func (c *Crawler) Wait()
- type CrawlerOption
- func WithCache(cc cache.Cache, compressed bool, cacheFileds ...string) CrawlerOption
- func WithConcurrency(count uint64) CrawlerOption
- func WithCookies(cookies map[string]string) CrawlerOption
- func WithLogger(lop *LogOp) CrawlerOption
- func WithProxy(proxyURL string) CrawlerOption
- func WithProxyPool(proxyURLs []string) CrawlerOption
- func WithRawCookie(cookie string) CrawlerOption
- func WithRetry(count uint32, cond RetryConditions) CrawlerOption
- func WithUserAgent(ua string) CrawlerOption
- type CustomRandomBoundary
- type HTMLParser
- type HandleHTML
- type HandleRequest
- type HandleResponse
- type LogOp
- type MultipartForm
- type Pool
- type Request
- func (r *Request) Abort()
- func (r *Request) AllowRedirect(maxRedirectsCount uint)
- func (r Request) Get(u string) error
- func (r Request) Hash() (string, error)
- func (r *Request) New(method, URL string, body []byte) *Request
- func (r Request) NumberOfRetries() uint32
- func (r Request) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (r *Request) Reset()
- func (r *Request) SetContentType(contentType string)
- func (r *Request) SetHeaders(headers map[string]string)
- type Response
- type RetryConditions
- type Task
Constants ¶
const ( RUNNING = 1 STOPED = 0 )
running status
Variables ¶
var ( ErrInvalidProxy = errors.New("the proxy ip should contain the protocol") ErrUnknownProtocol = errors.New("only support http and socks5 protocol") ErrProxyExpired = errors.New("the proxy ip has expired") ErrOnlyOneProxyIP = errors.New("unable to delete the only proxy ip") ErrUnkownProxyIP = errors.New("proxy is unkown") ErrEmptyProxyPool = errors.New("after deleting the invalid proxy, the current proxy ip pool is empty") ErrNoCacheSet = errors.New("no cache set") )
var ( // return if pool size <= 0 ErrInvalidPoolCap = errors.New("invalid pool cap") // put task but pool already closed ErrPoolAlreadyClosed = errors.New("pool already closed") )
errors
var (
ErrIncorrectResponse = errors.New("the response status code is not 200 or 201")
)
Functions ¶
func ReleaseRequest ¶ added in v0.1.4
func ReleaseRequest(req *Request)
ReleaseRequest returns req acquired via AcquireRequest to request pool.
It is forbidden accessing req and/or its' members after returning it to request pool.
func ReleaseResponse ¶ added in v0.1.4
func ReleaseResponse(resp *Response)
ReleaseResponse returns resp acquired via AcquireResponse to response pool.
It is forbidden accessing resp and/or its' members after returning it to response pool.
Types ¶
type AcquireProxies ¶ added in v0.1.4
可以从一些代理网站的 api 中请求指定数量的代理 ip
type Crawler ¶
type Crawler struct { // UserAgent is the User-Agent string used by HTTP requests UserAgent string // 在多协程中这个上下文管理可以用来退出或取消多个协程 Context context.Context // contains filtered or unexported fields }
Crawler is the provider of crawlers
func NewCrawler ¶
func NewCrawler(opts ...CrawlerOption) *Crawler
NewCrawler creates a new Crawler instance with some CrawlerOptions
func (*Crawler) AfterResponse ¶ added in v0.0.3
func (c *Crawler) AfterResponse(f HandleResponse)
AfterResponse is used to process the response, this method should be used for the response body in non-html format
func (*Crawler) BeforeRequest ¶ added in v0.0.3
func (c *Crawler) BeforeRequest(f HandleRequest)
BeforeRequest used to process requests, such as setting headers, passing context, etc.
func (*Crawler) ClearCache ¶ added in v0.1.4
ClearCache will clear all cache
func (*Crawler) DialWithProxy ¶ added in v0.0.7
func (*Crawler) DialWithProxyAndTimeout ¶ added in v0.0.7
func (*Crawler) ParseHTML ¶ added in v0.0.8
func (c *Crawler) ParseHTML(selector string, f HandleHTML)
ParseHTML can parse html to find the data you need, and process the data
func (*Crawler) PostJSON ¶ added in v0.1.4
PostJSON is used to send a POST request body in json format
func (*Crawler) PostMultipart ¶
PostMultipart
func (Crawler) ProxyPoolAmount ¶ added in v0.0.6
ProxyPoolAmount returns the number of proxies in the proxy pool
type CrawlerOption ¶
type CrawlerOption func(*Crawler)
func WithCache ¶ added in v0.1.4
func WithCache(cc cache.Cache, compressed bool, cacheFileds ...string) CrawlerOption
WithCache 使用缓存,可以选择是否压缩缓存的响应。 使用缓存时,如果发出的是 POST 请求,最好传入能 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
注意:当不传入缓存字段时,将会默认采用整个请求体作为 缓存标识,但由于 map 无序,同一个请求体生成的 key 很 难保证相同,所以可能会有同一个请求缓存多次,或者无法 从缓存中读取已请求过的请求的响应的情况出现。
func WithConcurrency ¶ added in v0.0.9
func WithConcurrency(count uint64) CrawlerOption
WithConcurrency 使用并发,参数为要创建的协程池数量
func WithCookies ¶
func WithCookies(cookies map[string]string) CrawlerOption
func WithLogger ¶ added in v0.1.4
func WithLogger(lop *LogOp) CrawlerOption
func WithProxyPool ¶ added in v0.0.2
func WithProxyPool(proxyURLs []string) CrawlerOption
WithProxyPool 使用一个代理池
func WithRawCookie ¶ added in v0.1.4
func WithRawCookie(cookie string) CrawlerOption
func WithRetry ¶
func WithRetry(count uint32, cond RetryConditions) CrawlerOption
WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
func WithUserAgent ¶
func WithUserAgent(ua string) CrawlerOption
type CustomRandomBoundary ¶ added in v0.0.3
type CustomRandomBoundary func() string
CustomRandomBoundary generates a custom boundary
type HTMLParser ¶ added in v0.0.8
type HTMLParser struct { Selector string Handle HandleHTML }
HTMLParser is used to parse html
type HandleHTML ¶ added in v0.0.8
type HandleHTML func(he *html.HTMLElement)
HandleHTML is used to process html
type HandleRequest ¶ added in v0.0.3
type HandleRequest func(r *Request)
HandleRequest is used to patch the request
type HandleResponse ¶ added in v0.0.3
type HandleResponse func(r *Response)
HandleResponse is used to handle the response
type LogOp ¶ added in v0.1.4
func (*LogOp) ToConsoleAndFile ¶ added in v0.1.4
type MultipartForm ¶ added in v0.1.4
type MultipartForm struct {
// contains filtered or unexported fields
}
MultipartForm 请求体的构造
func NewMultipartForm ¶ added in v0.1.4
func NewMultipartForm(dash string, f CustomRandomBoundary) *MultipartForm
func (*MultipartForm) AppendFile ¶ added in v0.1.4
func (mf *MultipartForm) AppendFile(name, filePath string) error
func (*MultipartForm) AppendString ¶ added in v0.1.4
func (mf *MultipartForm) AppendString(name, value string)
func (*MultipartForm) Boundary ¶ added in v0.1.4
func (mf *MultipartForm) Boundary() string
Boundary returns the Writer's boundary.
func (*MultipartForm) Bytes ¶ added in v0.1.4
func (mf *MultipartForm) Bytes() []byte
func (*MultipartForm) FormDataContentType ¶ added in v0.1.4
func (mf *MultipartForm) FormDataContentType() string
FormDataContentType returns the Content-Type for an HTTP multipart/form-data with this Writer's Boundary.
type Pool ¶ added in v0.0.9
Pool task pool
func (*Pool) GetRunningWorkers ¶ added in v0.0.9
GetRunningWorkers get running workers
type Request ¶ added in v0.0.3
type Request struct { // 访问的链接 URL string // 请求方法 Method string // 请求头 Headers *fasthttp.RequestHeader // 请求和响应之间共享的上下文 Ctx pctx.Context // 请求体 Body []byte // 唯一标识 ID uint32 // contains filtered or unexported fields }
func AcquireRequest ¶ added in v0.1.4
func AcquireRequest() *Request
AcquireRequest returns an empty Request instance from request pool.
The returned Request instance may be passed to ReleaseRequest when it is no longer needed. This allows Request recycling, reduces GC pressure and usually improves performance.
func (*Request) AllowRedirect ¶ added in v0.1.5
AllowRedirect 最多允许重定向 maxRedirectsCount 次。
重定向是一件比较常见,但影响爬虫效率的事,请根据实际情况设置此值。
func (Request) NumberOfRetries ¶ added in v0.0.5
func (*Request) SetContentType ¶ added in v0.0.3
func (*Request) SetHeaders ¶ added in v0.0.4
type Response ¶ added in v0.0.3
type Response struct { // 响应状态码 StatusCode int // 二进制请求体 Body []byte // 请求和响应之间共享的上下文 Ctx ctx.Context `json:"-"` // 响应对应的请求 Request *Request `json:"-"` // 响应头 Headers fasthttp.ResponseHeader // 是否从缓存中取得的响应 FromCache bool }
func AcquireResponse ¶ added in v0.1.4
func AcquireResponse() *Response
AcquireResponse returns an empty Response instance from response pool.
The returned Response instance may be passed to ReleaseResponse when it is no longer needed. This allows Response recycling, reduces GC pressure and usually improves performance.