Documentation ¶
Index ¶
- Constants
- Variables
- func ReleaseRequest(req *Request)
- func ReleaseResponse(resp *Response, releaseCtx bool)
- type AcquireProxies
- type Cache
- type CacheCondition
- type CacheField
- type CacheModel
- type ComplementProxyPool
- type Crawler
- func (c *Crawler) AddProxy(newProxy string)
- func (c *Crawler) AfterResponse(f HandleResponse)
- func (c *Crawler) BeforeRequest(f HandleRequest)
- func (c *Crawler) ClearCache() error
- func (c *Crawler) Clone() *Crawler
- func (c *Crawler) Debug(msg string, args ...log.Arg)
- func (c *Crawler) Error(err error, args ...log.Arg)
- func (c *Crawler) Fatal(err error, args ...log.Arg)
- func (c *Crawler) FatalOrPanic(err error)
- func (c *Crawler) Get(URL string) error
- func (c *Crawler) GetWithCtx(URL string, ctx pctx.Context) error
- func (c *Crawler) Info(msg string, args ...log.Arg)
- func (c *Crawler) ParseHTML(selector string, f HandleHTML)
- func (c *Crawler) ParseJSON(strict bool, f HandleJSON)
- func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (c *Crawler) PostJSON(URL string, requestData map[string]interface{}, ctx pctx.Context) error
- func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error
- func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error
- func (c *Crawler) ProxyDialerWithTimeout(proxyAddr string, timeout time.Duration) fasthttp.DialFunc
- func (c Crawler) ProxyInUse() string
- func (c Crawler) ProxyPoolAmount() int
- func (c *Crawler) SetCache(cc Cache, compressed bool, cacheCondition CacheCondition, ...)
- func (c *Crawler) SetConcurrency(count uint64, blockPanic bool)
- func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition)
- func (c *Crawler) SetRetry(count uint32, cond RetryConditions)
- func (c *Crawler) Wait()
- func (c *Crawler) Warning(msg string, args ...log.Arg)
- type CrawlerOption
- func EnableIPv6() CrawlerOption
- func SkipVerification() CrawlerOption
- func WithCache(cc Cache, compressed bool, cacheCondition CacheCondition, ...) CrawlerOption
- func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption
- func WithConcurrency(count uint64, blockPanic bool) CrawlerOption
- func WithCookies(cookies map[string]string) CrawlerOption
- func WithDefaultCache() CrawlerOption
- func WithDefaultLogger() CrawlerOption
- func WithLogger(logger *log.Logger) CrawlerOption
- func WithProxy(proxyURL string) CrawlerOption
- func WithProxyPool(proxyURLs []string) CrawlerOption
- func WithRawCookie(cookie string) CrawlerOption
- func WithRetry(count uint32, cond RetryConditions) CrawlerOption
- func WithUserAgent(ua string) CrawlerOption
- type CustomRandomBoundary
- type HTMLParser
- type HandleHTML
- type HandleJSON
- type HandleRequest
- type HandleResponse
- type JSONParser
- type MultipartForm
- type Pool
- type ProxyInvalidCondition
- type Request
- func (r *Request) Abort()
- func (r Request) AbsoluteURL(src string) string
- func (r *Request) AllowRedirect(maxRedirectsCount uint)
- func (r Request) Get(u string) error
- func (r Request) GetWithCache(URL string, cacheFields ...CacheField) error
- func (r Request) Hash() (string, error)
- func (r *Request) New(method, URL string, body []byte) *Request
- func (r Request) NumberOfRetries() uint32
- func (r Request) Post(URL string, requestData map[string]string) error
- func (r Request) PostJSON(URL string, requestData map[string]interface{}) error
- func (r Request) PostJSONWithCache(URL string, requestData map[string]interface{}, cacheFields ...CacheField) error
- func (r Request) PostMultipart(URL string, form *MultipartForm) error
- func (r Request) PostMultipartWithCache(URL string, form *MultipartForm, cacheFields ...CacheField) error
- func (r Request) PostWithCache(URL string, requestData map[string]string, cacheFields ...CacheField) error
- func (r Request) Request(method, URL string, cachedMap map[string]string, body []byte) error
- func (r *Request) Reset()
- func (r *Request) SetContentType(contentType string)
- func (r *Request) SetHeaders(headers map[string]string)
- func (r *Request) SetTimeout(t time.Duration)
- type Response
- func (r *Response) BodyGunzip() ([]byte, error)
- func (r Response) ClientIP() string
- func (r *Response) ContentType() string
- func (r *Response) GetSetCookie() string
- func (r *Response) Invalidate()
- func (r Response) IsTimeout() bool
- func (r Response) LocalIP() string
- func (r Response) Marshal() ([]byte, error)
- func (r *Response) Reset(releaseCtx bool)
- func (r *Response) Save(fileName string) error
- func (r *Response) String() string
- func (r *Response) Unmarshal(cachedBody []byte) error
- type RetryConditions
- type Task
Constants ¶
const ( // A key or field from URL query parameters QueryParam cacheFieldType = iota // A key or field from request body parameters RequestBodyParam )
const ( RUNNING = 1 STOPED = 0 )
running status
Variables ¶
var ( ErrNoCacheSet = errors.New("no cache set") ErrRequestFailed = errors.New("request failed") ErrTimeout = errors.New("timeout, and it is recommended to try a new proxy if you are using a proxy pool") ErrInvalidCacheTypeCode = errors.New("invalid cache type code") ErrNotAllowedCacheFieldType = errors.New("only query parameters are allowed as cached fields in `GET` requests") )
var ( // return if pool size <= 0 ErrInvalidPoolCap = errors.New("invalid pool cap") // put task but pool already closed ErrPoolAlreadyClosed = errors.New("pool already closed") // only the error type can be captured and processed ErrUnkownType = errors.New("recover only allows error type, but an unknown type is received") )
errors
var (
ErrIncorrectResponse = errors.New("the response status code is not 20X")
)
Functions ¶
func ReleaseRequest ¶
func ReleaseRequest(req *Request)
ReleaseRequest returns req acquired via AcquireRequest to request pool.
It is forbidden accessing req and/or its' members after returning it to request pool.
func ReleaseResponse ¶
ReleaseResponse returns resp acquired via AcquireResponse to response pool.
It is forbidden accessing resp and/or its' members after returning it to response pool.
Types ¶
type Cache ¶ added in v0.2.3
type Cache interface { // 是否开启压缩。压缩后能减小数据量,但压缩过程会耗时。 // 如果原数据长度很长,压缩耗时要比查询耗时低得多,此时开启压缩功能是最佳选择。 // 但如果原数据长度较短,压缩或不压缩,整体耗时区别不大。 // 是否开启压缩,需要自行测试抉择。 Compressed(yes bool) // 初始化,用来迁移数据库 / 表,和一些与数据库有关的前期准备工作 Init() error // 当前请求是否已缓存过,如果缓存过,则返回缓存中的响应 IsCached(key string) ([]byte, bool) // 将没有缓存过的请求保存到缓存中 Cache(key string, val []byte) error // 清除全部缓存 Clear() error }
type CacheCondition ¶ added in v0.2.0
type CacheField ¶ added in v0.2.4
type CacheField struct { Field string // contains filtered or unexported fields }
func NewQueryParamField ¶ added in v0.2.4
func NewQueryParamField(field string) CacheField
func NewRequestBodyParamField ¶ added in v0.2.4
func NewRequestBodyParamField(field string) CacheField
func (CacheField) String ¶ added in v0.2.4
func (cf CacheField) String() string
type CacheModel ¶ added in v0.2.3
func (CacheModel) TableName ¶ added in v0.2.3
func (CacheModel) TableName() string
type ComplementProxyPool ¶ added in v0.2.0
type ComplementProxyPool func() []string
type Crawler ¶
type Crawler struct { // UserAgent is the User-Agent string used by HTTP requests UserAgent string // 在多协程中这个上下文管理可以用来退出或取消多个协程 Context context.Context // contains filtered or unexported fields }
Crawler is the provider of crawlers
func NewCrawler ¶
func NewCrawler(opts ...CrawlerOption) *Crawler
NewCrawler creates a new Crawler instance with some CrawlerOptions
func (*Crawler) AfterResponse ¶
func (c *Crawler) AfterResponse(f HandleResponse)
AfterResponse is used to process the response, this method should be used for the response body in non-html format
func (*Crawler) BeforeRequest ¶
func (c *Crawler) BeforeRequest(f HandleRequest)
BeforeRequest used to process requests, such as setting headers, passing context, etc.
func (*Crawler) FatalOrPanic ¶ added in v0.2.0
func (*Crawler) GetWithCtx ¶ added in v0.2.0
GetWithCtx is used to send GET requests with a context
func (*Crawler) ParseHTML ¶
func (c *Crawler) ParseHTML(selector string, f HandleHTML)
ParseHTML can parse html to find the data you need, and process the data
func (*Crawler) ParseJSON ¶ added in v0.2.4
func (c *Crawler) ParseJSON(strict bool, f HandleJSON)
ParseJSON can parse json to find the data you need, and process the data.
If you set `strict` to true, responses that do not contain `application/json` in the content-type of the response header will not be processed.
It is recommended to do full processing of the json response in one call to `ParseJSON` instead of multiple calls to `ParseJSON`.
func (*Crawler) PostMultipart ¶
PostMultipart
func (*Crawler) ProxyDialerWithTimeout ¶ added in v0.2.0
func (Crawler) ProxyInUse ¶ added in v0.2.2
func (Crawler) ProxyPoolAmount ¶
ProxyPoolAmount returns the number of proxies in the proxy pool
func (*Crawler) SetCache ¶ added in v0.2.4
func (c *Crawler) SetCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField)
func (*Crawler) SetConcurrency ¶ added in v0.2.2
SetConcurrency 使用并发,参数为要创建的协程池数量
func (*Crawler) SetProxyInvalidCondition ¶ added in v0.2.0
func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition)
func (*Crawler) SetRetry ¶ added in v0.2.2
func (c *Crawler) SetRetry(count uint32, cond RetryConditions)
type CrawlerOption ¶
type CrawlerOption func(*Crawler)
func EnableIPv6 ¶ added in v0.2.0
func EnableIPv6() CrawlerOption
func SkipVerification ¶
func SkipVerification() CrawlerOption
SkipVerification will skip verifying the certificate when you access the `https` protocol
func WithCache ¶
func WithCache(cc Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...CacheField) CrawlerOption
WithCache 使用缓存,可以选择是否压缩缓存的响应。 使用缓存时,如果发出的是 POST 请求,最好传入能 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
注意:当不传入缓存字段时,将会默认采用整个请求体作为 缓存标识,但由于 map 无序,同一个请求体生成的 key 很 难保证相同,所以可能会有同一个请求缓存多次,或者无法 从缓存中读取已请求过的请求的响应的情况出现。
func WithComplementProxyPool ¶ added in v0.2.0
func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption
func WithConcurrency ¶
func WithConcurrency(count uint64, blockPanic bool) CrawlerOption
WithConcurrency 使用并发,参数为要创建的协程池数量
func WithCookies ¶
func WithCookies(cookies map[string]string) CrawlerOption
func WithDefaultCache ¶ added in v0.2.0
func WithDefaultCache() CrawlerOption
WithDefaultCache 默认缓存为 sqlite3,不压缩
func WithDefaultLogger ¶ added in v0.2.0
func WithDefaultLogger() CrawlerOption
func WithLogger ¶
func WithLogger(logger *log.Logger) CrawlerOption
func WithRawCookie ¶
func WithRawCookie(cookie string) CrawlerOption
func WithRetry ¶
func WithRetry(count uint32, cond RetryConditions) CrawlerOption
WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
func WithUserAgent ¶
func WithUserAgent(ua string) CrawlerOption
type CustomRandomBoundary ¶
type CustomRandomBoundary func() string
CustomRandomBoundary generates a custom boundary
type HTMLParser ¶
type HTMLParser struct { Selector string Handle HandleHTML }
HTMLParser is used to parse html
type HandleHTML ¶
type HandleHTML func(he *html.HTMLElement, r *Response)
HandleHTML is used to process html
type HandleJSON ¶ added in v0.2.4
type HandleJSON func(j json.JSONResult)
type HandleResponse ¶
type HandleResponse func(r *Response)
HandleResponse is used to handle the response
type JSONParser ¶ added in v0.2.4
type JSONParser struct { Handle HandleJSON // contains filtered or unexported fields }
JSONParser is used to parse json
type MultipartForm ¶
type MultipartForm struct {
// contains filtered or unexported fields
}
MultipartForm 请求体的构造
func NewMultipartForm ¶
func NewMultipartForm(dash string, f CustomRandomBoundary) *MultipartForm
func (*MultipartForm) AppendFile ¶
func (mf *MultipartForm) AppendFile(name, filePath string) error
func (*MultipartForm) AppendString ¶
func (mf *MultipartForm) AppendString(name, value string)
func (*MultipartForm) Boundary ¶
func (mf *MultipartForm) Boundary() string
Boundary returns the Writer's boundary.
func (*MultipartForm) Bytes ¶
func (mf *MultipartForm) Bytes() []byte
func (*MultipartForm) FormDataContentType ¶
func (mf *MultipartForm) FormDataContentType() string
FormDataContentType returns the Content-Type for an HTTP multipart/form-data with this Writer's Boundary.
type ProxyInvalidCondition ¶ added in v0.2.0
type Request ¶
type Request struct { // 访问的链接 URL string // 请求方法 Method string // 请求头 Headers *fasthttp.RequestHeader // 请求和响应之间共享的上下文 Ctx pctx.Context // 请求体 Body []byte // 唯一标识 ID uint32 // contains filtered or unexported fields }
func AcquireRequest ¶
func AcquireRequest() *Request
AcquireRequest returns an empty Request instance from request pool.
The returned Request instance may be passed to ReleaseRequest when it is no longer needed. This allows Request recycling, reduces GC pressure and usually improves performance.
func (Request) AbsoluteURL ¶
AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment or could not be parsed
func (*Request) AllowRedirect ¶
AllowRedirect allows up to `maxRedirectsCount` times to be redirected.
func (Request) GetWithCache ¶ added in v0.2.0
func (r Request) GetWithCache(URL string, cacheFields ...CacheField) error
func (Request) NumberOfRetries ¶
func (Request) PostJSONWithCache ¶ added in v0.2.0
func (r Request) PostJSONWithCache(URL string, requestData map[string]interface{}, cacheFields ...CacheField) error
func (Request) PostMultipart ¶ added in v0.2.0
func (r Request) PostMultipart(URL string, form *MultipartForm) error
func (Request) PostMultipartWithCache ¶ added in v0.2.0
func (r Request) PostMultipartWithCache(URL string, form *MultipartForm, cacheFields ...CacheField) error
func (Request) PostWithCache ¶ added in v0.2.0
func (*Request) SetContentType ¶
func (*Request) SetHeaders ¶
func (*Request) SetTimeout ¶ added in v0.1.9
SetTimeout sets the waiting time for each request before the remote end returns a response.
The function doesn't follow redirects.
type Response ¶
type Response struct { // 响应状态码 StatusCode int // 二进制请求体 Body []byte // 请求和响应之间共享的上下文 Ctx ctx.Context `json:"-"` // 响应对应的请求 Request *Request `json:"-"` // 响应头 Headers fasthttp.ResponseHeader // 是否从缓存中取得的响应 FromCache bool // contains filtered or unexported fields }
func AcquireResponse ¶
func AcquireResponse() *Response
AcquireResponse returns an empty Response instance from response pool.
The returned Response instance may be passed to ReleaseResponse when it is no longer needed. This allows Response recycling, reduces GC pressure and usually improves performance.
func (*Response) BodyGunzip ¶ added in v0.2.4
BodyGunzip returns un-gzipped body data.
This method may be used if the response header contains 'Content-Encoding: gzip' for reading un-gzipped body. Use Body for reading gzipped response body.
func (*Response) ContentType ¶
func (*Response) GetSetCookie ¶
func (*Response) Invalidate ¶ added in v0.2.4
func (r *Response) Invalidate()
Invalidate marks the current response as invalid and skips the html parsing process