Documentation ¶
Index ¶
- func AllowedDomains(domains []string) func(*Spider)
- func BasicAuthHeader(username string, password string) string
- func BytesToString(b []byte) string
- func CloseFunc(fc func(*Spider)) func(*Spider)
- func ConcurrentItems(items int) func(*Setting)
- func ConcurrentRequests(requests int) func(*Setting)
- func ConcurrentRequestsPerDomain(requests int) func(*Setting)
- func CopyNew(old interface{}) interface{}
- func DefaultParseFunc(fc ParseFunc) func(*Spider)
- func DefaultRequestHeaders(headers http.Header) func(*Setting)
- func DepthLimit(depth int) func(*Setting)
- func DepthPriority(depth int) func(*Setting)
- func DepthStatsVerbose(depth bool) func(*Setting)
- func DownloadDelay(t time.Duration) func(*Setting)
- func DownloadMaxSize(size int) func(*Setting)
- func DownloadStats(stats bool) func(*Setting)
- func DownloadTimeout(t time.Duration) func(*Setting)
- func DownloadWarnSize(size int) func(*Setting)
- func DownloaderClass(c Downloader) func(*Setting)
- func DownloaderMiddlewares(d map[int]DownloaderMiddlewarer) func(*Setting)
- func DownloaderMiddlewaresBase(d map[int]DownloaderMiddlewarer) func(*Setting)
- func DropItem(reason string) error
- func ErrFile(file string) func(*Setting)
- func FilterClass(f DupeFilter) func(*Setting)
- func FilterEnabled(filter bool) func(*Setting)
- func FindScriptVar(b []byte, v string) string
- func GetHeaderSize(header http.Header) int
- func GetRequestSize(request *Request) int
- func GetResponseSize(response *Response) int
- func HttpAuthDomain(h []string) func(*Setting)
- func HttpError(reason string) error
- func HttpErrorAllowAll(http bool) func(*Setting)
- func HttpErrorAllowedCodes(http []int) func(*Setting)
- func HttpPass(h string) func(*Setting)
- func HttpUser(h string) func(*Setting)
- func IgnoreRequest(reason string) error
- func InitLog(logPath, errPath string, logLevel zapcore.Level) *zap.SugaredLogger
- func ItemPipelines(i map[int]ItemPipeliner) func(*Setting)
- func ItemPipelinesBase(i map[int]ItemPipeliner) func(*Setting)
- func LogEnabled(log bool) func(*Setting)
- func LogFile(log string) func(*Setting)
- func LogLevel(log zapcore.Level) func(*Setting)
- func MaxRetryTimes(retry int) func(*Setting)
- func Name(name string) func(*Spider)
- func NewRequestLogger(log *zap.SugaredLogger, req *Request) *zap.SugaredLogger
- func NewResponseLogger(log *zap.SugaredLogger, response *Response) *zap.SugaredLogger
- func RandomizeDownloadDelay(delay bool) func(*Setting)
- func ReadRequestBody(request *Request) []byte
- func ResponseMaxActiveSize(size int) func(*Setting)
- func RetryEnabled(retry bool) func(*Setting)
- func RetryHttpCodes(retry []int) func(*Setting)
- func RetryPriorityAdjust(retry int) func(*Setting)
- func SanitizeFileName(fileName string) string
- func SchedulerClass(c Scheduler) func(*Setting)
- func SchedulerPriorityQueue(pq PriorityQueuer) func(*Setting)
- func Settings(settings *Setting) func(*Spider)
- func SpiderMiddlewares(m map[int]SpiderMiddlewarer) func(*Setting)
- func SpiderMiddlewaresBase(m map[int]SpiderMiddlewarer) func(*Setting)
- func StartRequestsFunc(fc func(*Spider) []*Request) func(*Spider)
- func StartUrls(urls []string) func(*Spider)
- func StringToBytes(s string) []byte
- func UrlLengthLimit(limits int) func(*Setting)
- func UrlSafeBase64Decode(source string) []byte
- func UrlSafeBase64Encode(source []byte) string
- func UserAgent(ua string) func(*Setting)
- type BaseItemPipeline
- type BaseMiddleware
- func (mw *BaseMiddleware) FromSpider(spider *Spider)
- func (mw *BaseMiddleware) GetModuleName() string
- func (mw *BaseMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse
- func (mw *BaseMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
- func (mw *BaseMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse
- func (mw *BaseMiddleware) ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems
- func (mw *BaseMiddleware) ProcessSpiderInput(response *Response, spider *Spider)
- func (mw *BaseMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems
- func (mw *BaseMiddleware) ProcessStartRequests(result []*Request, spider *Spider) []*Request
- type CloseSpiderFunc
- type Context
- func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}
- func (c *Context) Get(key string) interface{}
- func (c *Context) GetBool(key string) bool
- func (c *Context) GetBoolWithDefault(key string, dft bool) bool
- func (c *Context) GetFloat64(key string) float64
- func (c *Context) GetFloat64WithDefault(key string, dft float64) float64
- func (c *Context) GetInt(key string) int
- func (c *Context) GetIntWithDefault(key string, dft int) int
- func (c *Context) GetString(key string) string
- func (c *Context) GetStringWithDefault(key string, dft string) string
- func (c *Context) GetWithDefault(key string, dft interface{}) interface{}
- func (c *Context) Has(key string) bool
- func (c *Context) Put(key string, value interface{})
- type DefaultDownloader
- func (d *DefaultDownloader) Close()
- func (d *DefaultDownloader) Fetch(request *Request) (*Response, error)
- func (d *DefaultDownloader) FromSpider(spider *Spider)
- func (d *DefaultDownloader) IsEmpty() bool
- func (d *DefaultDownloader) IsFree() bool
- func (d *DefaultDownloader) NextRequestCircle(heartbeat time.Duration)
- func (d *DefaultDownloader) ProcessDownloader(signal *Signal, spider *Spider)
- type DefaultDupeFilter
- type DefaultScheduler
- type DepthMiddleware
- type DownloadStatsMiddleware
- func (mw *DownloadStatsMiddleware) FromSpider(spider *Spider)
- func (mw *DownloadStatsMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse
- func (mw *DownloadStatsMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
- func (mw *DownloadStatsMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse
- type DownloadTimeoutMiddleware
- type Downloader
- type DownloaderMiddlewareManager
- func (d *DownloaderMiddlewareManager) FromSpider(spider *Spider)
- func (d *DownloaderMiddlewareManager) ProcessException(signal *Signal, spider *Spider) (index int)
- func (d *DownloaderMiddlewareManager) ProcessRequest(signal *Signal, spider *Spider) (index int)
- func (d *DownloaderMiddlewareManager) ProcessResponse(signal *Signal, spider *Spider) (index int)
- type DownloaderMiddlewarer
- type DownloaderSlot
- type DupeFilter
- type ErrBase
- type ErrDropItem
- type ErrHttpError
- type ErrIgnoreRequest
- type ErrorbackFunc
- type Failure
- type HttpAuthMiddleware
- type HttpErrorMiddleware
- type Item
- type ItemPipelineManager
- type ItemPipeliner
- type ItemSlot
- type LifoPriorityQueue
- func (q *LifoPriorityQueue) Close()
- func (q *LifoPriorityQueue) FromSpider(spider *Spider)
- func (q *LifoPriorityQueue) IsEmpty() bool
- func (q *LifoPriorityQueue) Peek() interface{}
- func (q *LifoPriorityQueue) Pop() interface{}
- func (q *LifoPriorityQueue) Push(v interface{}, i int)
- func (q *LifoPriorityQueue) Size() int
- type OpenSpiderFunc
- type ParseFunc
- type PriorityQueuer
- type ProcessExceptionFunc
- type ProcessItemFunc
- type ProcessRequestFunc
- type ProcessResponseFunc
- type ProcessSpiderExceptionFunc
- type ProcessSpiderInputFunc
- type ProcessSpiderOutputFunc
- type ProcessStartRequestsFunc
- type Request
- type RequestItems
- type RequestResponse
- type Response
- type ResponseSlot
- type RetryMiddleware
- type Scheduler
- type Setter
- type Setting
- type Signal
- type Spider
- type SpiderMiddlewareManager
- func (s *SpiderMiddlewareManager) FromSpider(spider *Spider)
- func (s *SpiderMiddlewareManager) ProcessSpiderException(signal *Signal, spider *Spider) (index int)
- func (s *SpiderMiddlewareManager) ProcessSpiderInput(signal *Signal, spider *Spider) (index int)
- func (s *SpiderMiddlewareManager) ProcessSpiderOutput(signal *Signal, spider *Spider) (index int)
- func (s *SpiderMiddlewareManager) ProcessStartRequests(signal *Signal, spider *Spider) (index int)
- type SpiderMiddlewarer
- type SpiderOutputData
- type StatsCollect
- func (s *StatsCollect) ClearStats()
- func (s *StatsCollect) GetStats() StatsMap
- func (s *StatsCollect) GetValue(key string, dft int) int
- func (s *StatsCollect) IncValue(key string, count int, start int)
- func (s *StatsCollect) MaxValue(key string, value int)
- func (s *StatsCollect) MinValue(key string, value int)
- func (s *StatsCollect) SetStats(stats StatsMap)
- func (s *StatsCollect) SetValue(key string, value int)
- type StatsCollector
- type StatsMap
- type UrlLengthMiddleware
- type UserAgentMiddleware
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AllowedDomains ¶
func BasicAuthHeader ¶
func BytesToString ¶
func ConcurrentItems ¶
func ConcurrentRequests ¶
func DefaultParseFunc ¶
func DefaultRequestHeaders ¶
func DepthLimit ¶
func DepthPriority ¶
func DepthStatsVerbose ¶
func DownloadDelay ¶
func DownloadMaxSize ¶
func DownloadStats ¶
func DownloadTimeout ¶
func DownloadWarnSize ¶
func DownloaderClass ¶
func DownloaderClass(c Downloader) func(*Setting)
func DownloaderMiddlewares ¶
func DownloaderMiddlewares(d map[int]DownloaderMiddlewarer) func(*Setting)
func DownloaderMiddlewaresBase ¶
func DownloaderMiddlewaresBase(d map[int]DownloaderMiddlewarer) func(*Setting)
func FilterClass ¶
func FilterClass(f DupeFilter) func(*Setting)
func FilterEnabled ¶
func FindScriptVar ¶
func GetHeaderSize ¶
func GetRequestSize ¶
func GetResponseSize ¶
func HttpAuthDomain ¶
func HttpErrorAllowAll ¶
func HttpErrorAllowedCodes ¶
func IgnoreRequest ¶
func InitLog ¶
func InitLog(logPath, errPath string, logLevel zapcore.Level) *zap.SugaredLogger
初始化日志 logger
func ItemPipelines ¶
func ItemPipelines(i map[int]ItemPipeliner) func(*Setting)
func ItemPipelinesBase ¶
func ItemPipelinesBase(i map[int]ItemPipeliner) func(*Setting)
func LogEnabled ¶
func MaxRetryTimes ¶
func NewRequestLogger ¶
func NewRequestLogger(log *zap.SugaredLogger, req *Request) *zap.SugaredLogger
func NewResponseLogger ¶
func NewResponseLogger(log *zap.SugaredLogger, response *Response) *zap.SugaredLogger
func RandomizeDownloadDelay ¶
func ReadRequestBody ¶
func ResponseMaxActiveSize ¶
func RetryEnabled ¶
func RetryHttpCodes ¶
func RetryPriorityAdjust ¶
func SanitizeFileName ¶
SanitizeFileName replaces dangerous characters in a string so the return value can be used as a safe file name.
func SchedulerClass ¶
func SchedulerPriorityQueue ¶
func SchedulerPriorityQueue(pq PriorityQueuer) func(*Setting)
func SpiderMiddlewares ¶
func SpiderMiddlewares(m map[int]SpiderMiddlewarer) func(*Setting)
func SpiderMiddlewaresBase ¶
func SpiderMiddlewaresBase(m map[int]SpiderMiddlewarer) func(*Setting)
func StartRequestsFunc ¶
func StringToBytes ¶
func UrlLengthLimit ¶
func UrlSafeBase64Decode ¶
func UrlSafeBase64Encode ¶
Types ¶
type BaseItemPipeline ¶
type BaseItemPipeline struct { ModuleName string Logger *zap.SugaredLogger Stats StatsCollector }
func (*BaseItemPipeline) CloseSpider ¶
func (i *BaseItemPipeline) CloseSpider(spider *Spider)
func (*BaseItemPipeline) FromSpider ¶
func (i *BaseItemPipeline) FromSpider(spider *Spider)
func (*BaseItemPipeline) GetModuleName ¶
func (i *BaseItemPipeline) GetModuleName() string
func (*BaseItemPipeline) OpenSpider ¶
func (i *BaseItemPipeline) OpenSpider(spider *Spider)
func (*BaseItemPipeline) ProcessItem ¶
func (i *BaseItemPipeline) ProcessItem(item *Item, spider *Spider)
type BaseMiddleware ¶
type BaseMiddleware struct { //中间件模块名称 ModuleName string Logger *zap.SugaredLogger Stats StatsCollector }
爬虫中间件及下载器中间件共用的基础中间件数据结构,实现了两者的全部接口
func (*BaseMiddleware) FromSpider ¶
func (mw *BaseMiddleware) FromSpider(spider *Spider)
func (*BaseMiddleware) GetModuleName ¶
func (mw *BaseMiddleware) GetModuleName() string
func (*BaseMiddleware) ProcessException ¶
func (mw *BaseMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse
func (*BaseMiddleware) ProcessRequest ¶
func (mw *BaseMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
func (*BaseMiddleware) ProcessResponse ¶
func (mw *BaseMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse
func (*BaseMiddleware) ProcessSpiderException ¶
func (mw *BaseMiddleware) ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems
func (*BaseMiddleware) ProcessSpiderInput ¶
func (mw *BaseMiddleware) ProcessSpiderInput(response *Response, spider *Spider)
func (*BaseMiddleware) ProcessSpiderOutput ¶
func (mw *BaseMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems
func (*BaseMiddleware) ProcessStartRequests ¶
func (mw *BaseMiddleware) ProcessStartRequests(result []*Request, spider *Spider) []*Request
type CloseSpiderFunc ¶
type CloseSpiderFunc func(*Spider)
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context provides a tiny layer for passing data between callbacks
func (*Context) GetBoolWithDefault ¶
func (*Context) GetFloat64 ¶
func (*Context) GetFloat64WithDefault ¶
func (*Context) GetStringWithDefault ¶
func (*Context) GetWithDefault ¶
type DefaultDownloader ¶
type DefaultDownloader struct {
// contains filtered or unexported fields
}
func (*DefaultDownloader) Close ¶
func (d *DefaultDownloader) Close()
func (*DefaultDownloader) Fetch ¶
func (d *DefaultDownloader) Fetch(request *Request) (*Response, error)
func (*DefaultDownloader) FromSpider ¶
func (d *DefaultDownloader) FromSpider(spider *Spider)
func (*DefaultDownloader) IsEmpty ¶
func (d *DefaultDownloader) IsEmpty() bool
func (*DefaultDownloader) IsFree ¶
func (d *DefaultDownloader) IsFree() bool
func (*DefaultDownloader) NextRequestCircle ¶
func (d *DefaultDownloader) NextRequestCircle(heartbeat time.Duration)
func (*DefaultDownloader) ProcessDownloader ¶
func (d *DefaultDownloader) ProcessDownloader(signal *Signal, spider *Spider)
type DefaultDupeFilter ¶
type DefaultDupeFilter struct {
// contains filtered or unexported fields
}
func (*DefaultDupeFilter) Close ¶
func (df *DefaultDupeFilter) Close()
func (*DefaultDupeFilter) FromSpider ¶
func (df *DefaultDupeFilter) FromSpider(spider *Spider)
func (*DefaultDupeFilter) Log ¶
func (df *DefaultDupeFilter) Log(request *Request)
func (*DefaultDupeFilter) RequestFingerprint ¶
func (df *DefaultDupeFilter) RequestFingerprint(request *Request) string
func (*DefaultDupeFilter) RequestSeen ¶
func (df *DefaultDupeFilter) RequestSeen(request *Request) bool
type DefaultScheduler ¶
type DefaultScheduler struct {
// contains filtered or unexported fields
}
func (*DefaultScheduler) Close ¶
func (s *DefaultScheduler) Close()
func (*DefaultScheduler) EnqueueRequest ¶
func (s *DefaultScheduler) EnqueueRequest(request *Request) bool
func (*DefaultScheduler) FromSpider ¶
func (s *DefaultScheduler) FromSpider(spider *Spider)
func (*DefaultScheduler) HasPendingRequests ¶
func (s *DefaultScheduler) HasPendingRequests() bool
func (*DefaultScheduler) NextRequest ¶
func (s *DefaultScheduler) NextRequest() *Request
type DepthMiddleware ¶
type DepthMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
func (*DepthMiddleware) FromSpider ¶
func (mw *DepthMiddleware) FromSpider(spider *Spider)
func (*DepthMiddleware) ProcessSpiderOutput ¶
func (mw *DepthMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems
type DownloadStatsMiddleware ¶
type DownloadStatsMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
func (*DownloadStatsMiddleware) FromSpider ¶
func (mw *DownloadStatsMiddleware) FromSpider(spider *Spider)
func (*DownloadStatsMiddleware) ProcessException ¶
func (mw *DownloadStatsMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse
func (*DownloadStatsMiddleware) ProcessRequest ¶
func (mw *DownloadStatsMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
func (*DownloadStatsMiddleware) ProcessResponse ¶
func (mw *DownloadStatsMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse
type DownloadTimeoutMiddleware ¶
type DownloadTimeoutMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
下载超时中间件,对没有在Meta中指定"download_timeout"的Request添加默认超时时间
func (*DownloadTimeoutMiddleware) FromSpider ¶
func (mw *DownloadTimeoutMiddleware) FromSpider(spider *Spider)
func (*DownloadTimeoutMiddleware) ProcessRequest ¶
func (mw *DownloadTimeoutMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
type Downloader ¶
type DownloaderMiddlewareManager ¶
type DownloaderMiddlewareManager struct { ModuleName string // contains filtered or unexported fields }
下载器中间件管理器
func (*DownloaderMiddlewareManager) FromSpider ¶
func (d *DownloaderMiddlewareManager) FromSpider(spider *Spider)
func (*DownloaderMiddlewareManager) ProcessException ¶
func (d *DownloaderMiddlewareManager) ProcessException(signal *Signal, spider *Spider) (index int)
func (*DownloaderMiddlewareManager) ProcessRequest ¶
func (d *DownloaderMiddlewareManager) ProcessRequest(signal *Signal, spider *Spider) (index int)
func (*DownloaderMiddlewareManager) ProcessResponse ¶
func (d *DownloaderMiddlewareManager) ProcessResponse(signal *Signal, spider *Spider) (index int)
type DownloaderMiddlewarer ¶
type DownloaderMiddlewarer interface { GetModuleName() string FromSpider(spider *Spider) ProcessRequest(request *Request, spider *Spider) RequestResponse ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse ProcessException(request *Request, err error, spider *Spider) RequestResponse }
type DownloaderSlot ¶
type DownloaderSlot struct {
// contains filtered or unexported fields
}
func (*DownloaderSlot) DownloadDelay ¶
func (s *DownloaderSlot) DownloadDelay() int64
func (*DownloaderSlot) IsFree ¶
func (s *DownloaderSlot) IsFree() bool
func (*DownloaderSlot) Len ¶
func (s *DownloaderSlot) Len() int
type DupeFilter ¶
type ErrDropItem ¶
type ErrDropItem struct {
ErrBase
}
type ErrHttpError ¶
type ErrHttpError struct {
ErrBase
}
type ErrIgnoreRequest ¶
type ErrIgnoreRequest struct {
ErrBase
}
type ErrorbackFunc ¶
type ErrorbackFunc func(*Failure) RequestItems
ErrorbackFunc Request请求失败后调用的回调函数
type HttpAuthMiddleware ¶
type HttpAuthMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
Http认证中间件,如果爬虫的Setting中指定了"HttpUser"或"HttpPass"参数, 且Request的Header中未指定Authorization,则为该Request添加指定认证信息
func (*HttpAuthMiddleware) FromSpider ¶
func (mw *HttpAuthMiddleware) FromSpider(spider *Spider)
func (*HttpAuthMiddleware) ProcessRequest ¶
func (mw *HttpAuthMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
type HttpErrorMiddleware ¶
type HttpErrorMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
func (*HttpErrorMiddleware) FromSpider ¶
func (mw *HttpErrorMiddleware) FromSpider(spider *Spider)
func (*HttpErrorMiddleware) ProcessSpiderException ¶
func (mw *HttpErrorMiddleware) ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems
func (*HttpErrorMiddleware) ProcessSpiderInput ¶
func (mw *HttpErrorMiddleware) ProcessSpiderInput(response *Response, spider *Spider)
type ItemPipelineManager ¶
type ItemPipelineManager struct { ModuleName string // contains filtered or unexported fields }
func (*ItemPipelineManager) CloseSpider ¶
func (i *ItemPipelineManager) CloseSpider(spider *Spider) (index int)
func (*ItemPipelineManager) FromSpider ¶
func (i *ItemPipelineManager) FromSpider(spider *Spider)
func (*ItemPipelineManager) OpenSpider ¶
func (i *ItemPipelineManager) OpenSpider(spider *Spider) (index int)
func (*ItemPipelineManager) ProcessItem ¶
func (i *ItemPipelineManager) ProcessItem(item Item, spider *Spider) (index int)
type ItemPipeliner ¶
type ItemSlot ¶
type ItemSlot struct {
// contains filtered or unexported fields
}
func (*ItemSlot) FinishItem ¶
func (*ItemSlot) FromSpider ¶
type LifoPriorityQueue ¶
type LifoPriorityQueue struct {
// contains filtered or unexported fields
}
func (*LifoPriorityQueue) Close ¶
func (q *LifoPriorityQueue) Close()
func (*LifoPriorityQueue) FromSpider ¶
func (q *LifoPriorityQueue) FromSpider(spider *Spider)
func (*LifoPriorityQueue) IsEmpty ¶
func (q *LifoPriorityQueue) IsEmpty() bool
func (*LifoPriorityQueue) Peek ¶
func (q *LifoPriorityQueue) Peek() interface{}
func (*LifoPriorityQueue) Pop ¶
func (q *LifoPriorityQueue) Pop() interface{}
func (*LifoPriorityQueue) Push ¶
func (q *LifoPriorityQueue) Push(v interface{}, i int)
func (*LifoPriorityQueue) Size ¶
func (q *LifoPriorityQueue) Size() int
type OpenSpiderFunc ¶
type OpenSpiderFunc func(*Spider)
type ParseFunc ¶
type ParseFunc func(*Response, *Spider) RequestItems
ParseFunc 用户自定义用于解析Response的函数
type PriorityQueuer ¶
type ProcessExceptionFunc ¶
type ProcessExceptionFunc func(request *Request, err error, spider *Spider) RequestResponse
type ProcessItemFunc ¶
type ProcessRequestFunc ¶
type ProcessRequestFunc func(request *Request, spider *Spider) RequestResponse
type ProcessResponseFunc ¶
type ProcessResponseFunc func(request *Request, response *Response, spider *Spider) RequestResponse
type ProcessSpiderExceptionFunc ¶
type ProcessSpiderExceptionFunc func(response *Response, err error, spider *Spider) RequestItems
type ProcessSpiderInputFunc ¶
type ProcessSpiderOutputFunc ¶
type ProcessSpiderOutputFunc func(response *Response, result RequestItems, spider *Spider) RequestItems
type Request ¶
type Request struct { Url *url.URL Method string Headers *http.Header Body io.Reader Cookies []http.Cookie Encoding string Priority int DontFilter bool Ctx *Context //请求抛出错误时的回调函数,错误包括404、请求超时等 Errback ErrorbackFunc //request请求下载完成后处理其response的回调函数 //默认调用Parse() Callback ParseFunc }
func DefaultStartRequests ¶
type RequestItems ¶
type RequestItems []interface{}
type RequestResponse ¶
type RequestResponse interface{}
下载器中间件各功能函数的返回值,Request、Response、nil或者ErrIgnoreRequest
type Response ¶
type Response struct { StatusCode int Body []byte Ctx *Context Request *Request Headers *http.Header }
type ResponseSlot ¶
type ResponseSlot struct {
// contains filtered or unexported fields
}
func (*ResponseSlot) AddResponse ¶
func (s *ResponseSlot) AddResponse(response *Response)
func (*ResponseSlot) FinishResponse ¶
func (s *ResponseSlot) FinishResponse(response *Response)
func (*ResponseSlot) FromSpider ¶
func (s *ResponseSlot) FromSpider(spider *Spider)
func (*ResponseSlot) IsFree ¶
func (s *ResponseSlot) IsFree() bool
type RetryMiddleware ¶
type RetryMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
重试中间件,对符合要求的Response或者错误对应的Request进行重试
func (*RetryMiddleware) FromSpider ¶
func (mw *RetryMiddleware) FromSpider(spider *Spider)
func (*RetryMiddleware) ProcessException ¶
func (mw *RetryMiddleware) ProcessException(request *Request, err error, spider *Spider) RequestResponse
func (*RetryMiddleware) ProcessResponse ¶
func (mw *RetryMiddleware) ProcessResponse(request *Request, response *Response, spider *Spider) RequestResponse
type Setter ¶
type Setting ¶
type Setting struct { // 爬虫机器人名称 // BotName string // 是否启用下载状态记录功能 DownloadStats bool // 下载超时时间 DownloadTimeout time.Duration // 单个域名下每个Request的请求间隔 DownloadDelay time.Duration //启用后实际请求间隔会在0.5到1.5倍的DownloadDelay之间随机选取 RandomizeDownloadDelay bool //单个Request最大下载数据量 DownloadMaxSize int //单个Request开始在日志中输出警报信息的下载数据量 DownloadWarnSize int //单个页面允许爬取的最大深度,为0代表无限制 DepthLimit int //用来根据请求深度调整Request中Priority值的整数 //调整计算公式如下: //Request.Priority = Request.Priority - ( Request.Depth * DepthPriority ) DepthPriority int // 是否启用request深度相关记录功能 DepthStatsVerbose bool //是否自动重试 RetryEnabled bool //除第一次外最大重试次数 MaxRetryTimes int //自动重试的状态码 RetryHttpCodes []int // 重试请求优先级的调整参数 // request.Priority += priorityAdjust RetryPriorityAdjust int //是否对Request进行去重过滤 FilterEnabled bool //同时处理的最大Item个数 ConcurrentItems int //爬虫下载器同时下载的最大Request个数 ConcurrentRequests int //单个域名允许同时访问的请求个数 ConcurrentRequestsPerDomain int // 允许下载数据占用的最大内存 ResponseMaxActiveSize int // 是否启用日志功能 LogEnabled bool // 除终端外额外将日志内容保存到指定文件 // 指定ErrFile时仅将Warning以下的日志信息保存到该文件 LogFile string // 指定时会将Warning及以上的日志信息保存到指定文件 // 仅指定ErrFile时,Warning以下的日志信息仅在终端显示而不保存 ErrFile string // 日志显示及保存的最低日志等级 LogLevel zapcore.Level // url最大长度限制 UrlLengthLimit int HttpErrorAllowAll bool HttpErrorAllowedCodes []int // 默认请求头 DefaultRequestHeaders http.Header // 默认UserAgent UserAgent string // 请求过滤器 FilterClass DupeFilter // 调度器优先级队列 SchedulerPriorityQueue PriorityQueuer // 用户自定义的设置项 ExtensionSettings *Context DownloaderMiddlewaresBase map[int]DownloaderMiddlewarer DownloaderMiddlewares map[int]DownloaderMiddlewarer SpiderMiddlewaresBase map[int]SpiderMiddlewarer SpiderMiddlewares map[int]SpiderMiddlewarer ItemPipelinesBase map[int]ItemPipeliner ItemPipelines map[int]ItemPipeliner SchedulerClass Scheduler DownloaderClass Downloader HttpUser string HttpPass string HttpAuthDomain []string }
func NewSetting ¶
type Spider ¶
type Spider struct { // 该spider的名称,用于日志记录 Name string // 可选。包含了spider允许爬取的域名(domain)列表(list)。 // 当OffsiteMiddleware启用时,域名不在列表中的URL不会被跟进。 AllowedDomains []string // URL列表。当没有制定特定的URL时,spider将从该列表中开始进行爬取。 // 因此,第一个被获取到的页面的URL将是该列表之一。 // 后续的URL将会从获取到的数据中提取。 StartUrls []string // 用于生成该爬虫爬取的起始Request,默认使用StartUrls中的链接生成Request StartRequestsFunc func(*Spider) []*Request // 当response没有指定回调函数时,该方法是xspider处理下载的response的默认方法。 DefaultParseFunc ParseFunc // 爬取结束后执行的自定义函数 CloseFunc func(*Spider) // 爬虫的设置参数,多个爬虫可共用一个相同的设置 Settings *Setting // 日志记录器 Log *zap.SugaredLogger // 爬虫状态记录器 Stats StatsCollector // contains filtered or unexported fields }
type SpiderMiddlewareManager ¶
type SpiderMiddlewareManager struct { ModuleName string // contains filtered or unexported fields }
爬虫中间件管理器
func (*SpiderMiddlewareManager) FromSpider ¶
func (s *SpiderMiddlewareManager) FromSpider(spider *Spider)
func (*SpiderMiddlewareManager) ProcessSpiderException ¶
func (s *SpiderMiddlewareManager) ProcessSpiderException(signal *Signal, spider *Spider) (index int)
func (*SpiderMiddlewareManager) ProcessSpiderInput ¶
func (s *SpiderMiddlewareManager) ProcessSpiderInput(signal *Signal, spider *Spider) (index int)
func (*SpiderMiddlewareManager) ProcessSpiderOutput ¶
func (s *SpiderMiddlewareManager) ProcessSpiderOutput(signal *Signal, spider *Spider) (index int)
func (*SpiderMiddlewareManager) ProcessStartRequests ¶
func (s *SpiderMiddlewareManager) ProcessStartRequests(signal *Signal, spider *Spider) (index int)
type SpiderMiddlewarer ¶
type SpiderMiddlewarer interface { GetModuleName() string FromSpider(spider *Spider) ProcessSpiderInput(response *Response, spider *Spider) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems ProcessSpiderException(response *Response, err error, spider *Spider) RequestItems ProcessStartRequests(result []*Request, spider *Spider) []*Request }
爬虫中间件接口
type SpiderOutputData ¶
type SpiderOutputData struct { Response *Response Result RequestItems }
type StatsCollect ¶
type StatsCollect struct {
// contains filtered or unexported fields
}
func (*StatsCollect) ClearStats ¶
func (s *StatsCollect) ClearStats()
func (*StatsCollect) GetStats ¶
func (s *StatsCollect) GetStats() StatsMap
func (*StatsCollect) MaxValue ¶
func (s *StatsCollect) MaxValue(key string, value int)
func (*StatsCollect) MinValue ¶
func (s *StatsCollect) MinValue(key string, value int)
func (*StatsCollect) SetStats ¶
func (s *StatsCollect) SetStats(stats StatsMap)
func (*StatsCollect) SetValue ¶
func (s *StatsCollect) SetValue(key string, value int)
type StatsCollector ¶
type StatsCollector interface { GetValue(key string, dft int) int GetStats() StatsMap SetValue(key string, value int) SetStats(stats StatsMap) IncValue(key string, count int, start int) MaxValue(key string, value int) MinValue(key string, value int) ClearStats() }
func NewStatsCollector ¶
func NewStatsCollector() StatsCollector
type UrlLengthMiddleware ¶
type UrlLengthMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
func (*UrlLengthMiddleware) FromSpider ¶
func (mw *UrlLengthMiddleware) FromSpider(spider *Spider)
func (*UrlLengthMiddleware) ProcessSpiderOutput ¶
func (mw *UrlLengthMiddleware) ProcessSpiderOutput(response *Response, result RequestItems, spider *Spider) RequestItems
type UserAgentMiddleware ¶
type UserAgentMiddleware struct { BaseMiddleware // contains filtered or unexported fields }
UserAgent中间件,对没有在Headers中指定"User-Agent"参数的Request添加默认值
func (*UserAgentMiddleware) FromSpider ¶
func (mw *UserAgentMiddleware) FromSpider(spider *Spider)
func (*UserAgentMiddleware) ProcessRequest ¶
func (mw *UserAgentMiddleware) ProcessRequest(request *Request, spider *Spider) RequestResponse
Click to show internal directories.
Click to hide internal directories.