Documentation ¶
Overview ¶
Package tegenaria is a crawler framework based on golang
tegenaria是一个基于golang开发的快速、高效率的网络爬虫框架
Index ¶
- Constants
- Variables
- func AbsFilePathTest(t *testing.T, path string) string
- func DefaultWatcher(ch chan EventType, hooker EventHooksInterface) error
- func GetEngineID() string
- func GetFunctionName(fn Parser) string
- func GetLogger(Name string) *logrus.Entry
- func GetMachineIP() (string, error)
- func GetParserByName(spider SpiderInterface, name string) reflect.Value
- func GetUUID() string
- func GoRunner(wg *conc.WaitGroup, funcs ...GoFunc) <-chan error
- func Interface2Uint(value interface{}) uint
- func MD5(s string) string
- func Map2String(m interface{}) string
- func NewTestProxyServer() *httptest.Server
- func NewTestServer() *httptest.Server
- func OptimalNumOfBits(n int, p float64) int
- func OptimalNumOfHashFunctions(n int, m int) int
- type BaseSpider
- type CacheInterface
- type CheckMasterLive
- type ComponentInterface
- type Configuration
- type Context
- type ContextOption
- type CrawlEngine
- func (e *CrawlEngine) EventsWatcherRunner() error
- func (e *CrawlEngine) Execute(spiderName string) StatisticInterface
- func (e *CrawlEngine) GetComponents() ComponentInterface
- func (e *CrawlEngine) GetCurrentSpider() SpiderInterface
- func (e *CrawlEngine) GetRuntimeStatus() *RuntimeStatus
- func (e *CrawlEngine) GetSpiders() *Spiders
- func (e *CrawlEngine) GetStatic() StatisticInterface
- func (e *CrawlEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
- func (e *CrawlEngine) RegisterPipelines(pipeline PipelinesInterface)
- func (e *CrawlEngine) RegisterSpiders(spider SpiderInterface)
- func (e *CrawlEngine) Scheduler() error
- type DefaultComponents
- func (d *DefaultComponents) CheckWorkersStop() bool
- func (d *DefaultComponents) GetDupefilter() RFPDupeFilterInterface
- func (d *DefaultComponents) GetEventHooks() EventHooksInterface
- func (d *DefaultComponents) GetLimiter() LimitInterface
- func (d *DefaultComponents) GetQueue() CacheInterface
- func (d *DefaultComponents) GetStats() StatisticInterface
- func (d *DefaultComponents) SetCurrentSpider(spider SpiderInterface)
- func (d *DefaultComponents) SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
- type DefaultComponentsOption
- func DefaultComponentsWithDefaultHooks(events *DefaultHooks) DefaultComponentsOption
- func DefaultComponentsWithDefaultLimiter(limiter *DefaultLimiter) DefaultComponentsOption
- func DefaultComponentsWithDefaultQueue(queue *DefaultQueue) DefaultComponentsOption
- func DefaultComponentsWithDefaultStatistic(statistic *DefaultStatistic) DefaultComponentsOption
- func DefaultComponentsWithDupefilter(dupefilter *DefaultRFPDupeFilter) DefaultComponentsOption
- type DefaultFieldHook
- type DefaultHooks
- func (d *DefaultHooks) Error(params ...interface{}) error
- func (d *DefaultHooks) EventsWatcher(ch chan EventType) error
- func (d *DefaultHooks) Exit(params ...interface{}) error
- func (d *DefaultHooks) Heartbeat(params ...interface{}) error
- func (d *DefaultHooks) Pause(params ...interface{}) error
- func (d *DefaultHooks) SetCurrentSpider(spider SpiderInterface)
- func (d *DefaultHooks) Start(params ...interface{}) error
- type DefaultLimiter
- type DefaultQueue
- type DefaultRFPDupeFilter
- type DefaultStatistic
- type DistributedWorkerInterface
- type Downloader
- type DownloaderOption
- func DownloadWithClient(client http.Client) DownloaderOption
- func DownloadWithH2(h2 bool) DownloaderOption
- func DownloadWithTLSConfig(tls *tls.Config) DownloaderOption
- func DownloadWithTimeout(timeout time.Duration) DownloaderOption
- func DownloaderWithtransport(transport *http.Transport) DownloaderOption
- type EngineOption
- type ErrorOption
- type EventHooksInterface
- type EventType
- type EventsWatcher
- type GoFunc
- type HandleError
- type Hook
- type ItemInterface
- type ItemMeta
- type ItemPipelines
- type LimitInterface
- type Middlewares
- type MiddlewaresBase
- type MiddlewaresInterface
- type Parser
- type PipelinesBase
- type PipelinesInterface
- type ProcessResponse
- type Proxy
- type RFPDupeFilterInterface
- type RedirectError
- type Request
- type RequestMethod
- type RequestOption
- func RequestWithAllowRedirects(allowRedirects bool) RequestOption
- func RequestWithAllowedStatusCode(allowStatusCode []uint64) RequestOption
- func RequestWithBodyReader(body io.Reader) RequestOption
- func RequestWithDoNotFilter(doNotFilter bool) RequestOption
- func RequestWithMaxConnsPerHost(maxConnsPerHost int) RequestOption
- func RequestWithMaxRedirects(maxRedirects int) RequestOption
- func RequestWithParser(parser Parser) RequestOption
- func RequestWithPostForm(payload url.Values) RequestOption
- func RequestWithRequestBody(body map[string]interface{}) RequestOption
- func RequestWithRequestBytesBody(body []byte) RequestOption
- func RequestWithRequestCookies(cookies map[string]string) RequestOption
- func RequestWithRequestHeader(headers map[string]string) RequestOption
- func RequestWithRequestMeta(meta map[string]interface{}) RequestOption
- func RequestWithRequestParams(params map[string]string) RequestOption
- func RequestWithRequestProxy(proxy Proxy) RequestOption
- func RequestWithTimeout(timeout time.Duration) RequestOption
- type Response
- type RuntimeStatus
- func (r *RuntimeStatus) GetDuration() float64
- func (r *RuntimeStatus) GetRestartAt() int64
- func (r *RuntimeStatus) GetStartAt() int64
- func (r *RuntimeStatus) GetStatusOn() StatusType
- func (r *RuntimeStatus) GetStopAt() int64
- func (r *RuntimeStatus) SetDuration(duration float64)
- func (r *RuntimeStatus) SetRestartAt(startAt int64)
- func (r *RuntimeStatus) SetStartAt(startAt int64)
- func (r *RuntimeStatus) SetStatus(status StatusType)
- func (r *RuntimeStatus) SetStopAt(stopAt int64)
- type Settings
- type SpiderDownloader
- type SpiderInterface
- type Spiders
- type StatisticInterface
- type StatsFieldType
- type StatusType
- type TestDownloadMiddler
- type TestDownloadMiddler2
- type TestItemPipeline
- type TestItemPipeline2
- type TestItemPipeline3
- type TestItemPipeline4
- type TestSpider
Constants ¶
const ( // RequestStats 发起的请求总数 RequestStats string = "requests" // ItemsStats 获取到的items总数 ItemsStats string = "items" // DownloadFailStats 请求失败总数 DownloadFailStats string = "download_fail" // ErrorStats 错误总数 ErrorStats string = "errors" )
Variables ¶
var ( // ErrSpiderMiddleware 下载中间件处理异常 ErrSpiderMiddleware error = errors.New("handle spider middleware error") // ErrSpiderCrawls 抓取流程错误 ErrSpiderCrawls error = errors.New("handle spider crawl error") // ErrDuplicateSpiderName 爬虫名重复错误 ErrDuplicateSpiderName error = errors.New("register a duplicate spider name error") // ErrEmptySpiderName 爬虫名不能为空 ErrEmptySpiderName error = errors.New("register a empty spider name error") // ErrSpiderNotExist 爬虫实例不存在 ErrSpiderNotExist error = errors.New("not found spider") // ErrNotAllowStatusCode 不允许的状态码 ErrNotAllowStatusCode error = errors.New("not allow handle status code") // ErrGetCacheItem 获取item 错误 ErrGetCacheItem error = errors.New("getting item from cache error") // ErrGetHttpProxy 获取http代理错误 ErrGetHttpProxy error = errors.New("getting http proxy ") // ErrGetHttpsProxy 获取https代理错误 ErrGetHttpsProxy error = errors.New("getting https proxy ") // ErrParseSocksProxy 解析socks代理错误 ErrParseSocksProxy error = errors.New("parse socks proxy ") // ErrResponseRead 响应读取失败 ErrResponseRead error = errors.New("read response to buffer error") // ErrResponseParse 响应解析失败 ErrResponseParse error = errors.New("parse response error") // ErrNoMaterNodeLive 找不到主节点 ErrNoMaterNodeLive error = errors.New("no any master node is active") )
var ProcessId string = uuid.New().String()
Functions ¶
func DefaultWatcher ¶ added in v0.5.0
func DefaultWatcher(ch chan EventType, hooker EventHooksInterface) error
DefaultWatcher 默认的事件监听器 ch 用于接收事件 hooker 事件处理实例化接口,比如DefaultHooks
func GetFunctionName ¶ added in v0.4.1
GetFunctionName 提取解析函数名
func GetParserByName ¶ added in v0.4.1
func GetParserByName(spider SpiderInterface, name string) reflect.Value
GetParserByName 通过函数名从spider实例中获取解析函数
func Interface2Uint ¶ added in v0.5.0
func Interface2Uint(value interface{}) uint
func NewTestProxyServer ¶ added in v0.5.0
func NewTestServer ¶ added in v0.5.0
func OptimalNumOfBits ¶ added in v0.4.1
OptimalNumOfBits 计算位数组长度
func OptimalNumOfHashFunctions ¶ added in v0.4.1
OptimalNumOfHashFunctions 计算最优的布隆过滤器哈希函数个数
Types ¶
type BaseSpider ¶
BaseSpider base spider
func NewBaseSpider ¶
func NewBaseSpider(name string, feedUrls []string) *BaseSpider
NewBaseSpider 构建公共爬虫对象
type CacheInterface ¶
type CacheInterface interface { // enqueue ctx写入缓存 Enqueue(ctx *Context) error // dequeue ctx 从缓存出队列 Dequeue() (interface{}, error) // isEmpty 缓存是否为空 IsEmpty() bool // getSize 缓存大小 GetSize() uint64 // close 关闭缓存 Close() error // SetCurrentSpider 设置当前的spider SetCurrentSpider(spider SpiderInterface) }
CacheInterface request缓存组件
type CheckMasterLive ¶ added in v0.4.1
CheckMasterLive 检查所有的master节点是否都在线
type ComponentInterface ¶ added in v0.5.0
type ComponentInterface interface { // GetDupefilter 获取过滤器组件 GetDupefilter() RFPDupeFilterInterface // GetQueue 获取请求队列接口 GetQueue() CacheInterface // GetLimiter 限速器组件 GetLimiter() LimitInterface // GetStats 指标统计组件 GetStats() StatisticInterface // GetEventHooks 事件监控组件 GetEventHooks() EventHooksInterface // CheckWorkersStop 爬虫停止的条件 CheckWorkersStop() bool // SetCurrentSpider 当前正在运行的爬虫实例 SetCurrentSpider(spider SpiderInterface) // SpiderBeforeStart 启动StartRequest之前的动作 SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error }
ComponentInterface 系统组件接口 包含了爬虫系统运行的必要组件
type Configuration ¶
var Config *Configuration = nil
func (*Configuration) GetValue ¶ added in v0.4.1
func (c *Configuration) GetValue(key string) (interface{}, error)
type Context ¶
type Context struct { // Request 请求对象 Request *Request // Response 响应对象 Response *Response // CtxID context 唯一id由uuid生成 CtxID string // Error 处理过程中的错误信息 Error error // Cancel context.CancelFunc Cancel context.CancelFunc // Items 读写item的管道 Items chan *ItemMeta // Spider 爬虫实例 Spider SpiderInterface // contains filtered or unexported fields }
Context 在引擎中的数据流通载体,负责单个抓取任务的生命周期维护
func NewContext ¶
func NewContext(request *Request, Spider SpiderInterface, opts ...ContextOption) *Context
NewContext 从内存池中构建context对象
func NewTestRequest ¶ added in v0.5.0
func NewTestRequest(spider SpiderInterface, opts ...RequestOption) *Context
type ContextOption ¶
type ContextOption func(c *Context)
ContextOption 上下文选项
func WithContextID ¶ added in v0.5.0
func WithContextID(ctxID string) ContextOption
WithContextID 设置自定义的ctxId
func WithItemChannelSize ¶ added in v0.4.1
func WithItemChannelSize(size int) ContextOption
WithItemChannelSize 设置 items 管道的缓冲大小
type CrawlEngine ¶ added in v0.4.1
type CrawlEngine struct {
// contains filtered or unexported fields
}
CrawlEngine 引擎是整个框架数据流调度核心
func NewTestEngine ¶ added in v0.5.0
func NewTestEngine(spiderName string, opts ...EngineOption) *CrawlEngine
func (*CrawlEngine) EventsWatcherRunner ¶ added in v0.4.1
func (e *CrawlEngine) EventsWatcherRunner() error
EventsWatcherRunner 事件监听器运行组件
func (*CrawlEngine) Execute ¶ added in v0.4.1
func (e *CrawlEngine) Execute(spiderName string) StatisticInterface
func (*CrawlEngine) GetComponents ¶ added in v0.5.0
func (e *CrawlEngine) GetComponents() ComponentInterface
func (*CrawlEngine) GetCurrentSpider ¶ added in v0.5.0
func (e *CrawlEngine) GetCurrentSpider() SpiderInterface
GetCurrentSpider 获取当前正在运行的spider
func (*CrawlEngine) GetRuntimeStatus ¶ added in v0.5.0
func (e *CrawlEngine) GetRuntimeStatus() *RuntimeStatus
func (*CrawlEngine) GetSpiders ¶ added in v0.4.1
func (e *CrawlEngine) GetSpiders() *Spiders
GetSpiders 获取所有的已经注册到引擎的spider实例
func (*CrawlEngine) GetStatic ¶ added in v0.5.0
func (e *CrawlEngine) GetStatic() StatisticInterface
GetStatic 获取StatisticInterface 统计指标
func (*CrawlEngine) RegisterDownloadMiddlewares ¶ added in v0.4.1
func (e *CrawlEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
RegisterDownloadMiddlewares 注册下载中间件到引擎
func (*CrawlEngine) RegisterPipelines ¶ added in v0.4.1
func (e *CrawlEngine) RegisterPipelines(pipeline PipelinesInterface)
RegisterPipelines 注册pipelines到引擎
func (*CrawlEngine) RegisterSpiders ¶ added in v0.4.1
func (e *CrawlEngine) RegisterSpiders(spider SpiderInterface)
RegisterSpiders 将spider实例注册到引擎的 spiders
func (*CrawlEngine) Scheduler ¶ added in v0.4.1
func (e *CrawlEngine) Scheduler() error
Scheduler 调度器
type DefaultComponents ¶ added in v0.5.0
type DefaultComponents struct {
// contains filtered or unexported fields
}
DefaultComponents 默认的组件
func NewDefaultComponents ¶ added in v0.5.0
func NewDefaultComponents(opts ...DefaultComponentsOption) *DefaultComponents
func (*DefaultComponents) CheckWorkersStop ¶ added in v0.5.0
func (d *DefaultComponents) CheckWorkersStop() bool
func (*DefaultComponents) GetDupefilter ¶ added in v0.5.0
func (d *DefaultComponents) GetDupefilter() RFPDupeFilterInterface
func (*DefaultComponents) GetEventHooks ¶ added in v0.5.0
func (d *DefaultComponents) GetEventHooks() EventHooksInterface
func (*DefaultComponents) GetLimiter ¶ added in v0.5.0
func (d *DefaultComponents) GetLimiter() LimitInterface
func (*DefaultComponents) GetQueue ¶ added in v0.5.0
func (d *DefaultComponents) GetQueue() CacheInterface
func (*DefaultComponents) GetStats ¶ added in v0.5.0
func (d *DefaultComponents) GetStats() StatisticInterface
func (*DefaultComponents) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultComponents) SetCurrentSpider(spider SpiderInterface)
func (*DefaultComponents) SpiderBeforeStart ¶ added in v0.5.0
func (d *DefaultComponents) SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
type DefaultComponentsOption ¶ added in v0.5.0
type DefaultComponentsOption func(d *DefaultComponents)
func DefaultComponentsWithDefaultHooks ¶ added in v0.5.0
func DefaultComponentsWithDefaultHooks(events *DefaultHooks) DefaultComponentsOption
func DefaultComponentsWithDefaultLimiter ¶ added in v0.5.0
func DefaultComponentsWithDefaultLimiter(limiter *DefaultLimiter) DefaultComponentsOption
func DefaultComponentsWithDefaultQueue ¶ added in v0.5.0
func DefaultComponentsWithDefaultQueue(queue *DefaultQueue) DefaultComponentsOption
func DefaultComponentsWithDefaultStatistic ¶ added in v0.5.0
func DefaultComponentsWithDefaultStatistic(statistic *DefaultStatistic) DefaultComponentsOption
func DefaultComponentsWithDupefilter ¶ added in v0.5.0
func DefaultComponentsWithDupefilter(dupefilter *DefaultRFPDupeFilter) DefaultComponentsOption
type DefaultFieldHook ¶
type DefaultFieldHook struct { }
func (*DefaultFieldHook) Levels ¶
func (hook *DefaultFieldHook) Levels() []logrus.Level
type DefaultHooks ¶ added in v0.4.5
type DefaultHooks struct {
// contains filtered or unexported fields
}
func NewDefaultHooks ¶ added in v0.4.5
func NewDefaultHooks() *DefaultHooks
NewDefaultHooks 构建新的默认事件监听器
func (*DefaultHooks) Error ¶ added in v0.4.5
func (d *DefaultHooks) Error(params ...interface{}) error
Error 处理ERROR事件
func (*DefaultHooks) EventsWatcher ¶ added in v0.4.5
func (d *DefaultHooks) EventsWatcher(ch chan EventType) error
EventsWatcher DefualtHooks 的事件监听器
func (*DefaultHooks) Exit ¶ added in v0.4.5
func (d *DefaultHooks) Exit(params ...interface{}) error
Exit 处理EXIT事件
func (*DefaultHooks) Heartbeat ¶ added in v0.4.5
func (d *DefaultHooks) Heartbeat(params ...interface{}) error
Heartbeat 处理HEARTBEAT事件
func (*DefaultHooks) Pause ¶ added in v0.5.0
func (d *DefaultHooks) Pause(params ...interface{}) error
Pause 处理STOP事件
func (*DefaultHooks) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultHooks) SetCurrentSpider(spider SpiderInterface)
func (*DefaultHooks) Start ¶ added in v0.4.5
func (d *DefaultHooks) Start(params ...interface{}) error
Start 处理START事件
type DefaultLimiter ¶ added in v0.5.0
type DefaultLimiter struct {
// contains filtered or unexported fields
}
defaultLimiter 默认的限速器
func NewDefaultLimiter ¶ added in v0.4.1
func NewDefaultLimiter(limitRate int) *DefaultLimiter
NewDefaultLimiter 创建一个新的限速器 limitRate 最大请求速率
func (*DefaultLimiter) CheckAndWaitLimiterPass ¶ added in v0.5.0
func (d *DefaultLimiter) CheckAndWaitLimiterPass() error
checkAndWaitLimiterPass 检查当前并发量 如果并发量达到上限则等待
func (*DefaultLimiter) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultLimiter) SetCurrentSpider(spider SpiderInterface)
setCurrrentSpider 设置当前的spider名
type DefaultQueue ¶ added in v0.5.0
type DefaultQueue struct {
// contains filtered or unexported fields
}
RequestCache request缓存队列
func NewDefaultQueue ¶ added in v0.5.0
func NewDefaultQueue(size int) *DefaultQueue
NewDefaultQueue get a new DefaultQueue
func (*DefaultQueue) Dequeue ¶ added in v0.5.0
func (c *DefaultQueue) Dequeue() (interface{}, error)
dequeue 从队列中获取request对象
func (*DefaultQueue) Enqueue ¶ added in v0.5.0
func (c *DefaultQueue) Enqueue(ctx *Context) error
enqueue request对象入队列
func (*DefaultQueue) SetCurrentSpider ¶ added in v0.5.0
func (c *DefaultQueue) SetCurrentSpider(spider SpiderInterface)
SetCurrentSpider 设置当前的spider
type DefaultRFPDupeFilter ¶ added in v0.5.0
type DefaultRFPDupeFilter struct {
// contains filtered or unexported fields
}
RFPDupeFilter 去重组件
func NewRFPDupeFilter ¶
func NewRFPDupeFilter(bloomP float64, bloomN int) *DefaultRFPDupeFilter
NewRFPDupeFilter 新建去重组件 bloomP容错率 bloomN数据规模
func (*DefaultRFPDupeFilter) DoDupeFilter ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) DoDupeFilter(ctx *Context) (bool, error)
DoDupeFilter 通过布隆过滤器对request对象进行去重处理
func (*DefaultRFPDupeFilter) Fingerprint ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) Fingerprint(ctx *Context) ([]byte, error)
Fingerprint 计算指纹
func (*DefaultRFPDupeFilter) SetCurrentSpider ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) SetCurrentSpider(spider SpiderInterface)
type DefaultStatistic ¶ added in v0.5.0
type DefaultStatistic struct { // Metrics 指标-数值缓存 Metrics map[string]*uint64 // contains filtered or unexported fields }
Statistic 数据统计指标
func NewDefaultStatistic ¶ added in v0.5.0
func NewDefaultStatistic() *DefaultStatistic
NewStatistic 默认统计数据组件构造函数
func (*DefaultStatistic) Get ¶ added in v0.5.0
func (s *DefaultStatistic) Get(metric string) uint64
Get 获取某个指标的数值
func (*DefaultStatistic) GetAllStats ¶ added in v0.5.0
func (s *DefaultStatistic) GetAllStats() map[string]uint64
GetAllStats 格式化统计数据
func (*DefaultStatistic) Incr ¶ added in v0.5.0
func (s *DefaultStatistic) Incr(metrics string)
Incr 新增一个指标值
func (*DefaultStatistic) SetCurrentSpider ¶ added in v0.5.0
func (s *DefaultStatistic) SetCurrentSpider(spider SpiderInterface)
SetCurrentSpider 设置当前的spider
type DistributedWorkerInterface ¶ added in v0.4.1
type DistributedWorkerInterface interface { // AddNode 新增一个节点 AddNode() error // DelNode 删除当前的节点 DelNode() error // PauseNode 停止当前的节点 PauseNode() error // Heartbeat 心跳 Heartbeat() error // CheckAllNodesStop 检查所有的节点是否都已经停止 CheckAllNodesStop() (bool, error) // CheckMasterLive 检测主节点是否还在线 CheckMasterLive() (bool, error) // SetMaster 是否将当前的节点设置为主节点 SetMaster(flag bool) // SetCurrentSpider 设置当前的spider SetCurrentSpider(spider SpiderInterface) // GetWorkerID 当前工作节点的id GetWorkerID() string // IsMaster 是否是主节点 IsMaster() bool }
DistributedWorkerInterface 分布式组件接口
type Downloader ¶
type Downloader interface { // Download 下载函数 Download(ctx *Context) (*Response, error) // CheckStatus 检查响应状态码的合法性 CheckStatus(statusCode uint64, allowStatus []uint64) bool }
Downloader 下载器接口
type DownloaderOption ¶
type DownloaderOption func(d *SpiderDownloader)
DownloaderOption 下载器可选参数函数
func DownloadWithClient ¶
func DownloadWithClient(client http.Client) DownloaderOption
DownloadWithClient 设置下载器的http.Client客户端
func DownloadWithH2 ¶ added in v0.4.1
func DownloadWithH2(h2 bool) DownloaderOption
DownloadWithH2 下载器是否开启http2
func DownloadWithTLSConfig ¶ added in v0.5.0
func DownloadWithTLSConfig(tls *tls.Config) DownloaderOption
DownloadWithTLSConfig 设置下载器的tls
func DownloadWithTimeout ¶
func DownloadWithTimeout(timeout time.Duration) DownloaderOption
DownloadWithTimeout 设置下载器的网络请求超时时间
func DownloaderWithtransport ¶
func DownloaderWithtransport(transport *http.Transport) DownloaderOption
DownloaderWithtransport 为下载器设置 http.Transport
type EngineOption ¶
type EngineOption func(r *CrawlEngine)
EngineOption 引擎构造过程中的可选参数
func EngineWithComponents ¶ added in v0.5.0
func EngineWithComponents(components ComponentInterface) EngineOption
func EngineWithDownloader ¶
func EngineWithDownloader(downloader Downloader) EngineOption
EngineWithDownloader 引擎使用的下载器组件
func EngineWithReqChannelSize ¶ added in v0.5.0
func EngineWithReqChannelSize(size int) EngineOption
EngineWithReqChannelSize
func EngineWithUniqueReq ¶
func EngineWithUniqueReq(uniqueReq bool) EngineOption
EngineWithUniqueReq 是否进行去重处理, true则进行去重处理,默认值为true
type ErrorOption ¶
type ErrorOption func(e *HandleError)
ErrorOption HandleError 可选参数
func ErrorWithExtras ¶ added in v0.4.1
func ErrorWithExtras(extras map[string]interface{}) ErrorOption
ErrorWithExtras HandleError 添加额外的数据
type EventHooksInterface ¶ added in v0.4.1
type EventHooksInterface interface { // Start 处理引擎启动事件 Start(params ...interface{}) error // Stop 处理引擎停止事件 Pause(params ...interface{}) error // Error处理错误事件 Error(params ...interface{}) error // Exit 退出引擎事件 Exit(params ...interface{}) error // Heartbeat 心跳检查事件 Heartbeat(params ...interface{}) error // EventsWatcher 事件监听器 EventsWatcher(ch chan EventType) error SetCurrentSpider(spider SpiderInterface) }
EventHooksInterface 事件处理函数接口
type EventsWatcher ¶ added in v0.4.1
EventsWatcher 事件监听器
type HandleError ¶
type HandleError struct { // CtxID 上下文id CtxID string // Err 处理过程的错误 Err error // Extras 携带的额外信息 Extras map[string]interface{} }
HandleError 错误处理接口
func NewError ¶
func NewError(ctx *Context, err error, opts ...ErrorOption) *HandleError
NewError 构建新的HandleError实例
type ItemMeta ¶
type ItemMeta struct { // CtxID 对应的context id CtxID string // Item item对象 Item ItemInterface }
ItemMeta item元数据结构
type ItemPipelines ¶
type ItemPipelines []PipelinesInterface
func (ItemPipelines) Len ¶
func (p ItemPipelines) Len() int
func (ItemPipelines) Less ¶
func (p ItemPipelines) Less(i, j int) bool
func (ItemPipelines) Swap ¶
func (p ItemPipelines) Swap(i, j int)
type LimitInterface ¶ added in v0.4.1
type LimitInterface interface { // checkAndWaitLimiterPass 检查当前并发量 // 如果并发量达到上限则等待 CheckAndWaitLimiterPass() error // setCurrrentSpider 设置当前正在的运行的spider SetCurrentSpider(spider SpiderInterface) }
LimitInterface 限速器接口
type Middlewares ¶
type Middlewares []MiddlewaresInterface
Middlewares 下载中间件队列
func (Middlewares) Less ¶
func (p Middlewares) Less(i, j int) bool
func (Middlewares) Swap ¶
func (p Middlewares) Swap(i, j int)
type MiddlewaresBase ¶
type MiddlewaresBase struct {
Priority int
}
type MiddlewaresInterface ¶
type MiddlewaresInterface interface { // GetPriority 获取优先级,数字越小优先级越高 GetPriority() int // ProcessRequest 处理request请求对象 // 此处用于增加请求头 // 按优先级执行 ProcessRequest(ctx *Context) error // ProcessResponse 用于处理请求成功之后的response // 执行顺序你优先级,及优先级越高执行顺序越晚 ProcessResponse(ctx *Context, req chan<- *Context) error // GetName 获取中间件的名称 GetName() string }
MiddlewaresInterface 下载中间件的接口用于处理进入下载器之前的request对象 和下载之后的response
type PipelinesBase ¶
type PipelinesBase struct {
Priority int
}
type PipelinesInterface ¶
type PipelinesInterface interface { // GetPriority 获取当前pipeline的优先级 GetPriority() int // ProcessItem item处理单元 ProcessItem(spider SpiderInterface, item *ItemMeta) error }
PipelinesInterface pipeline 接口 pipeline 主要用于处理item,例如数据存储、数据清洗 将多个pipeline注册到引擎可以实现责任链模式的数据处理
type ProcessResponse ¶
ProcessResponse 处理下载之后的response函数
type RFPDupeFilterInterface ¶
type RFPDupeFilterInterface interface { // Fingerprint request指纹计算 Fingerprint(ctx *Context) ([]byte, error) // DoDupeFilter request去重 DoDupeFilter(ctx *Context) (bool, error) SetCurrentSpider(spider SpiderInterface) }
RFPDupeFilterInterface request 对象指纹计算和布隆过滤器去重
type Request ¶
type Request struct { // Url 请求Url Url string `json:"url"` // Headers 请求头 Headers map[string]string `json:"headers"` // Method 请求方式 Method RequestMethod `json:"method"` // Params 请求url的参数 Params map[string]string `json:"params"` // Proxy 代理实例 Proxy *Proxy `json:"-"` // Cookies 请求携带的cookies Cookies map[string]string `json:"cookies"` // Meta 请求携带的额外的信息 Meta map[string]interface{} `json:"meta"` // AllowRedirects 是否允许跳转默认允许 AllowRedirects bool `json:"allowRedirects"` // MaxRedirects 最大的跳转次数 MaxRedirects int `json:"maxRedirects"` // Parser 该请求绑定的响应解析函数,必须是一个spider实例 Parser string `json:"parser"` // MaxConnsPerHost 单个域名最大的连接数 MaxConnsPerHost int `json:"maxConnsPerHost"` // AllowStatusCode 允许的状态码 AllowStatusCode []uint64 `json:"allowStatusCode"` // Timeout 请求超时时间 Timeout time.Duration `json:"timeout"` // DoNotFilter DoNotFilter bool // contains filtered or unexported fields }
Request 请求对象的结构
func NewRequest ¶
func NewRequest(url string, method RequestMethod, parser Parser, opts ...RequestOption) *Request
请注意parser函数必须是某一个spiderinterface实例的解析函数 否则无法正常调用该解析函数
func RequestFromMap ¶ added in v0.4.1
func RequestFromMap(src map[string]interface{}, opts ...RequestOption) *Request
RequestFromMap 从map创建requests
type RequestMethod ¶ added in v0.4.1
type RequestMethod string
RequestMethod 请求方式
const ( // GET 请求 GET RequestMethod = "GET" // POST 请求 POST RequestMethod = "POST" // PUT 请求 PUT RequestMethod = "PUT" // DELETE 请求 DELETE RequestMethod = "DELETE" // OPTIONS 请求 OPTIONS RequestMethod = "OPTIONS" // HEAD 请求 HEAD RequestMethod = "HEAD" )
type RequestOption ¶ added in v0.4.1
type RequestOption func(r *Request)
Option NewRequest 可选参数
func RequestWithAllowRedirects ¶
func RequestWithAllowRedirects(allowRedirects bool) RequestOption
RequestWithAllowRedirects 设置是否允许跳转 如果不允许则MaxRedirects=0
func RequestWithAllowedStatusCode ¶ added in v0.4.1
func RequestWithAllowedStatusCode(allowStatusCode []uint64) RequestOption
RequestWithAllowedStatusCode 设置AllowStatusCode
func RequestWithBodyReader ¶ added in v0.5.0
func RequestWithBodyReader(body io.Reader) RequestOption
RequestWithBodyReader set request body io.Reader
func RequestWithDoNotFilter ¶ added in v0.4.6
func RequestWithDoNotFilter(doNotFilter bool) RequestOption
RequestWithDoNotFilter 设置当前请求是否进行过滤处理, true则认为该条请求无需进入去重流程,默认值为false
func RequestWithMaxConnsPerHost ¶
func RequestWithMaxConnsPerHost(maxConnsPerHost int) RequestOption
RequestWithMaxConnsPerHost 设置MaxConnsPerHost
func RequestWithMaxRedirects ¶
func RequestWithMaxRedirects(maxRedirects int) RequestOption
RequestWithMaxRedirects 设置最大的跳转次数 若maxRedirects <= 0则认为不允许跳转AllowRedirects = false
func RequestWithParser ¶ added in v0.4.1
func RequestWithParser(parser Parser) RequestOption
RequestWithParser 设置Parser
func RequestWithPostForm ¶ added in v0.5.0
func RequestWithPostForm(payload url.Values) RequestOption
RequestWithPostForm set application/x-www-form-urlencoded request body reader
func RequestWithRequestBody ¶
func RequestWithRequestBody(body map[string]interface{}) RequestOption
RequestWithRequestBody 传入请求体到request
func RequestWithRequestBytesBody ¶ added in v0.4.1
func RequestWithRequestBytesBody(body []byte) RequestOption
RequestWithRequestBytesBody request绑定bytes body
func RequestWithRequestCookies ¶
func RequestWithRequestCookies(cookies map[string]string) RequestOption
RequestWithRequestCookies 设置cookie
func RequestWithRequestHeader ¶
func RequestWithRequestHeader(headers map[string]string) RequestOption
RequestWithRequestHeader 设置请求头
func RequestWithRequestMeta ¶
func RequestWithRequestMeta(meta map[string]interface{}) RequestOption
RequestWithRequestMeta 设置 meta
func RequestWithRequestParams ¶
func RequestWithRequestParams(params map[string]string) RequestOption
RequestWithRequestParams 设置请求的url参数
func RequestWithRequestProxy ¶
func RequestWithRequestProxy(proxy Proxy) RequestOption
RequestWithRequestProxy 设置代理
func RequestWithTimeout ¶ added in v0.4.1
func RequestWithTimeout(timeout time.Duration) RequestOption
RequestWithTimeout 设置请求超时时间 若timeout<=0则认为没有超时时间
type Response ¶
type Response struct { // Status状态码 Status int // Headers 响应头 Headers map[string][]string // Header response header // Delay 请求延迟 Delay float64 // Delay the time of handle download request // ContentLength 响应体大小 ContentLength uint64 // ContentLength response content length // URL 请求url URL string // URL of request url // Buffer 响应体缓存 Buffer *bytes.Buffer // buffer read response buffer Body io.ReadCloser // contains filtered or unexported fields }
Response 请求响应体的结构
type RuntimeStatus ¶ added in v0.5.0
type RuntimeStatus struct { StartAt int64 Duration float64 StopAt int64 RestartAt int64 // StatusOn 当前引擎的状态 StatusOn StatusType }
func NewRuntimeStatus ¶ added in v0.5.0
func NewRuntimeStatus() *RuntimeStatus
func (*RuntimeStatus) GetDuration ¶ added in v0.5.0
func (r *RuntimeStatus) GetDuration() float64
GetDuration 爬虫运行时长
func (*RuntimeStatus) GetRestartAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetRestartAt() int64
GetStartAt 获取引擎启动的时间戳
func (*RuntimeStatus) GetStartAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetStartAt() int64
GetStartAt 获取引擎启动的时间戳
func (*RuntimeStatus) GetStatusOn ¶ added in v0.5.0
func (r *RuntimeStatus) GetStatusOn() StatusType
GetStatusOn 获取引擎的状态
func (*RuntimeStatus) GetStopAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetStopAt() int64
GetStopAt 爬虫停止的时间戳
func (*RuntimeStatus) SetDuration ¶ added in v0.5.0
func (r *RuntimeStatus) SetDuration(duration float64)
func (*RuntimeStatus) SetRestartAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetRestartAt(startAt int64)
func (*RuntimeStatus) SetStartAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetStartAt(startAt int64)
func (*RuntimeStatus) SetStatus ¶ added in v0.5.0
func (r *RuntimeStatus) SetStatus(status StatusType)
SetStatus 设置引擎状态 用于控制引擎的启停
func (*RuntimeStatus) SetStopAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetStopAt(stopAt int64)
type SpiderDownloader ¶
type SpiderDownloader struct { // ProxyFunc 对单个请求进行代理设置 ProxyFunc func(req *http.Request) (*url.URL, error) // contains filtered or unexported fields }
SpiderDownloader tegenaria 爬虫下载器
func (*SpiderDownloader) CheckStatus ¶
func (d *SpiderDownloader) CheckStatus(statusCode uint64, allowStatus []uint64) bool
CheckStatus 检查状态码是否合法
type SpiderInterface ¶
type SpiderInterface interface { // StartRequest 通过GetFeedUrls()获取种子 // urls并构建初始请求 StartRequest(req chan<- *Context) // Parser 默认的请求响应解析函数 // 在解析过程中生成的新的请求可以推送到req channel Parser(resp *Context, req chan<- *Context) error // ErrorHandler 错误处理函数,允许在此过程中生成新的请求 // 并推送到req channel ErrorHandler(err *Context, req chan<- *Context) // GetName 获取spider名称 GetName() string // GetFeedUrls 获取种子urls GetFeedUrls() []string }
SpiderInterface Tegenaria spider interface, developer can custom spider must be based on this interface to achieve custom spider.
type Spiders ¶
type Spiders struct { // SpidersModules spider名称和spider实例的映射 SpidersModules map[string]SpiderInterface // Parsers parser函数名和函数的映射 // 用于序列化和反序列化 Parsers map[string]Parser }
Spiders 全局spiders管理器 用于接收注册的SpiderInterface实例
var SpidersList *Spiders
SpidersList 注册到引擎的爬虫列表
func (*Spiders) GetAllSpidersName ¶ added in v0.5.0
func (*Spiders) GetSpider ¶
func (s *Spiders) GetSpider(name string) (SpiderInterface, error)
GetSpider 通过爬虫名获取spider实例
func (*Spiders) Register ¶
func (s *Spiders) Register(spider SpiderInterface) error
Register spider实例注册到Spiders.SpidersModules
type StatisticInterface ¶ added in v0.4.1
type StatisticInterface interface { // GetAllStats 获取所有的指标数据 GetAllStats() map[string]uint64 // Incr 指定的指标计数器自增1 Incr(metric string) // Get 获取指标的数值 Get(metric string) uint64 // SetCurrentSpider 设置当前的爬虫实例 SetCurrentSpider(spider SpiderInterface) }
StatisticInterface 数据统计组件接口
type StatusType ¶ added in v0.5.0
type StatusType uint
StatusType 当前引擎的状态
const ( // ON_START 启动状态 ON_START StatusType = iota // ON_STOP 停止状态 ON_STOP // ON_PAUSE 暂停状态 ON_PAUSE )
func (StatusType) GetTypeName ¶ added in v0.5.0
func (p StatusType) GetTypeName() string
GetTypeName 获取引擎状态的字符串形式
type TestDownloadMiddler ¶ added in v0.5.0
func (TestDownloadMiddler) GetName ¶ added in v0.5.0
func (m TestDownloadMiddler) GetName() string
func (TestDownloadMiddler) GetPriority ¶ added in v0.5.0
func (m TestDownloadMiddler) GetPriority() int
func (TestDownloadMiddler) ProcessRequest ¶ added in v0.5.0
func (m TestDownloadMiddler) ProcessRequest(ctx *Context) error
func (TestDownloadMiddler) ProcessResponse ¶ added in v0.5.0
func (m TestDownloadMiddler) ProcessResponse(ctx *Context, req chan<- *Context) error
type TestDownloadMiddler2 ¶ added in v0.5.0
func (TestDownloadMiddler2) GetName ¶ added in v0.5.0
func (m TestDownloadMiddler2) GetName() string
func (TestDownloadMiddler2) GetPriority ¶ added in v0.5.0
func (m TestDownloadMiddler2) GetPriority() int
func (TestDownloadMiddler2) ProcessRequest ¶ added in v0.5.0
func (m TestDownloadMiddler2) ProcessRequest(ctx *Context) error
func (TestDownloadMiddler2) ProcessResponse ¶ added in v0.5.0
func (m TestDownloadMiddler2) ProcessResponse(ctx *Context, req chan<- *Context) error
type TestItemPipeline ¶ added in v0.5.0
type TestItemPipeline struct {
Priority int
}
func (*TestItemPipeline) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline) GetPriority() int
func (*TestItemPipeline) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline2 ¶ added in v0.5.0
type TestItemPipeline2 struct {
Priority int
}
func (*TestItemPipeline2) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline2) GetPriority() int
func (*TestItemPipeline2) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline2) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline3 ¶ added in v0.5.0
type TestItemPipeline3 struct {
Priority int
}
func (*TestItemPipeline3) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline3) GetPriority() int
func (*TestItemPipeline3) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline3) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline4 ¶ added in v0.5.0
type TestItemPipeline4 struct {
Priority int
}
func (*TestItemPipeline4) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline4) GetPriority() int
func (*TestItemPipeline4) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline4) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestSpider ¶ added in v0.5.0
type TestSpider struct {
*BaseSpider
}
func (*TestSpider) ErrorHandler ¶ added in v0.5.0
func (s *TestSpider) ErrorHandler(err *Context, req chan<- *Context)
func (*TestSpider) GetFeedUrls ¶ added in v0.5.0
func (s *TestSpider) GetFeedUrls() []string
func (*TestSpider) GetName ¶ added in v0.5.0
func (s *TestSpider) GetName() string
func (*TestSpider) Parser ¶ added in v0.5.0
func (s *TestSpider) Parser(resp *Context, req chan<- *Context) error
func (*TestSpider) StartRequest ¶ added in v0.5.0
func (s *TestSpider) StartRequest(req chan<- *Context)