Documentation
¶
Index ¶
- Variables
- func CreateDataDB(path string) *leveldb.DB
- func DecodeGBK(s []byte) ([]byte, error)
- func RequestStringify(req Request) (string, error)
- type ByteHandler
- type ClientGenerator
- type ConsoleHandler
- type ConsolePipeline
- type DataStore
- func (lvdb *DataStore) Add(key string, value string) error
- func (lvdb *DataStore) BatchAdd(m map[string]string) error
- func (lvdb *DataStore) Clear(prefix string, limit ...string)
- func (lvdb *DataStore) Del(key string) error
- func (lvdb *DataStore) Get(key string) (string, error)
- func (lvdb *DataStore) List(prefix string, limit ...string) ([]string, error)
- type DefaultListener
- type Downloader
- type GBKByteHandler
- type Handler
- type HttpDownloader
- type Listener
- type Pipeline
- type Proxy
- type ProxyProvider
- type Request
- type RequestFilter
- type RequestHandle
- type RequestScheduler
- type RequestState
- type Response
- type Result
- type Scheduler
- type SimpleClientGenerator
- type SimpleProxyProvider
- type Spider
- func (s *Spider) AddHandler(handler Handler)
- func (s *Spider) AddHeader(key, value string)
- func (s *Spider) AddListener(listener Listener)
- func (s *Spider) AddPipeline(pipeline Pipeline)
- func (s *Spider) AddProxy(pxy ...Proxy)
- func (s *Spider) AddRequestStore(store Store)
- func (s *Spider) AddSeedUrl(seedUrls ...string)
- func (s *Spider) ClearRequestStore()
- func (s *Spider) Run()
- func (s *Spider) SaveHtml(savepath string, suffixGenerate func() string)
- func (s *Spider) SetByteHandler(handler ByteHandler)
- func (s *Spider) SetClientGenerator(clientGenerator ClientGenerator)
- func (s *Spider) SetCycleTime(time int)
- func (s *Spider) SetDownloader(downloader Downloader)
- func (s *Spider) SetGoroutines(n int)
- func (s *Spider) SetProxyProvider(proxyProvider ProxyProvider)
- func (s *Spider) SetRequestFilter(filter RequestFilter)
- func (s *Spider) SetScheduler(scheduler Scheduler)
- func (s *Spider) SetSleepTime(t time.Duration)
- func (s *Spider) SetTimeOut(t time.Duration)
- type Store
- type StoreRequestFilter
Constants ¶
This section is empty.
Variables ¶
View Source
var ErrorSkip = errors.New("skip")
Functions ¶
func RequestStringify ¶
Types ¶
type ByteHandler ¶
type ClientGenerator ¶
type ClientGenerator interface { Generate() *http.Client SetProxyProvider(pxyProvider ProxyProvider) }
ClientGenerator 客户端生成器
type DefaultListener ¶
type DefaultListener struct {
// contains filtered or unexported fields
}
Listener的认实现
type Downloader ¶
type GBKByteHandler ¶
type GBKByteHandler struct { }
type HttpDownloader ¶
type HttpDownloader struct {
// contains filtered or unexported fields
}
Downloader的默认实现
func (*HttpDownloader) SetClientGenerator ¶
func (d *HttpDownloader) SetClientGenerator(generator ClientGenerator)
type Listener ¶
type Listener interface { OnError(req Request, e error, ctx context.Context) OnSuccess(req Request, ctx context.Context) }
监听接口
type ProxyProvider ¶
ProxyProvider 代理提供器
type Request ¶
type Request struct { Id string Url string //请求资源地址 Method string //请求方法, Header map[string][]string //请求头 Downloader Downloader //下载器 Extras map[string]interface{} //额外信息 Skip bool //跳过请求不处理 State RequestState //请求的状态 CycleTime int //请求失败之后重复请求的次数 }
func NewRequest ¶
func NewRequest() Request
func ParseRequest ¶
type RequestFilter ¶
FilterRequest 去重复的url
type RequestHandle ¶
type RequestHandle func(req *Request)
type RequestScheduler ¶
type RequestScheduler struct {
// contains filtered or unexported fields
}
Scheduler的默认实现
func (*RequestScheduler) Len ¶
func (s *RequestScheduler) Len() int
func (*RequestScheduler) Poll ¶
func (s *RequestScheduler) Poll() Request
func (*RequestScheduler) Push ¶
func (s *RequestScheduler) Push(reqs ...Request)
type RequestState ¶
type RequestState string
const ( RequestNormal RequestState = "normal" RequestSuccess RequestState = "success" RequestError RequestState = "error" )
type SimpleClientGenerator ¶
type SimpleClientGenerator struct {
// contains filtered or unexported fields
}
SimpleClientGenerator ClientGenerator的默认实现
func (*SimpleClientGenerator) Generate ¶
func (sg *SimpleClientGenerator) Generate() *http.Client
func (*SimpleClientGenerator) SetProxyProvider ¶
func (sg *SimpleClientGenerator) SetProxyProvider(pxyProvider ProxyProvider)
type SimpleProxyProvider ¶
type SimpleProxyProvider struct {
// contains filtered or unexported fields
}
SimpleProxyProvider ProxyProvider默认实现
func (*SimpleProxyProvider) AddProxy ¶
func (sp *SimpleProxyProvider) AddProxy(pxy ...Proxy)
func (*SimpleProxyProvider) GetProxy ¶
func (sp *SimpleProxyProvider) GetProxy() *Proxy
GetProxy 实现ProxyProvider接口
type Spider ¶
type Spider struct { RequestsStore []Store //保存请求对象数据 PreHandleRequest RequestHandle //执行请求前的请求处理 // contains filtered or unexported fields }
func (*Spider) AddRequestStore ¶
SetStoreDB 存储器 存储请求数据
func (*Spider) SaveHtml ¶
SaveHtml 是否保存html 默认false不保存 savepath保存地址 也可以在自定义的Handler处理器中自行实现保存逻辑 suffixGenerate 名字后缀函数,html存储名字和生成的后缀拼接
func (*Spider) SetByteHandler ¶
func (s *Spider) SetByteHandler(handler ByteHandler)
SetByteHandler 设置字节处理器 对下载的字节进行处理
func (*Spider) SetClientGenerator ¶
func (s *Spider) SetClientGenerator(clientGenerator ClientGenerator)
SetClientGenerator 客户端生成器
func (*Spider) SetDownloader ¶
func (s *Spider) SetDownloader(downloader Downloader)
SetDownloader 设置下载器
func (*Spider) SetProxyProvider ¶
func (s *Spider) SetProxyProvider(proxyProvider ProxyProvider)
SetProxyProvider 代理提供者
func (*Spider) SetRequestFilter ¶
func (s *Spider) SetRequestFilter(filter RequestFilter)
SetRequestFilter 设置请求过滤器
func (*Spider) SetTimeOut ¶
SetTimeOut 设置程序在没有数据之后退出的时间 当t<0时 程序一直运行不退出
type Store ¶
type Store interface { Add(key string, value string) error BatchAdd(m map[string]string) error Get(key string) (string, error) Del(key string) error List(prefix string, limit ...string) ([]string, error) Clear(prefix string, limit ...string) }
key-value存储数据
type StoreRequestFilter ¶
type StoreRequestFilter struct {
// contains filtered or unexported fields
}
FilterRequest的默认鸟实现
func (*StoreRequestFilter) Filter ¶
func (filter *StoreRequestFilter) Filter(requests ...Request) []Request
Click to show internal directories.
Click to hide internal directories.