Documentation ¶
Index ¶
- Constants
- Variables
- func AddCookieToJar(urlAddr string, cookies ...*http.Cookie) func(s *Spider)
- func GetRequestHash(r *Request) [md5.Size]byte
- func Limiter(WhiteList bool, rules ...*LimitRule) func(s *Spider)
- func RandomProxy(p ...string) func(s *Spider)
- func RandomUserAgent() func(s *Spider)
- func RedisDistributed(ro *redis.Options, sName string, useDeduplicate bool, ...) func(s *Spider)
- func RedisReqDeduplicate(r *redis.Client, sName string) func(s *Spider)
- func RefererFiller() func(s *Spider)
- func ReqDeduplicate() func(s *Spider)
- func Retry(maxTimes int, okcode ...int) func(s *Spider)
- func RobotsTxt(baseUrl, ua string) func(s *Spider)
- func SaveItemsAsCSV(f *os.File) func(s *Spider)
- func SaveItemsAsJSON(f *os.File) func(s *Spider)
- func SetDepthFirst(d bool) func(s *Spider)
- func SpiderLogError(f *os.File) func(s *Spider)
- func SpiderLogPrint() func(s *Spider)
- type BaseDownloader
- type BaseScheduler
- type Context
- type CsvItem
- type CtxHandlerFun
- type Downloader
- type DownloaderErr
- type ErrorItem
- type JsonItem
- type LimitRule
- type LimitRuleAllow
- type Manager
- type RedisScheduler
- type Request
- func (s *Request) AddCookie(c *http.Cookie) *Request
- func (s *Request) AddParam(k, v string) *Request
- func (s *Request) GetBody() []byte
- func (s *Request) SetHeader(key, value string) *Request
- func (s *Request) SetParam(p map[string]string) *Request
- func (s *Request) SetProxy(p string) *Request
- func (s *Request) SetUA(ua string) *Request
- func (s *Request) WithMeta(k string, v interface{}) *Request
- type Response
- type Scheduler
- type Spider
- func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)
- func (s *Spider) OnAdd(fn func(ctx *Context, t *Task) *Task)
- func (s *Spider) OnError(fn func(ctx *Context, err error))
- func (s *Spider) OnFinish(fn func(s *Spider))
- func (s *Spider) OnHTML(selector string, fn func(ctx *Context, sel *goquery.Selection))
- func (s *Spider) OnItem(fn func(i interface{}) interface{})
- func (s *Spider) OnJSON(q string, fn func(ctx *Context, j gjson.Result))
- func (s *Spider) OnReq(fn func(ctx *Context, req *Request) *Request)
- func (s *Spider) OnResp(fn CtxHandlerFun)
- func (s *Spider) OnStart(fn func(s *Spider))
- func (s *Spider) Run()
- func (s *Spider) SetItemPoolSize(i int)
- func (s *Spider) SetTaskPoolSize(i int)
- func (s *Spider) Use(fn ...func(s *Spider))
- type Task
Constants ¶
const DeduplicateSuffix = "_deduplicate"
const ItemsSuffix = "_items"
const TasksSuffix = "_tasks"
Variables ¶
var D = NewBaseDownloader()
var Do = D.Do
var ErrRunFinishedSpider = errors.New("running a spider which is finished,you could recreate this spider and run the new one")
var GetReq = Get
Deprecated: will be remove at next major version
var Log = logging.MustGetLogger("goribot")
var PostReq = Post
Deprecated: will be remove at next major version
Functions ¶
func AddCookieToJar ¶
AddCookieToJar is an extension add a cookie to downloader's cookie jar
func GetRequestHash ¶
GetRequestHash return a hash of url,header,cookie and body data from a request
func RandomProxy ¶
RandomUserAgent is an extension can set random proxy url for new task
func RandomUserAgent ¶
func RandomUserAgent() func(s *Spider)
RandomUserAgent is an extension can set random User-Agent for new task
func RedisDistributed ¶
func RedisReqDeduplicate ¶
ReqDeduplicate is an extension can deduplicate new task based on redis to support distributed
func RefererFiller ¶
func RefererFiller() func(s *Spider)
RefererFiller is an extension can add Referer for new task
func ReqDeduplicate ¶
func ReqDeduplicate() func(s *Spider)
ReqDeduplicate is an extension can deduplicate new task
func SaveItemsAsCSV ¶
SaveItemsAsCSV is a extension save items to a csv file
func SaveItemsAsJSON ¶
SaveItemsAsCSV is a extension save items to a json file
func SetDepthFirst ¶
SetDepthFirst is an extension change Scheduler DepthFirst setting
func SpiderLogError ¶
SpiderLogError is a extension logs special or error response
func SpiderLogPrint ¶
func SpiderLogPrint() func(s *Spider)
SpiderLogPrint is a extension print spider working status
Types ¶
type BaseDownloader ¶
BaseDownloader is default downloader of goribot
func NewBaseDownloader ¶
func NewBaseDownloader() *BaseDownloader
func (*BaseDownloader) AddMiddleware ¶
type BaseScheduler ¶
type BaseScheduler struct { // DepthFirst sets push new tasks to the top of the queue DepthFirst bool // contains filtered or unexported fields }
Scheduler is default scheduler of goribot
func NewBaseScheduler ¶
func NewBaseScheduler(depthFirst bool) *BaseScheduler
func (*BaseScheduler) AddItem ¶
func (s *BaseScheduler) AddItem(i interface{})
func (*BaseScheduler) AddTask ¶
func (s *BaseScheduler) AddTask(t *Task)
func (*BaseScheduler) GetItem ¶
func (s *BaseScheduler) GetItem() interface{}
func (*BaseScheduler) GetTask ¶
func (s *BaseScheduler) GetTask() *Task
func (*BaseScheduler) IsItemEmpty ¶
func (s *BaseScheduler) IsItemEmpty() bool
func (*BaseScheduler) IsTaskEmpty ¶
func (s *BaseScheduler) IsTaskEmpty() bool
type Context ¶
type Context struct { // Req is the origin request Req *Request // Resp is the response object Resp *Response // Meta the request task created by NewTaskWithMeta func will have a k-y pair Meta map[string]interface{} Handlers []CtxHandlerFun // contains filtered or unexported fields }
Context is a wrap of response,origin request,new task,etc
func (*Context) Abort ¶
func (c *Context) Abort()
Abort this context to break the handler chain and stop handling
func (*Context) AddItem ¶
func (c *Context) AddItem(i interface{})
AddItem add an item to new item list. After every handler func return, spider will collect these items and call OnItem handler func
func (*Context) AddTask ¶
func (c *Context) AddTask(request *Request, handlers ...CtxHandlerFun)
AddTask add a task to new task list. After every handler func return,spider will collect these tasks
type CtxHandlerFun ¶
type CtxHandlerFun func(ctx *Context)
type Downloader ¶
type Downloader interface { Do(req *Request) (resp *Response, err error) AddMiddleware(func(req *Request, next func(req *Request) (resp *Response, err error)) (resp *Response, err error)) }
Downloader tool download response from request
type DownloaderErr ¶
type DownloaderErr struct { // Request is the Request object when the error occurred Request *Request // Response is the Request object when the error occurred.It could be nil. Response *Response // contains filtered or unexported fields }
DownloaderErr is a error create by Downloader
type LimitRule ¶
type LimitRuleAllow ¶
type LimitRuleAllow uint8
const ( NotSet LimitRuleAllow = iota Allow Disallow )
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
func (*Manager) SetItemPoolSize ¶
type RedisScheduler ¶
type RedisScheduler struct {
// contains filtered or unexported fields
}
Scheduler is default scheduler of goribot
func NewRedisScheduler ¶
func NewRedisScheduler(redis *redis.Client, sName string, bs int, fn ...CtxHandlerFun) *RedisScheduler
func (*RedisScheduler) AddItem ¶
func (s *RedisScheduler) AddItem(i interface{})
func (*RedisScheduler) AddTask ¶
func (s *RedisScheduler) AddTask(t *Task)
func (*RedisScheduler) GetItem ¶
func (s *RedisScheduler) GetItem() interface{}
func (*RedisScheduler) GetTask ¶
func (s *RedisScheduler) GetTask() *Task
func (*RedisScheduler) IsItemEmpty ¶
func (s *RedisScheduler) IsItemEmpty() bool
func (*RedisScheduler) IsTaskEmpty ¶
func (s *RedisScheduler) IsTaskEmpty() bool
type Request ¶
type Request struct { *http.Request Depth int // ResponseCharacterEncoding is the character encoding of the response body. // Leave it blank to allow automatic character encoding of the response body. // It is empty by default and it can be set in OnRequest callback. ResponseCharacterEncoding string // ProxyURL is the proxy address that handles the request ProxyURL string // Meta contains data between a Request and a Response Meta map[string]interface{} Err error // contains filtered or unexported fields }
Request is a object of HTTP request
func PostFormReq ¶
PostFormReq creates a post request with form data
func PostJsonReq ¶
PostJsonReq creates a post request with json data
func PostRawReq ¶
PostReq creates a post request with raw data
func (*Request) SetHeader ¶
SetHeader sets the header entries associated with key to the single element value.
func (*Request) SetParam ¶
SetParam sets query param of request url. Deprecated: will be remove at next major version
type Response ¶
type Response struct { *http.Response // Body is the content of the Response Body []byte // Text is the content of the Response parsed as string Text string // Request is the Req object from goribot of the response.Tip: there is another Request attr come from *http.Response Req *Request // Dom is the parsed html object Dom *goquery.Document // Meta contains data between a Request and a Response Meta map[string]interface{} }
Response is a object of HTTP response
func (*Response) DecodeAndParse ¶
DecodeAndParas decodes the body to text and try to parse it to html or json.
type Scheduler ¶
type Scheduler interface { // GetTask pops a task GetTask() *Task // GetItem pops a item GetItem() interface{} // AddTask push a task AddTask(t *Task) // AddItem push a item AddItem(i interface{}) // IsTaskEmpty returns is tasks queue empty IsTaskEmpty() bool // IsItemEmpty returns is items queue empty IsItemEmpty() bool }
Scheduler is a queue of tasks and items
type Spider ¶
type Spider struct { Scheduler Scheduler Downloader Downloader AutoStop bool // contains filtered or unexported fields }
func (*Spider) AddTask ¶
func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)
func (*Spider) OnAdd ¶
***********************************************************************************
func (*Spider) OnError ¶
***********************************************************************************
func (*Spider) OnFinish ¶
***********************************************************************************
func (*Spider) OnItem ¶
func (s *Spider) OnItem(fn func(i interface{}) interface{})
***********************************************************************************
func (*Spider) OnReq ¶
***********************************************************************************
func (*Spider) OnResp ¶
func (s *Spider) OnResp(fn CtxHandlerFun)
***********************************************************************************
func (*Spider) OnStart ¶
***********************************************************************************
func (*Spider) SetItemPoolSize ¶
func (*Spider) SetTaskPoolSize ¶
type Task ¶
type Task struct { Request *Request Handlers []CtxHandlerFun }
func NewTask ¶
func NewTask(request *Request, handlers ...CtxHandlerFun) *Task