Documentation ¶
Index ¶
- Variables
- func AutoMigrateHack(s *gorm.DB, rule *TaskRule) *gorm.DB
- func CancelTask(taskID uint64) bool
- func GetTaskRuleKeys() []string
- func NewConstraints(columns []string, sizeOrSQLConstraint ...interface{}) (constraints map[string]*OutputConstraint)
- func NewSQLString(size int, defaultValue ...string) (sql string)
- func NewStringsConstraints(columns []string, size ...int) (constraints map[string]*OutputConstraint)
- func Register(rule *TaskRule)
- type CSVConf
- type Context
- func (ctx *Context) Abort()
- func (ctx *Context) AbsoluteURL(u string) string
- func (ctx *Context) GetAnyReqContextValue(key string) interface{}
- func (ctx *Context) GetOutputDB() *sql.DB
- func (ctx *Context) GetReqContextValue(key string) string
- func (ctx *Context) GetRequest() *Request
- func (ctx *Context) Output(value interface{}, namespace ...string) error
- func (ctx *Context) OutputCustom(o Outputer, namespace ...string) error
- func (ctx *Context) OutputDefault(row map[int]interface{}, namespace ...string) error
- func (ctx *Context) Post(URL string, requestData map[string]string) error
- func (ctx *Context) PostForNext(URL string, requestData map[string]string) error
- func (ctx *Context) PostForNextWithContext(URL string, requestData map[string]string) error
- func (ctx *Context) PostMultipartForNext(URL string, requestData map[string][]byte) error
- func (ctx *Context) PostRawForNext(URL string, requestData []byte) error
- func (ctx *Context) PostRawForNextWithContext(URL string, requestData []byte) error
- func (ctx *Context) PostWithContext(URL string, requestData map[string]string) error
- func (ctx *Context) PutReqContextValue(key string, value interface{})
- func (ctx *Context) Request(method, URL string, requestData io.Reader, hdr http.Header) error
- func (ctx *Context) RequestForNext(method, URL string, requestData io.Reader, hdr http.Header) error
- func (ctx *Context) RequestForNextWithContext(method, URL string, requestData io.Reader, hdr http.Header) error
- func (ctx *Context) RequestWithContext(method, URL string, requestData io.Reader, hdr http.Header) error
- func (ctx *Context) Retry() error
- func (ctx *Context) SetResponseCharacterEncoding(encoding string)
- func (ctx *Context) Visit(URL string) error
- func (ctx *Context) VisitForNext(URL string) error
- func (ctx *Context) VisitForNextWithContext(URL string) error
- func (ctx *Context) VisitWithContext(URL string) error
- type HTMLElement
- func (h *HTMLElement) Attr(k string) string
- func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
- func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
- func (h *HTMLElement) ChildText(goquerySelector string) string
- func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
- type Limit
- type MultipleNamespaceConf
- type Node
- type Option
- type OutputConfig
- type OutputConstraint
- type Outputer
- type Request
- type Response
- type Rule
- type Spider
- type Task
- type TaskConfig
- type TaskRule
- type XMLElement
Constants ¶
This section is empty.
Variables ¶
var ( // ErrTaskRuleNotExist is the error type for task rule not exist ErrTaskRuleNotExist = errors.New("task rule not exist") // ErrTaskRuleIsNil is the error thrown when a nil rule registered ErrTaskRuleIsNil = errors.New("task rule is nil") // ErrTaskRuleNameIsEmpty is the error thrown when the ruleName is empty ErrTaskRuleNameIsEmpty = errors.New("task rule name is empty") // ErrTaskRuleNameDuplicated is the error thrown if the rule name is duplicated ErrTaskRuleNameDuplicated = errors.New("task rule name is Duplicated") // ErrTaskRuleHeadIsNil is the error thrown if the rule's head is nil ErrTaskRuleHeadIsNil = errors.New("task rule head is nil") // ErrTaskRuleNodesLenInvalid is the error thrown if the rule's nodes len is invalid ErrTaskRuleNodesLenInvalid = errors.New("task rule nodes len is invalid") // ErrTaskRuleNodesKeyInvalid is the error thrown if the rule's key len is invalid ErrTaskRuleNodesKeyInvalid = errors.New("task rule nodes key should start from 0 and monotonically increasing") // ErrTaskRunningTimeout is the error type for task running timeout ErrTaskRunningTimeout = errors.New("task running timeout") )
var ( // ErrOutputFieldsNotMatchOutputRow is the error type for output fields not match out put row ErrOutputFieldsNotMatchOutputRow = errors.New("output fields not match out put row") // ErrTooManyOutputNamespace is the error type for for too many output namespace ErrTooManyOutputNamespace = errors.New("too many output namespace") // ErrOutputToMultipleTableDisabled is the error thrown if "OutputToMultipleTable" is false ErrOutputToMultipleTableDisabled = errors.New("output to multiple tables disabled") // ErrOutputTypeNotSupported is the error type for unknown output type ErrOutputTypeNotSupported = errors.New("output type not supported") // ErrMultConfNamespaceNotFound is the error type for mult conf namespace not found ErrMultConfNamespaceNotFound = errors.New("mult conf namespace not found") // ErrOutputParamNotSupported is the error type for unknown output param ErrOutputParamNotSupported = errors.New("output param not supported ") )
Functions ¶
func AutoMigrateHack ¶
AutoMigrateHack auto create table of the rule
func NewConstraints ¶
func NewConstraints(columns []string, sizeOrSQLConstraint ...interface{}) (constraints map[string]*OutputConstraint)
NewConstraints is the convenience func to return the custom constraints
func NewSQLString ¶
NewSQLString is the convenience func to return varchar sql string
func NewStringsConstraints ¶
func NewStringsConstraints(columns []string, size ...int) (constraints map[string]*OutputConstraint)
NewStringsConstraints is the convenience func to return varchar sql string of a batch columns
Types ¶
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context gospider context of each callback
func (*Context) AbsoluteURL ¶
AbsoluteURL return the absolute URL of u
func (*Context) GetAnyReqContextValue ¶
GetAnyReqContextValue return the interface value for a key on ctx
func (*Context) GetOutputDB ¶
GetOutputDB get database of current context
func (*Context) GetReqContextValue ¶
GetReqContextValue return the string value for a key on ctx
func (*Context) GetRequest ¶
GetRequest return the request on this context
func (*Context) OutputCustom ¶
Output output custom by user
func (*Context) OutputDefault ¶
Output output a row data by default
func (*Context) PostForNext ¶
PostForNext issues a POST to the specified URL for next step
func (*Context) PostForNextWithContext ¶
PostForNextWithContext issues a POST to the specified URL for next step with previous context
func (*Context) PostMultipartForNext ¶
PostMultipartForNext issues a multipart POST to the specified URL for next step
func (*Context) PostRawForNext ¶
PostRawForNext issues a rawData POST to the specified URL
func (*Context) PostRawForNextWithContext ¶
PostRawForNextWithContext issues a rawData POST to the specified URL for next step with previous context
func (*Context) PostWithContext ¶
PostWithContext issues a POST to the specified URL with current context
func (*Context) PutReqContextValue ¶
PutReqContextValue sets the value for a key
func (*Context) RequestForNext ¶
func (ctx *Context) RequestForNext(method, URL string, requestData io.Reader, hdr http.Header) error
RequestForNext low level method to send HTTP request for next step
func (*Context) RequestForNextWithContext ¶
func (ctx *Context) RequestForNextWithContext(method, URL string, requestData io.Reader, hdr http.Header) error
RequestForNextWithContext low level method to send HTTP request for next step with previous context
func (*Context) RequestWithContext ¶
func (ctx *Context) RequestWithContext(method, URL string, requestData io.Reader, hdr http.Header) error
RequestWithContext low level method to send HTTP request with context
func (*Context) SetResponseCharacterEncoding ¶
SetResponseCharacterEncoding set the response charscter encoding on the request
func (*Context) VisitForNext ¶
VisitForNext issues a GET to the specified URL for next step
func (*Context) VisitForNextWithContext ¶
VisitForNextWithContext issues a GET to the specified URL for next step with previous context
func (*Context) VisitWithContext ¶
VisitWithContext issues a GET to the specified URL with current context
type HTMLElement ¶
type HTMLElement struct { Name string Text string Request *Request Response *Response DOM *goquery.Selection // contains filtered or unexported fields }
HTMLElement the html element object
func (*HTMLElement) Attr ¶
func (h *HTMLElement) Attr(k string) string
Attr return the html element attr value
func (*HTMLElement) ChildAttr ¶
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
ChildAttr the child attr value of h
func (*HTMLElement) ChildAttrs ¶
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
ChildAttrs the child attr list of h
func (*HTMLElement) ChildText ¶
func (h *HTMLElement) ChildText(goquerySelector string) string
ChildText the child text content of h
func (*HTMLElement) ForEach ¶
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
ForEach calls callback on each goquerySelector element
type Limit ¶
type Limit struct { Enable bool // DomainRegexp is a regular expression to match against domains DomainRegexp string // DomainRegexp is a glob pattern to match against domains DomainGlob string // Delay is the duration to wait before creating a new request to the matching domains Delay time.Duration // RandomDelay is the extra randomized duration to wait added to Delay before creating a new request RandomDelay time.Duration // Parallelism is the number of the maximum allowed concurrent requests of the matching domains Parallelism int }
Limit is the limit of a task
type MultipleNamespaceConf ¶
type MultipleNamespaceConf struct { OutputFields []string OutputConstraints map[string]*OutputConstraint OutputTableOpts string }
MultipleNamespaceConf is the mutiple namespace conf
type Node ¶
type Node struct { OnRequest func(ctx *Context, req *Request) OnError func(ctx *Context, res *Response, err error) error OnResponse func(ctx *Context, res *Response) error OnHTML map[string]func(ctx *Context, el *HTMLElement) error OnXML map[string]func(ctx *Context, el *XMLElement) error OnScraped func(ctx *Context, res *Response) error }
Node the rule node of a task
type Option ¶
type Option struct { UserAgent string MaxDepth int AllowedDomains []string URLFilters []*regexp.Regexp AllowURLRevisit bool MaxBodySize int IgnoreRobotsTxt bool InsecureSkipVerify bool ParseHTTPErrorResponse bool DisableCookies bool RequestTimeout time.Duration }
Option is the config option of a task
type OutputConfig ¶
OutputConfig is the output config of a task
type OutputConstraint ¶
OutputConstraint is the output constraint of db
type Request ¶
type Request struct { URL *url.URL Headers *http.Header Method string Body io.Reader ID uint32 // contains filtered or unexported fields }
Request the object of each request
type Response ¶
type Response struct { StatusCode int Body []byte Request *Request Headers *http.Header // contains filtered or unexported fields }
Response the object of each response
type Spider ¶
type Spider struct {
// contains filtered or unexported fields
}
Spider the spider define
type TaskConfig ¶
type TaskConfig struct { CronSpec string Option Option Limit Limit ProxyURLs []string OutputConfig OutputConfig }
TaskConfig is the config of a task
type TaskRule ¶
type TaskRule struct { Name string Description string OutputToMultipleNamespace bool MultipleNamespaceConf map[string]*MultipleNamespaceConf Namespace string OutputFields []string OutputConstraints map[string]*OutputConstraint OutputTableOpts string DisableCookies bool AllowURLRevisit bool IgnoreRobotsTxt bool InsecureSkipVerify bool ParseHTTPErrorResponse bool Rule *Rule }
TaskRule is the task rule define
func GetTaskRule ¶
GetTaskRule get task rule by ruleName
type XMLElement ¶
type XMLElement struct { Name string Text string Request *Request Response *Response DOM interface{} // contains filtered or unexported fields }
XMLElement the xml element object
func (*XMLElement) Attr ¶
func (x *XMLElement) Attr(k string) string
Attr return the xml element attr value
func (*XMLElement) ChildAttr ¶
func (x *XMLElement) ChildAttr(xpathQuery, attrName string) string
ChildAttr the child attr value of x
func (*XMLElement) ChildAttrs ¶
func (x *XMLElement) ChildAttrs(xpathQuery, attrName string) []string
ChildAttrs the child attr list of x
func (*XMLElement) ChildText ¶
func (x *XMLElement) ChildText(xpathQuery string) string
ChildText the child text content of x