Documentation ¶
Index ¶
- Constants
- Variables
- func PutContext(ctx *Context)
- type Bell
- type Clock
- type Context
- func (context *Context) AddQueue(req *request.Request) *Context
- func (context *Context) Aid(aid map[string]interface{}, ruleName ...string) interface{}
- func (context *Context) CopyRequest() *request.Request
- func (context *Context) CopyTemps() request.Temp
- func (context *Context) CreatItem(item map[int]interface{}, ruleName ...string) map[string]interface{}
- func (context *Context) FileOutput(name ...string)
- func (context *Context) GetCookie() string
- func (context *Context) GetDom() *goquery.Document
- func (context *Context) GetError() error
- func (context *Context) GetHeader() http.Header
- func (context *Context) GetHost() string
- func (context *Context) GetItemField(index int, ruleName ...string) (field string)
- func (context *Context) GetItemFieldIndex(field string, ruleName ...string) (index int)
- func (context *Context) GetItemFields(ruleName ...string) []string
- func (context *Context) GetKeyIn() string
- func (context *Context) GetLimit() int
- func (context *Context) GetMethod() string
- func (context *Context) GetName() string
- func (context *Context) GetReferer() string
- func (context *Context) GetRequest() *request.Request
- func (context *Context) GetRequestHeader() http.Header
- func (context *Context) GetResponse() *http.Response
- func (context *Context) GetRule(ruleName string) (*Rule, bool)
- func (context *Context) GetRuleName() string
- func (context *Context) GetRules() map[string]*Rule
- func (context *Context) GetSpider() *Spider
- func (context *Context) GetStatusCode() int
- func (context *Context) GetTemp(key string, defaultValue interface{}) interface{}
- func (context *Context) GetTemps() request.Temp
- func (context *Context) GetText() string
- func (context *Context) GetURL() string
- func (context *Context) JsAddQueue(jreq map[string]interface{}) *Context
- func (*Context) Log() logs.Logs
- func (context *Context) Output(item interface{}, ruleName ...string)
- func (context *Context) Parse(ruleName ...string) *Context
- func (context *Context) PullFiles() (fs []data.FileCell)
- func (context *Context) PullItems() (ds []data.DataCell)
- func (context *Context) ResetText(body string) *Context
- func (context *Context) RunTimer(id string) bool
- func (context *Context) SetError(err error)
- func (context *Context) SetKeyIn(keyIn string) *Context
- func (context *Context) SetLimit(max int) *Context
- func (context *Context) SetPausetime(pause int64, runtime ...bool) *Context
- func (context *Context) SetReferer(referer string) *Context
- func (context *Context) SetResponse(resp *http.Response) *Context
- func (context *Context) SetTemp(key string, value interface{}) *Context
- func (context *Context) SetTimer(id string, tol time.Duration, bell *Bell) bool
- func (context *Context) SetURL(url string) *Context
- func (context *Context) UpsertItemField(field string, ruleName ...string) (index int)
- type Rule
- type RuleModel
- type RuleTree
- type Spider
- func (spider *Spider) CanStop() bool
- func (spider *Spider) Copy() *Spider
- func (spider *Spider) Defer()
- func (spider *Spider) DoHistory(req *request.Request, ok bool) bool
- func (spider *Spider) GetDescription() string
- func (spider *Spider) GetEnableCookie() bool
- func (spider *Spider) GetID() int
- func (spider *Spider) GetItemField(rule *Rule, index int) (field string)
- func (spider *Spider) GetItemFieldIndex(rule *Rule, field string) (index int)
- func (spider *Spider) GetItemFields(rule *Rule) []string
- func (spider *Spider) GetKeyIn() string
- func (spider *Spider) GetLimit() int64
- func (spider *Spider) GetName() string
- func (spider *Spider) GetRule(ruleName string) (*Rule, bool)
- func (spider *Spider) GetRules() map[string]*Rule
- func (spider *Spider) GetSubName() string
- func (spider *Spider) IsStopping() bool
- func (spider *Spider) MustGetRule(ruleName string) *Rule
- func (spider *Spider) OutDefaultField() bool
- func (spider Spider) Register() *Spider
- func (spider *Spider) ReqmatrixInit() *Spider
- func (spider *Spider) RequestFree()
- func (spider *Spider) RequestLen() int
- func (spider *Spider) RequestPull() *request.Request
- func (spider *Spider) RequestPush(req *request.Request)
- func (spider *Spider) RequestUse()
- func (spider *Spider) RunTimer(id string) bool
- func (spider *Spider) SetID(id int)
- func (spider *Spider) SetKeyIn(keyword string)
- func (spider *Spider) SetLimit(max int64)
- func (spider *Spider) SetPausetime(pause int64, runtime ...bool)
- func (spider *Spider) SetTimer(id string, tol time.Duration, bell *Bell) bool
- func (spider *Spider) Start()
- func (spider *Spider) Stop()
- func (spider *Spider) TryFlushFailure()
- func (spider *Spider) TryFlushSuccess()
- func (spider *Spider) UpsertItemField(rule *Rule, field string) (index int)
- type SpiderModel
- type SpiderSpecies
- type Timer
Constants ¶
const ( KeyIn = util.UseKeyIn // If Spider.KeyIn is used, set the initial value to USE_KeyIn in the rule Limit = math.MaxInt64 // If you want to customize the control limit in the rule, the Limit initial value must be Limit ForcedStop = "- Take the initiative to terminate Spider -" )
const ( // alarm clock A = iota // countdown T )
Variables ¶
var Species = &SpiderSpecies{ list: []*Spider{}, hash: map[string]*Spider{}, }
Examples of global spider species
Functions ¶
func PutContext ¶
func PutContext(ctx *Context)
Types ¶
type Context ¶
type Context struct { Request *request.Request // original request Response *http.Response // response stream, where URL is copied from * request.Request sync.Mutex // contains filtered or unexported fields }
Context is a struct ...
func (*Context) AddQueue ¶
generate and add a request to the queue. Request.URL and Request.Rule must be set. Request.Spider does not need to be set manually (set by the system automatically). Request.EnableCookie is set in the Spider field and invalidated in the rule request. The following fields have default values, not set: Request.Method defaults to the GET method; Request.DialTimeout defaults to the constant request.DefaultDialTimeout, less than 0 does not limit the waiting time; Request.ConnTimeout defaults to the constant request.DefaultConnTimeout, less than 0 when the download timeout is not restricted; Request.TryTimes defaults to the request request.DefaultTryTimes, less than 0 does not limit the number of failed overloads; Request.RedirectTimes by default does not limit the number of redirects, less than 0 to prohibit redirects; Request.RetryPause defaults to constant request.DefaultRetryPause; Request.DownloaderID specified downloader ID, 0 for the default Surf high concurrent downloader, full functionality, 1 for the PhantomJS downloader, features strong break, slow, low concurrent. default auto fill Referer.
func (*Context) Aid ¶
call the specified function under the auxiliary function AidFunc (). specify the matching AidFunc with ruleName, and defaults to the current rule when it is empty.
func (*Context) CopyRequest ¶
Get a copy of the original request.
func (*Context) CreatItem ¶
func (context *Context) CreatItem(item map[int]interface{}, ruleName ...string) map[string]interface{}
CreatItem -> Generate text results. specify the matching ItemFields field with ruleName, and defaults to the current rule when it is empty.
func (*Context) FileOutput ¶
FileOutput ... name Specifies the file name, which is the default to keep the original file name unchanged.
func (*Context) GetItemField ¶
By the index index to obtain the result field name, do not exist when the empty string, If ruleName is empty, the default is the current rule.
func (*Context) GetItemFieldIndex ¶
Get the index subscript from the result field name, the index is -1 when there is no, If ruleName is empty, the default is the current rule.
func (*Context) GetItemFields ¶
Get the list of result field names.
func (*Context) GetReferer ¶
func (*Context) GetRequest ¶
Get the original request.
func (*Context) GetRequestHeader ¶
Get the request header information.
func (*Context) GetResponse ¶
Get the response flow.
func (*Context) GetRuleName ¶
Get the current rule name.
func (*Context) GetStatusCode ¶
Get the response status code.
func (*Context) GetTemp ¶
Get the temporary cache data in the request defaultValue can not interface {} (nil)
func (*Context) GetURL ¶
Get the URL from the original request to ensure that the URL before and after the request is exactly equal and the Chinese is not encoded.
func (*Context) JsAddQueue ¶
for dynamic rule add request.
func (*Context) Output ¶
Output text results. item type is map [int] interface {}, according to ruleName existing ItemFields field output, When the item type is map [string] interface {}, the ItemFields field that does not exist for ruleName will be automatically added, When the rule name is empty, the default current rule.
func (*Context) Parse ¶
parse the response flow. specify the matching ParseFunc field with ruleName, and the default call to Root ().
func (*Context) SetPausetime ¶
Custom pause interval (random: Pausetime / 2 ~ Pausetime * 2), higher priority than external pass. overwrite existing values if and only if runtime [0] is true.
func (*Context) SetReferer ¶
func (*Context) SetTimer ¶
set the timer, @id is a unique identifier for the timer, @ Bell == nil when the countdown, then @ tol for sleep long, @bell! = Nil for the alarm, this time @ tol used to specify the time to wake up (from now encountered from the first bell to the bell).
func (*Context) UpsertItemField ¶
dynamically append the result field name to the specified rule and get the index position, already exists to get the original index position, If ruleName is empty, the default is the current rule.
type Rule ¶
type Rule struct { ItemFields []string // result field list (optional, write guaranteed field order) ParseFunc func(*Context) // Content parsing function AidFunc func(*Context, map[string]interface{}) interface{} // General helper function }
Collect the rule node
type RuleModel ¶
type RuleModel struct { Name string `xml:"name,attr"` ParseFunc string `xml:"ParseFunc>Script"` AidFunc string `xml:"AidFunc>Script"` }
RuleModel is a strcut for ...
type RuleTree ¶
type RuleTree struct { Root func(*Context) // root node (execute entry) Trunk map[string]*Rule // node hash table (execution acquisition process) }
Collect the rule tree
type Spider ¶
type Spider struct { // The following fields are defined by the user Name string // User interface displays the name (should be guaranteed uniqueness) Description string // The user interface displays the description Pausetime int64 // random pause interval (50% ~ 200%), if the rules are directly defined, it is not covered by interface Limit int64 // default limit request number, 0 is not limited; if the rule is defined as Limit, then use the rules of the custom limit program KeyIn string // Customize the input configuration information, set the initial value to KeyIn in the rule before use EnableCookie bool // all requests are using cookie records NotDefaultField bool // whether to disable the output of the default field currentLink / parentLink / downloadTime Namespace func(spider *Spider) string // namespace, used to output files, named paths SubNamespace func(spider *Spider, dataCell map[string]interface{}) string // Secondary naming, used to output files, named paths, can depend on specific data content RuleTree *RuleTree // define a specific collection rule tree // contains filtered or unexported fields }
spider rules
func (*Spider) DoHistory ¶
Returns whether the request was added as a new failure to the end of the queue
func (*Spider) GetDescription ¶
Get the spider description
func (*Spider) GetEnableCookie ¶
control whether all requests use cookies
func (*Spider) GetItemField ¶
returns the value of the result field name does not exist when the empty string is returned
func (*Spider) GetItemFieldIndex ¶
returns the index of the result field name does not exist when the index is -1
func (*Spider) GetItemFields ¶
Specify the list of field names for the result of the rule
func (*Spider) GetLimit ¶
Get the acquisition limit <0 means that the number of requests is limited > 0 indicates that the custom limit scheme is used in the rule
func (*Spider) IsStopping ¶
func (*Spider) MustGetRule ¶
returns the specified rule
func (*Spider) OutDefaultField ¶
whether to output the default added field URL / ParentURL / downloadTime
func (*Spider) ReqmatrixInit ¶
func (*Spider) RequestFree ¶
func (spider *Spider) RequestFree()
func (*Spider) RequestLen ¶
func (*Spider) RequestPull ¶
func (*Spider) RequestPush ¶
func (*Spider) RequestUse ¶
func (spider *Spider) RequestUse()
func (*Spider) SetLimit ¶
set the acquisition limit <0 means that the number of requests is limited > 0 indicates that the custom limit scheme is used in the rule
func (*Spider) SetPausetime ¶
Custom pause time pause [0] ~ (pause [0] + pause [1]), higher priority than external pass overwrite existing values if and only if runtime [0] is true
func (*Spider) SetTimer ¶
set the timer @id is uniquely identified by the timer @ Bell == nil when the countdown, then @ tol for sleep long @bell! = Nil for the alarm, this time @ tol used to specify the time to wake up (from now encountered from the first bell to the bell)
func (*Spider) TryFlushFailure ¶
func (spider *Spider) TryFlushFailure()
func (*Spider) TryFlushSuccess ¶
func (spider *Spider) TryFlushSuccess()
type SpiderModel ¶
type SpiderModel struct { Name string `xml:"Name"` Description string `xml:"Description"` Pausetime int64 `xml:"Pausetime"` EnableLimit bool `xml:"EnableLimit"` EnableKeyIn bool `xml:"EnableKeyIn"` EnableCookie bool `xml:"EnableCookie"` NotDefaultField bool `xml:"NotDefaultField"` Namespace string `xml:"Namespace>Script"` SubNamespace string `xml:"SubNamespace>Script"` Root string `xml:"Root>Script"` Trunk []RuleModel `xml:"Rule"` }
SpiderModel is rule interpreter model
type SpiderSpecies ¶
type SpiderSpecies struct {
// contains filtered or unexported fields
}
List of spider species
func (*SpiderSpecies) Add ¶
func (spiderSpecies *SpiderSpecies) Add(sp *Spider) *Spider
Add a new category to the spider list
func (*SpiderSpecies) Get ¶
func (spiderSpecies *SpiderSpecies) Get() []*Spider
Get all spider species
func (*SpiderSpecies) GetByName ¶
func (spiderSpecies *SpiderSpecies) GetByName(name string) *Spider