Versions in this module Expand all Collapse all v1 v1.2.0 Feb 13, 2019 Changes in this version + const ProxyURLKey + var ErrAlreadyVisited = errors.New("URL already visited") + var ErrForbiddenDomain = errors.New("Forbidden domain") + var ErrForbiddenURL = errors.New("ForbiddenURL") + var ErrMaxDepth = errors.New("Max depth limit reached") + var ErrMissingURL = errors.New("Missing URL") + var ErrNoCookieJar = errors.New("Cookie jar is not available") + var ErrNoPattern = errors.New("No pattern defined in LimitRule") + var ErrNoURLFiltersMatch = errors.New("No URLFilters match") + var ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt") + func AllowURLRevisit() func(*Collector) + func AllowedDomains(domains ...string) func(*Collector) + func Async(a bool) func(*Collector) + func CacheDir(path string) func(*Collector) + func Debugger(d debug.Debugger) func(*Collector) + func DetectCharset() func(*Collector) + func DisallowedDomains(domains ...string) func(*Collector) + func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) + func ID(id uint32) func(*Collector) + func IgnoreRobotsTxt() func(*Collector) + func MaxBodySize(sizeInBytes int) func(*Collector) + func MaxDepth(depth int) func(*Collector) + func ParseHTTPErrorResponse() func(*Collector) + func SanitizeFileName(fileName string) string + func URLFilters(filters ...*regexp.Regexp) func(*Collector) + func UnmarshalHTML(v interface{}, s *goquery.Selection) error + func UserAgent(ua string) func(*Collector) + type Collector struct + AllowURLRevisit bool + AllowedDomains []string + Async bool + CacheDir string + CheckHead bool + DetectCharset bool + DisallowedDomains []string + DisallowedURLFilters []*regexp.Regexp + ID uint32 + IgnoreRobotsTxt bool + MaxBodySize int + MaxDepth int + ParseHTTPErrorResponse bool + RedirectHandler func(req *http.Request, via []*http.Request) error + URLFilters []*regexp.Regexp + UserAgent string + func NewCollector(options ...func(*Collector)) *Collector + func (c *Collector) Appengine(ctx context.Context) + func (c *Collector) Clone() *Collector + func (c *Collector) Cookies(URL string) []*http.Cookie + func (c *Collector) DisableCookies() + func (c *Collector) Head(URL string) error + func (c *Collector) Init() + func (c *Collector) Limit(rule *LimitRule) error + func (c *Collector) Limits(rules []*LimitRule) error + func (c *Collector) OnError(f ErrorCallback) + func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) + func (c *Collector) OnHTMLDetach(goquerySelector string) + func (c *Collector) OnRequest(f RequestCallback) + func (c *Collector) OnResponse(f ResponseCallback) + func (c *Collector) OnScraped(f ScrapedCallback) + func (c *Collector) OnXML(xpathQuery string, f XMLCallback) + func (c *Collector) OnXMLDetach(xpathQuery string) + func (c *Collector) Post(URL string, requestData map[string]string) error + func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error + func (c *Collector) PostRaw(URL string, requestData []byte) error + func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error + func (c *Collector) SetCookieJar(j *cookiejar.Jar) + func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error + func (c *Collector) SetDebugger(d debug.Debugger) + func (c *Collector) SetProxy(proxyURL string) error + func (c *Collector) SetProxyFunc(p ProxyFunc) + func (c *Collector) SetRequestTimeout(timeout time.Duration) + func (c *Collector) SetStorage(s storage.Storage) error + func (c *Collector) String() string + func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) + func (c *Collector) Visit(URL string) error + func (c *Collector) Wait() + func (c *Collector) WithTransport(transport http.RoundTripper) + type Context struct + func NewContext() *Context + func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{} + func (c *Context) Get(key string) string + func (c *Context) GetAny(key string) interface{} + func (c *Context) MarshalBinary() (_ []byte, _ error) + func (c *Context) Put(key string, value interface{}) + func (c *Context) UnmarshalBinary(_ []byte) error + type ErrorCallback func(*Response, error) + type HTMLCallback func(*HTMLElement) + type HTMLElement struct + DOM *goquery.Selection + Index int + Name string + Request *Request + Response *Response + Text string + func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement + func (h *HTMLElement) Attr(k string) string + func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string + func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string + func (h *HTMLElement) ChildText(goquerySelector string) string + func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) + func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) + func (h *HTMLElement) Unmarshal(v interface{}) error + type LimitRule struct + Delay time.Duration + DomainGlob string + DomainRegexp string + Parallelism int + RandomDelay time.Duration + func (r *LimitRule) Init() error + func (r *LimitRule) Match(domain string) bool + type ProxyFunc func(*http.Request) (*url.URL, error) + type Request struct + Body io.Reader + Ctx *Context + Depth int + Headers *http.Header + ID uint32 + Method string + ProxyURL string + ResponseCharacterEncoding string + URL *url.URL + func (r *Request) Abort() + func (r *Request) AbsoluteURL(u string) string + func (r *Request) Do() error + func (r *Request) Marshal() ([]byte, error) + func (r *Request) New(method, URL string, body io.Reader) (*Request, error) + func (r *Request) Post(URL string, requestData map[string]string) error + func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error + func (r *Request) PostRaw(URL string, requestData []byte) error + func (r *Request) Retry() error + func (r *Request) Visit(URL string) error + type RequestCallback func(*Request) + type Response struct + Body []byte + Ctx *Context + Headers *http.Header + Request *Request + StatusCode int + func (r *Response) FileName() string + func (r *Response) Save(fileName string) error + type ResponseCallback func(*Response) + type ScrapedCallback func(*Response) + type XMLCallback func(*XMLElement) + type XMLElement struct + DOM interface{} + Name string + Request *Request + Response *Response + Text string + func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement + func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement + func (h *XMLElement) Attr(k string) string + func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string + func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string + func (h *XMLElement) ChildText(xpathQuery string) string + func (h *XMLElement) ChildTexts(xpathQuery string) []string