Documentation ¶
Index ¶
- Variables
- func AbsoluteURL(u string, base *url.URL) string
- func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, ...) (method, requestURL, contentType string, body *bytes.Buffer, err error)
- func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, ...) (requestURL string, body *bytes.Buffer, contentType string, err error)
- func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)
- func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)
- func HostToWildcardGlobs(host string) []glob.Glob
- func RoundRobinProxySwitcher(ProxyURLs ...string) (func(r *http.Request) (*url.URL, error), error)
- func WithAutoLogin(username, password string, flags ...string) configOpt
- func WithBasicAuth(user, pass string) configOpt
- func WithBodySize(size int) configOpt
- func WithConcurrent(concurrent int) configOpt
- func WithConnectTimeout(f float64) configOpt
- func WithDisallowMIMEType(d []string) configOpt
- func WithDisallowSuffix(d []string) configOpt
- func WithDomainBlackList(domain string) configOpt
- func WithDomainWhiteList(domain string) configOpt
- func WithExtraSuffixForEveryPath(path ...string) configOpt
- func WithExtraSuffixForEveryRootPath(path ...string) configOpt
- func WithFixedCookie(k, v string) configOpt
- func WithForbiddenFromParent(b bool) configOpt
- func WithHeader(k, v string) configOpt
- func WithMaxDepth(depth int) configOpt
- func WithMaxRedirectTimes(maxRedirectTimes int) configOpt
- func WithMaxRequestCount(limit int) configOpt
- func WithMaxRetry(limit int) configOpt
- func WithMaxUrlCount(limit int) configOpt
- func WithOnRequest(f func(req *Req)) configOpt
- func WithProxy(proxies ...string) configOpt
- func WithResponseTimeout(f float64) configOpt
- func WithUrlExtractor(f func(*Req) []interface{}) configOpt
- func WithUrlRegexpBlackList(re string) configOpt
- func WithUrlRegexpWhiteList(re string) configOpt
- func WithUserAgent(ua string) configOpt
- type Config
- type Crawler
- type Req
- func (r *Req) AbsoluteURL(u string) string
- func (r *Req) Hash() string
- func (r *Req) IsForm() bool
- func (r *Req) IsHttps() bool
- func (r *Req) IsLoginForm() bool
- func (r *Req) IsUploadForm() bool
- func (r *Req) Request() *http.Request
- func (r *Req) RequestRaw() []byte
- func (r *Req) Response() (*http.Response, error)
- func (r *Req) ResponseBody() []byte
- func (r *Req) ResponseRaw() []byte
- func (r *Req) SameWildcardOrigin(s *Req) bool
- func (r *Req) Url() string
- type RequestIf
- type Result
Constants ¶
This section is empty.
Variables ¶
View Source
var ( ExcludedSuffix = []string{ ".js", ".css", ".jpg", ".jpeg", ".png", ".mp3", ".mp4", ".flv", ".aac", ".ogg", ".svg", "ico", ".gif", ".doc", "docx", ".pptx", ".ppt", ".pdf", } ExcludedMIME = []string{"image/*", "audio/*", "video/*", "*octet-stream*", "application/ogg", "application/pdf", "application/msword", "application/x-ppt", "video/avi", "application/x-ico", "*zip", } )
View Source
var Exports = map[string]interface{}{ "Start": func(url string, opt ...configOpt) (chan *Req, error) { ch := make(chan *Req) opt = append(opt, WithOnRequest(func(req *Req) { ch <- req })) crawler, err := NewCrawler(url, opt...) if err != nil { return nil, utils.Errorf("create crawler failed: %s", err) } go func() { defer close(ch) err := crawler.Run() if err != nil { log.Error(err) } }() return ch, nil }, "basicAuth": WithBasicAuth, "bodySize": WithBodySize, "concurrent": WithConcurrent, "connectTimeout": WithConnectTimeout, "timeout": WithConnectTimeout, "domainExclude": WithDomainBlackList, "domainInclude": WithDomainWhiteList, "cookie": WithFixedCookie, "forbiddenFromParent": WithForbiddenFromParent, "disallowSuffix": WithDisallowSuffix, "header": WithHeader, "urlExtractor": WithUrlExtractor, "maxDepth": WithMaxDepth, "maxRedirect": WithMaxRedirectTimes, "maxRequest": WithMaxRequestCount, "maxRetry": WithMaxRetry, "maxUrls": WithMaxUrlCount, "proxy": WithProxy, "responseTimeout": WithResponseTimeout, "urlRegexpExclude": WithUrlRegexpBlackList, "urlRegexpInclude": WithUrlRegexpWhiteList, "userAgent": WithUserAgent, "ua": WithUserAgent, "autoLogin": WithAutoLogin, "RequestsFromFlow": HandleRequestResult, }
View Source
var (
URLPattern, _ = regexp.Compile(`(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:\.{1,10})(?:\?[^"|']{0,}|)))`)
)
Functions ¶
func HandleElementForm ¶
func HandleFormUrlEncoded ¶
func HandleMultipartFormData ¶
func HandleRequestResult ¶
func HostToWildcardGlobs ¶
func RoundRobinProxySwitcher ¶
RoundRobinProxySwitcher creates a proxy switcher function which rotates ProxyURLs on every request. The proxy type is determined by the URL scheme. "http", "https" and "socks5" are supported. If the scheme is empty, "http" is assumed.
func WithAutoLogin ¶
func WithBasicAuth ¶
func WithBasicAuth(user, pass string) configOpt
func WithBodySize ¶
func WithBodySize(size int) configOpt
func WithConcurrent ¶
func WithConcurrent(concurrent int) configOpt
func WithConnectTimeout ¶
func WithConnectTimeout(f float64) configOpt
func WithDisallowMIMEType ¶
func WithDisallowMIMEType(d []string) configOpt
func WithDisallowSuffix ¶
func WithDisallowSuffix(d []string) configOpt
func WithDomainBlackList ¶
func WithDomainBlackList(domain string) configOpt
func WithDomainWhiteList ¶
func WithDomainWhiteList(domain string) configOpt
func WithExtraSuffixForEveryPath ¶
func WithExtraSuffixForEveryPath(path ...string) configOpt
func WithExtraSuffixForEveryRootPath ¶
func WithExtraSuffixForEveryRootPath(path ...string) configOpt
func WithFixedCookie ¶
func WithFixedCookie(k, v string) configOpt
func WithForbiddenFromParent ¶
func WithForbiddenFromParent(b bool) configOpt
func WithHeader ¶
func WithHeader(k, v string) configOpt
func WithMaxDepth ¶
func WithMaxDepth(depth int) configOpt
func WithMaxRedirectTimes ¶
func WithMaxRedirectTimes(maxRedirectTimes int) configOpt
func WithMaxRequestCount ¶
func WithMaxRequestCount(limit int) configOpt
func WithMaxRetry ¶
func WithMaxRetry(limit int) configOpt
func WithMaxUrlCount ¶
func WithMaxUrlCount(limit int) configOpt
func WithOnRequest ¶
func WithOnRequest(f func(req *Req)) configOpt
func WithResponseTimeout ¶
func WithResponseTimeout(f float64) configOpt
func WithUrlExtractor ¶
func WithUrlExtractor(f func(*Req) []interface{}) configOpt
func WithUrlRegexpBlackList ¶
func WithUrlRegexpBlackList(re string) configOpt
func WithUrlRegexpWhiteList ¶
func WithUrlRegexpWhiteList(re string) configOpt
func WithUserAgent ¶
func WithUserAgent(ua string) configOpt
Types ¶
type Config ¶
type Config struct { // 基础认证 BasicAuth bool AuthUsername string AuthPassword string // contains filtered or unexported fields }
func (*Config) CreateHTTPClient ¶
type Req ¶
type Req struct {
// contains filtered or unexported fields
}
func (*Req) AbsoluteURL ¶
func (*Req) IsLoginForm ¶
func (*Req) IsUploadForm ¶
func (*Req) RequestRaw ¶
func (*Req) ResponseBody ¶
func (*Req) ResponseRaw ¶
func (*Req) SameWildcardOrigin ¶
Click to show internal directories.
Click to hide internal directories.