Documentation ¶
Index ¶
- Variables
- func AbsoluteURL(u string, base *url.URL) string
- func Exec(https bool, req []byte, ...) error
- func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, ...) (method, requestURL, contentType string, body *bytes.Buffer, err error)
- func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, ...) (requestURL string, body *bytes.Buffer, contentType string, err error)
- func HandleJSGetNewRequest(isHttps bool, req []byte, code string, cb ...func(bool, []byte))
- func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)
- func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)
- func HostToWildcardGlobs(host string) []glob.Glob
- func NewHTTPRequest(https bool, req []byte, rsp []byte, url string) (bool, []byte, error)
- func PageInformationWalker(mimeType string, page string, opts ...PageInfoFetchOption) error
- func StartCrawler(url string, opt ...ConfigOpt) (chan *Req, error)
- type Config
- type ConfigOpt
- func WithAutoLogin(username, password string, flags ...string) ConfigOpt
- func WithBasicAuth(user, pass string) ConfigOpt
- func WithBodySize(size int) ConfigOpt
- func WithConcurrent(concurrent int) ConfigOpt
- func WithConnectTimeout(f float64) ConfigOpt
- func WithDisallowMIMEType(d []string) ConfigOpt
- func WithDisallowSuffix(d []string) ConfigOpt
- func WithDomainBlackList(domain string) ConfigOpt
- func WithDomainWhiteList(domain string) ConfigOpt
- func WithExtraSuffixForEveryPath(path ...string) ConfigOpt
- func WithExtraSuffixForEveryRootPath(path ...string) ConfigOpt
- func WithFixedCookie(k, v string) ConfigOpt
- func WithForbiddenFromParent(b bool) ConfigOpt
- func WithHeader(k, v string) ConfigOpt
- func WithJSParser(enable ...bool) ConfigOpt
- func WithMaxDepth(depth int) ConfigOpt
- func WithMaxRedirectTimes(maxRedirectTimes int) ConfigOpt
- func WithMaxRequestCount(limit int) ConfigOpt
- func WithMaxRetry(limit int) ConfigOpt
- func WithMaxUrlCount(limit int) ConfigOpt
- func WithOnRequest(f func(req *Req)) ConfigOpt
- func WithProxy(proxies ...string) ConfigOpt
- func WithResponseTimeout(f float64) ConfigOpt
- func WithRuntimeID(id string) ConfigOpt
- func WithUrlExtractor(f func(*Req) []interface{}) ConfigOpt
- func WithUrlRegexpBlackList(re string) ConfigOpt
- func WithUrlRegexpWhiteList(re string) ConfigOpt
- func WithUserAgent(ua string) ConfigOpt
- type Crawler
- type JavaScriptContent
- type PageInfoFetchOption
- type Req
- func (r *Req) AbsoluteURL(u string) string
- func (r *Req) Hash() string
- func (r *Req) IsForm() bool
- func (r *Req) IsHttps() bool
- func (r *Req) IsLoginForm() bool
- func (r *Req) IsUploadForm() bool
- func (r *Req) Request() *http.Request
- func (r *Req) RequestRaw() []byte
- func (r *Req) Response() (*http.Response, error)
- func (r *Req) ResponseBody() []byte
- func (r *Req) ResponseRaw() []byte
- func (r *Req) SameWildcardOrigin(s *Req) bool
- func (r *Req) Url() string
- type RequestIf
- type Result
Constants ¶
This section is empty.
Variables ¶
var ( ExcludedSuffix = []string{ ".css", ".jpg", ".jpeg", ".png", ".mp3", ".mp4", ".flv", ".aac", ".ogg", ".svg", "ico", ".gif", ".doc", "docx", ".pptx", ".ppt", ".pdf", } ExcludedMIME = []string{ "image/*", "audio/*", "video/*", "*octet-stream*", "application/ogg", "application/pdf", "application/msword", "application/x-ppt", "video/avi", "application/x-ico", "*zip", } )
var Exports = map[string]interface{}{ "Start": StartCrawler, "basicAuth": WithBasicAuth, "bodySize": WithBodySize, "concurrent": WithConcurrent, "connectTimeout": WithConnectTimeout, "timeout": WithConnectTimeout, "domainExclude": WithDomainBlackList, "domainInclude": WithDomainWhiteList, "cookie": WithFixedCookie, "forbiddenFromParent": WithForbiddenFromParent, "disallowSuffix": WithDisallowSuffix, "header": WithHeader, "urlExtractor": WithUrlExtractor, "maxDepth": WithMaxDepth, "maxRedirect": WithMaxRedirectTimes, "maxRequest": WithMaxRequestCount, "maxRetry": WithMaxRetry, "maxUrls": WithMaxUrlCount, "proxy": WithProxy, "responseTimeout": WithResponseTimeout, "urlRegexpExclude": WithUrlRegexpBlackList, "urlRegexpInclude": WithUrlRegexpWhiteList, "userAgent": WithUserAgent, "ua": WithUserAgent, "autoLogin": WithAutoLogin, "jsParser": WithJSParser, "RequestsFromFlow": HandleRequestResult, }
var URLPattern, _ = regexp.Compile(`(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:\.{1,10})(?:\?[^"|']{0,}|)))`)
Functions ¶
func HandleElementForm ¶
func HandleFormUrlEncoded ¶
func HandleJSGetNewRequest ¶ added in v1.3.1
func HandleMultipartFormData ¶
func HandleRequestResult ¶
RequestsFromFlow 尝试从一次请求与响应中爬取出所有可能的请求,返回所有可能请求的原始报文与错误 Example: ``` reqs, err = crawler.RequestsFromFlow(false, reqBytes, rspBytes) ```
func HostToWildcardGlobs ¶
func NewHTTPRequest ¶ added in v1.3.0
func PageInformationWalker ¶ added in v1.3.0
func PageInformationWalker(mimeType string, page string, opts ...PageInfoFetchOption) error
func StartCrawler ¶ added in v1.3.0
Start 启动爬虫爬取某个URL,它还可以接收零个到多个选项函数,用于影响爬取行为 返回一个Req结构体引用管道与错误 Example: ``` ch, err := crawler.Start("https://www.baidu.com", crawler.concurrent(10)) for req in ch { println(req.Response()~) } ```
Types ¶
type Config ¶
type Config struct { // 基础认证 BasicAuth bool AuthUsername string AuthPassword string // contains filtered or unexported fields }
func (*Config) GetLowhttpConfig ¶ added in v1.3.0
func (c *Config) GetLowhttpConfig() []lowhttp.LowhttpOpt
type ConfigOpt ¶ added in v1.3.0
type ConfigOpt func(c *Config)
func WithAutoLogin ¶
autoLogin 是一个选项函数,用于指定爬虫时的自动填写可能存在的登录表单 Example: ``` crawler.Start("https://example.com", crawler.autoLogin("admin", "admin")) ```
func WithBasicAuth ¶
basicAuth 是一个选项函数,用于指定爬虫时的自动该填写的基础认证用户名和密码 Example: ``` crawler.Start("https://example.com", crawler.basicAuth("admin", "admin")) ```
func WithBodySize ¶
bodySize 是一个选项函数,用于指定爬虫时的最大响应体大小,默认为10MB Example: ``` crawler.Start("https://example.com", crawler.bodySize(1024 * 1024)) ```
func WithConcurrent ¶
concurrent 是一个选项函数,用于指定爬虫时的并发数,默认为20 Example: ``` crawler.Start("https://example.com", crawler.concurrent(10)) ```
func WithConnectTimeout ¶
connectTimeout 是一个选项函数,用于指定爬虫时的连接超时时间,默认为10s Example: ``` crawler.Start("https://example.com", crawler.connectTimeout(5)) ```
func WithDisallowMIMEType ¶
func WithDisallowSuffix ¶
disallowSuffix 是一个选项函数,用于指定爬虫时的后缀黑名单 Example: ``` crawler.Start("https://example.com", crawler.disallowSuffix(".css", ".jpg", ".png")) // 爬虫时不会爬取css、jpg、png文件 ```
func WithDomainBlackList ¶
domainExclude 是一个选项函数,用于指定爬虫时的域名黑名单 domain允许使用glob语法,例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainExclude("*.baidu.com")) ```
func WithDomainWhiteList ¶
domainInclude 是一个选项函数,用于指定爬虫时的域名白名单 domain允许使用glob语法,例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainInclude("*.example.com")) ```
func WithFixedCookie ¶
cookie 是一个选项函数,用于指定爬虫时的cookie Example: ``` crawler.Start("https://example.com", crawler.cookie("key", "value")) ```
func WithForbiddenFromParent ¶
forbiddenFromParent 是一个选项函数,用于指定爬虫时的是否禁止从根路径发起请求,默认为false 对于一个起始URL,如果其并不是从根路径开始且没有禁止从根路径发起请求,那么爬虫会从其根路径开始爬取 Example: ``` crawler.Start("https://example.com/a/b/c", crawler.forbiddenFromParent(false)) // 这会从 https://example.com/ 开始爬取 ```
func WithHeader ¶
header 是一个选项函数,用于指定爬虫时的请求头 Example: ``` crawler.Start("https://example.com", crawler.header("User-Agent", "yaklang-crawler")) ```
func WithJSParser ¶ added in v1.3.2
jsParser 是一个选项函数,用于指定爬虫时是否进行对于JS的代码解析。 填写该选项默认开启,也可以传入false强制关闭。 Example: ``` crawler.Start("https://example.com", crawler.jsParser()) // 开启 crawler.Start("https://example.com", crawler.jsParser(true)) // 开启 crawler.Start("https://example.com", crawler.jsParser(false)) // 关闭 ```
func WithMaxDepth ¶
maxDepth 是一个选项函数,用于指定爬虫时的最大深度,默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxDepth(10)) ```
func WithMaxRedirectTimes ¶
maxRedirect 是一个选项函数,用于指定爬虫时的最大重定向次数,默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxRedirect(10)) ```
func WithMaxRequestCount ¶
maxRequest 是一个选项函数,用于指定爬虫时的最大请求数,默认为1000 Example: ``` crawler.Start("https://example.com", crawler.maxRequest(10000)) ```
func WithMaxRetry ¶
maxRetry 是一个选项函数,用于指定爬虫时的最大重试次数,默认为3 Example: ``` crawler.Start("https://example.com", crawler.maxRetry(10)) ```
func WithMaxUrlCount ¶
maxUrls 是一个选项函数,用于指定爬虫时的最大链接数,默认为10000 Example: ``` crawler.Start("https://example.com", crawler.maxUrls(20000)) ```
func WithOnRequest ¶
func WithProxy ¶
proxy 是一个选项函数,用于指定爬虫时的代理 Example: ``` crawler.Start("https://example.com", crawler.proxy("http://127.0.0.1:8080")) ```
func WithResponseTimeout ¶
responseTimeout 是一个选项函数,用于指定爬虫时的响应超时时间,默认为10s ! 未实现 Example: ``` crawler.Start("https://example.com", crawler.responseTimeout(5)) ```
func WithRuntimeID ¶ added in v1.3.0
func WithUrlExtractor ¶
urlExtractor 是一个选项函数,它接收一个函数作为参数,用于为爬虫添加额外的链接提取规则 Example: ``` crawler.Start("https://example.com", crawler.urlExtractor(func(req) { 尝试编写自己的规则,从响应体(req.Response()或req.ResponseRaw())中提取额外的链接 }) ```
func WithUrlRegexpBlackList ¶
urlRegexpExclude 是一个选项函数,用于指定爬虫时的URL正则黑名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpExclude(`\.jpg`)) ```
func WithUrlRegexpWhiteList ¶
urlRegexpInclude 是一个选项函数,用于指定爬虫时的URL正则白名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpInclude(`\.html`)) ```
func WithUserAgent ¶
userAgent 是一个选项函数,用于指定爬虫时的User-Agent Example: ``` crawler.Start("https://example.com", crawler.userAgent("yaklang-crawler")) ```
type JavaScriptContent ¶ added in v1.3.0
func (*JavaScriptContent) String ¶ added in v1.3.0
func (s *JavaScriptContent) String() string
type PageInfoFetchOption ¶ added in v1.3.0
type PageInfoFetchOption func(config *infoFetcherConfig)
func WithFetcher_HtmlTag ¶ added in v1.3.0
func WithFetcher_HtmlTag(f func(string, *html.Node)) PageInfoFetchOption
func WithFetcher_HtmlText ¶ added in v1.3.0
func WithFetcher_HtmlText(f func(*html.Node)) PageInfoFetchOption
func WithFetcher_JavaScript ¶ added in v1.3.0
func WithFetcher_JavaScript(f func(content *JavaScriptContent)) PageInfoFetchOption
type Req ¶
type Req struct {
// contains filtered or unexported fields
}
func (*Req) AbsoluteURL ¶
AbsoluteURL 根据当前请求的URL,将传入的相对路径转换为绝对路径 Example: ``` req.AbsoluteURL("/a/b/c") ```
func (*Req) IsLoginForm ¶
IsLoginForm 判断当前请求是否是一个登录表单 Example: ``` req.IsLoginForm() ```
func (*Req) IsUploadForm ¶
IsUploadForm 判断当前请求是否是一个上传表单 Example: ``` req.IsUploadForm() ```
func (*Req) RequestRaw ¶
RequestRaw 返回当前请求的原始请求报文 Example: ``` req.RequestRaw() ```
func (*Req) ResponseBody ¶
ResponseBody 返回当前请求的原始响应体 Example: ``` req.ResponseBody() ```
func (*Req) ResponseRaw ¶
ResponseRaw 返回当前请求的原始响应报文 Example: ``` req.ResponseRaw() ```
func (*Req) SameWildcardOrigin ¶
SameWildcardOrigin 判断当前请求与传入的请求是否是同域的 Example: ``` req1.SameWildcardOrigin(req2) ```