crawler

package

v1.3.5-alpha0802 Latest Latest Go to latest Published: Aug 2, 2024 License: AGPL-3.0 Imports: 28 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/yaklang/yaklang

Documentation ¶

Index ¶

Variables
func AbsoluteURL(u string, base *url.URL) string
func Exec(https bool, req []byte, ...) error
func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, ...) (method, requestURL, contentType string, body *bytes.Buffer, err error)
func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, ...) (requestURL string, body *bytes.Buffer, contentType string, err error)
func HandleJSGetNewRequest(isHttps bool, req []byte, code string, cb ...func(bool, []byte))
func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)
func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)
func HostToWildcardGlobs(host string) []glob.Glob
func NewHTTPRequest(https bool, req []byte, rsp []byte, url string) (bool, []byte, error)
func PageInformationWalker(mimeType string, page string, opts ...PageInfoFetchOption) error
func StartCrawler(url string, opt ...ConfigOpt) (chan *Req, error)
type Config
- func (c *Config) CheckShouldBeHandledURL(u *url.URL) bool
- func (c *Config) GetLowhttpConfig() []lowhttp.LowhttpOpt
type ConfigOpt
- func WithAutoLogin(username, password string, flags ...string) ConfigOpt
- func WithBasicAuth(user, pass string) ConfigOpt
- func WithBodySize(size int) ConfigOpt
- func WithConcurrent(concurrent int) ConfigOpt
- func WithConnectTimeout(f float64) ConfigOpt
- func WithDisallowMIMEType(d []string) ConfigOpt
- func WithDisallowSuffix(d []string) ConfigOpt
- func WithDomainBlackList(domain string) ConfigOpt
- func WithDomainWhiteList(domain string) ConfigOpt
- func WithExtraSuffixForEveryPath(path ...string) ConfigOpt
- func WithExtraSuffixForEveryRootPath(path ...string) ConfigOpt
- func WithFixedCookie(k, v string) ConfigOpt
- func WithForbiddenFromParent(b bool) ConfigOpt
- func WithHeader(k, v string) ConfigOpt
- func WithJSParser(enable ...bool) ConfigOpt
- func WithMaxDepth(depth int) ConfigOpt
- func WithMaxRedirectTimes(maxRedirectTimes int) ConfigOpt
- func WithMaxRequestCount(limit int) ConfigOpt
- func WithMaxRetry(limit int) ConfigOpt
- func WithMaxUrlCount(limit int) ConfigOpt
- func WithOnRequest(f func(req *Req)) ConfigOpt
- func WithProxy(proxies ...string) ConfigOpt
- func WithResponseTimeout(f float64) ConfigOpt
- func WithRuntimeID(id string) ConfigOpt
- func WithUrlExtractor(f func(*Req) []interface{}) ConfigOpt
- func WithUrlRegexpBlackList(re string) ConfigOpt
- func WithUrlRegexpWhiteList(re string) ConfigOpt
- func WithUserAgent(ua string) ConfigOpt
type Crawler
- func NewCrawler(urls string, opts ...ConfigOpt) (*Crawler, error)
- func (c *Crawler) Run() error
type JavaScriptContent
- func (s *JavaScriptContent) String() string
type PageInfoFetchOption
- func WithFetcher_HtmlTag(f func(string, *html.Node)) PageInfoFetchOption
- func WithFetcher_HtmlText(f func(*html.Node)) PageInfoFetchOption
- func WithFetcher_JavaScript(f func(content *JavaScriptContent)) PageInfoFetchOption
type Req
- func (r *Req) AbsoluteURL(u string) string
- func (r *Req) Hash() string
- func (r *Req) IsForm() bool
- func (r *Req) IsHttps() bool
- func (r *Req) IsLoginForm() bool
- func (r *Req) IsUploadForm() bool
- func (r *Req) Request() *http.Request
- func (r *Req) RequestRaw() []byte
- func (r *Req) Response() (*http.Response, error)
- func (r *Req) ResponseBody() []byte
- func (r *Req) ResponseRaw() []byte
- func (r *Req) SameWildcardOrigin(s *Req) bool
- func (r *Req) Url() string
type RequestIf
type Result

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	ExcludedSuffix = []string{
		".css",
		".jpg", ".jpeg", ".png",
		".mp3", ".mp4",
		".flv", ".aac", ".ogg",
		".svg", "ico", ".gif",
		".doc", "docx", ".pptx",
		".ppt", ".pdf",
	}
	ExcludedMIME = []string{
		"image/*",
		"audio/*", "video/*", "*octet-stream*",
		"application/ogg", "application/pdf", "application/msword",
		"application/x-ppt", "video/avi", "application/x-ico",
		"*zip",
	}
)

View Source

var Exports = map[string]interface{}{
	"Start":               StartCrawler,
	"basicAuth":           WithBasicAuth,
	"bodySize":            WithBodySize,
	"concurrent":          WithConcurrent,
	"connectTimeout":      WithConnectTimeout,
	"timeout":             WithConnectTimeout,
	"domainExclude":       WithDomainBlackList,
	"domainInclude":       WithDomainWhiteList,
	"cookie":              WithFixedCookie,
	"forbiddenFromParent": WithForbiddenFromParent,
	"disallowSuffix":      WithDisallowSuffix,
	"header":              WithHeader,
	"urlExtractor":        WithUrlExtractor,
	"maxDepth":            WithMaxDepth,
	"maxRedirect":         WithMaxRedirectTimes,
	"maxRequest":          WithMaxRequestCount,
	"maxRetry":            WithMaxRetry,
	"maxUrls":             WithMaxUrlCount,
	"proxy":               WithProxy,
	"responseTimeout":     WithResponseTimeout,
	"urlRegexpExclude":    WithUrlRegexpBlackList,
	"urlRegexpInclude":    WithUrlRegexpWhiteList,
	"userAgent":           WithUserAgent,
	"ua":                  WithUserAgent,
	"autoLogin":           WithAutoLogin,
	"jsParser":            WithJSParser,
	"RequestsFromFlow":    HandleRequestResult,
}

View Source

var URLPattern, _ = regexp.Compile(`(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:\.{1,10})(?:\?[^"|']{0,}|)))`)

Functions ¶

func AbsoluteURL ¶

func AbsoluteURL(u string, base *url.URL) string

func Exec ¶ added in v1.3.0

func Exec(https bool, req []byte, callback func(response *lowhttp.LowhttpResponse, https bool, req []byte)) error

func HandleElementForm ¶

func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, guessParams ...func(user, pass string, extra map[string][]string)) (method, requestURL, contentType string, body *bytes.Buffer, err error)

func HandleFormUrlEncoded ¶

func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, guessParams ...func(username, password string, extra map[string][]string)) (requestURL string, body *bytes.Buffer, contentType string, err error)

func HandleJSGetNewRequest ¶ added in v1.3.1

func HandleJSGetNewRequest(isHttps bool, req []byte, code string, cb ...func(bool, []byte))

func HandleMultipartFormData ¶

func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)

func HandleRequestResult ¶

func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)

RequestsFromFlow 尝试从一次请求与响应中爬取出所有可能的请求，返回所有可能请求的原始报文与错误 Example: ``` reqs, err = crawler.RequestsFromFlow(false, reqBytes, rspBytes) ```

func HostToWildcardGlobs ¶

func HostToWildcardGlobs(host string) []glob.Glob

func NewHTTPRequest ¶ added in v1.3.0

func NewHTTPRequest(https bool, req []byte, rsp []byte, url string) (bool, []byte, error)

func PageInformationWalker ¶ added in v1.3.0

func PageInformationWalker(mimeType string, page string, opts ...PageInfoFetchOption) error

func StartCrawler ¶ added in v1.3.0

func StartCrawler(url string, opt ...ConfigOpt) (chan *Req, error)

Start 启动爬虫爬取某个URL，它还可以接收零个到多个选项函数，用于影响爬取行为返回一个Req结构体引用管道与错误 Example: ``` ch, err := crawler.Start("https://www.baidu.com", crawler.concurrent(10)) for req in ch { println(req.Response()~) } ```

Types ¶

type Config ¶

type Config struct {
	// 基础认证
	BasicAuth    bool
	AuthUsername string
	AuthPassword string
	// contains filtered or unexported fields
}

func (*Config) CheckShouldBeHandledURL ¶

func (c *Config) CheckShouldBeHandledURL(u *url.URL) bool

func (*Config) GetLowhttpConfig ¶ added in v1.3.0

func (c *Config) GetLowhttpConfig() []lowhttp.LowhttpOpt

type ConfigOpt ¶ added in v1.3.0

type ConfigOpt func(c *Config)

func WithAutoLogin ¶

func WithAutoLogin(username, password string, flags ...string) ConfigOpt

autoLogin 是一个选项函数，用于指定爬虫时的自动填写可能存在的登录表单 Example: ``` crawler.Start("https://example.com", crawler.autoLogin("admin", "admin")) ```

func WithBasicAuth ¶

func WithBasicAuth(user, pass string) ConfigOpt

basicAuth 是一个选项函数，用于指定爬虫时的自动该填写的基础认证用户名和密码 Example: ``` crawler.Start("https://example.com", crawler.basicAuth("admin", "admin")) ```

func WithBodySize ¶

func WithBodySize(size int) ConfigOpt

bodySize 是一个选项函数，用于指定爬虫时的最大响应体大小，默认为10MB Example: ``` crawler.Start("https://example.com", crawler.bodySize(1024 * 1024)) ```

func WithConcurrent ¶

func WithConcurrent(concurrent int) ConfigOpt

concurrent 是一个选项函数，用于指定爬虫时的并发数，默认为20 Example: ``` crawler.Start("https://example.com", crawler.concurrent(10)) ```

func WithConnectTimeout ¶

func WithConnectTimeout(f float64) ConfigOpt

connectTimeout 是一个选项函数，用于指定爬虫时的连接超时时间，默认为10s Example: ``` crawler.Start("https://example.com", crawler.connectTimeout(5)) ```

func WithDisallowMIMEType ¶

func WithDisallowMIMEType(d []string) ConfigOpt

func WithDisallowSuffix ¶

func WithDisallowSuffix(d []string) ConfigOpt

disallowSuffix 是一个选项函数，用于指定爬虫时的后缀黑名单 Example: ``` crawler.Start("https://example.com", crawler.disallowSuffix(".css", ".jpg", ".png")) // 爬虫时不会爬取css、jpg、png文件 ```

func WithDomainBlackList ¶

func WithDomainBlackList(domain string) ConfigOpt

domainExclude 是一个选项函数，用于指定爬虫时的域名黑名单 domain允许使用glob语法，例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainExclude("*.baidu.com")) ```

func WithDomainWhiteList ¶

func WithDomainWhiteList(domain string) ConfigOpt

domainInclude 是一个选项函数，用于指定爬虫时的域名白名单 domain允许使用glob语法，例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainInclude("*.example.com")) ```

func WithExtraSuffixForEveryPath ¶

func WithExtraSuffixForEveryPath(path ...string) ConfigOpt

func WithExtraSuffixForEveryRootPath ¶

func WithExtraSuffixForEveryRootPath(path ...string) ConfigOpt

func WithFixedCookie ¶

func WithFixedCookie(k, v string) ConfigOpt

cookie 是一个选项函数，用于指定爬虫时的cookie Example: ``` crawler.Start("https://example.com", crawler.cookie("key", "value")) ```

func WithForbiddenFromParent ¶

func WithForbiddenFromParent(b bool) ConfigOpt

forbiddenFromParent 是一个选项函数，用于指定爬虫时的是否禁止从根路径发起请求，默认为false 对于一个起始URL，如果其并不是从根路径开始且没有禁止从根路径发起请求，那么爬虫会从其根路径开始爬取 Example: ``` crawler.Start("https://example.com/a/b/c", crawler.forbiddenFromParent(false)) // 这会从 https://example.com/ 开始爬取 ```

func WithHeader ¶

func WithHeader(k, v string) ConfigOpt

header 是一个选项函数，用于指定爬虫时的请求头 Example: ``` crawler.Start("https://example.com", crawler.header("User-Agent", "yaklang-crawler")) ```

func WithJSParser ¶ added in v1.3.2

func WithJSParser(enable ...bool) ConfigOpt

jsParser 是一个选项函数，用于指定爬虫时是否进行对于JS的代码解析。填写该选项默认开启，也可以传入false强制关闭。 Example: ``` crawler.Start("https://example.com", crawler.jsParser()) // 开启 crawler.Start("https://example.com", crawler.jsParser(true)) // 开启 crawler.Start("https://example.com", crawler.jsParser(false)) // 关闭 ```

func WithMaxDepth ¶

func WithMaxDepth(depth int) ConfigOpt

maxDepth 是一个选项函数，用于指定爬虫时的最大深度，默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxDepth(10)) ```

func WithMaxRedirectTimes ¶

func WithMaxRedirectTimes(maxRedirectTimes int) ConfigOpt

maxRedirect 是一个选项函数，用于指定爬虫时的最大重定向次数，默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxRedirect(10)) ```

func WithMaxRequestCount ¶

func WithMaxRequestCount(limit int) ConfigOpt

maxRequest 是一个选项函数，用于指定爬虫时的最大请求数，默认为1000 Example: ``` crawler.Start("https://example.com", crawler.maxRequest(10000)) ```

func WithMaxRetry ¶

func WithMaxRetry(limit int) ConfigOpt

maxRetry 是一个选项函数，用于指定爬虫时的最大重试次数，默认为3 Example: ``` crawler.Start("https://example.com", crawler.maxRetry(10)) ```

func WithMaxUrlCount ¶

func WithMaxUrlCount(limit int) ConfigOpt

maxUrls 是一个选项函数，用于指定爬虫时的最大链接数，默认为10000 Example: ``` crawler.Start("https://example.com", crawler.maxUrls(20000)) ```

func WithOnRequest ¶

func WithOnRequest(f func(req *Req)) ConfigOpt

func WithProxy ¶

func WithProxy(proxies ...string) ConfigOpt

proxy 是一个选项函数，用于指定爬虫时的代理 Example: ``` crawler.Start("https://example.com", crawler.proxy("http://127.0.0.1:8080")) ```

func WithResponseTimeout ¶

func WithResponseTimeout(f float64) ConfigOpt

responseTimeout 是一个选项函数，用于指定爬虫时的响应超时时间，默认为10s ! 未实现 Example: ``` crawler.Start("https://example.com", crawler.responseTimeout(5)) ```

func WithRuntimeID ¶ added in v1.3.0

func WithRuntimeID(id string) ConfigOpt

func WithUrlExtractor ¶

func WithUrlExtractor(f func(*Req) []interface{}) ConfigOpt

urlExtractor 是一个选项函数，它接收一个函数作为参数，用于为爬虫添加额外的链接提取规则 Example: ``` crawler.Start("https://example.com", crawler.urlExtractor(func(req) { 尝试编写自己的规则，从响应体(req.Response()或req.ResponseRaw())中提取额外的链接 }) ```

func WithUrlRegexpBlackList ¶

func WithUrlRegexpBlackList(re string) ConfigOpt

urlRegexpExclude 是一个选项函数，用于指定爬虫时的URL正则黑名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpExclude(`\.jpg`)) ```

func WithUrlRegexpWhiteList ¶

func WithUrlRegexpWhiteList(re string) ConfigOpt

urlRegexpInclude 是一个选项函数，用于指定爬虫时的URL正则白名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpInclude(`\.html`)) ```

func WithUserAgent ¶

func WithUserAgent(ua string) ConfigOpt

userAgent 是一个选项函数，用于指定爬虫时的User-Agent Example: ``` crawler.Start("https://example.com", crawler.userAgent("yaklang-crawler")) ```

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler ¶

func NewCrawler(urls string, opts ...ConfigOpt) (*Crawler, error)

func (*Crawler) Run ¶

func (c *Crawler) Run() error

type JavaScriptContent ¶ added in v1.3.0

type JavaScriptContent struct {
	IsCodeText bool
	Code       string
	UrlPath    string
	Node       *html.Node
}

func (*JavaScriptContent) String ¶ added in v1.3.0

func (s *JavaScriptContent) String() string

type PageInfoFetchOption ¶ added in v1.3.0

type PageInfoFetchOption func(config *infoFetcherConfig)

func WithFetcher_HtmlTag ¶ added in v1.3.0

func WithFetcher_HtmlTag(f func(string, *html.Node)) PageInfoFetchOption

func WithFetcher_HtmlText ¶ added in v1.3.0

func WithFetcher_HtmlText(f func(*html.Node)) PageInfoFetchOption

func WithFetcher_JavaScript ¶ added in v1.3.0

func WithFetcher_JavaScript(f func(content *JavaScriptContent)) PageInfoFetchOption

type Req ¶

type Req struct {
	// contains filtered or unexported fields
}

func (*Req) AbsoluteURL ¶

func (r *Req) AbsoluteURL(u string) string

AbsoluteURL 根据当前请求的URL，将传入的相对路径转换为绝对路径 Example: ``` req.AbsoluteURL("/a/b/c") ```

func (*Req) Hash ¶

func (r *Req) Hash() string

Hash 返回当前请求的哈希值，其值由请求的URL与请求方法组成 Example: ``` req.Hash() ```

func (*Req) IsForm ¶

func (r *Req) IsForm() bool

IsForm 判断当前请求是否是一个表单 Example: ``` req.IsForm() ```

func (*Req) IsHttps ¶

func (r *Req) IsHttps() bool

IsHttps 返回当前请求是否是https请求 Example: ``` req.IsHttps() ```

func (*Req) IsLoginForm ¶

func (r *Req) IsLoginForm() bool

IsLoginForm 判断当前请求是否是一个登录表单 Example: ``` req.IsLoginForm() ```

func (*Req) IsUploadForm ¶

func (r *Req) IsUploadForm() bool

IsUploadForm 判断当前请求是否是一个上传表单 Example: ``` req.IsUploadForm() ```

func (*Req) Request ¶

func (r *Req) Request() *http.Request

Request 返回当前请求的原始请求结构体引用 Example: ``` req.Request() ```

func (*Req) RequestRaw ¶

func (r *Req) RequestRaw() []byte

RequestRaw 返回当前请求的原始请求报文 Example: ``` req.RequestRaw() ```

func (*Req) Response ¶

func (r *Req) Response() (*http.Response, error)

Response 返回当前请求的原始响应结构体引用与错误 Example: ``` resp, err = req.Response() ```

func (*Req) ResponseBody ¶

func (r *Req) ResponseBody() []byte

ResponseBody 返回当前请求的原始响应体 Example: ``` req.ResponseBody() ```

func (*Req) ResponseRaw ¶

func (r *Req) ResponseRaw() []byte

ResponseRaw 返回当前请求的原始响应报文 Example: ``` req.ResponseRaw() ```

func (*Req) SameWildcardOrigin ¶

func (r *Req) SameWildcardOrigin(s *Req) bool

SameWildcardOrigin 判断当前请求与传入的请求是否是同域的 Example: ``` req1.SameWildcardOrigin(req2) ```

func (*Req) Url ¶

func (r *Req) Url() string

Url 返回当前请求的URL字符串 Example: ``` req.Url() ```

type RequestIf ¶

type RequestIf interface {
	Url() string
	Request() *http.Request
	ResponseBody() []byte
	Response() (*http.Response, error)
	IsHttps() bool
	ResponseRaw() []byte
	RequestRaw() []byte
}

type Result ¶

type Result struct {
	FoundUrls []string
	Requests  []*Req
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL