crawler

package
v1.3.4-alpha5 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 18, 2024 License: AGPL-3.0 Imports: 28 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ExcludedSuffix = []string{
		".css",
		".jpg", ".jpeg", ".png",
		".mp3", ".mp4",
		".flv", ".aac", ".ogg",
		".svg", "ico", ".gif",
		".doc", "docx", ".pptx",
		".ppt", ".pdf",
	}
	ExcludedMIME = []string{
		"image/*",
		"audio/*", "video/*", "*octet-stream*",
		"application/ogg", "application/pdf", "application/msword",
		"application/x-ppt", "video/avi", "application/x-ico",
		"*zip",
	}
)
View Source
var Exports = map[string]interface{}{
	"Start":               StartCrawler,
	"basicAuth":           WithBasicAuth,
	"bodySize":            WithBodySize,
	"concurrent":          WithConcurrent,
	"connectTimeout":      WithConnectTimeout,
	"timeout":             WithConnectTimeout,
	"domainExclude":       WithDomainBlackList,
	"domainInclude":       WithDomainWhiteList,
	"cookie":              WithFixedCookie,
	"forbiddenFromParent": WithForbiddenFromParent,
	"disallowSuffix":      WithDisallowSuffix,
	"header":              WithHeader,
	"urlExtractor":        WithUrlExtractor,
	"maxDepth":            WithMaxDepth,
	"maxRedirect":         WithMaxRedirectTimes,
	"maxRequest":          WithMaxRequestCount,
	"maxRetry":            WithMaxRetry,
	"maxUrls":             WithMaxUrlCount,
	"proxy":               WithProxy,
	"responseTimeout":     WithResponseTimeout,
	"urlRegexpExclude":    WithUrlRegexpBlackList,
	"urlRegexpInclude":    WithUrlRegexpWhiteList,
	"userAgent":           WithUserAgent,
	"ua":                  WithUserAgent,
	"autoLogin":           WithAutoLogin,
	"jsParser":            WithJSParser,
	"RequestsFromFlow":    HandleRequestResult,
}
View Source
var URLPattern, _ = regexp.Compile(`(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:\.{1,10})(?:\?[^"|']{0,}|)))`)

Functions

func AbsoluteURL

func AbsoluteURL(u string, base *url.URL) string

func Exec added in v1.3.0

func Exec(https bool, req []byte, callback func(response *lowhttp.LowhttpResponse, https bool, req []byte)) error

func HandleElementForm

func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, guessParams ...func(user, pass string, extra map[string][]string)) (method, requestURL, contentType string, body *bytes.Buffer, err error)

func HandleFormUrlEncoded

func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, guessParams ...func(username, password string, extra map[string][]string)) (requestURL string, body *bytes.Buffer, contentType string, err error)

func HandleJSGetNewRequest added in v1.3.1

func HandleJSGetNewRequest(isHttps bool, req []byte, code string, cb ...func(bool, []byte))

func HandleMultipartFormData

func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)

func HandleRequestResult

func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)

RequestsFromFlow 尝试从一次请求与响应中爬取出所有可能的请求,返回所有可能请求的原始报文与错误 Example: ``` reqs, err = crawler.RequestsFromFlow(false, reqBytes, rspBytes) ```

func HostToWildcardGlobs

func HostToWildcardGlobs(host string) []glob.Glob

func NewHTTPRequest added in v1.3.0

func NewHTTPRequest(https bool, req []byte, rsp []byte, url string) (bool, []byte, error)

func PageInformationWalker added in v1.3.0

func PageInformationWalker(mimeType string, page string, opts ...PageInfoFetchOption) error

func StartCrawler added in v1.3.0

func StartCrawler(url string, opt ...ConfigOpt) (chan *Req, error)

Start 启动爬虫爬取某个URL,它还可以接收零个到多个选项函数,用于影响爬取行为 返回一个Req结构体引用管道与错误 Example: ``` ch, err := crawler.Start("https://www.baidu.com", crawler.concurrent(10)) for req in ch { println(req.Response()~) } ```

Types

type Config

type Config struct {
	// 基础认证
	BasicAuth    bool
	AuthUsername string
	AuthPassword string
	// contains filtered or unexported fields
}

func (*Config) CheckShouldBeHandledURL

func (c *Config) CheckShouldBeHandledURL(u *url.URL) bool

func (*Config) GetLowhttpConfig added in v1.3.0

func (c *Config) GetLowhttpConfig() []lowhttp.LowhttpOpt

type ConfigOpt added in v1.3.0

type ConfigOpt func(c *Config)

func WithAutoLogin

func WithAutoLogin(username, password string, flags ...string) ConfigOpt

autoLogin 是一个选项函数,用于指定爬虫时的自动填写可能存在的登录表单 Example: ``` crawler.Start("https://example.com", crawler.autoLogin("admin", "admin")) ```

func WithBasicAuth

func WithBasicAuth(user, pass string) ConfigOpt

basicAuth 是一个选项函数,用于指定爬虫时的自动该填写的基础认证用户名和密码 Example: ``` crawler.Start("https://example.com", crawler.basicAuth("admin", "admin")) ```

func WithBodySize

func WithBodySize(size int) ConfigOpt

bodySize 是一个选项函数,用于指定爬虫时的最大响应体大小,默认为10MB Example: ``` crawler.Start("https://example.com", crawler.bodySize(1024 * 1024)) ```

func WithConcurrent

func WithConcurrent(concurrent int) ConfigOpt

concurrent 是一个选项函数,用于指定爬虫时的并发数,默认为20 Example: ``` crawler.Start("https://example.com", crawler.concurrent(10)) ```

func WithConnectTimeout

func WithConnectTimeout(f float64) ConfigOpt

connectTimeout 是一个选项函数,用于指定爬虫时的连接超时时间,默认为10s Example: ``` crawler.Start("https://example.com", crawler.connectTimeout(5)) ```

func WithDisallowMIMEType

func WithDisallowMIMEType(d []string) ConfigOpt

func WithDisallowSuffix

func WithDisallowSuffix(d []string) ConfigOpt

disallowSuffix 是一个选项函数,用于指定爬虫时的后缀黑名单 Example: ``` crawler.Start("https://example.com", crawler.disallowSuffix(".css", ".jpg", ".png")) // 爬虫时不会爬取css、jpg、png文件 ```

func WithDomainBlackList

func WithDomainBlackList(domain string) ConfigOpt

domainExclude 是一个选项函数,用于指定爬虫时的域名黑名单 domain允许使用glob语法,例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainExclude("*.baidu.com")) ```

func WithDomainWhiteList

func WithDomainWhiteList(domain string) ConfigOpt

domainInclude 是一个选项函数,用于指定爬虫时的域名白名单 domain允许使用glob语法,例如*.example.com Example: ``` crawler.Start("https://example.com", crawler.domainInclude("*.example.com")) ```

func WithExtraSuffixForEveryPath

func WithExtraSuffixForEveryPath(path ...string) ConfigOpt

func WithExtraSuffixForEveryRootPath

func WithExtraSuffixForEveryRootPath(path ...string) ConfigOpt

func WithFixedCookie

func WithFixedCookie(k, v string) ConfigOpt

cookie 是一个选项函数,用于指定爬虫时的cookie Example: ``` crawler.Start("https://example.com", crawler.cookie("key", "value")) ```

func WithForbiddenFromParent

func WithForbiddenFromParent(b bool) ConfigOpt

forbiddenFromParent 是一个选项函数,用于指定爬虫时的是否禁止从根路径发起请求,默认为false 对于一个起始URL,如果其并不是从根路径开始且没有禁止从根路径发起请求,那么爬虫会从其根路径开始爬取 Example: ``` crawler.Start("https://example.com/a/b/c", crawler.forbiddenFromParent(false)) // 这会从 https://example.com/ 开始爬取 ```

func WithHeader

func WithHeader(k, v string) ConfigOpt

header 是一个选项函数,用于指定爬虫时的请求头 Example: ``` crawler.Start("https://example.com", crawler.header("User-Agent", "yaklang-crawler")) ```

func WithJSParser added in v1.3.2

func WithJSParser(enable ...bool) ConfigOpt

jsParser 是一个选项函数,用于指定爬虫时是否进行对于JS的代码解析。 填写该选项默认开启,也可以传入false强制关闭。 Example: ``` crawler.Start("https://example.com", crawler.jsParser()) // 开启 crawler.Start("https://example.com", crawler.jsParser(true)) // 开启 crawler.Start("https://example.com", crawler.jsParser(false)) // 关闭 ```

func WithMaxDepth

func WithMaxDepth(depth int) ConfigOpt

maxDepth 是一个选项函数,用于指定爬虫时的最大深度,默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxDepth(10)) ```

func WithMaxRedirectTimes

func WithMaxRedirectTimes(maxRedirectTimes int) ConfigOpt

maxRedirect 是一个选项函数,用于指定爬虫时的最大重定向次数,默认为5 Example: ``` crawler.Start("https://example.com", crawler.maxRedirect(10)) ```

func WithMaxRequestCount

func WithMaxRequestCount(limit int) ConfigOpt

maxRequest 是一个选项函数,用于指定爬虫时的最大请求数,默认为1000 Example: ``` crawler.Start("https://example.com", crawler.maxRequest(10000)) ```

func WithMaxRetry

func WithMaxRetry(limit int) ConfigOpt

maxRetry 是一个选项函数,用于指定爬虫时的最大重试次数,默认为3 Example: ``` crawler.Start("https://example.com", crawler.maxRetry(10)) ```

func WithMaxUrlCount

func WithMaxUrlCount(limit int) ConfigOpt

maxUrls 是一个选项函数,用于指定爬虫时的最大链接数,默认为10000 Example: ``` crawler.Start("https://example.com", crawler.maxUrls(20000)) ```

func WithOnRequest

func WithOnRequest(f func(req *Req)) ConfigOpt

func WithProxy

func WithProxy(proxies ...string) ConfigOpt

proxy 是一个选项函数,用于指定爬虫时的代理 Example: ``` crawler.Start("https://example.com", crawler.proxy("http://127.0.0.1:8080")) ```

func WithResponseTimeout

func WithResponseTimeout(f float64) ConfigOpt

responseTimeout 是一个选项函数,用于指定爬虫时的响应超时时间,默认为10s ! 未实现 Example: ``` crawler.Start("https://example.com", crawler.responseTimeout(5)) ```

func WithRuntimeID added in v1.3.0

func WithRuntimeID(id string) ConfigOpt

func WithUrlExtractor

func WithUrlExtractor(f func(*Req) []interface{}) ConfigOpt

urlExtractor 是一个选项函数,它接收一个函数作为参数,用于为爬虫添加额外的链接提取规则 Example: ``` crawler.Start("https://example.com", crawler.urlExtractor(func(req) { 尝试编写自己的规则,从响应体(req.Response()或req.ResponseRaw())中提取额外的链接 }) ```

func WithUrlRegexpBlackList

func WithUrlRegexpBlackList(re string) ConfigOpt

urlRegexpExclude 是一个选项函数,用于指定爬虫时的URL正则黑名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpExclude(`\.jpg`)) ```

func WithUrlRegexpWhiteList

func WithUrlRegexpWhiteList(re string) ConfigOpt

urlRegexpInclude 是一个选项函数,用于指定爬虫时的URL正则白名单 Example: ``` crawler.Start("https://example.com", crawler.urlRegexpInclude(`\.html`)) ```

func WithUserAgent

func WithUserAgent(ua string) ConfigOpt

userAgent 是一个选项函数,用于指定爬虫时的User-Agent Example: ``` crawler.Start("https://example.com", crawler.userAgent("yaklang-crawler")) ```

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(urls string, opts ...ConfigOpt) (*Crawler, error)

func (*Crawler) Run

func (c *Crawler) Run() error

type JavaScriptContent added in v1.3.0

type JavaScriptContent struct {
	IsCodeText bool
	Code       string
	UrlPath    string
	Node       *html.Node
}

func (*JavaScriptContent) String added in v1.3.0

func (s *JavaScriptContent) String() string

type PageInfoFetchOption added in v1.3.0

type PageInfoFetchOption func(config *infoFetcherConfig)

func WithFetcher_HtmlTag added in v1.3.0

func WithFetcher_HtmlTag(f func(string, *html.Node)) PageInfoFetchOption

func WithFetcher_HtmlText added in v1.3.0

func WithFetcher_HtmlText(f func(*html.Node)) PageInfoFetchOption

func WithFetcher_JavaScript added in v1.3.0

func WithFetcher_JavaScript(f func(content *JavaScriptContent)) PageInfoFetchOption

type Req

type Req struct {
	// contains filtered or unexported fields
}

func (*Req) AbsoluteURL

func (r *Req) AbsoluteURL(u string) string

AbsoluteURL 根据当前请求的URL,将传入的相对路径转换为绝对路径 Example: ``` req.AbsoluteURL("/a/b/c") ```

func (*Req) Hash

func (r *Req) Hash() string

Hash 返回当前请求的哈希值,其值由请求的URL与请求方法组成 Example: ``` req.Hash() ```

func (*Req) IsForm

func (r *Req) IsForm() bool

IsForm 判断当前请求是否是一个表单 Example: ``` req.IsForm() ```

func (*Req) IsHttps

func (r *Req) IsHttps() bool

IsHttps 返回当前请求是否是https请求 Example: ``` req.IsHttps() ```

func (*Req) IsLoginForm

func (r *Req) IsLoginForm() bool

IsLoginForm 判断当前请求是否是一个登录表单 Example: ``` req.IsLoginForm() ```

func (*Req) IsUploadForm

func (r *Req) IsUploadForm() bool

IsUploadForm 判断当前请求是否是一个上传表单 Example: ``` req.IsUploadForm() ```

func (*Req) Request

func (r *Req) Request() *http.Request

Request 返回当前请求的原始请求结构体引用 Example: ``` req.Request() ```

func (*Req) RequestRaw

func (r *Req) RequestRaw() []byte

RequestRaw 返回当前请求的原始请求报文 Example: ``` req.RequestRaw() ```

func (*Req) Response

func (r *Req) Response() (*http.Response, error)

Response 返回当前请求的原始响应结构体引用与错误 Example: ``` resp, err = req.Response() ```

func (*Req) ResponseBody

func (r *Req) ResponseBody() []byte

ResponseBody 返回当前请求的原始响应体 Example: ``` req.ResponseBody() ```

func (*Req) ResponseRaw

func (r *Req) ResponseRaw() []byte

ResponseRaw 返回当前请求的原始响应报文 Example: ``` req.ResponseRaw() ```

func (*Req) SameWildcardOrigin

func (r *Req) SameWildcardOrigin(s *Req) bool

SameWildcardOrigin 判断当前请求与传入的请求是否是同域的 Example: ``` req1.SameWildcardOrigin(req2) ```

func (*Req) Url

func (r *Req) Url() string

Url 返回当前请求的URL字符串 Example: ``` req.Url() ```

type RequestIf

type RequestIf interface {
	Url() string
	Request() *http.Request
	ResponseBody() []byte
	Response() (*http.Response, error)
	IsHttps() bool
	ResponseRaw() []byte
	RequestRaw() []byte
}

type Result

type Result struct {
	FoundUrls []string
	Requests  []*Req
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL