crawlergo

package
v1.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 29, 2024 License: AGPL-3.0 Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AllDomainCollect

func AllDomainCollect(reqList []*model.Request) []string

func GetPathsByFuzz

func GetPathsByFuzz(navReq model2.Request) []*model2.Request

* 使用常见路径列表进行fuzz

func GetPathsByFuzzDict

func GetPathsByFuzzDict(navReq model2.Request, dictPath string) []*model2.Request

* 使用字典列表进行fuzz

func GetPathsFromRobots

func GetPathsFromRobots(navReq model2.Request) []*model2.Request

GetPathsFromRobots 从robots.txt文件中获取路径信息

func SubDomainCollect

func SubDomainCollect(reqList []*model.Request, HostLimit string) []string

Types

type CrawlerTask

type CrawlerTask struct {
	Browser    *engine.Browser  //
	RootDomain string           // 当前爬取根域名 用于子域名收集
	Targets    []*model.Request // 输入目标
	Result     *Result          // 最终结果
	Config     *TaskConfig      // 配置信息

	Pool *ants.Pool // 协程池

	Start    time.Time // 开始时间
	OnResult OnResultCallback
	// contains filtered or unexported fields
}

func NewCrawlerTask

func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig, onResult OnResultCallback) (*CrawlerTask, error)

NewCrawlerTask 新建爬虫任务

func (*CrawlerTask) Run

func (t *CrawlerTask) Run()

Run 开始当前任务

type OnResultCallback

type OnResultCallback func(*OutResult)

OnResultCallback (OutResult)

type OutResult

type OutResult struct {
	ReqList *model.Request // 返回的同域名结果
}

type Result

type Result struct {
	ReqList       []*model.Request // 返回的同域名结果
	AllReqList    []*model.Request // 所有域名的请求
	AllDomainList []string         // 所有域名列表
	SubDomainList []string         // 子域名列表
	// contains filtered or unexported fields
}

type TaskConfig

type TaskConfig struct {
	MaxCrawlCount           int    // 最大爬取的数量
	FilterMode              string // simple、smart、strict
	ExtraHeaders            map[string]interface{}
	ExtraHeadersString      string
	AllDomainReturn         bool // 全部域名收集
	SubDomainReturn         bool // 子域名收集
	NoHeadless              bool // headless模式
	DomContentLoadedTimeout time.Duration
	TabRunTimeout           time.Duration     // 单个标签页超时
	PathByFuzz              bool              // 通过字典进行Path Fuzz
	FuzzDictPath            string            // Fuzz目录字典
	PathFromRobots          bool              // 解析Robots文件找出路径
	MaxTabsCount            int               // 允许开启的最大标签页数量 即同时爬取的数量
	ChromiumPath            string            // Chromium的程序路径  `/home/zhusiyu1/chrome-linux/chrome`
	ChromiumWSUrl           string            // Websocket debugging URL for a running chrome session
	EventTriggerMode        string            // 事件触发的调用方式: 异步 或 顺序
	EventTriggerInterval    time.Duration     // 事件触发的间隔
	BeforeExitDelay         time.Duration     // 退出前的等待时间,等待DOM渲染,等待XHR发出捕获
	EncodeURLWithCharset    bool              // 使用检测到的字符集自动编码URL
	IgnoreKeywords          []string          // 忽略的关键字,匹配上之后将不再扫描且不发送请求
	Proxy                   string            // 请求代理
	CustomFormValues        map[string]string // 自定义表单填充参数
	CustomFormKeywordValues map[string]string // 自定义表单关键词填充内容
	MaxRunTime              int64             // 最大爬取时间(单位秒),超时则结束任务,平滑结束(比如某个url还未处理完不能结束,需要一次req完成后才可以结束整个任务)
}

func NewTaskConfig

func NewTaskConfig(optFuncs ...TaskConfigOptFunc) *TaskConfig

type TaskConfigOptFunc

type TaskConfigOptFunc func(*TaskConfig)

func WithAllDomainReturn

func WithAllDomainReturn(gen bool) TaskConfigOptFunc

func WithBeforeExitDelay

func WithBeforeExitDelay(gen time.Duration) TaskConfigOptFunc

func WithChromiumPath

func WithChromiumPath(gen string) TaskConfigOptFunc

func WithCustomFormKeywordValues

func WithCustomFormKeywordValues(gen map[string]string) TaskConfigOptFunc

func WithCustomFormValues

func WithCustomFormValues(gen map[string]string) TaskConfigOptFunc

func WithDomContentLoadedTimeout

func WithDomContentLoadedTimeout(gen time.Duration) TaskConfigOptFunc

func WithEncodeURLWithCharset

func WithEncodeURLWithCharset(gen bool) TaskConfigOptFunc

func WithEventTriggerInterval

func WithEventTriggerInterval(gen time.Duration) TaskConfigOptFunc

func WithEventTriggerMode

func WithEventTriggerMode(gen string) TaskConfigOptFunc

func WithExtraHeaders

func WithExtraHeaders(gen map[string]interface{}) TaskConfigOptFunc

func WithExtraHeadersString

func WithExtraHeadersString(gen string) TaskConfigOptFunc

func WithFilterMode

func WithFilterMode(gen string) TaskConfigOptFunc

func WithFuzzDictPath

func WithFuzzDictPath(gen string) TaskConfigOptFunc

func WithIgnoreKeywords

func WithIgnoreKeywords(gen []string) TaskConfigOptFunc

func WithMaxCrawlCount

func WithMaxCrawlCount(maxCrawlCount int) TaskConfigOptFunc

func WithMaxTabsCount

func WithMaxTabsCount(gen int) TaskConfigOptFunc

func WithNoHeadless

func WithNoHeadless(gen bool) TaskConfigOptFunc

func WithPathByFuzz

func WithPathByFuzz(gen bool) TaskConfigOptFunc

func WithPathFromRobots

func WithPathFromRobots(gen bool) TaskConfigOptFunc

func WithProxy

func WithProxy(gen string) TaskConfigOptFunc

func WithSubDomainReturn

func WithSubDomainReturn(gen bool) TaskConfigOptFunc

func WithTabRunTimeout

func WithTabRunTimeout(gen time.Duration) TaskConfigOptFunc

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL