crawler

package module
v0.0.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 4, 2022 License: MIT Imports: 16 Imported by: 0

README

go-crawler

A simple crawler library implement with Golang, some features same as Scrapy(a python crawler framework)

usage

import "github.com/qhzhyt/go-crawler"

func parse(res *crawler.Response, ctx *crawler.Context) {
	title := res.CSS("a").Attrs("href")
	ctx.Emit(map[string]interface{}{"title": title})
}

func StartRequests(ctx *crawler.Context) []*crawler.Request {
	return crawler.GetURLS(
        "http://www.baidu.com/",
        "http://www.qq.com/"
    )
}

func main() {
    crawler.NewCrawler("test").
		WithStartRequest(startRequest).
		WithDefaultCallback(parse).
		OnItem(func(item interface{}, ctx *crawler.Context) interface{} {
				fmt.Println(item)
				return nil
		}).
		WithSettings(&crawler.Settings{RequestDelay: 1000}).
		Start(true)
}


Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func URLJoin added in v0.0.3

func URLJoin(url string, path string) string

Types

type Args

type Args url.Values

Args is http post form

type Context

type Context struct {
	Engine       *CrawlEngine
	Crawler      *Crawler
	Settings     *Settings
	Depth        int32
	LastRequest  *Request
	LastResponse *Response
}

Context 爬虫执行上下文

func (*Context) AddItem

func (ctx *Context) AddItem(item interface{})

AddItem 处理item

func (*Context) AddRequest

func (ctx *Context) AddRequest(req *Request)

AddRequest 添加请求

func (*Context) Emit

func (ctx *Context) Emit(item interface{})

Emit 提交Request或item

type Cookies

type Cookies map[string]string

Cookies request or response Cookies

type CrawlEngine

type CrawlEngine struct {

	//fastHttpClient *fasthttp.Client
	RequestQueue chan *Request
	ItemQueue    chan *itemWrapper
	//RequestingCount     int32
	//ProcessingItemCount int32
	Settings       *Settings
	RequestMetaMap *sync.Map //map[*http.Request]Meta
	// contains filtered or unexported fields
}

CrawlEngine 爬取引擎

func (*CrawlEngine) IsIdle

func (eng *CrawlEngine) IsIdle() bool

IsIdle 判断引擎是否进入空闲状态

func (*CrawlEngine) Start

func (eng *CrawlEngine) Start()

Start 启动引擎

func (*CrawlEngine) StartProcessItems

func (eng *CrawlEngine) StartProcessItems()

StartProcessItems 开始处理Items

func (*CrawlEngine) StartProcessRequests

func (eng *CrawlEngine) StartProcessRequests()

StartProcessRequests 开始处理请求

func (*CrawlEngine) Wait

func (eng *CrawlEngine) Wait()

Wait 等待引擎执行结束

func (*CrawlEngine) WaitTime added in v0.0.3

func (eng *CrawlEngine) WaitTime(seconds time.Duration)

Wait 等待引擎执行结束

type Crawler

type Crawler struct {
	Name          string
	StartUrls     []string
	StartRequests func(ctx *Context) []*Request

	Pipelines     []ItemPipeline
	ItemTypeFuncs map[string]ItemPipelineFunc

	Settings *Settings
	Engine   *CrawlEngine
	// contains filtered or unexported fields
}

Crawler 爬虫本体

func NewCrawler

func NewCrawler(settings *Settings) *Crawler

NewCrawler 创建一个爬虫

func (*Crawler) AddItemPipeline

func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler

AddItemPipeline 添加Pipeline

func (*Crawler) AddItemPipelineFunc

func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler

AddItemPipelineFunc 添加Pipeline func

func (*Crawler) AddRequest added in v0.0.3

func (c *Crawler) AddRequest(req *Request)

AddRequest crawl one url

func (*Crawler) ClearPipelines

func (c *Crawler) ClearPipelines() *Crawler

ClearPipelines 清空pipeline

func (*Crawler) CrawlURL

func (c *Crawler) CrawlURL(url string)

CrawlURL crawl one url

func (*Crawler) IsIdle added in v0.0.3

func (c *Crawler) IsIdle() bool

func (*Crawler) OnItem

func (c *Crawler) OnItem(f ItemPipelineFunc) *Crawler

OnItem 默认item处理函数

func (*Crawler) OnItemType

func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler

OnItemType 与itemExample同类型的item处理函数

func (*Crawler) OnRedirect added in v0.0.3

func (c *Crawler) OnRedirect(callback RedirectCallback) *Crawler

OnRedirect Set redirect callback

func (*Crawler) OnRequestError

func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler

OnRequestError Set request error callback

func (*Crawler) OnResponse

func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler

OnResponse Set response callback

func (*Crawler) OnStart

func (c *Crawler) OnStart(callback func(ctx *Context)) *Crawler

OnStart 设置start回调

func (*Crawler) OnStop

func (c *Crawler) OnStop(callback func(ctx *Context)) *Crawler

OnStop 设置stop回调

func (*Crawler) Start

func (c *Crawler) Start(wait bool) *Crawler

Start 启动爬虫

func (*Crawler) Wait

func (c *Crawler) Wait()

Wait 等待引擎进入空闲状态

func (*Crawler) WaitTime added in v0.0.3

func (c *Crawler) WaitTime(seconds time.Duration)

Wait 等待引擎进入空闲状态

func (*Crawler) WithDefaultCallback

func (c *Crawler) WithDefaultCallback(callback func(res *Response, ctx *Context)) *Crawler

WithDefaultCallback 设置默认回调函数

func (*Crawler) WithStartRequests

func (c *Crawler) WithStartRequests(callback func(ctx *Context) []*Request) *Crawler

WithStartRequests 自定义request

type Headers

type Headers map[string]interface{}

Headers request or response headers

type History added in v0.0.3

type History []*HistoryItem

func (History) Append added in v0.0.3

func (h History) Append(request *Request, response *Response) History

type HistoryItem added in v0.0.3

type HistoryItem struct {
	Request  *Request
	Response *Response
}

type ItemPipeline

type ItemPipeline interface {
	ProcessItem(item interface{}, ctx *Context) interface{}
}

ItemPipeline pipeline接口

func DefaultPipeLines

func DefaultPipeLines() []ItemPipeline

DefaultPipeLines 默认的pipelines

func FuncPipeline

func FuncPipeline(callback ItemPipelineFunc) ItemPipeline

FuncPipeline 仅提供一个函数的pipeline

type ItemPipelineFunc

type ItemPipelineFunc func(item interface{}, ctx *Context) interface{}

ItemPipelineFunc 处理item的函数

type Meta

type Meta map[string]interface{}

Meta request or response Meta

func (Meta) Has added in v0.0.3

func (m Meta) Has(key string) bool

type MongoPipeline

type MongoPipeline struct {
	MongoDBURI string
	Database   string
	Collection string
}

MongoPipeline 默认mongodb pipeline

func (*MongoPipeline) ProcessItem

func (dmp *MongoPipeline) ProcessItem(item interface{}, ctx *Context) interface{}

ProcessItem 实现ItemPipeline接口

type RedirectCallback added in v0.0.3

type RedirectCallback func(res *Response, req *Request, ctx *Context) *Request

type Request

type Request struct {
	Method        string
	URL           string
	Body          []byte
	Headers       http.Header
	Cookies       Cookies
	Timeout       int
	Meta          Meta
	Callback      ResponseCallback
	ErrorCallback RequestErrorCallback

	ProxyURL  string
	OriginURL string
	Host      string
	History   History
	// contains filtered or unexported fields
}

Request Crawler的请求

func FormRequest

func FormRequest(url string, form Args) *Request

FormRequest form post request

func GetRequest

func GetRequest(url string, args Args) *Request

GetRequest GetRequest

func GetURL

func GetURL(url string) *Request

GetURL GET url

func GetURLs

func GetURLs(urls ...string) []*Request

GetURLs GET url

func NewRequest

func NewRequest(method string, url string, body []byte) *Request

NewRequest NewRequest

func PostRequest

func PostRequest(url string, body []byte) *Request

PostRequest basic post request

func (*Request) AddHeader added in v0.0.3

func (req *Request) AddHeader(key string, value string) *Request

WithHeaders set Headers

func (*Request) AddMeta added in v0.0.3

func (req *Request) AddMeta(key string, value interface{}) *Request

AddMeta set Headers

func (*Request) Clone added in v0.0.3

func (req *Request) Clone() *Request

func (*Request) OnError

func (req *Request) OnError(callback RequestErrorCallback) *Request

func (*Request) OnResponse

func (req *Request) OnResponse(callback ResponseCallback) *Request

OnResponse set Response callback

func (*Request) WithContentType

func (req *Request) WithContentType(contentType string) *Request

WithContentType set Content-Type

func (*Request) WithCookies

func (req *Request) WithCookies(cookies map[string]string) *Request

WithCookies set Cookies

func (*Request) WithHeaders

func (req *Request) WithHeaders(headers map[string]string) *Request

WithHeaders set Headers

func (*Request) WithHost added in v0.0.3

func (req *Request) WithHost(host string) *Request

WithHost set Host

func (*Request) WithMeta

func (req *Request) WithMeta(meta Meta) *Request

WithMeta set Headers

func (*Request) WithProxy

func (req *Request) WithProxy(proxy string) *Request

WithProxy set Headers

func (*Request) WithTimeout

func (req *Request) WithTimeout(timeout int) *Request

WithTimeout set timeout

type RequestErrorCallback

type RequestErrorCallback func(req *Request, err error, ctx *Context)

type Response

type Response struct {
	*htmlquery.Selector
	StatusCode int
	URL        string
	Status     string
	Body       []byte
	Request    *Request
	Headers    http.Header
	Cookies    Cookies
	Meta       Meta

	History History
	//NativeResponse  *http.Response
	X509Certificate *x509.Certificate
	X509CertChan    []*x509.Certificate
	// contains filtered or unexported fields
}

Response Crawler的响应

func NewResponse

func NewResponse(res *http.Response) *Response

NewResponse 创建Response

func NewResponse(content []byte) *Response {
	return &Response{Selector: htmlquery.NewSelector(content), Body: content}
}

NewResponse create a Response from http.Response

func (*Response) Redirect added in v0.0.3

func (res *Response) Redirect(url string) *Request

func (*Response) Text

func (res *Response) Text() string

Text 获取响应文本

func (*Response) WithRequest

func (res *Response) WithRequest(req *Request) *Response

WithRequest 设置request

func (*Response) WithStatus

func (res *Response) WithStatus(code int, sataus string) *Response

WithStatus 设置响应状态

type ResponseCallback

type ResponseCallback func(res *Response, ctx *Context)

ResponseCallback ResponseCallback

type Settings

type Settings struct {
	MaxConcurrentRequests     int32
	RequestDelay              int
	RequestTimeout            int
	MaxConcurrentProcessItems int
	MaxRetryTimes             int
	MaxRedirectTimes          int
	AutoParseHtml             bool
	SkipTLSVerify             bool
	Transport                 *http.Transport
}

Settings 爬虫配置

func DefaultSettings

func DefaultSettings() *Settings

DefaultSettings 创建默认Setting

Directories

Path Synopsis
Package htmlquery provides extract data from HTML documents using XPath expression.
Package htmlquery provides extract data from HTML documents using XPath expression.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL