crawler

package module
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 4, 2022 License: MIT Imports: 14 Imported by: 0

README

go-crawler

A simple crawler library implement with Golang, some features same as Scrapy(a python crawler framework)

usage

import "github.com/qhzhyt/go-crawler"

func parse(res *crawler.Response, ctx *crawler.Context) {
	title := res.CSS("a").Attrs("href")
	ctx.Emit(map[string]interface{}{"title": title})
}

func StartRequests(ctx *crawler.Context) []*crawler.Request {
	return crawler.GetURLS(
        "http://www.baidu.com/",
        "http://www.qq.com/"
    )
}

func main() {
    crawler.NewCrawler("test").
		WithStartRequest(startRequest).
		WithDefaultCallback(parse).
		OnItem(func(item interface{}, ctx *crawler.Context) interface{} {
				fmt.Println(item)
				return nil
		}).
		WithSettings(&crawler.Settings{RequestDelay: 1000}).
		Start(true)
}


Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Args

type Args url.Values

Args is http post form

type Context

type Context struct {
	Engine *CrawlEngine

	Crawler  *Crawler
	Settings *Settings

	Depth        int32
	LastRequest  *Request
	LastResponse *Response
}

Context 爬虫执行上下文

func (*Context) AddItem

func (ctx *Context) AddItem(item interface{})

AddItem 处理item

func (*Context) AddRequest

func (ctx *Context) AddRequest(req *Request)

AddRequest 添加请求

func (*Context) Emit

func (ctx *Context) Emit(item interface{})

Emit 提交Request或item

type Cookies

type Cookies map[string]string

Cookies request or response Cookies

type CrawlEngine

type CrawlEngine struct {
	RequestQueue        chan *Request
	ItemQueue           chan *itemWrapper
	RequestingCount     int32
	ProcessingItemCount int32
	Settings            *Settings
	RequestMetaMap      *sync.Map //map[*http.Request]Meta
	// contains filtered or unexported fields
}

CrawlEngine 爬取引擎

func (*CrawlEngine) IsIdle

func (eng *CrawlEngine) IsIdle() bool

IsIdle 判断引擎是否进入空闲状态

func (*CrawlEngine) Start

func (eng *CrawlEngine) Start()

Start 启动引擎

func (*CrawlEngine) StartProcessItems

func (eng *CrawlEngine) StartProcessItems()

StartProcessItems 开始处理Items

func (*CrawlEngine) StartProcessRequests

func (eng *CrawlEngine) StartProcessRequests()

StartProcessRequests 开始处理请求

func (*CrawlEngine) Wait

func (eng *CrawlEngine) Wait()

Wait 等待引擎执行结束

type Crawler

type Crawler struct {
	Name          string
	StartUrls     []string
	StartRequests func(ctx *Context) []*Request

	Pipelines            []ItemPipeline
	ItemTypeFuncs        map[string]ItemPipelineFunc
	ResponseCallback     func(res *Response, ctx *Context)
	RequestErrorCallback RequestErrorCallback
	Settings             *Settings
	Engine               *CrawlEngine
	// contains filtered or unexported fields
}

Crawler 爬虫本体

func NewCrawler

func NewCrawler(name string) *Crawler

NewCrawler 创建一个爬虫

func (*Crawler) AddItemPipeline

func (c *Crawler) AddItemPipeline(p ItemPipeline) *Crawler

AddItemPipeline 添加Pipeline

func (*Crawler) AddItemPipelineFunc

func (c *Crawler) AddItemPipelineFunc(f ItemPipelineFunc) *Crawler

AddItemPipelineFunc 添加Pipeline func

func (*Crawler) ClearPipelines

func (c *Crawler) ClearPipelines() *Crawler

ClearPipelines 清空pipeline

func (*Crawler) CrawlURL

func (c *Crawler) CrawlURL(url string)

func (*Crawler) OnItem

func (c *Crawler) OnItem(f ItemPipelineFunc) *Crawler

OnItem 默认item处理函数

func (*Crawler) OnItemType

func (c *Crawler) OnItemType(itemExample interface{}, f ItemPipelineFunc) *Crawler

OnItemType 与itemExample同类型的item处理函数

func (*Crawler) OnRequestError

func (c *Crawler) OnRequestError(callback RequestErrorCallback) *Crawler

func (*Crawler) OnResponse

func (c *Crawler) OnResponse(callback ResponseCallback) *Crawler

func (*Crawler) OnStart

func (c *Crawler) OnStart(callback func(ctx *Context)) *Crawler

OnStart 设置start回调

func (*Crawler) OnStop

func (c *Crawler) OnStop(callback func(ctx *Context)) *Crawler

OnStop 设置stop回调

func (*Crawler) Start

func (c *Crawler) Start(wait bool) *Crawler

Start 启动爬虫

func (*Crawler) Wait

func (c *Crawler) Wait()

Wait 等待引擎进入空闲状态

func (*Crawler) WithDefaultCallback

func (c *Crawler) WithDefaultCallback(callback func(res *Response, ctx *Context)) *Crawler

WithDefaultCallback 设置默认回调函数

func (*Crawler) WithSettings

func (c *Crawler) WithSettings(s *Settings) *Crawler

WithSettings 设置settings

func (*Crawler) WithStartRequests

func (c *Crawler) WithStartRequests(callback func(ctx *Context) []*Request) *Crawler

WithStartRequests 自定义request

type Headers

type Headers map[string]interface{}

Headers request or response headers

func (Headers) Get

func (h Headers) Get(name string) string

Get a header value by name

func (Headers) GetList

func (h Headers) GetList(name string) []string

func (Headers) Set

func (h Headers) Set(name string, value string)

Set a header value by name

type ItemPipeline

type ItemPipeline interface {
	ProcessItem(item interface{}, ctx *Context) interface{}
}

ItemPipeline pipeline接口

func DefaultPipeLines

func DefaultPipeLines() []ItemPipeline

DefaultPipeLines 默认的pipelines

func FuncPipeline

func FuncPipeline(callback ItemPipelineFunc) ItemPipeline

FuncPipeline 仅提供一个函数的pipeline

type ItemPipelineFunc

type ItemPipelineFunc func(item interface{}, ctx *Context) interface{}

ItemPipelineFunc 处理item的函数

type Meta

type Meta map[string]interface{}

Meta request or response Meta

type MongoPipeline

type MongoPipeline struct {
	MongoDBURI string
	Database   string
	Collection string
}

MongoPipeline 默认mongodb pipeline

func (*MongoPipeline) ProcessItem

func (dmp *MongoPipeline) ProcessItem(item interface{}, ctx *Context) interface{}

ProcessItem 实现ItemPipeline接口

type Request

type Request struct {
	Method        string
	URL           string
	Body          []byte
	Headers       Headers
	Cookies       Cookies
	Timeout       int
	Meta          Meta
	Callback      ResponseCallback
	ErrorCallback RequestErrorCallback

	ProxyURL string
	// contains filtered or unexported fields
}

Request Crawler的请求

func FormRequest

func FormRequest(url string, form Args) *Request

FormRequest form post request

func GetRequest

func GetRequest(url string, args Args) *Request

GetRequest GetRequest

func GetURL

func GetURL(url string) *Request

GetURL GET url

func GetURLs

func GetURLs(urls ...string) []*Request

GetURLs GET url

func NewRequest

func NewRequest(method string, url string, body []byte) *Request

NewRequest NewRequest

func PostRequest

func PostRequest(url string, body []byte) *Request

PostRequest basic post request

func (*Request) OnError

func (req *Request) OnError(callback RequestErrorCallback) *Request

func (*Request) OnResponse

func (req *Request) OnResponse(callback ResponseCallback) *Request

OnResponse set Response callback

func (*Request) WithContentType

func (req *Request) WithContentType(contentType string) *Request

WithContentType set Content-Type

func (*Request) WithCookies

func (req *Request) WithCookies(cookies map[string]string) *Request

WithCookies set Cookies

func (*Request) WithHeaders

func (req *Request) WithHeaders(headers map[string]string) *Request

WithHeaders set Headers

func (*Request) WithMeta

func (req *Request) WithMeta(meta Meta) *Request

WithMeta set Headers

func (*Request) WithProxy

func (req *Request) WithProxy(proxy string) *Request

WithProxy set Headers

func (*Request) WithTimeout

func (req *Request) WithTimeout(timeout int) *Request

WithTimeout set timeout

type RequestErrorCallback

type RequestErrorCallback func(req *Request, err error, ctx *Context)

type Response

type Response struct {
	StatusCode int
	*htmlquery.Selector
	URL     string
	Status  string
	Body    []byte
	Request *Request
	Headers Headers
	Cookies Cookies
	Meta    Meta

	NativeResponse *http.Response
	// contains filtered or unexported fields
}

Response Crawler的响应

func NewResponse

func NewResponse(res *http.Response) *Response

NewResponse 创建Response

func NewResponse(content []byte) *Response {
	return &Response{Selector: htmlquery.NewSelector(content), Body: content}
}

NewResponse create a Response from http.Response

func (*Response) Text

func (res *Response) Text() string

Text 获取响应文本

func (*Response) WithRequest

func (res *Response) WithRequest(req *Request) *Response

WithRequest 设置request

func (*Response) WithStatus

func (res *Response) WithStatus(code int, sataus string) *Response

WithStatus 设置响应状态

type ResponseCallback

type ResponseCallback func(res *Response, ctx *Context)

ResponseCallback ResponseCallback

type Settings

type Settings struct {
	MaxConcurrentRequests     int32
	RequestDelay              int
	RequestTimeout            int
	MaxConcurrentProcessItems int
	MaxRetryTimes             int
}

Settings 爬虫配置

func DefaultSettings

func DefaultSettings() *Settings

DefaultSettings 创建默认Setting

Directories

Path Synopsis
Package htmlquery provides extract data from HTML documents using XPath expression.
Package htmlquery provides extract data from HTML documents using XPath expression.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL