tegenaria

package module
v0.2.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 28, 2022 License: Apache-2.0 Imports: 29 Imported by: 1

README

Tegenaria crawl framework

codecov go workflow
Tegenaria is a crawler framework based on golang

Installation

To install Tegenaria package, you should install Go and set your Go workspace first.

  1. The first need Go installed (version 1.13+ is required), then you can use the below Go command to install Tegenaria.
go get -u github.com/wetrycode/tegenaria
  1. Import it in your code:
import "github.com/wetrycode/tegenaria"

Quick start

See the example of tegenaria

Document

Contribution

Feel free to PR and raise issues.
Send me an email directly, vforfreedom96@gmail.com

License

Apache2.0 © geebytes

Documentation

Overview

Copyright 2022 geebytes Licensed under the Apache License, Version 2.0 (the 'License'); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	GET     string = "GET"
	POST    string = "POST"
	PUT     string = "PUT"
	DELETE  string = "DELETE"
	OPTIONS string = "OPTIONS"
	HEAD    string = "HEAD"
)

Request method constant definition

Variables

View Source
var (
	ErrSpiderMiddleware    error = errors.New("handle spider middleware error")
	ErrSpiderCrawls        error = errors.New("handle spider crawl error")
	ErrDuplicateSpiderName error = errors.New("register a duplicate spider name error")
	ErrEmptySpiderName     error = errors.New("register a empty spider name error")
	ErrSpiderNotExist      error = errors.New("not found spider")
	ErrNotAllowStatusCode  error = errors.New("not allow handle status code")
	ErrGetCacheItem        error = errors.New("getting item from cache error")
	ErrGetHttpProxy        error = errors.New("getting http proxy ")
	ErrGetHttpsProxy       error = errors.New("getting https proxy ")
	ErrParseSocksProxy     error = errors.New("parse socks proxy ")
	ErrResponseRead        error = errors.New("read response to buffer error")
	ErrResponseParse       error = errors.New("parse response error")
)
View Source
var ProcessId string = uuid.New().String()

Functions

func DefaultErrorHandler

func DefaultErrorHandler(spider SpiderInterface, err *HandleError)

// DefaultErrorHandler error default handler

func GetLogger

func GetLogger(Name string) *logrus.Entry

func GetUUID

func GetUUID() string

func NewRequestCache

func NewRequestCache() *requestCache

NewRequestCache get a new requestCache

Types

type BaseSpider

type BaseSpider struct {
	// Name spider name
	Name string

	// FeedUrls feed urls
	FeedUrls []string
}

BaseSpider base spider

func NewBaseSpider

func NewBaseSpider(name string, feedUrls []string) *BaseSpider

func (*BaseSpider) ErrorHandler

func (s *BaseSpider) ErrorHandler(err *HandleError)

func (*BaseSpider) Parser

func (s *BaseSpider) Parser(resp *Context, item chan<- *ItemMeta, req chan<- *Context) error

Parser parse request respone it will send item or new request to engine

func (*BaseSpider) StartRequest

func (s *BaseSpider) StartRequest(req chan<- *Context)

type CacheInterface

type CacheInterface interface {
	// contains filtered or unexported methods
}

CacheInterface request cache interface you can use redis to do cache

type Configuration

type Configuration struct {
	Log *Logger `ymal:"log"`
}
var Config *Configuration = &Configuration{
	Log: &Logger{
		Path:  "/var/log",
		Level: "warn",
	},
}

type Context

type Context struct {
	// Request
	Request *Request

	// DownloadResult downloader handler result
	DownloadResult *RequestResult

	// Item
	Item ItemInterface

	//Ctx parent context
	Ctx context.Context

	// CtxId
	CtxId string

	// Error
	Error error
}

Context spider crawl request schedule unit it is used on all data flow

func NewContext

func NewContext(request *Request, opts ...ContextOption) *Context

func (*Context) Deadline

func (c *Context) Deadline() (deadline time.Time, ok bool)

Deadline returns that there is no deadline (ok==false) when c has no Context.

func (*Context) Done

func (c *Context) Done() <-chan struct{}

Done returns nil (chan which will wait forever) when c.Request has no Context.

func (*Context) Err

func (c *Context) Err() error

Err returns nil when ct has no Context.

func (Context) GetCtxId

func (c Context) GetCtxId() string

func (*Context) Value

func (c *Context) Value(key interface{}) interface{}

Value returns the value associated with this context for key, or nil if no value is associated with key. Successive calls to Value with the same key returns the same result.

type ContextOption

type ContextOption func(c *Context)

func ContextWithItem added in v0.2.4

func ContextWithItem(item ItemInterface) ContextOption

func WithContext

func WithContext(ctx context.Context) ContextOption

type DefaultFieldHook

type DefaultFieldHook struct {
}

func (*DefaultFieldHook) Fire

func (hook *DefaultFieldHook) Fire(entry *logrus.Entry) error

func (*DefaultFieldHook) Levels

func (hook *DefaultFieldHook) Levels() []logrus.Level

type Downloader

type Downloader interface {
	// Download core funcation
	Download(ctx *Context, result chan<- *Context)

	// CheckStatus check response status code if allow handle
	CheckStatus(statusCode uint64, allowStatus []uint64) bool
	// contains filtered or unexported methods
}

Downloader interface

func NewDownloader

func NewDownloader(opts ...DownloaderOption) Downloader

SpiderDownloader get a new spider downloader

type DownloaderOption

type DownloaderOption func(d *SpiderDownloader)

DownloaderOption optional parameters of the downloader

func DownloadWithClient

func DownloadWithClient(client http.Client) DownloaderOption

DownloadWithClient set http client for downloader

func DownloadWithTimeout

func DownloadWithTimeout(timeout time.Duration) DownloaderOption

DownloadWithTimeout set request download timeout

func DownloadWithTlsConfig

func DownloadWithTlsConfig(tls *tls.Config) DownloaderOption

DownloadWithTlsConfig set tls configure for downloader

func DownloaderWithStreamThreshold

func DownloaderWithStreamThreshold(streamThreshold uint64) DownloaderOption

StreamThreshold the must max size of response body to use stream donload

func DownloaderWithtransport

func DownloaderWithtransport(transport *http.Transport) DownloaderOption

DownloaderWithtransport download transport configure http.Transport

type EngineOption

type EngineOption func(r *SpiderEngine)

EngineOption the options params of NewDownloader

func EngineWithAllowStatusCode

func EngineWithAllowStatusCode(allowStatusCode []uint64) EngineOption

EngineWithAllowStatusCode set request response allow status

func EngineWithContext

func EngineWithContext(ctx context.Context) EngineOption

EngineWithContext set engine context

func EngineWithDownloader

func EngineWithDownloader(downloader Downloader) EngineOption

EngineWithDownloader set spider engine downloader

func EngineWithReadCacheNum

func EngineWithReadCacheNum(cacheReadNum uint) EngineOption

EngineWithReadCacheNum set cache reader number

func EngineWithRequestNum

func EngineWithRequestNum(requestNum uint) EngineOption

EngineWithRequestNum set request channel buffer size request channel buffer size default to 1024

func EngineWithSchedulerNum

func EngineWithSchedulerNum(schedulerNum uint) EngineOption

EngineWithSchedulerNum set engine scheduler number default to cpu number

func EngineWithTimeout

func EngineWithTimeout(timeout time.Duration) EngineOption

EngineWithTimeout set request download timeout

func EngineWithUniqueReq

func EngineWithUniqueReq(uniqueReq bool) EngineOption

EngineWithUniqueReq set request unique flag

type ErrorHandler

type ErrorHandler func(spider SpiderInterface, err *HandleError)

ErrorHandler a Customizable error handler funcation receive error from errchans

type ErrorOption

type ErrorOption func(e *HandleError)

func ErrorWithItem

func ErrorWithItem(item *ItemMeta) ErrorOption

func ErrorWithRequest

func ErrorWithRequest(request *Request) ErrorOption

func ErrorWithResponse

func ErrorWithResponse(response *Response) ErrorOption

type HandleError

type HandleError struct {
	CtxId    string
	Err      error
	Request  *Request
	Response *Response
	Item     *ItemMeta
}

func NewError

func NewError(ctxId string, err error, opts ...ErrorOption) *HandleError

func (*HandleError) Error

func (e *HandleError) Error() string

type ItemInterface

type ItemInterface interface {
}

Item as meta data process interface

type ItemMeta

type ItemMeta struct {
	CtxId string
	Item  ItemInterface
}

func NewItem

func NewItem(ctx *Context, item ItemInterface) *ItemMeta

type ItemPipelines

type ItemPipelines []PipelinesInterface

func (ItemPipelines) Len

func (p ItemPipelines) Len() int

func (ItemPipelines) Less

func (p ItemPipelines) Less(i, j int) bool

func (ItemPipelines) Swap

func (p ItemPipelines) Swap(i, j int)

type Logger

type Logger struct {
	Path  string `yaml:"path"`
	Level string `yaml:"level"`
}

func (*Logger) GetValue added in v0.2.4

func (l *Logger) GetValue(key string) (string, error)

type Middlewares

type Middlewares []MiddlewaresInterface

func (Middlewares) Len

func (p Middlewares) Len() int

func (Middlewares) Less

func (p Middlewares) Less(i, j int) bool

func (Middlewares) Swap

func (p Middlewares) Swap(i, j int)

type MiddlewaresBase

type MiddlewaresBase struct {
	Priority int
}

type MiddlewaresInterface

type MiddlewaresInterface interface {
	// GetPriority get middlerware priority
	GetPriority() int

	// ProcessRequest process request before request to do download
	ProcessRequest(ctx *Context) error

	// ProcessResponse process response before response to parse
	ProcessResponse(ctx *Context, req chan<- *Context) error

	// GetName get middlerware name
	GetName() string
}

MiddlewaresInterface Download middleware interface for Request and Response processing, the smaller the priority number the higher the priority

type Option

type Option func(r *Request)

Option NewRequest options

func RequestWithAllowRedirects

func RequestWithAllowRedirects(allowRedirects bool) Option

func RequestWithMaxConnsPerHost

func RequestWithMaxConnsPerHost(maxConnsPerHost int) Option

func RequestWithMaxRedirects

func RequestWithMaxRedirects(maxRedirects int) Option

func RequestWithRequestBody

func RequestWithRequestBody(body map[string]interface{}) Option

func RequestWithRequestCookies

func RequestWithRequestCookies(cookies map[string]string) Option

func RequestWithRequestHeader

func RequestWithRequestHeader(header map[string]string) Option

func RequestWithRequestMeta

func RequestWithRequestMeta(meta map[string]interface{}) Option

func RequestWithRequestParams

func RequestWithRequestParams(params map[string]string) Option

func RequestWithRequestProxy

func RequestWithRequestProxy(proxy Proxy) Option

func RequestWithResponseWriter

func RequestWithResponseWriter(write io.Writer) Option

type Parser

type Parser func(resp *Context, item chan<- *ItemMeta, req chan<- *Context) error

Parser response parse handler

type PipelinesBase

type PipelinesBase struct {
	Priority int
}

type PipelinesInterface

type PipelinesInterface interface {
	// GetPriority get pipeline Priority
	GetPriority() int
	// ProcessItem item handler
	ProcessItem(spider SpiderInterface, item *ItemMeta) error
}

PipelinesInterface pipeline interface pipeline is mainly used for processing item, the engine schedules ProcessItem according to the priority of pipelines from highest to lowest

type ProcessResponse

type ProcessResponse func(ctx *Context) error

type Proxy

type Proxy struct {
	ProxyUrl string
}

type RFPDupeFilter

type RFPDupeFilter struct {
	// contains filtered or unexported fields
}

func NewRFPDupeFilter

func NewRFPDupeFilter(bloomM uint, bloomK uint) *RFPDupeFilter

func (*RFPDupeFilter) DoDupeFilter

func (f *RFPDupeFilter) DoDupeFilter(request *Request) (bool, error)

DoDupeFilter deduplicate request filter by bloom

func (*RFPDupeFilter) Fingerprint

func (f *RFPDupeFilter) Fingerprint(request *Request) ([]byte, error)

type RFPDupeFilterInterface

type RFPDupeFilterInterface interface {
	// Fingerprint compute request fingerprint
	Fingerprint(request *Request) ([]byte, error)

	// DoDupeFilter do request fingerprint duplicates filter
	DoDupeFilter(request *Request) (bool, error)
}

RFPDupeFilterInterface Request Fingerprint duplicates filter interface

type RedirectError

type RedirectError struct {
	RedirectNum int
}

func (*RedirectError) Error

func (e *RedirectError) Error() string

type Request

type Request struct {
	Url            string                 // Set request URL
	Header         map[string]string      // Set request header
	Method         string                 // Set request Method
	Body           []byte                 // Set request body
	Params         map[string]string      // Set request query params
	Proxy          *Proxy                 // Set request proxy addr
	Cookies        map[string]string      // Set request cookie
	Meta           map[string]interface{} // Set other data
	AllowRedirects bool                   // Set if allow redirects. default is true
	MaxRedirects   int                    // Set max allow redirects number

	BodyReader     io.Reader // Set request body reader
	ResponseWriter io.Writer // Set request response body writer,like file
	// contains filtered or unexported fields
}

Request a spider request config

func NewRequest

func NewRequest(url string, method string, parser Parser, opts ...Option) *Request

NewRequest create a new Request. It will get a nil request form requestPool and then init params

type RequestResult

type RequestResult struct {
	Error    *HandleError // Error error exception during request
	Response *Response    // Response network request response object
}

RequestResult network request response result

func NewDownloadResult

func NewDownloadResult() *RequestResult

type Response

type Response struct {
	Status        int                 // Status request response status code
	Header        map[string][]string // Header response header
	Delay         float64             // Delay the time of handle download request
	ContentLength int                 // ContentLength response content length
	URL           string              // URL of request url
	Buffer        *bytes.Buffer       // buffer read response buffer
}

Response the Request download response data

func NewResponse

func NewResponse() *Response

NewResponse create a new Response from responsePool

func (*Response) Json

func (r *Response) Json() map[string]interface{}

Json deserialize the response body to json

func (*Response) String

func (r *Response) String() string

String get response text from response body

type Settings

type Settings interface {
	GetValue(key string) (error, string)
}

type SpiderDownloader

type SpiderDownloader struct {
	// StreamThreshold read body threshold using streaming TODO
	// if content length is bigger that,download will read response by streaming
	// it is a feature in the future
	StreamThreshold uint64

	// ProxyFunc update proxy for per request
	ProxyFunc func(req *http.Request) (*url.URL, error)
	// contains filtered or unexported fields
}

SpiderDownloader tegenaria spider downloader

func (*SpiderDownloader) CheckStatus

func (d *SpiderDownloader) CheckStatus(statusCode uint64, allowStatus []uint64) bool

CheckStatus check response status

func (*SpiderDownloader) Download

func (d *SpiderDownloader) Download(ctx *Context, result chan<- *Context)

Download network downloader

type SpiderEngine

type SpiderEngine struct {

	// Ctx context.Context
	Ctx context.Context

	// DownloadTimeout the request handle timeout value
	DownloadTimeout time.Duration

	// RFPDupeFilter request fingerprint BloomFilter
	// it will work if filterDuplicateReq is true
	RFPDupeFilter RFPDupeFilterInterface

	// Stats spider status counter and recorder
	Stats *SpiderStats

	// ErrorHandler see ErrorHandler funcation description
	ErrorHandler ErrorHandler
	// contains filtered or unexported fields
}
var (
	Engine *SpiderEngine // SpiderEngine global and once spider engine

)

func NewSpiderEngine

func NewSpiderEngine(opts ...EngineOption) *SpiderEngine

func (*SpiderEngine) Close

func (e *SpiderEngine) Close()

Close engine and close all channels

func (*SpiderEngine) RegisterDownloadMiddlewares

func (e *SpiderEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)

RegisterDownloadMiddlewares add a download middlewares

func (*SpiderEngine) RegisterPipelines

func (e *SpiderEngine) RegisterPipelines(pipeline PipelinesInterface)

RegisterPipelines add items handle pipelines

func (*SpiderEngine) RegisterSpider

func (e *SpiderEngine) RegisterSpider(spider SpiderInterface)

RegisterSpider add spiders

func (*SpiderEngine) SetAllowedStatus

func (e *SpiderEngine) SetAllowedStatus(allowedStatusCode []uint64)

SetAllowedStatus set allowed response status codes

func (*SpiderEngine) SetDownloadTimeout

func (e *SpiderEngine) SetDownloadTimeout(timeout time.Duration)

SetDownloadTimeout set download timeout

func (*SpiderEngine) Start

func (e *SpiderEngine) Start(spiderName string)

Start spider engine start. It will schedule all spider system

type SpiderInterface

type SpiderInterface interface {
	// StartRequest make new request by feed urls
	StartRequest(req chan<- *Context)

	// Parser parse response ,it can generate ItemMeta and send to engine
	// it also can generate new Request
	Parser(resp *Context, item chan<- *ItemMeta, req chan<- *Context) error

	// ErrorHandler it is used to handler all error recive from engine
	ErrorHandler(err *HandleError, req chan<- *Context)

	// GetName get spider name
	GetName() string
}

type SpiderStats

type SpiderStats struct {
	ItemScraped uint64 // ItemScraped scraped item counter

	RequestDownloaded uint64 // RequestDownloaded request download counter

	NetworkTraffic int64 // NetworkTraffic network traffic counter

	ErrorCount uint64 // ErrorCount count all error recvice

}

SpiderStats is spiders running stats

type Spiders

type Spiders struct {
	SpidersModules map[string]SpiderInterface
}
var SpidersList *Spiders

func NewSpiders

func NewSpiders() *Spiders

func (*Spiders) GetSpider

func (s *Spiders) GetSpider(name string) (SpiderInterface, error)

func (*Spiders) Register

func (s *Spiders) Register(spider SpiderInterface) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL