Documentation
¶
Overview ¶
Copyright 2022 geebytes Licensed under the Apache License, Version 2.0 (the 'License'); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Index ¶
- Constants
- Variables
- func DefaultErrorHandler(spider SpiderInterface, err *HandleError)
- func GetLogger(Name string) *logrus.Entry
- func GetUUID() string
- func NewRequestCache() *requestCache
- type BaseSpider
- type CacheInterface
- type Configuration
- type Context
- type ContextOption
- type DefaultFieldHook
- type Downloader
- type DownloaderOption
- func DownloadWithClient(client http.Client) DownloaderOption
- func DownloadWithTimeout(timeout time.Duration) DownloaderOption
- func DownloadWithTlsConfig(tls *tls.Config) DownloaderOption
- func DownloaderWithStreamThreshold(streamThreshold uint64) DownloaderOption
- func DownloaderWithtransport(transport *http.Transport) DownloaderOption
- type EngineOption
- func EngineWithAllowStatusCode(allowStatusCode []uint64) EngineOption
- func EngineWithContext(ctx context.Context) EngineOption
- func EngineWithDownloader(downloader Downloader) EngineOption
- func EngineWithReadCacheNum(cacheReadNum uint) EngineOption
- func EngineWithRequestNum(requestNum uint) EngineOption
- func EngineWithSchedulerNum(schedulerNum uint) EngineOption
- func EngineWithTimeout(timeout time.Duration) EngineOption
- func EngineWithUniqueReq(uniqueReq bool) EngineOption
- type ErrorHandler
- type ErrorOption
- type HandleError
- type ItemInterface
- type ItemMeta
- type ItemPipelines
- type Logger
- type Middlewares
- type MiddlewaresBase
- type MiddlewaresInterface
- type Option
- func RequestWithAllowRedirects(allowRedirects bool) Option
- func RequestWithMaxConnsPerHost(maxConnsPerHost int) Option
- func RequestWithMaxRedirects(maxRedirects int) Option
- func RequestWithRequestBody(body map[string]interface{}) Option
- func RequestWithRequestCookies(cookies map[string]string) Option
- func RequestWithRequestHeader(header map[string]string) Option
- func RequestWithRequestMeta(meta map[string]interface{}) Option
- func RequestWithRequestParams(params map[string]string) Option
- func RequestWithRequestProxy(proxy Proxy) Option
- func RequestWithResponseWriter(write io.Writer) Option
- type Parser
- type PipelinesBase
- type PipelinesInterface
- type ProcessResponse
- type Proxy
- type RFPDupeFilter
- type RFPDupeFilterInterface
- type RedirectError
- type Request
- type RequestResult
- type Response
- type Settings
- type SpiderDownloader
- type SpiderEngine
- func (e *SpiderEngine) Close()
- func (e *SpiderEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
- func (e *SpiderEngine) RegisterPipelines(pipeline PipelinesInterface)
- func (e *SpiderEngine) RegisterSpider(spider SpiderInterface)
- func (e *SpiderEngine) SetAllowedStatus(allowedStatusCode []uint64)
- func (e *SpiderEngine) SetDownloadTimeout(timeout time.Duration)
- func (e *SpiderEngine) Start(spiderName string)
- type SpiderInterface
- type SpiderStats
- type Spiders
Constants ¶
const ( GET string = "GET" POST string = "POST" PUT string = "PUT" DELETE string = "DELETE" OPTIONS string = "OPTIONS" HEAD string = "HEAD" )
Request method constant definition
Variables ¶
var ( ErrSpiderMiddleware error = errors.New("handle spider middleware error") ErrSpiderCrawls error = errors.New("handle spider crawl error") ErrDuplicateSpiderName error = errors.New("register a duplicate spider name error") ErrEmptySpiderName error = errors.New("register a empty spider name error") ErrSpiderNotExist error = errors.New("not found spider") ErrNotAllowStatusCode error = errors.New("not allow handle status code") ErrGetCacheItem error = errors.New("getting item from cache error") ErrGetHttpProxy error = errors.New("getting http proxy ") ErrGetHttpsProxy error = errors.New("getting https proxy ") ErrParseSocksProxy error = errors.New("parse socks proxy ") ErrResponseRead error = errors.New("read response to buffer error") ErrResponseParse error = errors.New("parse response error") )
var ProcessId string = uuid.New().String()
Functions ¶
func DefaultErrorHandler ¶
func DefaultErrorHandler(spider SpiderInterface, err *HandleError)
// DefaultErrorHandler error default handler
Types ¶
type BaseSpider ¶
BaseSpider base spider
func NewBaseSpider ¶
func NewBaseSpider(name string, feedUrls []string) *BaseSpider
func (*BaseSpider) ErrorHandler ¶
func (s *BaseSpider) ErrorHandler(err *HandleError)
func (*BaseSpider) Parser ¶
func (s *BaseSpider) Parser(resp *Context, item chan<- *ItemMeta, req chan<- *Context) error
Parser parse request respone it will send item or new request to engine
func (*BaseSpider) StartRequest ¶
func (s *BaseSpider) StartRequest(req chan<- *Context)
type CacheInterface ¶
type CacheInterface interface {
// contains filtered or unexported methods
}
CacheInterface request cache interface you can use redis to do cache
type Configuration ¶
type Configuration struct {
Log *Logger `ymal:"log"`
}
var Config *Configuration = &Configuration{ Log: &Logger{ Path: "/var/log", Level: "warn", }, }
type Context ¶
type Context struct { // Request Request *Request // DownloadResult downloader handler result DownloadResult *RequestResult // CtxId CtxId string // Error Error error // contains filtered or unexported fields }
Context spider crawl request schedule unit it is used on all data flow
func NewContext ¶
func NewContext(request *Request, opts ...ContextOption) *Context
func (*Context) Deadline ¶
Deadline returns that there is no deadline (ok==false) when c has no Context.
type ContextOption ¶
type ContextOption func(c *Context)
func WithContext ¶
func WithContext(ctx context.Context) ContextOption
type DefaultFieldHook ¶
type DefaultFieldHook struct { }
func (*DefaultFieldHook) Levels ¶
func (hook *DefaultFieldHook) Levels() []logrus.Level
type Downloader ¶
type Downloader interface { // Download core funcation Download(ctx *Context, result chan<- *Context) // CheckStatus check response status code if allow handle CheckStatus(statusCode uint64, allowStatus []uint64) bool // contains filtered or unexported methods }
Downloader interface
func NewDownloader ¶
func NewDownloader(opts ...DownloaderOption) Downloader
SpiderDownloader get a new spider downloader
type DownloaderOption ¶
type DownloaderOption func(d *SpiderDownloader)
DownloaderOption optional parameters of the downloader
func DownloadWithClient ¶
func DownloadWithClient(client http.Client) DownloaderOption
DownloadWithClient set http client for downloader
func DownloadWithTimeout ¶
func DownloadWithTimeout(timeout time.Duration) DownloaderOption
DownloadWithTimeout set request download timeout
func DownloadWithTlsConfig ¶
func DownloadWithTlsConfig(tls *tls.Config) DownloaderOption
DownloadWithTlsConfig set tls configure for downloader
func DownloaderWithStreamThreshold ¶
func DownloaderWithStreamThreshold(streamThreshold uint64) DownloaderOption
StreamThreshold the must max size of response body to use stream donload
func DownloaderWithtransport ¶
func DownloaderWithtransport(transport *http.Transport) DownloaderOption
DownloaderWithtransport download transport configure http.Transport
type EngineOption ¶
type EngineOption func(r *SpiderEngine)
EngineOption the options params of NewDownloader
func EngineWithAllowStatusCode ¶
func EngineWithAllowStatusCode(allowStatusCode []uint64) EngineOption
EngineWithAllowStatusCode set request response allow status
func EngineWithContext ¶
func EngineWithContext(ctx context.Context) EngineOption
EngineWithContext set engine context
func EngineWithDownloader ¶
func EngineWithDownloader(downloader Downloader) EngineOption
EngineWithDownloader set spider engine downloader
func EngineWithReadCacheNum ¶
func EngineWithReadCacheNum(cacheReadNum uint) EngineOption
EngineWithReadCacheNum set cache reader number
func EngineWithRequestNum ¶
func EngineWithRequestNum(requestNum uint) EngineOption
EngineWithRequestNum set request channel buffer size request channel buffer size default to 1024
func EngineWithSchedulerNum ¶
func EngineWithSchedulerNum(schedulerNum uint) EngineOption
EngineWithSchedulerNum set engine scheduler number default to cpu number
func EngineWithTimeout ¶
func EngineWithTimeout(timeout time.Duration) EngineOption
EngineWithTimeout set request download timeout
func EngineWithUniqueReq ¶
func EngineWithUniqueReq(uniqueReq bool) EngineOption
EngineWithUniqueReq set request unique flag
type ErrorHandler ¶
type ErrorHandler func(spider SpiderInterface, err *HandleError)
ErrorHandler a Customizable error handler funcation receive error from errchans
type ErrorOption ¶
type ErrorOption func(e *HandleError)
func ErrorWithItem ¶
func ErrorWithItem(item *ItemMeta) ErrorOption
func ErrorWithRequest ¶
func ErrorWithRequest(request *Request) ErrorOption
func ErrorWithResponse ¶
func ErrorWithResponse(response *Response) ErrorOption
type HandleError ¶
type HandleError struct { CtxId string Err error Request *Request Response *Response Item *ItemMeta }
func NewError ¶
func NewError(ctxId string, err error, opts ...ErrorOption) *HandleError
func (*HandleError) Error ¶
func (e *HandleError) Error() string
type ItemMeta ¶
type ItemMeta struct { CtxId string Item ItemInterface Ctx context.Context }
func NewItem ¶
func NewItem(ctx *Context, item ItemInterface) *ItemMeta
type ItemPipelines ¶
type ItemPipelines []PipelinesInterface
func (ItemPipelines) Len ¶
func (p ItemPipelines) Len() int
func (ItemPipelines) Less ¶
func (p ItemPipelines) Less(i, j int) bool
func (ItemPipelines) Swap ¶
func (p ItemPipelines) Swap(i, j int)
type Middlewares ¶
type Middlewares []MiddlewaresInterface
func (Middlewares) Len ¶
func (p Middlewares) Len() int
func (Middlewares) Less ¶
func (p Middlewares) Less(i, j int) bool
func (Middlewares) Swap ¶
func (p Middlewares) Swap(i, j int)
type MiddlewaresBase ¶
type MiddlewaresBase struct {
Priority int
}
type MiddlewaresInterface ¶
type MiddlewaresInterface interface { // GetPriority get middlerware priority GetPriority() int // ProcessRequest process request before request to do download ProcessRequest(ctx *Context) error // ProcessResponse process response before response to parse ProcessResponse(ctx *Context, req chan<- *Context) error // GetName get middlerware name GetName() string }
MiddlewaresInterface Download middleware interface for Request and Response processing, the smaller the priority number the higher the priority
type Option ¶
type Option func(r *Request)
Option NewRequest options
func RequestWithMaxRedirects ¶
func RequestWithRequestBody ¶
func RequestWithRequestMeta ¶
func RequestWithRequestProxy ¶
type PipelinesBase ¶
type PipelinesBase struct {
Priority int
}
type PipelinesInterface ¶
type PipelinesInterface interface { // GetPriority get pipeline Priority GetPriority() int // ProcessItem item handler ProcessItem(spider SpiderInterface, item *ItemMeta) error }
PipelinesInterface pipeline interface pipeline is mainly used for processing item, the engine schedules ProcessItem according to the priority of pipelines from highest to lowest
type ProcessResponse ¶
type RFPDupeFilter ¶
type RFPDupeFilter struct {
// contains filtered or unexported fields
}
func NewRFPDupeFilter ¶
func NewRFPDupeFilter(bloomM uint, bloomK uint) *RFPDupeFilter
func (*RFPDupeFilter) DoDupeFilter ¶
func (f *RFPDupeFilter) DoDupeFilter(request *Request) (bool, error)
DoDupeFilter deduplicate request filter by bloom
func (*RFPDupeFilter) Fingerprint ¶
func (f *RFPDupeFilter) Fingerprint(request *Request) ([]byte, error)
type RFPDupeFilterInterface ¶
type RFPDupeFilterInterface interface { // Fingerprint compute request fingerprint Fingerprint(request *Request) ([]byte, error) // DoDupeFilter do request fingerprint duplicates filter DoDupeFilter(request *Request) (bool, error) }
RFPDupeFilterInterface Request Fingerprint duplicates filter interface
type RedirectError ¶
type RedirectError struct {
RedirectNum int
}
func (*RedirectError) Error ¶
func (e *RedirectError) Error() string
type Request ¶
type Request struct { Url string // Set request URL Header map[string]string // Set request header Method string // Set request Method Body []byte // Set request body Params map[string]string // Set request query params Proxy *Proxy // Set request proxy addr Cookies map[string]string // Set request cookie Meta map[string]interface{} // Set other data AllowRedirects bool // Set if allow redirects. default is true MaxRedirects int // Set max allow redirects number BodyReader io.Reader // Set request body reader ResponseWriter io.Writer // Set request response body writer,like file // contains filtered or unexported fields }
Request a spider request config
type RequestResult ¶
type RequestResult struct { Error *HandleError // Error error exception during request Response *Response // Response network request response object }
RequestResult network request response result
func NewDownloadResult ¶
func NewDownloadResult() *RequestResult
type Response ¶
type Response struct { Status int // Status request response status code Header map[string][]string // Header response header Delay float64 // Delay the time of handle download request ContentLength int // ContentLength response content length URL string // URL of request url Buffer *bytes.Buffer // buffer read response buffer }
Response the Request download response data
type SpiderDownloader ¶
type SpiderDownloader struct { // StreamThreshold read body threshold using streaming TODO // if content length is bigger that,download will read response by streaming // it is a feature in the future StreamThreshold uint64 // ProxyFunc update proxy for per request ProxyFunc func(req *http.Request) (*url.URL, error) // contains filtered or unexported fields }
SpiderDownloader tegenaria spider downloader
func (*SpiderDownloader) CheckStatus ¶
func (d *SpiderDownloader) CheckStatus(statusCode uint64, allowStatus []uint64) bool
CheckStatus check response status
func (*SpiderDownloader) Download ¶
func (d *SpiderDownloader) Download(ctx *Context, result chan<- *Context)
Download network downloader
type SpiderEngine ¶
type SpiderEngine struct { // Ctx context.Context Ctx context.Context // DownloadTimeout the request handle timeout value DownloadTimeout time.Duration // RFPDupeFilter request fingerprint BloomFilter // it will work if filterDuplicateReq is true RFPDupeFilter RFPDupeFilterInterface // Stats spider status counter and recorder Stats *SpiderStats // ErrorHandler see ErrorHandler funcation description ErrorHandler ErrorHandler // contains filtered or unexported fields }
var (
Engine *SpiderEngine // SpiderEngine global and once spider engine
)
func NewSpiderEngine ¶
func NewSpiderEngine(opts ...EngineOption) *SpiderEngine
func (*SpiderEngine) RegisterDownloadMiddlewares ¶
func (e *SpiderEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
RegisterDownloadMiddlewares add a download middlewares
func (*SpiderEngine) RegisterPipelines ¶
func (e *SpiderEngine) RegisterPipelines(pipeline PipelinesInterface)
RegisterPipelines add items handle pipelines
func (*SpiderEngine) RegisterSpider ¶
func (e *SpiderEngine) RegisterSpider(spider SpiderInterface)
RegisterSpider add spiders
func (*SpiderEngine) SetAllowedStatus ¶
func (e *SpiderEngine) SetAllowedStatus(allowedStatusCode []uint64)
SetAllowedStatus set allowed response status codes
func (*SpiderEngine) SetDownloadTimeout ¶
func (e *SpiderEngine) SetDownloadTimeout(timeout time.Duration)
SetDownloadTimeout set download timeout
func (*SpiderEngine) Start ¶
func (e *SpiderEngine) Start(spiderName string)
Start spider engine start. It will schedule all spider system
type SpiderInterface ¶
type SpiderInterface interface { // StartRequest make new request by feed urls StartRequest(req chan<- *Context) // Parser parse response ,it can generate ItemMeta and send to engine // it also can generate new Request Parser(resp *Context, item chan<- *ItemMeta, req chan<- *Context) error // ErrorHandler it is used to handler all error recive from engine ErrorHandler(err *HandleError, req chan<- *Context) // GetName get spider name GetName() string }
type SpiderStats ¶
type SpiderStats struct { ItemScraped uint64 // ItemScraped scraped item counter RequestDownloaded uint64 // RequestDownloaded request download counter NetworkTraffic int64 // NetworkTraffic network traffic counter ErrorCount uint64 // ErrorCount count all error recvice }
SpiderStats is spiders running stats
type Spiders ¶
type Spiders struct {
SpidersModules map[string]SpiderInterface
}
var SpidersList *Spiders
func NewSpiders ¶
func NewSpiders() *Spiders
func (*Spiders) Register ¶
func (s *Spiders) Register(spider SpiderInterface) error