Documentation ¶
Index ¶
- Constants
- Variables
- func ContextWithLogger(ctx context.Context, logger logging.Logger) context.Context
- func GetLoggerFromContext(ctx context.Context) logging.Logger
- func WithCache(cache Cacher) func(*ScrapeMate) error
- func WithConcurrency(concurrency int) func(*ScrapeMate) error
- func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
- func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error
- func WithFailed() func(*ScrapeMate) error
- func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error
- func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error
- func WithInitJob(job IJob) func(*ScrapeMate) error
- func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
- func WithLogger(log logging.Logger) func(*ScrapeMate) error
- type Cacher
- type CsvCapable
- type HTMLParser
- type HTTPFetcher
- type IJob
- type Job
- func (j *Job) BrowserActions(_ context.Context, page playwright.Page) Response
- func (j *Job) DoCheckResponse(resp *Response) bool
- func (j *Job) DoScreenshot() bool
- func (j *Job) GetBody() []byte
- func (j *Job) GetCacheKey() string
- func (j *Job) GetFullURL() string
- func (j *Job) GetHeaders() map[string]string
- func (j *Job) GetID() string
- func (j *Job) GetMaxRetries() int
- func (j *Job) GetMaxRetryDelay() time.Duration
- func (j *Job) GetMethod() string
- func (j *Job) GetParentID() string
- func (j *Job) GetPriority() int
- func (j *Job) GetRetryPolicy() RetryPolicy
- func (j *Job) GetTimeout() time.Duration
- func (j *Job) GetURL() string
- func (j *Job) GetURLParams() map[string]string
- func (j *Job) Process(_ context.Context, _ *Response) (any, []IJob, error)
- func (j *Job) ProcessOnFetchError() bool
- func (j *Job) String() string
- func (j *Job) UseInResults() bool
- type JobProvider
- type ProxyRotator
- type Response
- type Result
- type ResultWriter
- type RetryPolicy
- type ScrapeMate
- func (s *ScrapeMate) Close() error
- func (s *ScrapeMate) Concurrency() int
- func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)
- func (s *ScrapeMate) Done() <-chan struct{}
- func (s *ScrapeMate) Err() error
- func (s *ScrapeMate) Failed() <-chan IJob
- func (s *ScrapeMate) Results() <-chan Result
- func (s *ScrapeMate) Start() error
Constants ¶
const ( // DefaultUserAgent is the default user agent scrape mate uses DefaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36" // RetryJob retry a job RetryJob = 0 // DiscardJob just discard it in case crawling fails DiscardJob = 1 // RefreshIP refresh the api and then retry job RefreshIP = 2 // StopScraping exit scraping completely when an error happens StopScraping = 3 // DefaultMaxRetryDelay the default max delay between 2 consequive retries DefaultMaxRetryDelay = 2 * time.Second // PriorityHigh high priority PriorityHigh = 0 // PriorityMedium medium priority PriorityMedium = 1 // PriorityLow low priority PriorityLow = 2 )
Variables ¶
var ( // ErrorNoJobProvider returned when you do not set a job provider in initialization ErrorNoJobProvider = errors.New("no job provider set") // ErroExitSignal is returned when scrapemate exits because of a system interrupt ErrorExitSignal = errors.New("exit signal received") // ErrorNoLogger returned when you try to initialize it with a nil logger ErrorNoLogger = errors.New("no logger set") // ErrorNoContext returned when you try to initialized it with a nil context ErrorNoContext = errors.New("no context set") // ErrorConcurrency returned when you try to initialize it with concurrency <1 ErrorConcurrency = errors.New("concurrency must be greater than 0") // ErrorNoHTMLFetcher returned when you try to initialize with a nil httpFetcher ErrorNoHTMLFetcher = errors.New("no http fetcher set") // ErrorNoHTMLParser returned when you try to initialized with a nil HtmlParser ErrorNoHTMLParser = errors.New("no html parser set") // ErrorNoCacher returned when you try to initialized with a nil Cacher ErrorNoCacher = errors.New("no cacher set") // ErrorNoCsvCapable returned when you try to write a csv file without a csv capable Data ErrorNotCsvCapable = errors.New("not csv capable") // ErrInactivityTimeout returned when the system exits because of inactivity ErrInactivityTimeout = errors.New("inactivity timeout") )
Functions ¶
func ContextWithLogger ¶ added in v0.4.3
ContextWithLogger returns a new context with the logger
func GetLoggerFromContext ¶ added in v0.2.2
GetLoggerFromContext returns a logger from the context or a default logger
func WithCache ¶ added in v0.1.1
func WithCache(cache Cacher) func(*ScrapeMate) error
WithCache sets the cache for the scrapemate
func WithConcurrency ¶
func WithConcurrency(concurrency int) func(*ScrapeMate) error
WithConcurrency sets the concurrency for the scrapemate
func WithContext ¶
func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
WithContext sets the context for the scrapemate
func WithExitBecauseOfInactivity ¶ added in v0.5.0
func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error
func WithFailed ¶
func WithFailed() func(*ScrapeMate) error
WithFailed sets the failed jobs channel for the scrapemate
func WithHTMLParser ¶ added in v0.4.0
func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error
WithHTMLParser sets the html parser for the scrapemate
func WithHTTPFetcher ¶ added in v0.4.0
func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error
WithHTTPFetcher sets the http fetcher for the scrapemate
func WithInitJob ¶ added in v0.4.3
func WithInitJob(job IJob) func(*ScrapeMate) error
WithInitJob sets the first job to be processed It will be processed before the jobs from the job provider It is useful if you want to start the scraper with a specific job instead of the first one from the job provider A real use case is when you want to obtain some cookies before starting the scraping process (e.g. login) Important: The results from these job will be discarded !
func WithJobProvider ¶
func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
WithJobProvider sets the job provider for the scrapemate
func WithLogger ¶
func WithLogger(log logging.Logger) func(*ScrapeMate) error
WithLogger sets the logger for the scrapemate
Types ¶
type Cacher ¶ added in v0.1.1
type Cacher interface { Close() error Get(ctx context.Context, key string) (Response, error) Set(ctx context.Context, key string, value *Response) error }
Cacher is an interface for cache
type CsvCapable ¶ added in v0.2.1
CsvCapable is an interface for types that can be converted to csv It is used to convert the Data of a Result to csv
type HTMLParser ¶ added in v0.4.0
HTMLParser is an interface for html parsers
type HTTPFetcher ¶ added in v0.4.0
HTTPFetcher is an interface for http fetchers
type IJob ¶
type IJob interface { fmt.Stringer // GetID returns the unique identifier of the job. GetID() string // GetParentID returns the parent id of the job GetParentID() string // GetMethod returns the http method to use GetMethod() string // GetBody returns the body of the request GetBody() []byte // GetURL returns the url to request GetURL() string // GetHeaders returns the headers to use GetHeaders() map[string]string // GetURLParams returns the url params to use GetURLParams() map[string]string // GetFullURL returns the full url to request // it includes the url params GetFullURL() string // GetTimeout returns the timeout of the job GetTimeout() time.Duration // GetPriority returns the priority of the job GetPriority() int // CheckResponse checks the response of the job DoCheckResponse(resp *Response) bool // GetActionOnResponse returns the action to perform on the response GetRetryPolicy() RetryPolicy // GetMaxRetries returns the max retries of the job GetMaxRetries() int // Process processes the job Process(ctx context.Context, resp *Response) (any, []IJob, error) // GetMaxRetryDelay returns the delay to wait before retrying GetMaxRetryDelay() time.Duration BrowserActions(ctx context.Context, page playwright.Page) Response // DoScreenshot takes a screenshot of the page // Only works if the scraper uses jsfetcher DoScreenshot() bool // GetCacheKey returns the key to use for caching GetCacheKey() string // UseInResults returns true if the job should be used in the results UseInResults() bool // ProcessOnFetchError returns true if the job should be processed even if the job failed ProcessOnFetchError() bool }
IJob is a job to be processed by the scrapemate
type Job ¶
type Job struct { // ID is an identifier for the job ID string // ParentID is the parent id of the job ParentID string // Method can be one valid HTTP method Method string // Body is the request's body Body []byte // URL is the url to sent a request URL string // Headers is the map of headers to use in HTTP Headers map[string]string // URLParams are the url parameters to use in the query string URLParams map[string]string // Timeout is the timeout of that job. By timeout we mean the time // it takes to finish a single crawl Timeout time.Duration // Priority is a number indicating the priority. By convention the higher // the priority Priority int // MaxRetries defines the maximum number of retries when a job fails MaxRetries int // CheckResponse is a function that takes as an input a Response and returns: // true: when the response is to be accepted // false: when the response is to be rejected // By default a response is accepted if status code is 200 CheckResponse func(resp *Response) bool // RetryPolicy can be one of: // RetryJob: to retry the job untl it's successful // DiscardJob:for not accepted responses just discard them and do not retry the job // RefreshIP: Similar to RetryJob with an importan difference // Before the job is retried the IP is refreshed. RetryPolicy RetryPolicy // MaxRetryDelay By default when a job is rejected is retried with an exponential backof // for a MaxRetries numbers of time. If the sleep time between the retries is more than // MaxRetryDelay then it's capped to that. (Default is 2 seconds) MaxRetryDelay time.Duration // TakeScreenshot if true takes a screenshot of the page TakeScreenshot bool Response Response }
Job is the base job that we may use
func (*Job) BrowserActions ¶
BrowserActions is the function that will be executed in the browser This is the function that will be executed in the browser this is a default implementation that will just return the response override this function to perform actions in the browser
func (*Job) DoCheckResponse ¶
CheckResponse checks the response of the job
func (*Job) DoScreenshot ¶
DoScreenshot used to check if we need a screenshot It's here since it's a common use case
func (*Job) GetCacheKey ¶ added in v0.1.1
GetCacheKey returns the key to use for caching
func (*Job) GetFullURL ¶ added in v0.2.2
func (*Job) GetHeaders ¶
GetHeaders returns the headers to use
func (*Job) GetMaxRetries ¶
GetMaxRetry returns the max retry of the job
func (*Job) GetMaxRetryDelay ¶
GetRetryDelay returns the delay to wait before retrying
func (*Job) GetParentID ¶ added in v0.5.1
GetParentID returns the parent id of the job
func (*Job) GetPriority ¶
GetPriority returns the priority of the job
func (*Job) GetRetryPolicy ¶
func (j *Job) GetRetryPolicy() RetryPolicy
GetRetryPolicy returns the action to perform on the response
func (*Job) GetTimeout ¶
GetTimeout returns the timeout of the job
func (*Job) GetURLParams ¶ added in v0.4.0
GetURLParams returns the url params to use
func (*Job) ProcessOnFetchError ¶ added in v0.5.3
ProcessOnFetchError returns true if the job should be processed even if the job failed
func (*Job) UseInResults ¶ added in v0.2.1
UseInResults returns true if the job should be used in the results
type JobProvider ¶
type JobProvider interface { Jobs(ctx context.Context) (<-chan IJob, <-chan error) // Push pushes a job to the job provider Push(ctx context.Context, job IJob) error }
JobProvider is an interface for job providers a job provider is a service that provides jobs to scrapemate scrapemate will call the job provider to get jobs
type ProxyRotator ¶ added in v0.7.0
type ProxyRotator interface { GetCredentials() (string, string) RoundTrip(req *http.Request) (*http.Response, error) Next() string }
ProxyRotator is an interface for proxy rotators
type Response ¶
type Response struct { URL string StatusCode int Headers http.Header Duration time.Duration Body []byte Error error Meta map[string]any Screenshot []byte // Document is the parsed document // if you don't set an html parser the document will be nil // Since each html parser has it's own document type // the document is an interface // You need to cast it to the type of the parser you are using // For example if you are using goquery you need to cast it to *goquery.Document // If you are using the stdib parser net/html the it will be *html.Node Document any }
Response is the struct that it is returned when crawling finishes
type ResultWriter ¶ added in v0.2.1
ResultWriter is an interface for result writers
type RetryPolicy ¶
type RetryPolicy int
type ScrapeMate ¶ added in v0.2.1
type ScrapeMate struct {
// contains filtered or unexported fields
}
Scrapemate contains unexporter fields
func New ¶
func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)
New creates a new scrapemate
func (*ScrapeMate) Close ¶ added in v0.7.1
func (s *ScrapeMate) Close() error
func (*ScrapeMate) Concurrency ¶ added in v0.2.1
func (s *ScrapeMate) Concurrency() int
Concurrency returns how many workers are running in parallel
func (*ScrapeMate) Done ¶ added in v0.2.1
func (s *ScrapeMate) Done() <-chan struct{}
Done returns a channel that's closed when the work is done
func (*ScrapeMate) Err ¶ added in v0.2.1
func (s *ScrapeMate) Err() error
Err returns the error that caused scrapemate's context cancellation
func (*ScrapeMate) Failed ¶ added in v0.2.1
func (s *ScrapeMate) Failed() <-chan IJob
Failed returns the chanell that contains the jobs that failed. It's nil if you don't use the WithFailed option
func (*ScrapeMate) Results ¶ added in v0.2.1
func (s *ScrapeMate) Results() <-chan Result
Results returns a channel containing the results
func (*ScrapeMate) Start ¶ added in v0.2.1
func (s *ScrapeMate) Start() error
Start starts the scraper
Source Files ¶
Directories ¶
Path | Synopsis |
---|---|
adapters
|
|
Package mock is a generated GoMock package.
|
Package mock is a generated GoMock package. |