Documentation ¶
Index ¶
- Constants
- Variables
- func WithCache(cache Cacher) func(*ScrapeMate) error
- func WithConcurrency(concurrency int) func(*ScrapeMate) error
- func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
- func WithFailed() func(*ScrapeMate) error
- func WithHtmlParser(parser HtmlParser) func(*ScrapeMate) error
- func WithHttpFetcher(client HttpFetcher) func(*ScrapeMate) error
- func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
- func WithLogger(log logging.Logger) func(*ScrapeMate) error
- type Cacher
- type CsvCapable
- type HtmlParser
- type HttpFetcher
- type IJob
- type Job
- func (j *Job) BrowserActions(browser playwright.Browser) Response
- func (j *Job) DoCheckResponse(resp Response) bool
- func (j *Job) DoScreenshot() bool
- func (j *Job) GetBody() []byte
- func (j *Job) GetCacheKey() string
- func (j *Job) GetHeaders() map[string]string
- func (j *Job) GetID() string
- func (j *Job) GetMaxRetries() int
- func (j *Job) GetMaxRetryDelay() time.Duration
- func (j *Job) GetMethod() string
- func (j *Job) GetPriority() int
- func (j *Job) GetRetryPolicy() RetryPolicy
- func (j *Job) GetTimeout() time.Duration
- func (j *Job) GetURL() string
- func (j *Job) GetUrlParams() map[string]string
- func (j *Job) Process(ctx context.Context, resp Response) (any, []IJob, error)
- func (j *Job) String() string
- func (j *Job) UseInResults() bool
- type JobProvider
- type Response
- type Result
- type ResultWriter
- type RetryPolicy
- type ScrapeMate
- func (s *ScrapeMate) Concurrency() int
- func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)
- func (s *ScrapeMate) Done() <-chan struct{}
- func (s *ScrapeMate) Err() error
- func (s *ScrapeMate) Failed() <-chan IJob
- func (s *ScrapeMate) Results() <-chan Result
- func (s *ScrapeMate) Start() error
Constants ¶
const ( // DefaultUserAgent is the default user agent scrape mate uses DefaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36" // RetryJob retry a job RetryJob = 0 // DiscardJob just discard it in case crawling fails DiscardJob = 1 // RefreshIP refresh the api and then retry job RefreshIP = 2 // StopScraping exit scraping completely when an error happens StopScraping = 3 // DefaultMaxRetryDelay the default max delay between 2 consequive retries DefaultMaxRetryDelay = 2 * time.Second )
Variables ¶
var ( // ErrorNoJobProvider returned when you do not set a job provider in initialization ErrorNoJobProvider = errors.New("no job provider set") // ErroExitSignal is returned when scrapemate exits because of a system interrupt ErrorExitSignal = errors.New("exit signal received") // ErrorNoLogger returned when you try to initialize it with a nil logger ErrorNoLogger = errors.New("no logger set") // ErrorNoContext returned when you try to initialized it with a nil context ErrorNoContext = errors.New("no context set") // ErrorConcurrency returned when you try to initialize it with concurrency <1 ErrorConcurrency = errors.New("concurrency must be greater than 0") // ErrorNoHttpFetcher returned when you try to initialize with a nil httpFetcher ErrorNoHttpFetcher = errors.New("no http fetcher set") // ErrorNoHtmlParser returned when you try to initialized with a nil HtmlParser ErrorNoHtmlParser = errors.New("no html parser set") // ErrorNoCacher returned when you try to initialized with a nil Cacher ErrorNoCacher = errors.New("no cacher set") // ErrorNoCsvCapable returned when you try to write a csv file without a csv capable Data ErrorNotCsvCapable = errors.New("not csv capable") )
Functions ¶
func WithCache ¶ added in v0.1.1
func WithCache(cache Cacher) func(*ScrapeMate) error
WithCache sets the cache for the scrapemate
func WithConcurrency ¶
func WithConcurrency(concurrency int) func(*ScrapeMate) error
WithConcurrency sets the concurrency for the scrapemate
func WithContext ¶
func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
WithContext sets the context for the scrapemate
func WithFailed ¶
func WithFailed() func(*ScrapeMate) error
WithFailed sets the failed jobs channel for the scrapemate
func WithHtmlParser ¶
func WithHtmlParser(parser HtmlParser) func(*ScrapeMate) error
WithHtmlParser sets the html parser for the scrapemate
func WithHttpFetcher ¶
func WithHttpFetcher(client HttpFetcher) func(*ScrapeMate) error
WithHttpFetcher sets the http fetcher for the scrapemate
func WithJobProvider ¶
func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
WithJobProvider sets the job provider for the scrapemate
func WithLogger ¶
func WithLogger(log logging.Logger) func(*ScrapeMate) error
WithLogger sets the logger for the scrapemate
Types ¶
type Cacher ¶ added in v0.1.1
type Cacher interface { Close() error Get(ctx context.Context, key string) (Response, error) Set(ctx context.Context, key string, value Response) error }
Cacher is an interface for cache
type CsvCapable ¶ added in v0.2.1
CsvCapable is an interface for types that can be converted to csv It is used to convert the Data of a Result to csv
type HtmlParser ¶
HtmlParser is an interface for html parsers
type HttpFetcher ¶
HttpFetcher is an interface for http fetchers
type IJob ¶
type IJob interface { fmt.Stringer // GetID returns the unique identifier of the job. GetID() string // GetMethod returns the http method to use GetMethod() string // GetBody returns the body of the request GetBody() []byte // GetURL returns the url to request GetURL() string // GetHeaders returns the headers to use GetHeaders() map[string]string // GetUrlParams returns the url params to use GetUrlParams() map[string]string // GetTimeout returns the timeout of the job GetTimeout() time.Duration // GetPriority returns the priority of the job GetPriority() int // CheckResponse checks the response of the job DoCheckResponse(resp Response) bool // GetActionOnResponse returns the action to perform on the response GetRetryPolicy() RetryPolicy // GetMaxRetries returns the max retries of the job GetMaxRetries() int // Process processes the job Process(ctx context.Context, resp Response) (any, []IJob, error) // GetMaxRetryDelay returns the delay to wait before retrying GetMaxRetryDelay() time.Duration BrowserActions(browser playwright.Browser) Response // DoScreenshot takes a screenshot of the page // Only works if the scraper uses jsfetcher DoScreenshot() bool // GetCacheKey returns the key to use for caching GetCacheKey() string // UseInResults returns true if the job should be used in the results UseInResults() bool }
IJob is a job to be processed by the scrapemate
type Job ¶
type Job struct { // ID is an identifier for the job ID string // Method can be one valid HTTP method Method string // Body is the request's body Body []byte // URL is the url to sent a request URL string // Headers is the map of headers to use in HTTP Headers map[string]string // UrlParams are the url parameters to use in the query string UrlParams map[string]string // Timeout is the timeout of that job. By timeout we mean the time // it takes to finish a single crawl Timeout time.Duration // Priority is a number indicating the priority. By convention the higher // the priority Priority int // MaxRetries defines the maximum number of retries when a job fails MaxRetries int // CheckResponse is a function that takes as an input a Response and returns: // true: when the response is to be accepted // false: when the response is to be rejected // By default a response is accepted if status code is 200 CheckResponse func(resp Response) bool // RetryPolicy can be one of: // RetryJob: to retry the job untl it's sucessful // DiscardJob:for not accepted responses just discard them and do not retry the job // RefreshIP: Similar to RetryJob with an importan difference // Before the job is retried the IP is refreshed. RetryPolicy RetryPolicy // MaxRetryDelay By default when a job is rejected is retried with an exponential backof // for a MaxRetries numbers of time. If the sleep time between the retries is more than // MaxRetryDelay then it's capped to that. (Default is 2 seconds) MaxRetryDelay time.Duration //TakeScreenshot if true takes a screenshot of the page TakeScreenshot bool Response Response }
Job is the base job that we may use
func (*Job) BrowserActions ¶
BrowserActions is the function that will be executed in the browser This is the function that will be executed in the browser this is a default implementation that will just return the response override this function to perform actions in the browser
func (*Job) DoCheckResponse ¶
CheckResponse checks the response of the job
func (*Job) DoScreenshot ¶
DoScreenshot used to check if we need a screenshot It's here since it's a common use case
func (*Job) GetCacheKey ¶ added in v0.1.1
GetCacheKey returns the key to use for caching
func (*Job) GetHeaders ¶
GetHeaders returns the headers to use
func (*Job) GetMaxRetries ¶
GetMaxRetry returns the max retry of the job
func (*Job) GetMaxRetryDelay ¶
GetRetryDelay returns the delay to wait before retrying
func (*Job) GetPriority ¶
GetPriority returns the priority of the job
func (*Job) GetRetryPolicy ¶
func (j *Job) GetRetryPolicy() RetryPolicy
GetRetryPolicy returns the action to perform on the response
func (*Job) GetTimeout ¶
GetTimeout returns the timeout of the job
func (*Job) GetUrlParams ¶
GetUrlParams returns the url params to use
func (*Job) UseInResults ¶ added in v0.2.1
UseInResults returns true if the job should be used in the results
type JobProvider ¶
type JobProvider interface { Jobs(ctx context.Context) (<-chan IJob, <-chan error) // Push pushes a job to the job provider Push(ctx context.Context, job IJob) error }
JobProvider is an interface for job providers a job provider is a service that provides jobs to scrapemate scrapemate will call the job provider to get jobs
type Response ¶
type Response struct { URL string StatusCode int Headers http.Header Duration time.Duration Body []byte Error error Meta map[string]any Screenshot []byte // Document is the parsed document // if you don't set an html parser the document will be nil // Since each html parser has it's own document type // the document is an interface // You need to cast it to the type of the parser you are using // For example if you are using goquery you need to cast it to *goquery.Document // If you are using the stdib parser net/html the it will be *html.Node Document any }
Response is the struct that it is returned when crawling finishes
type ResultWriter ¶ added in v0.2.1
ResultWriter is an interface for result writers
type RetryPolicy ¶
type RetryPolicy int
type ScrapeMate ¶ added in v0.2.1
type ScrapeMate struct {
// contains filtered or unexported fields
}
Scrapemate contains unexporter fields
func New ¶
func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)
New creates a new scrapemate
func (*ScrapeMate) Concurrency ¶ added in v0.2.1
func (s *ScrapeMate) Concurrency() int
Concurrency returns how many workers are running in parallel
func (*ScrapeMate) Done ¶ added in v0.2.1
func (s *ScrapeMate) Done() <-chan struct{}
Done returns a channel that's closed when the work is done
func (*ScrapeMate) Err ¶ added in v0.2.1
func (s *ScrapeMate) Err() error
Err returns the error that caused scrapemate's context cancellation
func (*ScrapeMate) Failed ¶ added in v0.2.1
func (s *ScrapeMate) Failed() <-chan IJob
Failed returns the chanell that contains the jobs that failed. It's nil if you don't use the WithFailed option
func (*ScrapeMate) Results ¶ added in v0.2.1
func (s *ScrapeMate) Results() <-chan Result
Results returns a channel containing the results
func (*ScrapeMate) Start ¶ added in v0.2.1
func (s *ScrapeMate) Start() error
Start starts the scraper
Source Files ¶
Directories ¶
Path | Synopsis |
---|---|
adapters
|
|
Package mock is a generated GoMock package.
|
Package mock is a generated GoMock package. |