scrapemate

package module

v0.7.1 Latest Latest Go to latest Published: Oct 20, 2024 License: MIT Imports: 16 Imported by: 14

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/gosom/scrapemate

Links

Open Source Insights

README ¶

scrapemate

Scrapemate is a web crawling and scraping framework written in Golang. It is designed to be simple and easy to use, yet powerful enough to handle complex scraping tasks.

Features

Low level API & Easy High Level API
Customizable retry and error handling
Javascript Rendering with ability to control the browser
Screenshots support (when JS rendering is enabled)
Capability to write your own result exporter
Capability to write results in multiple sinks
Default CSV writer
Caching (File/LevelDB/Custom)
Custom job providers (memory provider included)
Headless and Headful support when using JS rendering
Automatic cookie and session handling
Rotating SOCKS5 proxy support

Installation

go get github.com/gosom/scrapemate

Quickstart

package main

import (
	"context"
	"encoding/csv"
	"fmt"
	"net/http"
	"os"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/gosom/scrapemate"
	"github.com/gosom/scrapemate/adapters/writers/csvwriter"
	"github.com/gosom/scrapemate/scrapemateapp"
)

func main() {
	csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))

	cfg, err := scrapemateapp.NewConfig(
		[]scrapemate.ResultWriter{csvWriter},
	)
	if err != nil {
		panic(err)
	}
	app, err := scrapemateapp.NewScrapeMateApp(cfg)
	if err != nil {
		panic(err)
	}
	seedJobs := []scrapemate.IJob{
		&SimpleCountryJob{
			Job: scrapemate.Job{
				ID:     "identity",
				Method: http.MethodGet,
				URL:    "https://www.scrapethissite.com/pages/simple/",
				Headers: map[string]string{
					"User-Agent": scrapemate.DefaultUserAgent,
				},
				Timeout:    10 * time.Second,
				MaxRetries: 3,
			},
		},
	}
	err = app.Start(context.Background(), seedJobs...)
	if err != nil && err != scrapemate.ErrorExitSignal {
		panic(err)
	}
}

type SimpleCountryJob struct {
	scrapemate.Job
}

func (j *SimpleCountryJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
	doc, ok := resp.Document.(*goquery.Document)
	if !ok {
		return nil, nil, fmt.Errorf("failed to cast response document to goquery document")
	}
	var countries []Country
	doc.Find("div.col-md-4.country").Each(func(i int, s *goquery.Selection) {
		var country Country
		country.Name = strings.TrimSpace(s.Find("h3.country-name").Text())
		country.Capital = strings.TrimSpace(s.Find("div.country-info span.country-capital").Text())
		country.Population = strings.TrimSpace(s.Find("div.country-info span.country-population").Text())
		country.Area = strings.TrimSpace(s.Find("div.country-info span.country-area").Text())
		countries = append(countries, country)
	})
	return countries, nil, nil
}

type Country struct {
	Name       string
	Capital    string
	Population string
	Area       string
}

func (c Country) CsvHeaders() []string {
	return []string{"Name", "Capital", "Population", "Area"}
}

func (c Country) CsvRow() []string {
	return []string{c.Name, c.Capital, c.Population, c.Area}
}

go mod tidy
go run main.go 1>countries.csv

(hit CTRL-C to exit)

Documentation

For the High Level API see this example.

Contributing

Contributions are welcome.

Licence

Scrapemate is licensed under the MIT License. See LICENCE file

Documentation ¶

Index ¶

Constants
Variables
func ContextWithLogger(ctx context.Context, logger logging.Logger) context.Context
func GetLoggerFromContext(ctx context.Context) logging.Logger
func WithCache(cache Cacher) func(*ScrapeMate) error
func WithConcurrency(concurrency int) func(*ScrapeMate) error
func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error
func WithFailed() func(*ScrapeMate) error
func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error
func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error
func WithInitJob(job IJob) func(*ScrapeMate) error
func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
func WithLogger(log logging.Logger) func(*ScrapeMate) error
type Cacher
type CsvCapable
type HTMLParser
type HTTPFetcher
type IJob
type Job
- func (j *Job) BrowserActions(_ context.Context, page playwright.Page) Response
- func (j *Job) DoCheckResponse(resp *Response) bool
- func (j *Job) DoScreenshot() bool
- func (j *Job) GetBody() []byte
- func (j *Job) GetCacheKey() string
- func (j *Job) GetFullURL() string
- func (j *Job) GetHeaders() map[string]string
- func (j *Job) GetID() string
- func (j *Job) GetMaxRetries() int
- func (j *Job) GetMaxRetryDelay() time.Duration
- func (j *Job) GetMethod() string
- func (j *Job) GetParentID() string
- func (j *Job) GetPriority() int
- func (j *Job) GetRetryPolicy() RetryPolicy
- func (j *Job) GetTimeout() time.Duration
- func (j *Job) GetURL() string
- func (j *Job) GetURLParams() map[string]string
- func (j *Job) Process(_ context.Context, _ *Response) (any, []IJob, error)
- func (j *Job) ProcessOnFetchError() bool
- func (j *Job) String() string
- func (j *Job) UseInResults() bool
type JobProvider
type ProxyRotator
type Response
type Result
type ResultWriter
type RetryPolicy
type ScrapeMate
- func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)
- func (s *ScrapeMate) Close() error
- func (s *ScrapeMate) Concurrency() int
- func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)
- func (s *ScrapeMate) Done() <-chan struct{}
- func (s *ScrapeMate) Err() error
- func (s *ScrapeMate) Failed() <-chan IJob
- func (s *ScrapeMate) Results() <-chan Result
- func (s *ScrapeMate) Start() error

Constants ¶

View Source

const (
	// DefaultUserAgent is the default user agent scrape mate uses
	DefaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"

	// RetryJob  retry a job
	RetryJob = 0
	// DiscardJob just discard it in case crawling fails
	DiscardJob = 1
	// RefreshIP refresh the api and then retry job
	RefreshIP = 2
	// StopScraping exit scraping completely when an error happens
	StopScraping = 3

	// DefaultMaxRetryDelay the default max delay between 2 consequive retries
	DefaultMaxRetryDelay = 2 * time.Second

	// PriorityHigh high priority
	PriorityHigh = 0
	// PriorityMedium medium priority
	PriorityMedium = 1
	// PriorityLow low priority
	PriorityLow = 2
)

Variables ¶

View Source

var (
	// ErrorNoJobProvider returned when you do not set a job provider in initialization
	ErrorNoJobProvider = errors.New("no job provider set")
	// ErroExitSignal is returned when scrapemate exits because of a system interrupt
	ErrorExitSignal = errors.New("exit signal received")
	// ErrorNoLogger returned when you try to initialize it with a nil logger
	ErrorNoLogger = errors.New("no logger set")
	// ErrorNoContext returned when you try to initialized it with a nil context
	ErrorNoContext = errors.New("no context set")
	// ErrorConcurrency returned when you try to initialize it with concurrency <1
	ErrorConcurrency = errors.New("concurrency must be greater than 0")
	// ErrorNoHTMLFetcher returned when you try to initialize with a nil httpFetcher
	ErrorNoHTMLFetcher = errors.New("no http fetcher set")
	// ErrorNoHTMLParser returned when you try to initialized with a nil HtmlParser
	ErrorNoHTMLParser = errors.New("no html parser set")
	// ErrorNoCacher returned when you try to initialized with a nil Cacher
	ErrorNoCacher = errors.New("no cacher set")
	// ErrorNoCsvCapable returned when you try to write a csv file without a csv capable Data
	ErrorNotCsvCapable = errors.New("not csv capable")
	// ErrInactivityTimeout returned when the system exits because of inactivity
	ErrInactivityTimeout = errors.New("inactivity timeout")
)

Functions ¶

func ContextWithLogger ¶ added in v0.4.3

func ContextWithLogger(ctx context.Context, logger logging.Logger) context.Context

ContextWithLogger returns a new context with the logger

func GetLoggerFromContext ¶ added in v0.2.2

func GetLoggerFromContext(ctx context.Context) logging.Logger

GetLoggerFromContext returns a logger from the context or a default logger

func WithCache ¶ added in v0.1.1

func WithCache(cache Cacher) func(*ScrapeMate) error

WithCache sets the cache for the scrapemate

func WithConcurrency ¶

func WithConcurrency(concurrency int) func(*ScrapeMate) error

WithConcurrency sets the concurrency for the scrapemate

func WithContext ¶

func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error

WithContext sets the context for the scrapemate

func WithExitBecauseOfInactivity ¶ added in v0.5.0

func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error

func WithFailed ¶

func WithFailed() func(*ScrapeMate) error

WithFailed sets the failed jobs channel for the scrapemate

func WithHTMLParser ¶ added in v0.4.0

func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error

WithHTMLParser sets the html parser for the scrapemate

func WithHTTPFetcher ¶ added in v0.4.0

func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error

WithHTTPFetcher sets the http fetcher for the scrapemate

func WithInitJob ¶ added in v0.4.3

func WithInitJob(job IJob) func(*ScrapeMate) error

WithInitJob sets the first job to be processed It will be processed before the jobs from the job provider It is useful if you want to start the scraper with a specific job instead of the first one from the job provider A real use case is when you want to obtain some cookies before starting the scraping process (e.g. login) Important: The results from these job will be discarded !

func WithJobProvider ¶

func WithJobProvider(provider JobProvider) func(*ScrapeMate) error

WithJobProvider sets the job provider for the scrapemate

func WithLogger ¶

func WithLogger(log logging.Logger) func(*ScrapeMate) error

WithLogger sets the logger for the scrapemate

Types ¶

type Cacher ¶ added in v0.1.1

type Cacher interface {
	Close() error
	Get(ctx context.Context, key string) (Response, error)
	Set(ctx context.Context, key string, value *Response) error
}

Cacher is an interface for cache

type CsvCapable ¶ added in v0.2.1

type CsvCapable interface {
	CsvHeaders() []string
	CsvRow() []string
}

CsvCapable is an interface for types that can be converted to csv It is used to convert the Data of a Result to csv

type HTMLParser ¶ added in v0.4.0

type HTMLParser interface {
	Parse(ctx context.Context, body []byte) (any, error)
}

HTMLParser is an interface for html parsers

type HTTPFetcher ¶ added in v0.4.0

type HTTPFetcher interface {
	Fetch(ctx context.Context, job IJob) Response
	Close() error
}

HTTPFetcher is an interface for http fetchers

type IJob ¶

type IJob interface {
	fmt.Stringer
	// GetID returns the unique identifier of the job.
	GetID() string
	// GetParentID returns the parent id of the job
	GetParentID() string
	// GetMethod returns the http method to use
	GetMethod() string
	// GetBody returns the body of the request
	GetBody() []byte
	// GetURL returns the url to request
	GetURL() string
	// GetHeaders returns the headers to use
	GetHeaders() map[string]string
	// GetURLParams returns the url params to use
	GetURLParams() map[string]string
	// GetFullURL returns the full url to request
	// it includes the url params
	GetFullURL() string
	// GetTimeout returns the timeout of the job
	GetTimeout() time.Duration
	// GetPriority returns the priority of the job
	GetPriority() int
	// CheckResponse checks the response of the job
	DoCheckResponse(resp *Response) bool
	// GetActionOnResponse returns the action to perform on the response
	GetRetryPolicy() RetryPolicy
	// GetMaxRetries returns the max retries of the job
	GetMaxRetries() int
	// Process processes the job
	Process(ctx context.Context, resp *Response) (any, []IJob, error)
	// GetMaxRetryDelay returns the delay to wait before retrying
	GetMaxRetryDelay() time.Duration
	BrowserActions(ctx context.Context, page playwright.Page) Response
	// DoScreenshot takes a screenshot of the page
	// Only works if the scraper uses jsfetcher
	DoScreenshot() bool
	// GetCacheKey returns the key to use for caching
	GetCacheKey() string
	// UseInResults returns true if the job should be used in the results
	UseInResults() bool
	// ProcessOnFetchError returns true if the job should be processed even if the job failed
	ProcessOnFetchError() bool
}

IJob is a job to be processed by the scrapemate

type Job ¶

type Job struct {
	// ID is an identifier for the job
	ID string
	// ParentID is the parent id of the job
	ParentID string
	// Method can be one valid HTTP method
	Method string
	// Body is the request's body
	Body []byte
	// URL is the url to sent a request
	URL string
	// Headers is the map of headers to use in HTTP
	Headers map[string]string
	// URLParams are the url parameters to use in the query string
	URLParams map[string]string
	// Timeout is the timeout of that job. By timeout we mean the time
	// it takes to finish a single crawl
	Timeout time.Duration
	// Priority is a number indicating the priority. By convention the higher
	// the priority
	Priority int
	// MaxRetries defines the maximum number of retries when a job fails
	MaxRetries int
	// CheckResponse is a function that takes as an input a Response and returns:
	// true: when the response is to be accepted
	// false: when the response is to be rejected
	// By default a response is accepted if status code is 200
	CheckResponse func(resp *Response) bool
	// RetryPolicy can be one of:
	// RetryJob: to retry the job untl it's successful
	// DiscardJob:for not accepted responses just discard them and do not retry the job
	// RefreshIP: Similar to RetryJob with an importan difference
	// 				Before the job is retried the IP is refreshed.
	RetryPolicy RetryPolicy
	// MaxRetryDelay By default when a job is rejected is retried with an exponential backof
	// for a MaxRetries numbers of time. If the sleep time between the retries is more than
	// MaxRetryDelay then it's capped to that. (Default is 2 seconds)
	MaxRetryDelay time.Duration
	// TakeScreenshot if true takes a screenshot of the page
	TakeScreenshot bool
	Response       Response
}

Job is the base job that we may use

func (*Job) BrowserActions ¶

func (j *Job) BrowserActions(_ context.Context, page playwright.Page) Response

BrowserActions is the function that will be executed in the browser This is the function that will be executed in the browser this is a default implementation that will just return the response override this function to perform actions in the browser

func (*Job) DoCheckResponse ¶

func (j *Job) DoCheckResponse(resp *Response) bool

CheckResponse checks the response of the job

func (*Job) DoScreenshot ¶

func (j *Job) DoScreenshot() bool

DoScreenshot used to check if we need a screenshot It's here since it's a common use case

func (*Job) GetBody ¶

func (j *Job) GetBody() []byte

GetBody returns the body of the request

func (*Job) GetCacheKey ¶ added in v0.1.1

func (j *Job) GetCacheKey() string

GetCacheKey returns the key to use for caching

func (*Job) GetFullURL ¶ added in v0.2.2

func (j *Job) GetFullURL() string

func (*Job) GetHeaders ¶

func (j *Job) GetHeaders() map[string]string

GetHeaders returns the headers to use

func (*Job) GetID ¶

func (j *Job) GetID() string

GetID returns the unique identifier of the job.

func (*Job) GetMaxRetries ¶

func (j *Job) GetMaxRetries() int

GetMaxRetry returns the max retry of the job

func (*Job) GetMaxRetryDelay ¶

func (j *Job) GetMaxRetryDelay() time.Duration

GetRetryDelay returns the delay to wait before retrying

func (*Job) GetMethod ¶

func (j *Job) GetMethod() string

GetMethod returns the http method to use

func (*Job) GetParentID ¶ added in v0.5.1

func (j *Job) GetParentID() string

GetParentID returns the parent id of the job

func (*Job) GetPriority ¶

func (j *Job) GetPriority() int

GetPriority returns the priority of the job

func (*Job) GetRetryPolicy ¶

func (j *Job) GetRetryPolicy() RetryPolicy

GetRetryPolicy returns the action to perform on the response

func (*Job) GetTimeout ¶

func (j *Job) GetTimeout() time.Duration

GetTimeout returns the timeout of the job

func (*Job) GetURL ¶

func (j *Job) GetURL() string

GetURL returns the url to request

func (*Job) GetURLParams ¶ added in v0.4.0

func (j *Job) GetURLParams() map[string]string

GetURLParams returns the url params to use

func (*Job) Process ¶

func (j *Job) Process(_ context.Context, _ *Response) (any, []IJob, error)

Process processes the job

func (*Job) ProcessOnFetchError ¶ added in v0.5.3

func (j *Job) ProcessOnFetchError() bool

ProcessOnFetchError returns true if the job should be processed even if the job failed

func (*Job) String ¶

func (j *Job) String() string

String returns the string representation of the job

func (*Job) UseInResults ¶ added in v0.2.1

func (j *Job) UseInResults() bool

UseInResults returns true if the job should be used in the results

type JobProvider ¶

type JobProvider interface {
	Jobs(ctx context.Context) (<-chan IJob, <-chan error)
	// Push pushes a job to the job provider
	Push(ctx context.Context, job IJob) error
}

JobProvider is an interface for job providers a job provider is a service that provides jobs to scrapemate scrapemate will call the job provider to get jobs

type ProxyRotator ¶ added in v0.7.0

type ProxyRotator interface {
	GetCredentials() (string, string)
	RoundTrip(req *http.Request) (*http.Response, error)
	Next() string
}

ProxyRotator is an interface for proxy rotators

type Response ¶

type Response struct {
	URL        string
	StatusCode int
	Headers    http.Header
	Duration   time.Duration
	Body       []byte
	Error      error
	Meta       map[string]any
	Screenshot []byte

	// Document is the parsed document
	// if you don't set an html parser the document will be nil
	// Since each html parser has it's own document type
	// the document is an interface
	// You need to cast it to the type of the parser you are using
	// For example if you are using goquery you need to cast it to *goquery.Document
	// If you are using the stdib parser net/html the it will be *html.Node
	Document any
}

Response is the struct that it is returned when crawling finishes

type Result ¶

type Result struct {
	Job  IJob
	Data any
}

Result is the struct items of which the Results channel has

type ResultWriter ¶ added in v0.2.1

type ResultWriter interface {
	Run(ctx context.Context, in <-chan Result) error
}

ResultWriter is an interface for result writers

type RetryPolicy ¶

type RetryPolicy int

type ScrapeMate ¶ added in v0.2.1

type ScrapeMate struct {
	// contains filtered or unexported fields
}

Scrapemate contains unexporter fields

func New ¶

func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)

New creates a new scrapemate

func (*ScrapeMate) Close ¶ added in v0.7.1

func (s *ScrapeMate) Close() error

func (*ScrapeMate) Concurrency ¶ added in v0.2.1

func (s *ScrapeMate) Concurrency() int

Concurrency returns how many workers are running in parallel

func (*ScrapeMate) DoJob ¶ added in v0.2.1

func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)

DoJob scrapes a job and returns it's result

func (*ScrapeMate) Done ¶ added in v0.2.1

func (s *ScrapeMate) Done() <-chan struct{}

Done returns a channel that's closed when the work is done

func (*ScrapeMate) Err ¶ added in v0.2.1

func (s *ScrapeMate) Err() error

Err returns the error that caused scrapemate's context cancellation

func (*ScrapeMate) Failed ¶ added in v0.2.1

func (s *ScrapeMate) Failed() <-chan IJob

Failed returns the chanell that contains the jobs that failed. It's nil if you don't use the WithFailed option

func (*ScrapeMate) Results ¶ added in v0.2.1

func (s *ScrapeMate) Results() <-chan Result

Results returns a channel containing the results

func (*ScrapeMate) Start ¶ added in v0.2.1

func (s *ScrapeMate) Start() error

Start starts the scraper

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
adapters
cache
cache/filecache
cache/leveldbcache
fetchers/jshttp
fetchers/nethttp
parsers/goqueryparser
providers/memory
proxy
writers/csvwriter
writers/jsonwriter
mock Package mock is a generated GoMock package.	Package mock is a generated GoMock package.
scrapemateapp

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL