scrapemate

package module
v0.8.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 3, 2024 License: MIT Imports: 17 Imported by: 14

README

scrapemate

GoDoc build Go Report Card

Scrapemate is a web crawling and scraping framework written in Golang. It is designed to be simple and easy to use, yet powerful enough to handle complex scraping tasks.

Features

  • Low level API & Easy High Level API
  • Customizable retry and error handling
  • Javascript Rendering with ability to control the browser
  • Screenshots support (when JS rendering is enabled)
  • Capability to write your own result exporter
  • Capability to write results in multiple sinks
  • Default CSV writer
  • Caching (File/LevelDB/Custom)
  • Custom job providers (memory provider included)
  • Headless and Headful support when using JS rendering
  • Automatic cookie and session handling
  • Rotating HTTP/HTTPS/SOCKS5 proxy support

Installation

go get github.com/gosom/scrapemate

Quickstart

package main

import (
	"context"
	"encoding/csv"
	"fmt"
	"net/http"
	"os"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/gosom/scrapemate"
	"github.com/gosom/scrapemate/adapters/writers/csvwriter"
	"github.com/gosom/scrapemate/scrapemateapp"
)

func main() {
	csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout))

	cfg, err := scrapemateapp.NewConfig(
		[]scrapemate.ResultWriter{csvWriter},
	)
	if err != nil {
		panic(err)
	}
	app, err := scrapemateapp.NewScrapeMateApp(cfg)
	if err != nil {
		panic(err)
	}
	seedJobs := []scrapemate.IJob{
		&SimpleCountryJob{
			Job: scrapemate.Job{
				ID:     "identity",
				Method: http.MethodGet,
				URL:    "https://www.scrapethissite.com/pages/simple/",
				Headers: map[string]string{
					"User-Agent": scrapemate.DefaultUserAgent,
				},
				Timeout:    10 * time.Second,
				MaxRetries: 3,
			},
		},
	}
	err = app.Start(context.Background(), seedJobs...)
	if err != nil && err != scrapemate.ErrorExitSignal {
		panic(err)
	}
}

type SimpleCountryJob struct {
	scrapemate.Job
}

func (j *SimpleCountryJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) {
	doc, ok := resp.Document.(*goquery.Document)
	if !ok {
		return nil, nil, fmt.Errorf("failed to cast response document to goquery document")
	}
	var countries []Country
	doc.Find("div.col-md-4.country").Each(func(i int, s *goquery.Selection) {
		var country Country
		country.Name = strings.TrimSpace(s.Find("h3.country-name").Text())
		country.Capital = strings.TrimSpace(s.Find("div.country-info span.country-capital").Text())
		country.Population = strings.TrimSpace(s.Find("div.country-info span.country-population").Text())
		country.Area = strings.TrimSpace(s.Find("div.country-info span.country-area").Text())
		countries = append(countries, country)
	})
	return countries, nil, nil
}

type Country struct {
	Name       string
	Capital    string
	Population string
	Area       string
}

func (c Country) CsvHeaders() []string {
	return []string{"Name", "Capital", "Population", "Area"}
}

func (c Country) CsvRow() []string {
	return []string{c.Name, c.Capital, c.Population, c.Area}
}

go mod tidy
go run main.go 1>countries.csv

(hit CTRL-C to exit)

Documentation

For the High Level API see this example.

Read also how to use high level api

For the Low Level API see books.toscrape.com

Additionally, for low level API you can read the blogpost

See an example of how you can use scrapemate go scrape Google Maps: https://github.com/gosom/google-maps-scraper

Contributing

Contributions are welcome.

Licence

Scrapemate is licensed under the MIT License. See LICENCE file

Documentation

Index

Constants

View Source
const (
	// DefaultUserAgent is the default user agent scrape mate uses
	DefaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"

	// RetryJob  retry a job
	RetryJob = 0
	// DiscardJob just discard it in case crawling fails
	DiscardJob = 1
	// RefreshIP refresh the api and then retry job
	RefreshIP = 2
	// StopScraping exit scraping completely when an error happens
	StopScraping = 3

	// DefaultMaxRetryDelay the default max delay between 2 consequive retries
	DefaultMaxRetryDelay = 2 * time.Second

	// PriorityHigh high priority
	PriorityHigh = 0
	// PriorityMedium medium priority
	PriorityMedium = 1
	// PriorityLow low priority
	PriorityLow = 2
)

Variables

View Source
var (
	// ErrorNoJobProvider returned when you do not set a job provider in initialization
	ErrorNoJobProvider = errors.New("no job provider set")
	// ErroExitSignal is returned when scrapemate exits because of a system interrupt
	ErrorExitSignal = errors.New("exit signal received")
	// ErrorNoLogger returned when you try to initialize it with a nil logger
	ErrorNoLogger = errors.New("no logger set")
	// ErrorNoContext returned when you try to initialized it with a nil context
	ErrorNoContext = errors.New("no context set")
	// ErrorConcurrency returned when you try to initialize it with concurrency <1
	ErrorConcurrency = errors.New("concurrency must be greater than 0")
	// ErrorNoHTMLFetcher returned when you try to initialize with a nil httpFetcher
	ErrorNoHTMLFetcher = errors.New("no http fetcher set")
	// ErrorNoHTMLParser returned when you try to initialized with a nil HtmlParser
	ErrorNoHTMLParser = errors.New("no html parser set")
	// ErrorNoCacher returned when you try to initialized with a nil Cacher
	ErrorNoCacher = errors.New("no cacher set")
	// ErrorNoCsvCapable returned when you try to write a csv file without a csv capable Data
	ErrorNotCsvCapable = errors.New("not csv capable")
	// ErrInactivityTimeout returned when the system exits because of inactivity
	ErrInactivityTimeout = errors.New("inactivity timeout")
)

Functions

func ContextWithLogger added in v0.4.3

func ContextWithLogger(ctx context.Context, logger logging.Logger) context.Context

ContextWithLogger returns a new context with the logger

func GetLoggerFromContext added in v0.2.2

func GetLoggerFromContext(ctx context.Context) logging.Logger

GetLoggerFromContext returns a logger from the context or a default logger

func WithCache added in v0.1.1

func WithCache(cache Cacher) func(*ScrapeMate) error

WithCache sets the cache for the scrapemate

func WithConcurrency

func WithConcurrency(concurrency int) func(*ScrapeMate) error

WithConcurrency sets the concurrency for the scrapemate

func WithContext

func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error

WithContext sets the context for the scrapemate

func WithExitBecauseOfInactivity added in v0.5.0

func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error

func WithFailed

func WithFailed() func(*ScrapeMate) error

WithFailed sets the failed jobs channel for the scrapemate

func WithHTMLParser added in v0.4.0

func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error

WithHTMLParser sets the html parser for the scrapemate

func WithHTTPFetcher added in v0.4.0

func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error

WithHTTPFetcher sets the http fetcher for the scrapemate

func WithInitJob added in v0.4.3

func WithInitJob(job IJob) func(*ScrapeMate) error

WithInitJob sets the first job to be processed It will be processed before the jobs from the job provider It is useful if you want to start the scraper with a specific job instead of the first one from the job provider A real use case is when you want to obtain some cookies before starting the scraping process (e.g. login) Important: The results from these job will be discarded !

func WithJobProvider

func WithJobProvider(provider JobProvider) func(*ScrapeMate) error

WithJobProvider sets the job provider for the scrapemate

func WithLogger

func WithLogger(log logging.Logger) func(*ScrapeMate) error

WithLogger sets the logger for the scrapemate

Types

type Cacher added in v0.1.1

type Cacher interface {
	Close() error
	Get(ctx context.Context, key string) (Response, error)
	Set(ctx context.Context, key string, value *Response) error
}

Cacher is an interface for cache

type CsvCapable added in v0.2.1

type CsvCapable interface {
	CsvHeaders() []string
	CsvRow() []string
}

CsvCapable is an interface for types that can be converted to csv It is used to convert the Data of a Result to csv

type HTMLParser added in v0.4.0

type HTMLParser interface {
	Parse(ctx context.Context, body []byte) (any, error)
}

HTMLParser is an interface for html parsers

type HTTPFetcher added in v0.4.0

type HTTPFetcher interface {
	Fetch(ctx context.Context, job IJob) Response
	Close() error
}

HTTPFetcher is an interface for http fetchers

type IJob

type IJob interface {
	fmt.Stringer
	// GetID returns the unique identifier of the job.
	GetID() string
	// GetParentID returns the parent id of the job
	GetParentID() string
	// GetMethod returns the http method to use
	GetMethod() string
	// GetBody returns the body of the request
	GetBody() []byte
	// GetURL returns the url to request
	GetURL() string
	// GetHeaders returns the headers to use
	GetHeaders() map[string]string
	// GetURLParams returns the url params to use
	GetURLParams() map[string]string
	// GetFullURL returns the full url to request
	// it includes the url params
	GetFullURL() string
	// GetTimeout returns the timeout of the job
	GetTimeout() time.Duration
	// GetPriority returns the priority of the job
	GetPriority() int
	// CheckResponse checks the response of the job
	DoCheckResponse(resp *Response) bool
	// GetActionOnResponse returns the action to perform on the response
	GetRetryPolicy() RetryPolicy
	// GetMaxRetries returns the max retries of the job
	GetMaxRetries() int
	// Process processes the job
	Process(ctx context.Context, resp *Response) (any, []IJob, error)
	// GetMaxRetryDelay returns the delay to wait before retrying
	GetMaxRetryDelay() time.Duration
	BrowserActions(ctx context.Context, page playwright.Page) Response
	// DoScreenshot takes a screenshot of the page
	// Only works if the scraper uses jsfetcher
	DoScreenshot() bool
	// GetCacheKey returns the key to use for caching
	GetCacheKey() string
	// UseInResults returns true if the job should be used in the results
	UseInResults() bool
	// ProcessOnFetchError returns true if the job should be processed even if the job failed
	ProcessOnFetchError() bool
}

IJob is a job to be processed by the scrapemate

type Job

type Job struct {
	// ID is an identifier for the job
	ID string
	// ParentID is the parent id of the job
	ParentID string
	// Method can be one valid HTTP method
	Method string
	// Body is the request's body
	Body []byte
	// URL is the url to sent a request
	URL string
	// Headers is the map of headers to use in HTTP
	Headers map[string]string
	// URLParams are the url parameters to use in the query string
	URLParams map[string]string
	// Timeout is the timeout of that job. By timeout we mean the time
	// it takes to finish a single crawl
	Timeout time.Duration
	// Priority is a number indicating the priority. By convention the higher
	// the priority
	Priority int
	// MaxRetries defines the maximum number of retries when a job fails
	MaxRetries int
	// CheckResponse is a function that takes as an input a Response and returns:
	// true: when the response is to be accepted
	// false: when the response is to be rejected
	// By default a response is accepted if status code is 200
	CheckResponse func(resp *Response) bool
	// RetryPolicy can be one of:
	// RetryJob: to retry the job untl it's successful
	// DiscardJob:for not accepted responses just discard them and do not retry the job
	// RefreshIP: Similar to RetryJob with an importan difference
	// 				Before the job is retried the IP is refreshed.
	RetryPolicy RetryPolicy
	// MaxRetryDelay By default when a job is rejected is retried with an exponential backof
	// for a MaxRetries numbers of time. If the sleep time between the retries is more than
	// MaxRetryDelay then it's capped to that. (Default is 2 seconds)
	MaxRetryDelay time.Duration
	// TakeScreenshot if true takes a screenshot of the page
	TakeScreenshot bool
	Response       Response
}

Job is the base job that we may use

func (*Job) BrowserActions

func (j *Job) BrowserActions(_ context.Context, page playwright.Page) Response

BrowserActions is the function that will be executed in the browser This is the function that will be executed in the browser this is a default implementation that will just return the response override this function to perform actions in the browser

func (*Job) DoCheckResponse

func (j *Job) DoCheckResponse(resp *Response) bool

CheckResponse checks the response of the job

func (*Job) DoScreenshot

func (j *Job) DoScreenshot() bool

DoScreenshot used to check if we need a screenshot It's here since it's a common use case

func (*Job) GetBody

func (j *Job) GetBody() []byte

GetBody returns the body of the request

func (*Job) GetCacheKey added in v0.1.1

func (j *Job) GetCacheKey() string

GetCacheKey returns the key to use for caching

func (*Job) GetFullURL added in v0.2.2

func (j *Job) GetFullURL() string

func (*Job) GetHeaders

func (j *Job) GetHeaders() map[string]string

GetHeaders returns the headers to use

func (*Job) GetID

func (j *Job) GetID() string

GetID returns the unique identifier of the job.

func (*Job) GetMaxRetries

func (j *Job) GetMaxRetries() int

GetMaxRetry returns the max retry of the job

func (*Job) GetMaxRetryDelay

func (j *Job) GetMaxRetryDelay() time.Duration

GetRetryDelay returns the delay to wait before retrying

func (*Job) GetMethod

func (j *Job) GetMethod() string

GetMethod returns the http method to use

func (*Job) GetParentID added in v0.5.1

func (j *Job) GetParentID() string

GetParentID returns the parent id of the job

func (*Job) GetPriority

func (j *Job) GetPriority() int

GetPriority returns the priority of the job

func (*Job) GetRetryPolicy

func (j *Job) GetRetryPolicy() RetryPolicy

GetRetryPolicy returns the action to perform on the response

func (*Job) GetTimeout

func (j *Job) GetTimeout() time.Duration

GetTimeout returns the timeout of the job

func (*Job) GetURL

func (j *Job) GetURL() string

GetURL returns the url to request

func (*Job) GetURLParams added in v0.4.0

func (j *Job) GetURLParams() map[string]string

GetURLParams returns the url params to use

func (*Job) Process

func (j *Job) Process(_ context.Context, _ *Response) (any, []IJob, error)

Process processes the job

func (*Job) ProcessOnFetchError added in v0.5.3

func (j *Job) ProcessOnFetchError() bool

ProcessOnFetchError returns true if the job should be processed even if the job failed

func (*Job) String

func (j *Job) String() string

String returns the string representation of the job

func (*Job) UseInResults added in v0.2.1

func (j *Job) UseInResults() bool

UseInResults returns true if the job should be used in the results

type JobProvider

type JobProvider interface {
	Jobs(ctx context.Context) (<-chan IJob, <-chan error)
	// Push pushes a job to the job provider
	Push(ctx context.Context, job IJob) error
}

JobProvider is an interface for job providers a job provider is a service that provides jobs to scrapemate scrapemate will call the job provider to get jobs

type Proxy added in v0.8.0

type Proxy struct {
	URL      string
	Username string
	Password string
}

Proxy is a struct for proxy

func NewProxy added in v0.8.0

func NewProxy(u string) (Proxy, error)

type ProxyRotator added in v0.7.0

type ProxyRotator interface {
	RoundTrip(req *http.Request) (*http.Response, error)
	Next() Proxy
}

ProxyRotator is an interface for proxy rotators

type Response

type Response struct {
	URL        string
	StatusCode int
	Headers    http.Header
	Duration   time.Duration
	Body       []byte
	Error      error
	Meta       map[string]any
	Screenshot []byte

	// Document is the parsed document
	// if you don't set an html parser the document will be nil
	// Since each html parser has it's own document type
	// the document is an interface
	// You need to cast it to the type of the parser you are using
	// For example if you are using goquery you need to cast it to *goquery.Document
	// If you are using the stdib parser net/html the it will be *html.Node
	Document any
}

Response is the struct that it is returned when crawling finishes

type Result

type Result struct {
	Job  IJob
	Data any
}

Result is the struct items of which the Results channel has

type ResultWriter added in v0.2.1

type ResultWriter interface {
	Run(ctx context.Context, in <-chan Result) error
}

ResultWriter is an interface for result writers

type RetryPolicy

type RetryPolicy int

type ScrapeMate added in v0.2.1

type ScrapeMate struct {
	// contains filtered or unexported fields
}

Scrapemate contains unexporter fields

func New

func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)

New creates a new scrapemate

func (*ScrapeMate) Close added in v0.7.1

func (s *ScrapeMate) Close() error

func (*ScrapeMate) Concurrency added in v0.2.1

func (s *ScrapeMate) Concurrency() int

Concurrency returns how many workers are running in parallel

func (*ScrapeMate) DoJob added in v0.2.1

func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)

DoJob scrapes a job and returns it's result

func (*ScrapeMate) Done added in v0.2.1

func (s *ScrapeMate) Done() <-chan struct{}

Done returns a channel that's closed when the work is done

func (*ScrapeMate) Err added in v0.2.1

func (s *ScrapeMate) Err() error

Err returns the error that caused scrapemate's context cancellation

func (*ScrapeMate) Failed added in v0.2.1

func (s *ScrapeMate) Failed() <-chan IJob

Failed returns the chanell that contains the jobs that failed. It's nil if you don't use the WithFailed option

func (*ScrapeMate) Results added in v0.2.1

func (s *ScrapeMate) Results() <-chan Result

Results returns a channel containing the results

func (*ScrapeMate) Start added in v0.2.1

func (s *ScrapeMate) Start() error

Start starts the scraper

Directories

Path Synopsis
adapters
Package mock is a generated GoMock package.
Package mock is a generated GoMock package.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL