crawler

package
v0.0.0-...-b1bbf48 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 21, 2024 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrInputChannelClosed = errors.New("input channel closed")

Functions

This section is empty.

Types

type FetchDatasource

type FetchDatasource interface {
	DownloadPage(url string) (string, error)
	Close() error
}

type FetchResult

type FetchResult struct {
	Err error

	RespBody string
	// contains filtered or unexported fields
}

type HTMLPipe

type HTMLPipe chan string

type Interceptor

type Interceptor struct {
	// contains filtered or unexported fields
}

func NewInterceptor

func NewInterceptor() *Interceptor

func (Interceptor) CreateObserver

func (i Interceptor) CreateObserver(
	on *url.URL, useBasePath bool,
	optWatcherKey ...string,
) (pipe HTMLPipe)

func (Interceptor) Dispose

func (i Interceptor) Dispose()

func (Interceptor) HandleResponse

func (i Interceptor) HandleResponse(htmlStr string, target *url.URL)

type InterceptorCallback

type InterceptorCallback = func(htmlStr string, target *url.URL)

type ParallelFetch

type ParallelFetch struct {
	// contains filtered or unexported fields
}

func NewParallelFetch

func NewParallelFetch(factory datatypes.Factory[FetchDatasource]) *ParallelFetch

func (*ParallelFetch) Fetch

func (pf *ParallelFetch) Fetch(urls ...string)

func (*ParallelFetch) Responses

func (pf *ParallelFetch) Responses() iter.Seq2[string, FetchResult]

Responses is an iterator over the elements received on output channel.

func (*ParallelFetch) Start

func (pf *ParallelFetch) Start(ctx context.Context, numWorkers int) error

func (*ParallelFetch) Stop

func (pf *ParallelFetch) Stop()

type WebCrawler

type WebCrawler struct {
	// contains filtered or unexported fields
}

func NewWebCrawler

func NewWebCrawler(baseURL, userAgent string) *WebCrawler

func (WebCrawler) Crawl

func (wc WebCrawler) Crawl() error

func (WebCrawler) LoadRobotsTXT

func (wc WebCrawler) LoadRobotsTXT() (*robotstxt.RobotsData, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL