crawler

package
v0.0.0-...-1ec42cb Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 26, 2024 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrBlockedByRobotstxt = errors.New("blocked by robots.txt")
View Source
var ErrDomainNotAllowed = errors.New("domain not allowed")
View Source
var ErrVisited = errors.New("URL already visited")

Functions

This section is empty.

Types

type BasicClient

type BasicClient struct {
	Options *ClientOptions
	// contains filtered or unexported fields
}

func NewBasicClient

func NewBasicClient(options *ClientOptions, client HTTPRequester) *BasicClient

func (*BasicClient) Get

func (c *BasicClient) Get(urlStr string) (*ClientResponse, error)

Makes a GET request to an URL and returns the http response or an error.

func (*BasicClient) GetUA

func (c *BasicClient) GetUA() string

GetUA returns the user-agent set for this client.

func (*BasicClient) Head

func (c *BasicClient) Head(urlStr string) (*ClientResponse, error)

Makes a HEAD request to an URL and returns the http response or an error.

type Client

type Client interface {
	Get(urlStr string) (*ClientResponse, error)
	Head(urlStr string) (*ClientResponse, error)
	GetUA() string
}

type ClientOptions

type ClientOptions struct {
	UserAgent        string
	BasicAuthDomains []string
	AuthUser         string
	AuthPass         string
}

type ClientResponse

type ClientResponse struct {
	Response *http.Response
	TTFB     int
}

type Crawler

type Crawler struct {
	Client Client
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(parsedURL *url.URL, options *Options, client Client) *Crawler

func (*Crawler) AddRequest

func (c *Crawler) AddRequest(r *RequestMessage) error

AddRequest processes a request message for the crawler. It checks if the URL has already been visited, validates the domain and checks if it is blocked in the the robots.txt rules. It returns an error if any of the checks fails. Finally, it adds the request to the processing queue.

func (*Crawler) GetStatus

func (c *Crawler) GetStatus() Status

GetStatus returns the current cralwer status.

func (*Crawler) OnResponse

func (c *Crawler) OnResponse(r ResponseCallback)

OnResponse sets the callback that the crawler will call for every response.

func (*Crawler) RobotstxtExists

func (c *Crawler) RobotstxtExists() bool

Returns true if the robots.txt file exists.

func (*Crawler) SitemapExists

func (c *Crawler) SitemapExists() bool

Returns true if the sitemap.xml file exists.

func (*Crawler) SitemapIsBlocked

func (c *Crawler) SitemapIsBlocked() bool

Returns true if any of the website's sitemaps is blocked in the robots.txt file.

func (*Crawler) Start

func (c *Crawler) Start()

Crawl starts crawling an URL and sends pagereports of the crawled URLs through the pr channel. It will end when there are no more URLs to crawl or the MaxPageReports limit is hit.

func (*Crawler) Stop

func (c *Crawler) Stop()

Stops the cralwer by canceling the cralwer context.

type HTTPRequester

type HTTPRequester interface {
	Do(req *http.Request) (*http.Response, error)
}

type Method

type Method int
const (
	// Supported HTTP methods.
	GET Method = iota
	HEAD
)

type Options

type Options struct {
	CrawlLimit      int
	IgnoreRobotsTxt bool
	FollowNofollow  bool
	IncludeNoindex  bool
	CrawlSitemap    bool
	AllowSubdomains bool
}

type Queue

type Queue struct {
	// contains filtered or unexported fields
}

func NewQueue

func NewQueue() *Queue

func (*Queue) Ack

func (q *Queue) Ack(s string)

Acknowledges a message has been processed.

func (*Queue) Active

func (q *Queue) Active() bool

Active returns true if the queue is not empty or has active elements.

func (*Queue) Count

func (q *Queue) Count() int

Returns the number of items currently in the queue.

func (*Queue) Done

func (q *Queue) Done()

Done stops the queue and closes all of its channels.

func (*Queue) Poll

func (q *Queue) Poll() *RequestMessage

Returns the first element in the queue.

func (*Queue) Push

func (q *Queue) Push(value *RequestMessage)

Adds a new value to the queue's end.

type RequestMessage

type RequestMessage struct {
	URL          *url.URL
	IgnoreDomain bool
	Method       Method
	Data         interface{}
}

type ResponseCallback

type ResponseCallback func(r *ResponseMessage)

type ResponseMessage

type ResponseMessage struct {
	URL       *url.URL
	Response  *http.Response
	Error     error
	TTFB      int
	Blocked   bool
	InSitemap bool
	Timeout   bool
	Data      interface{}
}

type RobotsChecker

type RobotsChecker struct {
	// contains filtered or unexported fields
}

func NewRobotsChecker

func NewRobotsChecker(client Client) *RobotsChecker

func (*RobotsChecker) Exists

func (r *RobotsChecker) Exists(u *url.URL) bool

Returns true if the robots.txt file exists and is valid

func (*RobotsChecker) GetSitemaps

func (r *RobotsChecker) GetSitemaps(u *url.URL) []string

Returns a list of sitemaps found in the robots.txt file

func (*RobotsChecker) IsBlocked

func (r *RobotsChecker) IsBlocked(u *url.URL) bool

Returns true if the URL is blocked by robots.txt

type SitemapChecker

type SitemapChecker struct {
	// contains filtered or unexported fields
}

func NewSitemapChecker

func NewSitemapChecker(client Client, limit int) *SitemapChecker

func (*SitemapChecker) ParseSitemaps

func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))

Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap

func (*SitemapChecker) SitemapExists

func (sc *SitemapChecker) SitemapExists(URLs []string) bool

Check if any of the sitemap URLs provided exist

type Status

type Status struct {
	Crawled    int
	Crawling   bool
	Discovered int
}

type URLStorage

type URLStorage struct {
	// contains filtered or unexported fields
}

func NewURLStorage

func NewURLStorage() *URLStorage

func (*URLStorage) Add

func (s *URLStorage) Add(u string)

Adds an URL string to the slice.

func (*URLStorage) Iterate

func (s *URLStorage) Iterate(f func(string))

Iterate over the seen map, applying the provided function f to the iteration's current element.

func (*URLStorage) Seen

func (s *URLStorage) Seen(u string) bool

Returns true if a URL string has already been added.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL