Documentation ¶
Index ¶
- Variables
- type BasicClient
- type Client
- type ClientOptions
- type ClientResponse
- type Crawler
- func (c *Crawler) AddRequest(r *RequestMessage) error
- func (c *Crawler) GetStatus() Status
- func (c *Crawler) OnResponse(r ResponseCallback)
- func (c *Crawler) RobotstxtExists() bool
- func (c *Crawler) SitemapExists() bool
- func (c *Crawler) SitemapIsBlocked() bool
- func (c *Crawler) Start()
- func (c *Crawler) Stop()
- type HTTPRequester
- type Method
- type Options
- type Queue
- type RequestMessage
- type ResponseCallback
- type ResponseMessage
- type RobotsChecker
- type SitemapChecker
- type Status
- type URLStorage
Constants ¶
This section is empty.
Variables ¶
var ErrBlockedByRobotstxt = errors.New("blocked by robots.txt")
var ErrDomainNotAllowed = errors.New("domain not allowed")
var ErrVisited = errors.New("URL already visited")
Functions ¶
This section is empty.
Types ¶
type BasicClient ¶
type BasicClient struct { Options *ClientOptions // contains filtered or unexported fields }
func NewBasicClient ¶
func NewBasicClient(options *ClientOptions, client HTTPRequester) *BasicClient
func (*BasicClient) Get ¶
func (c *BasicClient) Get(urlStr string) (*ClientResponse, error)
Makes a GET request to an URL and returns the http response or an error.
func (*BasicClient) GetUA ¶
func (c *BasicClient) GetUA() string
GetUA returns the user-agent set for this client.
func (*BasicClient) Head ¶
func (c *BasicClient) Head(urlStr string) (*ClientResponse, error)
Makes a HEAD request to an URL and returns the http response or an error.
type Client ¶
type Client interface { Get(urlStr string) (*ClientResponse, error) Head(urlStr string) (*ClientResponse, error) GetUA() string }
type ClientOptions ¶
type ClientResponse ¶
type Crawler ¶
type Crawler struct { Client Client // contains filtered or unexported fields }
func (*Crawler) AddRequest ¶
func (c *Crawler) AddRequest(r *RequestMessage) error
AddRequest processes a request message for the crawler. It checks if the URL has already been visited, validates the domain and checks if it is blocked in the the robots.txt rules. It returns an error if any of the checks fails. Finally, it adds the request to the processing queue.
func (*Crawler) OnResponse ¶
func (c *Crawler) OnResponse(r ResponseCallback)
OnResponse sets the callback that the crawler will call for every response.
func (*Crawler) RobotstxtExists ¶
Returns true if the robots.txt file exists.
func (*Crawler) SitemapExists ¶
Returns true if the sitemap.xml file exists.
func (*Crawler) SitemapIsBlocked ¶
Returns true if any of the website's sitemaps is blocked in the robots.txt file.
type Queue ¶
type Queue struct {
// contains filtered or unexported fields
}
func (*Queue) Push ¶
func (q *Queue) Push(value *RequestMessage)
Adds a new value to the queue's end.
type RequestMessage ¶
type ResponseCallback ¶
type ResponseCallback func(r *ResponseMessage)
type ResponseMessage ¶
type RobotsChecker ¶
type RobotsChecker struct {
// contains filtered or unexported fields
}
func NewRobotsChecker ¶
func NewRobotsChecker(client Client) *RobotsChecker
func (*RobotsChecker) Exists ¶
func (r *RobotsChecker) Exists(u *url.URL) bool
Returns true if the robots.txt file exists and is valid
func (*RobotsChecker) GetSitemaps ¶
func (r *RobotsChecker) GetSitemaps(u *url.URL) []string
Returns a list of sitemaps found in the robots.txt file
type SitemapChecker ¶
type SitemapChecker struct {
// contains filtered or unexported fields
}
func NewSitemapChecker ¶
func NewSitemapChecker(client Client, limit int) *SitemapChecker
func (*SitemapChecker) ParseSitemaps ¶
func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))
Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap
func (*SitemapChecker) SitemapExists ¶
func (sc *SitemapChecker) SitemapExists(URLs []string) bool
Check if any of the sitemap URLs provided exist
type URLStorage ¶
type URLStorage struct {
// contains filtered or unexported fields
}
func NewURLStorage ¶
func NewURLStorage() *URLStorage
func (*URLStorage) Iterate ¶
func (s *URLStorage) Iterate(f func(string))
Iterate over the seen map, applying the provided function f to the iteration's current element.
func (*URLStorage) Seen ¶
func (s *URLStorage) Seen(u string) bool
Returns true if a URL string has already been added.