crawler

package

v0.0.0-...-214edc1 Latest Latest Go to latest Published: Jan 16, 2025 License: MIT Imports: 12 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/stjudewashere/seonaut

Documentation ¶

Index ¶

Variables
type BasicClient
- func NewBasicClient(options *ClientOptions, client HTTPRequester) *BasicClient
- func (c *BasicClient) Get(urlStr string) (*ClientResponse, error)
- func (c *BasicClient) GetUA() string
- func (c *BasicClient) Head(urlStr string) (*ClientResponse, error)
type Client
type ClientOptions
type ClientResponse
type Crawler
- func NewCrawler(parsedURL *url.URL, options *Options, client Client) *Crawler
- func (c *Crawler) AddRequest(r *RequestMessage) error
- func (c *Crawler) GetStatus() Status
- func (c *Crawler) OnResponse(r ResponseCallback)
- func (c *Crawler) RobotstxtExists() bool
- func (c *Crawler) SitemapExists() bool
- func (c *Crawler) SitemapIsBlocked() bool
- func (c *Crawler) Start()
- func (c *Crawler) Stop()
type HTTPRequester
type Method
type Options
type Queue
- func NewQueue() *Queue
- func (q *Queue) Ack(s string)
- func (q *Queue) Active() bool
- func (q *Queue) Count() int
- func (q *Queue) Done()
- func (q *Queue) Poll() *RequestMessage
- func (q *Queue) Push(value *RequestMessage)
type RequestMessage
type ResponseCallback
type ResponseMessage
type RobotsChecker
- func NewRobotsChecker(client Client) *RobotsChecker
- func (r *RobotsChecker) Exists(u *url.URL) bool
- func (r *RobotsChecker) GetSitemaps(u *url.URL) []string
- func (r *RobotsChecker) IsBlocked(u *url.URL) bool
type SitemapChecker
- func NewSitemapChecker(client Client, limit int) *SitemapChecker
- func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))
- func (sc *SitemapChecker) SitemapExists(URLs []string) bool
type Status
type URLStorage
- func NewURLStorage() *URLStorage
- func (s *URLStorage) Add(u string)
- func (s *URLStorage) Iterate(f func(string))
- func (s *URLStorage) Seen(u string) bool

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrBlockedByRobotstxt = errors.New("blocked by robots.txt")

View Source

var ErrDomainNotAllowed = errors.New("domain not allowed")

View Source

var ErrVisited = errors.New("URL already visited")

Functions ¶

This section is empty.

Types ¶

type BasicClient ¶

type BasicClient struct {
	Options *ClientOptions
	// contains filtered or unexported fields
}

func NewBasicClient ¶

func NewBasicClient(options *ClientOptions, client HTTPRequester) *BasicClient

func (*BasicClient) Get ¶

func (c *BasicClient) Get(urlStr string) (*ClientResponse, error)

Makes a GET request to an URL and returns the http response or an error.

func (*BasicClient) GetUA ¶

func (c *BasicClient) GetUA() string

GetUA returns the user-agent set for this client.

func (*BasicClient) Head ¶

func (c *BasicClient) Head(urlStr string) (*ClientResponse, error)

Makes a HEAD request to an URL and returns the http response or an error.

type Client ¶

type Client interface {
	Get(urlStr string) (*ClientResponse, error)
	Head(urlStr string) (*ClientResponse, error)
	GetUA() string
}

type ClientOptions ¶

type ClientOptions struct {
	UserAgent        string
	BasicAuthDomains []string
	AuthUser         string
	AuthPass         string
}

type ClientResponse ¶

type ClientResponse struct {
	Response *http.Response
	TTFB     int
}

type Crawler ¶

type Crawler struct {
	Client Client
	// contains filtered or unexported fields
}

func NewCrawler ¶

func NewCrawler(parsedURL *url.URL, options *Options, client Client) *Crawler

func (*Crawler) AddRequest ¶

func (c *Crawler) AddRequest(r *RequestMessage) error

AddRequest processes a request message for the crawler. It checks if the URL has already been visited, validates the domain and checks if it is blocked in the the robots.txt rules. It returns an error if any of the checks fails. Finally, it adds the request to the processing queue.

func (*Crawler) GetStatus ¶

func (c *Crawler) GetStatus() Status

GetStatus returns the current cralwer status.

func (*Crawler) OnResponse ¶

func (c *Crawler) OnResponse(r ResponseCallback)

OnResponse sets the callback that the crawler will call for every response.

func (*Crawler) RobotstxtExists ¶

func (c *Crawler) RobotstxtExists() bool

Returns true if the robots.txt file exists.

func (*Crawler) SitemapExists ¶

func (c *Crawler) SitemapExists() bool

Returns true if the sitemap.xml file exists.

func (*Crawler) SitemapIsBlocked ¶

func (c *Crawler) SitemapIsBlocked() bool

Returns true if any of the website's sitemaps is blocked in the robots.txt file.

func (*Crawler) Start ¶

func (c *Crawler) Start()

Crawl starts crawling an URL and sends pagereports of the crawled URLs through the pr channel. It will end when there are no more URLs to crawl or the MaxPageReports limit is hit.

func (*Crawler) Stop ¶

func (c *Crawler) Stop()

Stops the cralwer by canceling the cralwer context.

type HTTPRequester ¶

type HTTPRequester interface {
	Do(req *http.Request) (*http.Response, error)
}

type Method ¶

type Method int

const (
	// Supported HTTP methods.
	GET Method = iota
	HEAD
)

type Options ¶

type Options struct {
	CrawlLimit      int
	IgnoreRobotsTxt bool
	FollowNofollow  bool
	IncludeNoindex  bool
	CrawlSitemap    bool
	AllowSubdomains bool
}

type Queue ¶

type Queue struct {
	// contains filtered or unexported fields
}

func NewQueue ¶

func NewQueue() *Queue

func (*Queue) Ack ¶

func (q *Queue) Ack(s string)

Acknowledges a message has been processed.

func (*Queue) Active ¶

func (q *Queue) Active() bool

Active returns true if the queue is not empty or has active elements.

func (*Queue) Count ¶

func (q *Queue) Count() int

Returns the number of items currently in the queue.

func (*Queue) Done ¶

func (q *Queue) Done()

Done stops the queue and closes all of its channels.

func (*Queue) Poll ¶

func (q *Queue) Poll() *RequestMessage

Returns the first element in the queue.

func (*Queue) Push ¶

func (q *Queue) Push(value *RequestMessage)

Adds a new value to the queue's end.

type RequestMessage ¶

type RequestMessage struct {
	URL          *url.URL
	IgnoreDomain bool
	Method       Method
	Data         interface{}
}

type ResponseCallback ¶

type ResponseCallback func(r *ResponseMessage)

type ResponseMessage ¶

type ResponseMessage struct {
	URL       *url.URL
	Response  *http.Response
	Error     error
	TTFB      int
	Blocked   bool
	InSitemap bool
	Timeout   bool
	Data      interface{}
}

type RobotsChecker ¶

type RobotsChecker struct {
	// contains filtered or unexported fields
}

func NewRobotsChecker ¶

func NewRobotsChecker(client Client) *RobotsChecker

func (*RobotsChecker) Exists ¶

func (r *RobotsChecker) Exists(u *url.URL) bool

Returns true if the robots.txt file exists and is valid

func (*RobotsChecker) GetSitemaps ¶

func (r *RobotsChecker) GetSitemaps(u *url.URL) []string

Returns a list of sitemaps found in the robots.txt file

func (*RobotsChecker) IsBlocked ¶

func (r *RobotsChecker) IsBlocked(u *url.URL) bool

Returns true if the URL is blocked by robots.txt

type SitemapChecker ¶

type SitemapChecker struct {
	// contains filtered or unexported fields
}

func NewSitemapChecker ¶

func NewSitemapChecker(client Client, limit int) *SitemapChecker

func (*SitemapChecker) ParseSitemaps ¶

func (sc *SitemapChecker) ParseSitemaps(URLs []string, callback func(u string))

Parse the sitemaps using a callback function on each entry For each URL provided check if it's an index sitemap

func (*SitemapChecker) SitemapExists ¶

func (sc *SitemapChecker) SitemapExists(URLs []string) bool

Check if any of the sitemap URLs provided exist

type Status ¶

type Status struct {
	Crawled    int
	Crawling   bool
	Discovered int
}

type URLStorage ¶

type URLStorage struct {
	// contains filtered or unexported fields
}

func NewURLStorage ¶

func NewURLStorage() *URLStorage

func (*URLStorage) Add ¶

func (s *URLStorage) Add(u string)

Adds an URL string to the slice.

func (*URLStorage) Iterate ¶

func (s *URLStorage) Iterate(f func(string))

Iterate over the seen map, applying the provided function f to the iteration's current element.

func (*URLStorage) Seen ¶

func (s *URLStorage) Seen(u string) bool

Returns true if a URL string has already been added.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL