crawler

package

v1.0.0 Latest Latest Go to latest Published: May 31, 2023 License: MIT Imports: 19 Imported by: 2

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/daohoangson/go-sitemirror

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func LongestCommonPrefix(path1 string, path2 string) string
func ReduceURL(base *neturl.URL, url *neturl.URL) string
type Crawler
- func New(client *http.Client, logger *logrus.Logger) Crawler
type Downloaded
- func Download(input *Input) *Downloaded
type Input
type Link
type QueueItem

Constants ¶

View Source

const (
	// CSSUri url from url()
	CSSUri urlContext = 1 + iota
	// HTMLTagA url from <a href=""></a>
	HTMLTagA
	// HTMLTagForm url from <form action="" />
	HTMLTagForm
	// HTMLTagImg url from <img src="" />
	HTMLTagImg
	// HTMLTagLinkStylesheet url from <link rel="stylesheet" href="" />
	HTMLTagLinkStylesheet
	// HTMLTagScript url from <script src="" />
	HTMLTagScript
	// HTTP3xxLocation url from HTTP response code 3xx
	HTTP3xxLocation
)

Variables ¶

This section is empty.

Functions ¶

func LongestCommonPrefix ¶

func LongestCommonPrefix(path1 string, path2 string) string

LongestCommonPrefix returns the common path elements between two paths

func ReduceURL ¶

func ReduceURL(base *neturl.URL, url *neturl.URL) string

ReduceURL returns relative version of url from base

Types ¶

type Crawler ¶

type Crawler interface {
	GetClientTimeout() time.Duration
	SetAutoDownloadDepth(uint64)
	GetAutoDownloadDepth() uint64
	SetNoCrossHost(bool)
	GetNoCrossHost() bool
	AddRequestHeader(string, string)
	SetRequestHeader(string, string)
	GetRequestHeaderValues(string) []string
	SetWorkerCount(uint64) error
	GetWorkerCount() uint64

	SetURLRewriter(func(*url.URL))
	SetOnURLShouldQueue(func(*url.URL) bool)
	SetOnURLShouldDownload(func(*url.URL) bool)
	SetOnDownload(func(*url.URL))
	SetOnDownloaded(func(*Downloaded))

	GetEnqueuedCount() uint64
	GetDownloadedCount() uint64
	GetLinkFoundCount() uint64
	HasStarted() bool
	HasStopped() bool
	IsRunning() bool
	IsBusy() bool

	Start()
	Stop()
	Enqueue(QueueItem)
	Download(QueueItem) *Downloaded
	Downloaded() (*Downloaded, bool)
	DownloadedNotBlocking() *Downloaded
	// contains filtered or unexported methods
}

Crawler represents an object that can process download requests

func New ¶

func New(client *http.Client, logger *logrus.Logger) Crawler

New returns a new crawler instance

type Downloaded ¶

type Downloaded struct {
	Input *Input

	BaseURL         *url.URL
	Body            string
	Error           error
	LinksAssets     map[string]Link
	LinksDiscovered map[string]Link
	StatusCode      int
	// contains filtered or unexported fields
}

Downloaded represents processed data after downloading

func Download ¶

func Download(input *Input) *Downloaded

Download returns parsed data after downloading the specified url.

func (*Downloaded) AddHeader ¶

func (d *Downloaded) AddHeader(key string, value string)

AddHeader adds a new header

func (*Downloaded) GetAssetURLs ¶

func (d *Downloaded) GetAssetURLs() []*neturl.URL

GetAssetURLs returns resolved asset urls

func (*Downloaded) GetDiscoveredURLs ¶

func (d *Downloaded) GetDiscoveredURLs() []*neturl.URL

GetDiscoveredURLs returns resolved discovered link urls

func (*Downloaded) GetHeaderKeys ¶

func (d *Downloaded) GetHeaderKeys() []string

GetHeaderKeys returns all header keys

func (*Downloaded) GetHeaderValues ¶

func (d *Downloaded) GetHeaderValues(key string) []string

GetHeaderValues returns values of the specified header key

func (*Downloaded) ProcessURL ¶

func (d *Downloaded) ProcessURL(context urlContext, url string) (string, error)

ProcessURL validates url and returns rewritten string representation

func (*Downloaded) Reduce ¶

func (d *Downloaded) Reduce(url *neturl.URL) string

Reduce returns relative version of url from .Input.URL

type Input ¶

type Input struct {
	Client      *http.Client
	Header      http.Header
	NoCrossHost bool
	Rewriter    *func(*url.URL)
	URL         *url.URL
}

Input represents a download request ready to be processed

type Link ¶

type Link struct {
	Context urlContext
	URL     *url.URL
}

Link represents an extracted link from download result

type QueueItem ¶

type QueueItem struct {
	URL           *url.URL
	Depth         uint64
	ForceDownload bool
}

QueueItem represents a download request in the queue

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL