crawler

package
v0.0.0-...-6d375a0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 23, 2024 License: MIT Imports: 20 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultParallelism = 2
	DefaultDelay       = 3000 * time.Millisecond
)

Variables

This section is empty.

Functions

This section is empty.

Types

type CollectorInterface

type CollectorInterface interface {
	GetCollector() *colly.Collector
	Visit(url string) error
}

CollectorInterface defines the interface for the crawling logic.

type CollectorWrapper

type CollectorWrapper struct {
	Logger               loggo.LoggerInterface
	DisallowedURLFilters []*regexp.Regexp
	// contains filtered or unexported fields
}

CollectorWrapper is a wrapper around to colly.Collector that implements the CollectorInterface.

func NewCollectorWrapper

func NewCollectorWrapper(collector *colly.Collector, logger loggo.LoggerInterface, disallowedURLFilters []*regexp.Regexp) *CollectorWrapper

func (*CollectorWrapper) GetCollector

func (cw *CollectorWrapper) GetCollector() *colly.Collector

GetCollector implements the CollectorInterface method.

func (*CollectorWrapper) Visit

func (cw *CollectorWrapper) Visit(URL string) error

Visit method with logging and timing

type CrawlManager

type CrawlManager struct {
	CollectorInstance *CollectorWrapper
	CrawlingMu        *sync.Mutex
	DBManager         dbmanager.DatabaseManagerInterface
	Logger            loggo.LoggerInterface
	Options           *CrawlOptions
	Results           *Results
	StatsManager      *StatsManager
	Storage           *redisstorage.Storage
	TermMatcher       *termmatcher.TermMatcher // Ensure TermMatcher is included
}

func NewCrawlManager

func NewCrawlManager(
	logger loggo.LoggerInterface,
	dbManager dbmanager.DatabaseManagerInterface,
	collectorInstance *CollectorWrapper,
	options *CrawlOptions,
	storage *redisstorage.Storage,
) *CrawlManager

func (*CrawlManager) Crawl

func (cm *CrawlManager) Crawl() error

func (*CrawlManager) GetDBManager

func (cm *CrawlManager) GetDBManager() dbmanager.DatabaseManagerInterface

func (*CrawlManager) GetLogger

func (cm *CrawlManager) GetLogger() loggo.LoggerInterface

func (*CrawlManager) GetOptions

func (cm *CrawlManager) GetOptions() *CrawlOptions

GetOptions Method to get options

func (*CrawlManager) SetOptions

func (cm *CrawlManager) SetOptions(options *CrawlOptions) error

SetOptions Method to set options

func (*CrawlManager) UpdateStats

func (cm *CrawlManager) UpdateStats(_ *CrawlOptions, matchingTerms []string)

type CrawlManagerInterface

type CrawlManagerInterface interface {
	Crawl() error
	GetDBManager() dbmanager.DatabaseManagerInterface
	GetLogger() loggo.LoggerInterface
	SetOptions(options *CrawlOptions) error
	UpdateStats(options *CrawlOptions, matchingTerms []string)
}

type CrawlOptions

type CrawlOptions struct {
	CrawlSiteID           string
	Debug                 bool
	DelayBetweenRequests  time.Duration
	MaxConcurrentRequests int
	MaxDepth              int
	SearchTerms           []string
	StartURL              string
}

CrawlOptions represents the configuration for a crawl.

type MockStatsManager

type MockStatsManager struct {
}
func (m *MockStatsManager) IncrementTotalLinks()

type Results

type Results struct {
	Pages []models.PageData
}

Results holds the results of the crawling process.

func NewResults

func NewResults() *Results

NewResults creates a new instance of Results.

type StatsManager

type StatsManager struct {
	LinkStats   *stats.Stats
	LinkStatsMu sync.RWMutex
}

StatsManager is a struct that manages crawling statistics. It includes fields for link statistics and a mutex for thread safety.

func NewStatsManager

func NewStatsManager() *StatsManager

NewStatsManager creates a new StatsManager with initialized fields.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL