Documentation
¶
Index ¶
- Constants
- type CollectorInterface
- type CollectorWrapper
- type CrawlManager
- func (cm *CrawlManager) Crawl() error
- func (cm *CrawlManager) GetDBManager() dbmanager.DatabaseManagerInterface
- func (cm *CrawlManager) GetLogger() loggo.LoggerInterface
- func (cm *CrawlManager) GetOptions() *CrawlOptions
- func (cm *CrawlManager) SetOptions(options *CrawlOptions) error
- func (cm *CrawlManager) UpdateStats(_ *CrawlOptions, matchingTerms []string)
- type CrawlManagerInterface
- type CrawlOptions
- type MockStatsManager
- type Results
- type StatsManager
Constants ¶
View Source
const ( DefaultParallelism = 2 DefaultDelay = 3000 * time.Millisecond )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CollectorInterface ¶
CollectorInterface defines the interface for the crawling logic.
type CollectorWrapper ¶
type CollectorWrapper struct { Logger loggo.LoggerInterface DisallowedURLFilters []*regexp.Regexp // contains filtered or unexported fields }
CollectorWrapper is a wrapper around to colly.Collector that implements the CollectorInterface.
func NewCollectorWrapper ¶
func NewCollectorWrapper(collector *colly.Collector, logger loggo.LoggerInterface, disallowedURLFilters []*regexp.Regexp) *CollectorWrapper
func (*CollectorWrapper) GetCollector ¶
func (cw *CollectorWrapper) GetCollector() *colly.Collector
GetCollector implements the CollectorInterface method.
func (*CollectorWrapper) Visit ¶
func (cw *CollectorWrapper) Visit(URL string) error
Visit method with logging and timing
type CrawlManager ¶
type CrawlManager struct { CollectorInstance *CollectorWrapper CrawlingMu *sync.Mutex DBManager dbmanager.DatabaseManagerInterface Logger loggo.LoggerInterface Options *CrawlOptions Results *Results StatsManager *StatsManager Storage *redisstorage.Storage TermMatcher *termmatcher.TermMatcher // Ensure TermMatcher is included }
func NewCrawlManager ¶
func NewCrawlManager( logger loggo.LoggerInterface, dbManager dbmanager.DatabaseManagerInterface, collectorInstance *CollectorWrapper, options *CrawlOptions, storage *redisstorage.Storage, ) *CrawlManager
func (*CrawlManager) Crawl ¶
func (cm *CrawlManager) Crawl() error
func (*CrawlManager) GetDBManager ¶
func (cm *CrawlManager) GetDBManager() dbmanager.DatabaseManagerInterface
func (*CrawlManager) GetLogger ¶
func (cm *CrawlManager) GetLogger() loggo.LoggerInterface
func (*CrawlManager) GetOptions ¶
func (cm *CrawlManager) GetOptions() *CrawlOptions
GetOptions Method to get options
func (*CrawlManager) SetOptions ¶
func (cm *CrawlManager) SetOptions(options *CrawlOptions) error
SetOptions Method to set options
func (*CrawlManager) UpdateStats ¶
func (cm *CrawlManager) UpdateStats(_ *CrawlOptions, matchingTerms []string)
type CrawlManagerInterface ¶
type CrawlManagerInterface interface { Crawl() error GetDBManager() dbmanager.DatabaseManagerInterface GetLogger() loggo.LoggerInterface SetOptions(options *CrawlOptions) error UpdateStats(options *CrawlOptions, matchingTerms []string) }
type CrawlOptions ¶
type CrawlOptions struct { CrawlSiteID string Debug bool DelayBetweenRequests time.Duration MaxConcurrentRequests int MaxDepth int SearchTerms []string StartURL string }
CrawlOptions represents the configuration for a crawl.
type MockStatsManager ¶
type MockStatsManager struct { }
func (*MockStatsManager) IncrementTotalLinks ¶
func (m *MockStatsManager) IncrementTotalLinks()
type StatsManager ¶
StatsManager is a struct that manages crawling statistics. It includes fields for link statistics and a mutex for thread safety.
func NewStatsManager ¶
func NewStatsManager() *StatsManager
NewStatsManager creates a new StatsManager with initialized fields.
Click to show internal directories.
Click to hide internal directories.