Documentation ¶
Index ¶
- Constants
- func CrawlURL(target string, client CacheAbstraction, profile CrawlingProfile)
- func GetMemStats() int64
- func GoodTLD(domain string, tldlist, countrylist map[string]string) (value, description string, result bool)
- func HEADCheck(target string, userAgent string) int
- func IntSecondsToDuration(seconds int) time.Duration
- func RandomSleep(seconds int)
- func SafeDescription(description string) (safeDescription string)
- func URLConverters(urlString string) string
- type CacheAbstraction
- type CrawlingProfile
- type DomainsCrawler
- func (dc *DomainsCrawler) CountDomainsParent(domain string, client CacheAbstraction) (cnt int64)
- func (dc *DomainsCrawler) Elapsed() int64
- func (dc *DomainsCrawler) GetRobots() error
- func (dc *DomainsCrawler) OKToCrawl() bool
- func (dc *DomainsCrawler) OnRequest(r *colly.Request)
- func (dc *DomainsCrawler) RunTicker()
- func (dc *DomainsCrawler) SetTarget(target string)
- func (dc *DomainsCrawler) Stop()
- type RoboTester
- type TargetMetaData
Constants ¶
View Source
const HundredPercent = 100
View Source
const (
OKToCrawlDivisor float64 = 10
)
View Source
const (
One = iota + 1
)
Variables ¶
This section is empty.
Functions ¶
func CrawlURL ¶
func CrawlURL(target string, client CacheAbstraction, profile CrawlingProfile)
func GetMemStats ¶
func GetMemStats() int64
func IntSecondsToDuration ¶
func RandomSleep ¶ added in v1.1.2
func RandomSleep(seconds int)
func SafeDescription ¶
func URLConverters ¶
Types ¶
type CacheAbstraction ¶
type CrawlingProfile ¶
type CrawlingProfile struct { DomainWhitelist []string // DomainWhitelist = []string{ // "tumblr.com", GenericTLDList map[string]string // GenericTLDList = map[string]string{ // "aaa": "generic_aaa", "aarp": "generic_aarp", TLDList map[string]string // TLDList = map[string]string{ // "ac": "Ascension Island", SecondLevelWhiteList []string // SecondLevelWhiteList = []string{ // "com", // "net", AllowedPorts map[string]string // AllowedPorts = map[string]string{ // "80": "1", // "81": "1", DisallowedPorts map[string]string // DisallowedPorts = map[string]string{ // // SMTP // "25": "1", // "465": "1", // "587": "1", PermanentDomainBlacklist map[string]string // PermanentDomainBlacklist = map[string]string{ // "localhost": "1", // "127.0.0.1": "1", BlockedTLDs map[string]string // BlockedTLDs = map[string]string{ // "facebook": "1", ConcurrentRequests int RandomDelay int CheckEvery int RequestTimeout int MaxRunTime int MaxResults int MaxDomainsPerParent int MaxTimeToDiscoverDomains int GracePeriod int // Bytes MaxMemory int64 // in % MaxMemoryPressure int // Both HEAD and GET UserAgent string }
type DomainsCrawler ¶
type DomainsCrawler struct {
// contains filtered or unexported fields
}
func NewCrawler ¶
func NewCrawler(cache *memcache.CacheType, client CacheAbstraction, profile CrawlingProfile) (crawler *DomainsCrawler)
func (*DomainsCrawler) CountDomainsParent ¶
func (dc *DomainsCrawler) CountDomainsParent(domain string, client CacheAbstraction) (cnt int64)
func (*DomainsCrawler) Elapsed ¶
func (dc *DomainsCrawler) Elapsed() int64
func (*DomainsCrawler) GetRobots ¶ added in v1.0.7
func (dc *DomainsCrawler) GetRobots() error
func (*DomainsCrawler) OKToCrawl ¶
func (dc *DomainsCrawler) OKToCrawl() bool
func (*DomainsCrawler) OnRequest ¶
func (dc *DomainsCrawler) OnRequest(r *colly.Request)
func (*DomainsCrawler) RunTicker ¶
func (dc *DomainsCrawler) RunTicker()
func (*DomainsCrawler) SetTarget ¶
func (dc *DomainsCrawler) SetTarget(target string)
func (*DomainsCrawler) Stop ¶
func (dc *DomainsCrawler) Stop()
type RoboTester ¶ added in v1.0.7
type RoboTester struct {
// contains filtered or unexported fields
}
func NewRoboTester ¶ added in v1.0.7
func NewRoboTester(fullURL, userAgent string) (*RoboTester, error)
func (*RoboTester) GetRobots ¶ added in v1.0.7
func (rt *RoboTester) GetRobots(path string) (robots *robotstxt.RobotsData, err error)
func (*RoboTester) Test ¶ added in v1.0.7
func (rt *RoboTester) Test(path string) bool
type TargetMetaData ¶
type TargetMetaData struct {
CrawledTS int64 `json:"crawled_ts"`
}
Source Files ¶
Click to show internal directories.
Click to hide internal directories.